File: TrimSequence.h

package info (click to toggle)
libstatgen 1.0.15-8
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 4,588 kB
  • sloc: cpp: 49,624; ansic: 1,408; makefile: 320; sh: 60
file content (139 lines) | stat: -rw-r--r-- 5,013 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
/*
 *  Copyright (C) 2010  Regents of the University of Michigan
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifndef _TRIMSEQUENCE_H
#define _TRIMSEQUENCE_H

#include <assert.h>
#include <stdint.h>
#include <stdlib.h>

#ifndef __WIN32__
#include <unistd.h>
#endif

///
/// TrimSequence is a templated function to find bases
/// which are below a certain moving mean threshold,
/// and can be applied to either end of the sequence string.
///
/// @param sequence is the input sequence
/// @param meanValue is the value below which we wish to trim.
/// @return the iterator of the location at which untrimmed values begin
///
/// Details:
///
/// trimFromLeft is a bool indicating which direction we wish
/// to trim.  true -> left to right, false is right to left.
///
/// The code is convoluted enough here, so for implementation
/// and testing sanity, the following definitions are made:
///
/// When trimFromLeft is true:
///    result == sequence.begin() implies no trimming
///    result == sequence.end() implies all values are trimmed
///
/// When trimFromLeft is false:
///    result == sequence.begin() implies all values are trimmed
///    result == sequence.end() no values are trimmed
///
/// result will always be in the range [sequence.begin() , sequence.end())
/// (begin is inclusive, end is exclusive).
///
/// NOTE: See TrimSequence.h and test/TrimSequence_test.cpp for examples
///
/// THIS CODE IS EXCEPTIONALLY FRAGILE.  DO NOT ATTEMPT TO FIX OR
/// IMPROVE WITHOUT INCLUDING DOCUMENTED, UNDERSTANABLE TEST CASES THAT CLEARLY
/// SHOW WHY OR WHY NOT SOMETHING WORKS.
///
template<typename sequenceType, typename meanValueType>
typename sequenceType::iterator trimSequence(sequenceType &sequence, meanValueType meanValue, const bool trimFromLeft)
{
    const int howManyValues = 4;    // this is used in signed arithmetic below
    int windowThreshold = howManyValues * meanValue;
    int64_t sumOfWindow = 0;
    typename sequenceType::iterator it;

    //
    // Sanity check to weed out what otherwise would be
    // a large number of boundary checks below.  If the input
    // is too small, just punt it back to the caller.  Technically,
    // we can still trim, but we'd just do the simple iteration
    // loop.  Save that for when we care.
    //

    if (sequence.size() < (size_t) howManyValues)
        return trimFromLeft? sequence.begin() : sequence.end();

    typename sequenceType::iterator sequenceBegin;
    typename sequenceType::iterator sequenceEnd;

    // The algorithm should be clear and efficient
    // so it does not bother to write codes for two directions.
    // It that way, we avoid thinking trimming from left and right interchangably.
    if (trimFromLeft)
    {
        // sequenceBegin is inclusive, sequenceEnd is exclusive,
        sequenceBegin = sequence.begin();
        sequenceEnd = sequence.end();

        for (it = sequenceBegin; it < sequenceBegin + howManyValues; it++)
            sumOfWindow += *it;

        for (; it < sequenceEnd; it ++)
        {
            if (sumOfWindow > windowThreshold)
                break;
            sumOfWindow += *it;
            sumOfWindow -= *(it - howManyValues);
        }
        // here it is in the range of [sequenceBegin+howManyValues, sequenceEnd] inclusively
        // the range is also [sequence.begin() + howManyValues, sequence.end()]
        while (*(it-1) >= meanValue && (it-1) >= sequenceBegin)
            it--;
    }
    else
    {
        sequenceBegin = sequence.end() - 1;
        sequenceEnd = sequence.begin() - 1;

        for (it = sequenceBegin; it > sequenceBegin - howManyValues; it--)
            sumOfWindow += *it;

        for (; it > sequenceEnd; it--)
        {
            if (sumOfWindow > windowThreshold)
                break;
            sumOfWindow += *it;
            sumOfWindow -= *(it + howManyValues);
        }

        // here it is in the range of [sequenceEnd, sequenceBegin - howManyValues] inclusively
        // the range is also [sequence.begin() -1, sequence.end() - 1 - howManyValues]
        while (*(it+1) >= meanValue && (it+1) <= sequenceBegin)
            it ++;
        // note, the return value should in the range [sequence.begin(), sequence.end()]
        it += 1;
    }

    // 'it' may be sequence.end() in some cases
    assert(it >= sequence.begin() && it <= sequence.end());

    return it;
}

#endif