File: alignment.hh

package info (click to toggle)
augustus 3.4.0%2Bdfsg2-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 758,480 kB
  • sloc: cpp: 65,451; perl: 21,436; python: 3,927; ansic: 1,240; makefile: 1,032; sh: 189; javascript: 32
file content (414 lines) | stat: -rw-r--r-- 14,221 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
/*
 * alignment.hh
 *
 * License: Artistic License, see file LICENSE.TXT or 
 *          https://opensource.org/licenses/artistic-license-1.0
 */

#ifndef _ALIGNMENT
#define _ALIGNMENT

#include "types.hh"

#include <vector>
#include <list>
#include <iostream>
#include <exception>
#include <climits>

class MsaSignature; // forward declaration

/**
 * @brief gapless alignment fragment
 * 
 * @author Mario Stanke
 */
class fragment {
public:
    fragment(int chrPos, int aliPos, int len) : chrPos(chrPos), aliPos(aliPos), len(len) {}
    fragment() {}
    int chrEnd() const { return chrPos + len - 1; }
    int chrPos; // chromosomal start position of fragment, 0-based
    int aliPos; // start position of fragment in alignment, 0-based
    int len;    // fragment length

    /*
    *   added by Giovanna Migliorelli 14.05.2020 
    *   code responsible for serialization
    *   aknowledgement : https://www.boost.org/doc/libs/1_70_0/libs/serialization/doc/tutorial.html
    */

    #ifdef TESTING
    friend class boost::serialization::access;
    template<class Archive>
    void serialize(Archive & ar, const unsigned int version)
    {
        ar & chrPos; 
        ar & aliPos;
        ar & len; 
    }
    #endif    
};

/**
 * @brief global multiple sequence alignment
 * 
 * @details alignment does not contain the sequence itself, but the info, where gaps are.<br>
 * Generation of exon candidates.
 * 
 * @author Mario Stanke
 */
class AlignmentRow {
public:
    AlignmentRow() : cumFragLen(0) {}
    AlignmentRow(string seqID, int chrPos, Strand strand, string rowbuffer);
    ~AlignmentRow(){}

    int chrStart() const;
    int chrEnd() const;
    int aliEnd() const;
    int getSeqLen() const { return chrEnd() - chrStart() + 1; }
    int getCumFragLen() const { return cumFragLen; }
    void setCumFragLen(int len) { cumFragLen = len;} // use with care to ensure consistency
    int gapLenAfterFrag(size_t i) const {
	if (i+1 >= frags.size())
	    return 0;
	return frags[i+1].chrPos - frags[i].chrPos - frags[i].len;
    }
    void addFragment(int chrPos, int aliPos, int len);
    void addFragment(fragment &f) { addFragment(f.chrPos, f.aliPos, f.len); }
    string getSignature() const {return seqID + ((strand == minusstrand)? "-" : "+");}
    void pack();
    friend ostream& operator<< (ostream& strm, const AlignmentRow &row);
    friend void appendRow(AlignmentRow **r1, const AlignmentRow *r2, int aliLen1, string sigstr);

    /** convert from chromosomal to alignment position
     * start search from the fragment 'from' on, i.e. assume that aliPos is not to the left of fragment *from
     * return -1, if chrPos is outside the range of these fragments
     * return -2, if position is otherwise not mappable: no fragment contains the chrPos, i.e. chrPos is in a gap
     */
    int getAliPos(int chrPos, vector<fragment>::const_iterator from);
    int getAliPos(int chrPos, vector<fragment>::const_iterator *from); // variant from Patrick Balmerth

    int getAliPos(int chrPos) { return getAliPos(chrPos, frags.begin()); }

    // convert from alignment to chromosomal position (inverse function of getAliPos())
    int getChrPos(int aliPos, vector<fragment>::const_iterator from);
    int getChrPos(int aliPos) { return getChrPos(aliPos, frags.begin()); }
    
    // data members
    string seqID; // e.g. chr21
    
    #ifdef TESTING
    int seqIDarchive;   // GM temporarily added to come around some problem with string serialization
    int chrLen;         // GM added to properly set rsa chr lengths during deserialization
    #endif

    Strand strand;
    vector<fragment> frags; // fragments are sorted by alignment positions AND by chromosomal positions (assumption for now)
private:
    int cumFragLen; // total length of all fragments, becomes a nonredundant attribute after merging of alignments

    /*
    *   added by Giovanna Migliorelli 14.05.2020 
    *   code responsible for serialization
    *   Acknowledegment : https://www.boost.org/doc/libs/1_70_0/libs/serialization/doc/tutorial.html
    */

    #ifdef TESTING
    friend class boost::serialization::access;
    template<class Archive>
    void serialize(Archive & ar, const unsigned int version)
    {
        ar & cumFragLen;
        ar & seqIDarchive; 
        ar & strand;
        ar & frags; 
        ar & chrLen;    
    }
    #endif
};



/**
 * @brief global multiple sequence alignment with efficiently stored long gaps.
 * 
 * @details Generation of exon candidates
 * 
 *          1000      1010       1020      1030       1040      1050    chromosomal
 * chr21       |.........|..........|.........|..........|.........|    positions
 *               *********                 ************                 in species 1
 *                \       \               /          /
 * align chr21     *********-------------************------------
 * ment   chr7     ----*****-----------------********************
 *                     |   |                  \    a fragment    \
 *                     *****                   ********************
 * chr7           |.........|..........|.........|..........|.........| chromosomal
 *             2000      2010       2020      2030       2040      2050 positions
 *                                                                      in species 2
 * Here: Alignment a;
 *       a.aliLen = 46
 *       a.rows[0].frags = ((1002, 0, 9), (1027, 23, 12))
 *       a.rows[0].chrStart() = 1002
 *       a.rows[0].chrEnd() = 1037
 *
 * Coordinates are LEFT TO RIGHT, for reverse strand alignments, they refer to the REVERSE COMPLEMENT of the sequence.
 * 
 * @author Mario Stanke
 */
class Alignment {
public:
    Alignment(size_t k) : aliLen(0), rows(k, NULL) {} // initialize with NULL, which stand for missing AlignmentRows
    ~Alignment(){
	for (unsigned i=0; i<rows.size(); i++) 
	    delete rows[i];	
    }
    friend bool mergeable (Alignment *a1, Alignment *a2, int maxGapLen, float mergeableFrac, bool strong);
    friend ostream& operator<< (ostream& strm, const Alignment &a);
    void printTextGraph(ostream& strm);
    void merge(Alignment *other, const MsaSignature *sig = NULL); // append 'other' Alignment to this
    friend Alignment* mergeAliList(list<Alignment*> alis,  const MsaSignature *sig);
    friend void capAliSize(list<Alignment*> &alis, int maxRange);
    friend void reduceOvlpRanges(list<Alignment*> &alis, size_t maxCov, float covPen);
    int maxRange(); // chromosomal range, maximized over rows
    int numRows() const { return rows.size(); }
    int numFilledRows() const; // number of nonempty rows
    int getCumFragLen() const; // total length of all fragments
    int getMaxSeqIdLen() const;
    friend int medianChrStartEndDiff(Alignment *a1, Alignment *a2);
    string getSignature() const;
    int numFitting(const MsaSignature *sig) const;
    void shiftAliPositions(int offset);
    void pack(); // merge pairs of fragments without gap into one fragment making the alignment representation more compact

    #ifdef TESTING
    void convertAlignment(int r, int start, int end);
    #endif

public: // should rather be private
    int aliLen; // all aligned sequences are this long when gaps are included
    vector<AlignmentRow*> rows;

private:
    /*
    *   added by Giovanna Migliorelli 14.05.2020 
    *   code responsible for serialization and default constructor required in serialization
    *   aknowledgement : https://www.boost.org/doc/libs/1_70_0/libs/serialization/doc/tutorial.html
    */

    #ifdef TESTING
    friend class boost::serialization::access;
    template<class Archive>
    void serialize(Archive & ar, const unsigned int version)
    {
        ar & aliLen;
        ar & rows;
    }
    
    Alignment(){};
    #endif
};

int medianChrStartEndDiff(Alignment *a1, Alignment *a2);

// sorting operator, with respect to a given species index
struct SortCriterion {
    SortCriterion(size_t speciesIdx) : s(speciesIdx) {};
    bool operator() (Alignment* const a1, Alignment* const a2){
	// alignments, where row s is missing come last
	if (a2->rows[s] == NULL)
	    return true;
	if (a1->rows[s] == NULL)
	    return false;
	if (a1->rows[s]->seqID < a2->rows[s]->seqID)
	    return true;
	if (a1->rows[s]->seqID > a2->rows[s]->seqID)
	    return false;
	// same sequence, compare chromosomal start positions
	return (a1->rows[s]->chrStart() < a2->rows[s]->chrStart());
    }
    size_t s;
};


// sorting operator, with respect to a given species index, index version
// for sorting a vector or list of indices to another vector that holds the Alignments
struct IdxSortCriterion {
    IdxSortCriterion(vector<Alignment*> const a_, size_t speciesIdx) : s(speciesIdx), a(a_) {};
    bool operator() (int i, int j){
	// alignments, where row s is missing come last
	if (a[j]->rows[s] == NULL && a[i]->rows[s]) // this conjunction makes sorting stable where the sequence is missing
	    return true;
	if (a[i]->rows[s] == NULL)
	    return false;
	if (a[i]->rows[s]->seqID < a[j]->rows[s]->seqID)
	    return true;
	if (a[i]->rows[s]->seqID > a[j]->rows[s]->seqID)
	    return false;
	// same sequence, compare chromosomal start positions
	return (a[i]->rows[s]->chrStart() < a[j]->rows[s]->chrStart());
    }
    size_t s;
    vector<Alignment*> const &a;
};

/*
 * b1 and b2 can be merged in that order because they are very similar and right next to each other.
 * In at least 'mergeableFrac' of the alignment rows the aligned sequenes are
 * strong=false: - refer to the same terget sequence, are on the same strand and satisfy 0 <= gaplen <= maxGapLen
 *                 if both are present
 * strong=true:  - refer to the same terget sequence, are on the same strand and satisfy 0 <= gaplen <= maxGapLen
 *                 if at least one is present
 */
bool mergeable (Alignment *b1, Alignment *b2, int maxGapLen, float mergeableFrac, bool strong);

inline bool isGap(char c){
    return (c == '-' || c == '.');
}

/**
 * @brief MsaSignature is a summary of the seqId/strand combinations of the alignment
 *
 * @author Mario Stanke
 */
class MsaSignature {
public:
    string sigstr() const{
	string str;
	for (unsigned s = 0; s < sigrows.size(); ++s)
	    if (sigrows[s] != "")
		str += itoa(s) + ":" + sigrows[s];
	return str;
    }
    vector<string> sigrows; // each row contains seqID and strand, e.g. chr21+
    int numAli;
    int sumAliLen;
    int sumCumFragLen;
    int depth;
    int color;
    bool operator< (const MsaSignature &other) const {
	return (sumCumFragLen > other.sumCumFragLen);
	// before: sorting by 1) number of species and 2) sumAliLen
	// return (depth > other.depth || (depth == other.depth && sumAliLen > other.sumAliLen));
    }
    bool fits(const Alignment &a, size_t s) const {
	return a.rows[s] && (sigrows[s] == a.rows[s]->seqID + strandChar(a.rows[s]->strand));
    }
    static int maxSigStrLen;
};

bool cmpSigPtr(const MsaSignature *s1, const MsaSignature *s2);

typedef pair<size_t, int> BoundaryFragment; // (s, chrPos), where s= species index

/**
 * 
 * @author Mario Stanke
 */
class CompareBoundaryFragment {
public:
    bool operator()(BoundaryFragment& bf1, BoundaryFragment& bf2) {
	return bf1.second > bf2.second; // => sort by increasing chromosomal position
    }
};


/**
 * MsaInsertion, specifies a string to be inserted into a StringAlignment
 */
class MsaInsertion {
public:
    MsaInsertion(size_t s, size_t insertpos, string insert): s(s), insertpos(insertpos), insert(insert){ }
    MsaInsertion() { }
    /**
     * Insertion sorting criterion: Do them right-to-left.
     * Do insertions at the same position from long to short.
     */
    bool operator<(const MsaInsertion& other) const {
        if (insertpos > other.insertpos)
            return true;
        if (insertpos < other.insertpos)
            return false;
        if (insert.length() > other.insert.length())
            return true;
        if (insert.length() < other.insert.length())
            return false;
        if (s < other.s)
            return true;
        return false;
    }
    size_t s; //! index of species
    size_t insertpos; //! at which position to insert
    string insert; //! what to insert
};


/**
 * @brief global multiple sequence alignment in (standard) string representation
 * @author Mario Stanke
 */
class StringAlignment {
public:
    StringAlignment(size_t numrows) : rows(numrows, ""), k(numrows), len(0) {} // initialize with empty strings
    ~StringAlignment(){len = 0;}

    /**
     * Insert unaligned sequences into an alignment.
     * The insertion positions all refer to the MSA before insertions.
     * Example:
     *
     * a-tt-g   insert((0, 0, "ggg"), (2, 4, "gaga"),    ggga-tt-----g
     * --ctgg          (1, 4, "ttt"))                    -----ctttt-gg
     * a-ttgc   ===================================>     ---a-ttgagagc
     *
     * @param[in] insList a list of inserts at certain positions
     */
    void insert(std::list<MsaInsertion> &insList, int maxInsertLen = INT_MAX);

    /**
     * Remove all columns which in which each row that is
     * present has a gap. Example
     * 
     * ggga-tt-----g           gggatt-----g
     * -----ctttt-gg   ====>   ----ctttt-gg
     * ---a-ttgagagc           ---attgagagc
     *
     * @param[in] insList a list of inserts at certain positions
     */
    size_t removeGapOnlyCols();

    bool isGapOnlyCol(size_t col){
        for (size_t s = 0; s < k; ++s)
            if (!rows[s].empty() && rows[s].at(col) != '-')
                return false;
        return true;
    }

    /**
     * Ensure all rows have the same length and compute and set this length.
     */
    void computeLen(){
        size_t m = 0, rowlen;
        for (size_t s = 0; s < k; ++s){
            rowlen = rows[s].length();
            if (rowlen > 0){
                if (m == 0)
                    m = rowlen;
                else if (m != rowlen)
                    throw length_error("StringAlignment with rows of differing lengths");
            }
        }
        len = m;
    }

    friend ostream& operator<< (ostream& strm, const StringAlignment &msa);

// data members
    vector<string> rows;
    size_t k;
    size_t len;
};


#endif  // _ALIGNMENT