File: GappedXdropAligner.hh

package info (click to toggle)
last-align 1651-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 14,692 kB
  • sloc: cpp: 44,419; python: 5,217; ansic: 1,938; sh: 710; makefile: 457
file content (409 lines) | stat: -rw-r--r-- 14,674 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
// Copyright 2011, 2012, 2013 Martin C. Frith

// These routines extend an alignment in a given direction (forward or
// reverse) from given start points in two sequences.

// The start points point at the first positions we'll try to align.

// To use: first call "align", which calculates the alignment but only
// returns its score.  To get the actual alignment, call
// "getNextChunk" to get each gapless chunk.

// The sequences had better end with sentinels: the score for matching
// a sentinel with anything should be -INF.

// The gap parameters correspond to "generalized affine gap costs"
// (Proteins 1998 32(1):88-96).
// gapExistenceCost = a
// gapExtensionCost = b
// gapUnalignedCost = c
// When c >= a + 2b, it reduces to standard affine gap costs:
// gap cost = gapExistenceCost + gapExtensionCost * (gap length).

// The insertion and deletion costs may differ.  Typically:
// delExistenceCost = insExistenceCost, and
// delExtensionCost = insExtensionCost.

// The algorithm proceeds antidiagonal-by-antidiagonal, similarly to
// Section 2 in J Comput Biol. 2000 7(1-2):203-14.  It does not allow
// the score to drop by more than maxScoreDrop below the highest score
// in any previous antidiagonal.

// If "globality" is 0, local alignment is performed: this finds the
// highest-scoring alignment ending anywhere.  Otherwise, overlap
// alignment is performed: this seeks the highest-scoring alignment
// ending at the end of either sequence.  If overlap alignment reaches
// the end of neither sequence (due to maxScoreDrop), -INF is
// returned.

// The parameter maxMatchScore should be the highest possible score
// for matching 2 letters.  This parameter is not actually necessary,
// but it provides some optimization opportunities.  If you give it a
// too-high value, the results will not change, but the run time may
// increase.

#ifndef GAPPED_XDROP_ALIGNER_HH
#define GAPPED_XDROP_ALIGNER_HH

#include "mcf_big_seq.hh"
#include "mcf_contiguous_queue.hh"
#include "mcf_reverse_queue.hh"
#include "mcf_gap_costs.hh"
#include "mcf_simd.hh"
#include "ScoreMatrixRow.hh"

#include <iosfwd>
#include <stddef.h>  // size_t
#include <vector>

namespace cbrc {

using namespace mcf;

typedef unsigned char uchar;
typedef const int *const_int_ptr;

typedef int Score;
typedef uchar TinyScore;

class TwoQualityScoreMatrix;

const int xdropPadLen = simdBytes;

const int droppedTinyScore = UCHAR_MAX;

class GappedXdropAligner {
 public:
  int align(BigPtr seq1,  // start point in the 1st sequence
            const uchar *seq2,  // start point in the 2nd sequence
            bool isForward,  // forward or reverse extension?
	    int globality,
            const ScoreMatrixRow *scorer,  // the substitution score matrix
	    int delExistenceCost,
	    int delExtensionCost,
	    int insExistenceCost,
	    int insExtensionCost,
            int gapUnalignedCost,
	    bool isAffine,
            int maxScoreDrop,
            int maxMatchScore);

  // Like "align", but it aligns a sequence to a PSSM.
  int alignPssm(BigPtr seq,
                const ScoreMatrixRow *pssm,
                bool isForward,
		int globality,
		int delExistenceCost,
		int delExtensionCost,
		int insExistenceCost,
		int insExtensionCost,
                int gapUnalignedCost,
		bool isAffine,
                int maxScoreDrop,
                int maxMatchScore);

  // Like "align", but both sequences have quality scores.
  int align2qual(const uchar *seq1,
                 const uchar *qual1,
                 const uchar *seq2,
                 const uchar *qual2,
                 bool isForward,
		 int globality,
                 const TwoQualityScoreMatrix &scorer,
		 int delExistenceCost,
		 int delExtensionCost,
		 int insExistenceCost,
		 int insExtensionCost,
                 int gapUnalignedCost,
		 bool isAffine,
                 int maxScoreDrop,
                 int maxMatchScore);

  // Like "align", but maybe faster for DNA.  Assumes that
  // scorer[i<4][j<4] fits in signed char.  Each sequence element is
  // first mapped through "toUnmasked".  If an unmasked sequence
  // element >= 4 appears, alignDna falls back to a slower algorithm.
  int alignDna(BigPtr seq1,
	       const uchar *seq2,
	       bool isForward,
	       const ScoreMatrixRow *scorer,
	       int delExistenceCost,
	       int delExtensionCost,
	       int insExistenceCost,
	       int insExtensionCost,
	       int maxScoreDrop,
	       int maxMatchScore,
	       const uchar *toUnmasked);

  // Call this repeatedly to get each gapless chunk of the alignment.
  // The chunks are returned in far-to-near order.  The chunk's end
  // coordinates in each sequence (relative to the start of extension)
  // and length are returned in the first 3 parameters.  If there are
  // no more chunks, the 3 parameters are unchanged and "false" is
  // returned.
  bool getNextChunk(size_t &end1,
                    size_t &end2,
                    size_t &length,
		    int delExistenceCost,
		    int delExtensionCost,
		    int insExistenceCost,
		    int insExtensionCost,
                    int gapUnalignedCost);

  // After "alignDna", must use this instead of "getNextChunk"
  bool getNextChunkDna(size_t &end1,
		       size_t &end2,
		       size_t &length,
		       int delExistenceCost,
		       int delExtensionCost,
		       int insExistenceCost,
		       int insExtensionCost);

  // Like "align", but it aligns a protein sequence to a DNA sequence.
  // The DNA should be provided as 3 protein sequences, one for each
  // reading frame.  seq2frame0 is the in-frame start point.
  // seq2frame1 is shifted by 1 in the direction of alignment
  // extension.  seq2frame2 is shifted by 1 in the opposite direction.
  // The algorithm is "3-frame alignment", as described in J Comput
  // Biol. 1997 4(3):339-49.
  int align3(const uchar *seq1,
             const uchar *seq2frame0,
             const uchar *seq2frame1,  // the +1 frame
             const uchar *seq2frame2,  // the -1 frame
             bool isForward,
             const ScoreMatrixRow *scorer,
             int gapExistenceCost,
             int gapExtensionCost,
             int gapUnalignedCost,
             int frameshiftCost,
             int maxScoreDrop,
             int maxMatchScore);

  // Like "align3", but it aligns a protein sequence to a 3-frame PSSM.
  int align3pssm(const uchar *seq,
                 const ScoreMatrixRow *pssmFrame0,
                 const ScoreMatrixRow *pssmFrame1,
                 const ScoreMatrixRow *pssmFrame2,
                 bool isForward,
                 int gapExistenceCost,
                 int gapExtensionCost,
                 int gapUnalignedCost,
                 int frameshiftCost,
                 int maxScoreDrop,
                 int maxMatchScore);

  // Use this version of getNextChunk for protein-versus-DNA
  // alignment.  The end1 parameter receives a protein coordinate, and
  // end2 receives a DNA coordinate relative to the in-frame start.
  // The length parameter receives the number of residues/codons, not
  // bases.
  bool getNextChunk3(size_t &end1,
                     size_t &end2,
                     size_t &length,
                     int gapExistenceCost,
                     int gapExtensionCost,
                     int gapUnalignedCost,
                     int frameshiftCost);

  // Like "align3", but does "new style" DNA-protein alignment.
  int alignFrame(const uchar *protein,
		 const uchar *frame0,  // translated DNA,  0 frame
		 const uchar *frame1,  // translated DNA, +1 frame
		 const uchar *frame2,  // translated DNA, +2 frame
		 bool isForward,
		 const ScoreMatrixRow *scorer,
		 const GapCosts &gapCosts,
		 int maxScoreDrop);

  // Use this after alignFrame.  The cost of the unaligned region
  // between this chunk and the next chunk is returned in the 4th
  // parameter.
  bool getNextChunkFrame(size_t &end1,
			 size_t &end2,
			 size_t &length,
			 int &costOfNearerGap,
			 const GapCosts &gapCosts);

  void writeShape(std::ostream &out) const;

  // The next few functions are for use by Centroid.  If the Centroid
  // code gets updated, it might make sense to change these functions too.

  // The number of antidiagonals, excluding dummy ones at the beginning.
  size_t numAntidiagonals() const
  { return numOfAntidiagonals; }

  size_t numCellsAndPads(size_t antidiagonal) const
  { return scoreEndsAndOrigins[2 * (antidiagonal + 3)]
      -    scoreEndsAndOrigins[2 * (antidiagonal + 2)]; }

  size_t scoreEndIndex(size_t antidiagonal) const
  { return scoreEndsAndOrigins[2 * (antidiagonal + 2)]; }

  // start of the x-drop region (i.e. number of skipped seq1 letters
  // before the x-drop region) for this antidiagonal
  size_t seq1start(size_t antidiagonal) const {
    size_t a = 2 * (antidiagonal + 2);
    return scoreEndsAndOrigins[a] + xdropPadLen - scoreEndsAndOrigins[a + 1];
  }

  size_t scoreOrigin(size_t antidiagonalIncludingDummies) const
  { return scoreEndsAndOrigins[2 * antidiagonalIncludingDummies + 1]; }

  // The index in the score vectors, of the previous "horizontal" cell.
  size_t hori(size_t antidiagonal, size_t seq1coordinate) const
  { return scoreOrigin(antidiagonal + 1) + seq1coordinate - 1; }

  // The index in the score vectors, of the previous "vertical" cell.
  size_t vert(size_t antidiagonal, size_t seq1coordinate) const
  { return scoreOrigin(antidiagonal + 1) + seq1coordinate; }

  // The index in the score vectors, of the previous "diagonal" cell.
  size_t diag(size_t antidiagonal, size_t seq1coordinate) const
  { return scoreOrigin(antidiagonal) + seq1coordinate - 1; }

  // The index in the score vectors, of the previous in-frame horizontal cell.
  size_t hori3(size_t antidiagonal, size_t seq1coordinate) const
  { return scoreOrigin(antidiagonal - 3) + seq1coordinate - 1; }

  // The index in the score vectors, of the previous in-frame vertical cell.
  size_t vert3(size_t antidiagonal, size_t seq1coordinate) const
  { return scoreOrigin(antidiagonal - 3) + seq1coordinate; }

  // The index in the score vectors, of the previous in-frame diagonal cell.
  size_t diag3(size_t antidiagonal, size_t seq1coordinate) const
  { return scoreOrigin(antidiagonal - 6) + seq1coordinate - 1; }

 private:
  std::vector<Score> xScores;  // best score ending with aligned letters
  std::vector<Score> yScores;  // best score ending with insertion in seq1
  std::vector<Score> zScores;  // best score ending with insertion in seq2

  std::vector<size_t> scoreEndsAndOrigins;  // data for each antidiagonal
  size_t numOfAntidiagonals;

  ContiguousQueue<const int *> pssmQueue;
  ContiguousQueue<uchar> seq1queue;
  ReverseQueue<uchar> seq2queue;

  // Our position during the trace-back:
  size_t bestAntidiagonal;
  size_t bestSeq1position;

  void resizeScoresIfSmaller(size_t size) {
    if (xScores.size() < size) {
      xScores.resize(size);
      yScores.resize(size);
      zScores.resize(size);
    }
  }

  void initAntidiagonal(size_t antidiagonalIncludingDummies,
			size_t seq1end, size_t thisEnd, int numCells) {
    const SimdInt mNegInf = simdFill(-INF);
    size_t nextEnd = thisEnd + xdropPadLen + numCells;

    size_t a = 2 * (antidiagonalIncludingDummies + 1);
    if (scoreEndsAndOrigins.size() <= a) {
      scoreEndsAndOrigins.resize(a + 1);
    }
    scoreEndsAndOrigins[a - 1] = nextEnd - seq1end;
    scoreEndsAndOrigins[a] = nextEnd;

    resizeScoresIfSmaller(nextEnd + (simdLen-1));
    for (int i = 0; i < xdropPadLen; i += simdLen) {
      simdStore(&xScores[thisEnd + i], mNegInf);
      simdStore(&yScores[thisEnd + i], mNegInf);
      simdStore(&zScores[thisEnd + i], mNegInf);
    }
  }

  // Puts 2 "dummy" antidiagonals at the start, so that we can safely
  // look-back from subsequent antidiagonals
  void init() {
    initAntidiagonal(0, 0, 0, 0);
    initAntidiagonal(1, 0, xdropPadLen, 0);
    xScores[xdropPadLen - 1] = 0;
    bestAntidiagonal = 0;
  }

  void initAntidiagonal3(size_t antidiagonalIncludingDummies,
			 size_t seq1end, size_t scoreEnd) {
    size_t a = 2 * (antidiagonalIncludingDummies + 1);
    if (scoreEndsAndOrigins.size() <= a) {
      scoreEndsAndOrigins.resize(a + 1);
    }
    scoreEndsAndOrigins[a - 1] = scoreEnd - seq1end;
    scoreEndsAndOrigins[a] = scoreEnd;
    resizeScoresIfSmaller(scoreEnd + (simdLen-1));
  }

  void updateBest(int &bestScore, int score, size_t antidiagonal,
                  const Score *x0, const Score *x0ori);

  void calcBestSeq1position(int bestScore, size_t numOfDummyAntidiagonals) {
    size_t seq1beg = seq1start(bestAntidiagonal + numOfDummyAntidiagonals - 2);
    const Score *x2 = &xScores[diag(bestAntidiagonal, seq1beg)];
    const Score *x2beg = x2;
    while (*x2 != bestScore) ++x2;
    bestSeq1position = x2 - x2beg + seq1beg;
  }

  void init3();
  void initFrame();

  // Everything below here is for alignDna & getNextChunkDna
  std::vector<TinyScore> xTinyScores;
  std::vector<TinyScore> yTinyScores;
  std::vector<TinyScore> zTinyScores;

  std::vector<int> scoreRises;  // increase of best score, per antidiagonal

  void resizeTinyScoresIfSmaller(size_t size) {
    if (xTinyScores.size() < size) {
      xTinyScores.resize(size);
      yTinyScores.resize(size);
      zTinyScores.resize(size);
    }
  }

  void initAntidiagonalTiny(size_t antidiagonalIncludingDummies,
			    size_t seq1end, size_t thisEnd, int numCells) {
    const SimdUint1 mNegInf = simdOnes1();
    size_t nextEnd = thisEnd + xdropPadLen + numCells;

    size_t a = 2 * (antidiagonalIncludingDummies + 1);
    if (scoreRises.size() <= antidiagonalIncludingDummies) {
      scoreEndsAndOrigins.resize(a + 1);
      scoreRises.resize(antidiagonalIncludingDummies + 1);
    }
    scoreEndsAndOrigins[a - 1] = nextEnd - seq1end;
    scoreEndsAndOrigins[a] = nextEnd;

    resizeTinyScoresIfSmaller(nextEnd + (simdBytes-1));
    simdStore1(&xTinyScores[thisEnd], mNegInf);
    simdStore1(&yTinyScores[thisEnd], mNegInf);
    simdStore1(&zTinyScores[thisEnd], mNegInf);
  }

  void initTiny(int scoreOffset) {
    initAntidiagonalTiny(0, 0, 0, 0);
    initAntidiagonalTiny(1, 0, xdropPadLen, 0);
    xTinyScores[xdropPadLen - 1] = scoreOffset;
    bestAntidiagonal = 2;
  }

  void calcBestSeq1positionTiny(int scoreOffset) {
    size_t seq1beg = seq1start(bestAntidiagonal);
    const TinyScore *x2 = &xTinyScores[diag(bestAntidiagonal, seq1beg)];
    const TinyScore *x2beg = x2;
    int target = scoreOffset - scoreRises[bestAntidiagonal] -
      scoreRises[bestAntidiagonal + 1] - scoreRises[bestAntidiagonal + 2];
    while (*x2 != target) ++x2;
    bestSeq1position = x2 - x2beg + seq1beg;
  }
};

}

#endif