File: pp_fastBlockSearcher.hh

package info (click to toggle)
augustus 3.2.3%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 289,676 kB
  • sloc: cpp: 48,711; perl: 13,339; ansic: 1,251; makefile: 859; sh: 58
file content (435 lines) | stat: -rw-r--r-- 13,894 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
/*****************************************************************************\
 * Filename : pp_fastBlockSearcher.hh
 * Author   : Oliver Keller
 * Project  : Gene Prediction with Protein Family Patterns
 *
 * Description: A fast block search class to determine sequence parts
 *              relevant for the ppx extension
 *
 *
 * Date       |   Author              |  Changes
 *------------|-----------------------|---------------------------------
 * 09.09.09   | Oliver Keller         | creation of the file
 \******************************************************************************/

#ifndef __PP_BLOCKSEARCHER_HH
#define __PP_BLOCKSEARCHER_HH

// project includes
#include "pp_profile.hh"

// standard C/C++ includes
#include <deque>
#include <vector>
#include <map>
#include <ostream>
#include <iostream>
#include <iomanip>
#include <cstdlib>

using namespace std;

namespace PP {
    
    /*
     * FsSeedCollection: Contains, for each triplet of residues, a 
     *                   set of positions that this tuple is likely
     *                   to appear as part of a block.
     *                   The decision whether a triplet is considered
     *                   a seed for a certain position depends on two
     *                   parameters: 
     *                   - the maximum number of seeds (chosen
     *                     as a subset of 8000 possible triplets) at a
     *                     certain position, reversely proportional to
     *                     the length of the block (so that the total
     *                     number of seeds used for a block is independent
     *                     of its length)
     *                   - if, according to the profile, a high probability
     *                     that a given triplet is a seed can be reached with
     *                     a smaller number of seeds, this reduced number is
     *                     used.
     *                   Given this number, the most likely (i.e. highest
     *                   scoring) triplets are taken into the collection of 
     *                   seeds.
     *
     * The use of a seed collection has the advantage that it can be calculated
     * once and used for filtering during the sequence analysis just by looking
     * at each 9-tuples of nucleotides in the sequence without calculating scores
     * at each point. Furthermore, the decision whether a 9-tuple translates to 
     * a seed can be made simultaneously for all offsets of all blocks at once.
     *
     */
    
    class FsSeedCollection {
//	multimap<string, Position> seeds;
	vector<int> vindex;
	vector<Position> vseeds;
	
	static int expSeedCount;   // maximal number of seeds per block
	static double maxCoverage; // position-specific coverage considered sufficient
//	multimap<string, Position>::iterator current;  // used internally in getNext()
	int current;
	int currend;

#ifdef DEBUG	    
	static double blockP(const PP::Block& blk, string s, int from) {
	    double result = 1;
	    for (int i=0;  i<s.length(); i++) 
		result *= blk[from+i][GeneticCode::get_aa_from_symbol(s[i])];
	    return result;
	}
	static double blockQ(const Block& blk, string s, int from) {
	    double result = 1;
	    for (int i=0;  i<s.length(); i++) 
		result *= blk[from+i].Q(GeneticCode::get_aa_from_symbol(s[i])).doubleValue();
	    return result;
	}
#endif

	static double blockP(const PP::Block& blk, int aas, int from) {

	    double result = 1;
	    for (int i=from+2;  i>=from; i--) {
		result *= blk[i][aas % 20];
		aas /= 20;
	    }
	    return result;
	}
	static double blockQ(const Block& blk, int aas, int from) {
	    double result = 1;
	    for (int i=from+2;  i>=from; i--) {
		result *= blk[i].Q(aas % 20).doubleValue();
		aas /= 20;
	    }
	    return result;
	}

    public:
	// create a seed collection
	FsSeedCollection(const PP::Profile& prfl) :
	    vindex(1,0), current(0), currend(0) 
	{
	    multimap<int, Position> seeds;
	    multimap<int, Position>::iterator it;


	    for (int b = 0; b<prfl.blockCount(); b++) {
		const Block& blk = prfl[b];
#ifdef VERBOSE_SEEDS
		cerr << "Determining seeds for block " << b << " (" << blk.id << ", " << blk.size() << " columns).\n";
#endif
		int maxcount = expSeedCount / blk.size();

		for (int i=0; i<=blk.size()-3; i++) {
		    multimap<double,int> currtriples;
		    string s(3, 0);
		    int val=0;
		    for (int t1 = 0; t1<NUM_AA; t1++) {
			s[0] = GeneticCode::aa_symbols[t1];
			for (int t2 = 0; t2<NUM_AA; t2++) {
			    s[1] = GeneticCode::aa_symbols[t2];
			    for (int t3 = 0; t3<NUM_AA; t3++) {
				s[2] = GeneticCode::aa_symbols[t3];
				double q_val = blockQ(blk, val, i);
				currtriples.insert(make_pair(q_val, val));
				val++;
			    }
			}
		    }
		    int count = 0;
		    double p = 0.0;
		    map<double,int>::reverse_iterator it = currtriples.rbegin();
		    for (count = 0; count < maxcount && p < maxCoverage; count++) {
			p += blockP(blk, it->second, i);
			seeds.insert(make_pair(it->second, Position(b,i)));
			++it;
		    }
#ifdef DEBUG
		    cerr << "[" << i << ".." << (i+2) << "]: coverage=" << p << "   minBlockQ=" << (--it)->first << " count=" << count << endl;
#endif
		}
	    }
	    vseeds.reserve(seeds.size());
	    vindex.reserve(8001);
	    for (it=seeds.begin(); it != seeds.end(); ++it) {
		while (vindex.size() <= it->first) 
		    vindex.push_back(vseeds.size());
		vseeds.push_back(it->second);
	    }
	    while (vindex.size()<=8000) 
		vindex.push_back(vseeds.size());
	}
	
	// return positions for which s is a seed (or null if there are none)
	// the first
	void setFirst(int patt) {
	    current = vindex[patt];
	    currend = vindex[patt+1];
	}
	bool hasNext() {
	    return current < currend;
	}
	// the next
	PP::Position& getNext() {
	    return vseeds[current++];
	}
	
	int size() {
	    return vseeds.size();
	}
    }; // PP::FsSeedCollection

    /*
     * FsHitType
     */
    struct FsHitType;
    typedef deque<FsHitType*> HitQueue;

    struct FsHitType {
	FsHitType(int p, int b, bool r, PartScoreType sc) :
	    pos(p),
	    head(this),
	    blockNo(b),
	    reverse(r),
	    score(sc.score), 
	    blockfrom(sc.from),
	    blockto(sc.to),
	    pathScore(sc.score), 
	    predecessor(0) {}
	int pos;
	int start() {
	    return head->pos;
	}
	FsHitType* head;
	int blockNo;
	bool reverse;
	double score;
	int blockfrom;
	int blockto;
	double pathScore;
	FsHitType* predecessor;
	void linkTo(HitQueue& queue) {
	    while(!queue.empty() && queue.front()->pos < pos - maxIntronLen) 
		queue.pop_front();
	    if (!queue.empty()) {
		predecessor = queue.front();
		head = predecessor->head;
		pathScore = predecessor->pathScore - intronMalus*(pos - predecessor->pos) + score;
	    }
	}
	void pushOn(HitQueue& queue) {
	    while (!queue.empty()) {
		FsHitType* ht = queue.back();
		if (ht->pathScore < pathScore + intronMalus*(pos - ht->pos))
		    queue.pop_back();
		else
		    break;
	    }
	    queue.push_back(this);
	}
	static int maxIntronLen;
	static double intronMalus;
    }; // PP::FsHitType

    /*
     * FsHitCollection
     */
    class FsHitCollection {
	vector<HitQueue> pendingHits[2];
	vector<FsHitType*> finalResult;
	vector<FsHitType*> allHits;
	int size;
    
    public:
	FsHitCollection(int n) : size(n) {
	    pendingHits[0].resize(n);
	    pendingHits[1].resize(n);
	}
	~FsHitCollection() {
	    for (int i=0; i<allHits.size(); i++) 
		delete allHits[i];
	}
	int allHitCount() {
	    return allHits.size();
	}
	int resultCount() {
	    return finalResult.size();
	}
	   
	void newHit(FsHitType*);
	void storeBestResults(multimap<double, FsHitType>&, int mincount, double threshold);
    }; // PP::FsHitCollection



    /*
     * PP::CandidateCollection: collects "candidates" for block hits
     * 
     * Each substring of the input sequence is a potential candidate. 
     * The collection is initialized with the profile (sequence of blocks)
     * and the seed collection (computed once in advance) containing the
     * seeds (see above)
     * 
     */

    // this type represents the results for a particular part of the
    // input sequence when translated and aligned to a block
    // - how many residues of the translation are part of a seed 
    //   (with respect to the corresponding block position)
    // - which is the rightmost residue that is part of seed
    struct ScoringCandidate {
	ScoringCandidate() : inSeedCount(0), lastOffset(-10) {}
	int inSeedCount;
	int lastOffset;
    };

    struct twomer {
	signed char v[2];
	twomer() { v[0]=v[1]=-1; }
	signed char& operator[] (int n) {
	    return v[n];
	}
	int val() { 
	    return (v[0]<0 || v[1]<0) ? -1 : (v[0]*20 + v[1]);
	}
    };
       

    struct CandidateCollection {
	typedef deque<ScoringCandidate> SeedCountsQueue;
	// we have, for both strands and each block a queue
	// of scoring candidates; the index for the queue is
	// given by the distance of the block match start to
	// the last observed nucleotide
	vector<SeedCountsQueue> theCandidates[2];
	FsSeedCollection& seedColl;
	const Profile& prfl;
	
	vector<int> maxseedcount; // maximum of inSeedCounts
	vector<int> seedhitsizes; // sum of inSeedCounts 
	vector<int> candcounts;   // for how many locations bestPartialLogScore is called
	vector<int> realhitcounts; // ...and successful
	
	// pending 2-mers of amino acids, one waiting for each frame/strand
	deque<twomer> patterns;


	CandidateCollection(const Profile& p, FsSeedCollection& coll) :
	    seedColl(coll),
	    prfl(p),
	    maxseedcount(p.blockCount(), 0),
	    seedhitsizes(p.blockCount(), 0),
	    candcounts(p.blockCount(), 0),
	    realhitcounts(p.blockCount(), 0),
	    patterns(6, twomer())
	{
	    // start the computation with a queue of emtpy scoring candidates
	    for (int b=0; b<p.blockCount(); b++) 
		theCandidates[0].push_back(SeedCountsQueue(p.blockSize(b) * 3 -8, 
							ScoringCandidate()));
	    theCandidates[1] = theCandidates[0];
	}
	    
	// put SeedsToEntries: given a 3-aminoacid-motif, add it to those scoring
	// candidates for which it represents a seed at the corresponding position
	void putSeedsToEntries(int currpat, bool reverse) {
	    seedColl.setFirst(currpat);
	    while (seedColl.hasNext()) {
		PP::Position& pos = seedColl.getNext();
		int index = (reverse ? prfl.blockSize(pos.b) - pos.i - 3 : pos.i) * 3;
		    
		ScoringCandidate& pf = theCandidates[reverse][pos.b][index];
#ifdef DEBUG
		if (pf.lastOffset >= 0 && (pf.lastOffset == pos.i || reverse == (pf.lastOffset < pos.i)))
		    cerr << "ERROR: last offset (" << pf.lastOffset << ") " << (reverse ? "<" : ">") 
			 << " current offset (" << pos.i << ") !" << endl;
#endif
		int range = abs(pf.lastOffset-pos.i);
		pf.inSeedCount += range < 3 ? range : 3;
		pf.lastOffset = pos.i;
	    }
	}

	// pushSeeds: move forward the sequence: for each strand, the next pending
	// pair of aminoacids is concatenated to the next residue found and putSeedsToEntries
	// is called with the resulting triple; and new pairs are added to the pending queue
	void pushSeeds(int t) {
	    signed char aa, aab;
	    twomer nextPattern = patterns.back();
	    patterns.pop_back();
	    twomer revPattern = patterns.back();
	    patterns.pop_back();
	    try {
		aa = GeneticCode::map[Seq2Int(3)(PP::DNA::sequence + t-2)];
		aab = GeneticCode::map[Seq2Int(3).rc(PP::DNA::sequence + t-2)];
		int forval = nextPattern.val()*20;
		int revval = revPattern.val();
		if (aa >= 0 && forval >= 0) 
		    putSeedsToEntries(forval + aa, false);
		if (aab >= 0 && revval >= 0)
		    putSeedsToEntries(revval + 400*aab, true);
	    } catch (InvalidNucleotideError e) {
		aa = aab = -2;
	    }
	    nextPattern[0] = nextPattern[1];
	    nextPattern[1] = aa;
	    patterns.push_front(nextPattern);
	    revPattern[1] = revPattern[0];
	    revPattern[0] = aab;
	    patterns.push_front(revPattern);
	}

	// takeCandidates: move forward the candidate queues (one for each block and strand)
	// if a scoring candidate is complete (fully evaluated) and its seed count
	// is above a threshold (here: around 25% of the block width), call it a hit candidate,
	// compute log scores for it, and if there is one non-negative, call it a hit, and
	// add it to the FsHitCollection target
	void takeCandidates(int t, FsHitCollection& target, bool reverse) {
	    for (int b=0; b < prfl.blockCount(); b++) {
		theCandidates[reverse][b].push_front(ScoringCandidate());
		int hc = theCandidates[reverse][b].back().inSeedCount;
		theCandidates[reverse][b].pop_back();
		seedhitsizes[b] += hc;
		if (hc < 0) 
 		    cerr << "ERROR: seedhitcount < 0!\n";
		if (maxseedcount[b] < hc)
		    maxseedcount[b] = hc;
		if (hc > 4 + prfl.blockSize(b)/4) {
		    int blstart = t - prfl.blockSize(b) * 3;
		    if (blstart < 0)
			continue;
		    candcounts[b]++;
		    PP::PartScoreType pt;
		    prfl[b].bestPartialLogScore(reverse, blstart, pt);
		    int width = pt.to - pt.from;
		    if (pt.score >= 0 && 
			width >= MIN_BLOCKSIZE && 
			width >= 0.3*prfl.blockSize(b)) {
			FsHitType* newHit = new FsHitType(blstart, b, reverse, pt);
			target.newHit(newHit);
			realhitcounts[b]++;
		    }
		}
	    }
	}

	// this is the entry point to the candidate collection
	// calls takeCandidates and pushSeeds
	void proceed(int t, FsHitCollection& target) {
	    takeCandidates(t, target, false);
	    takeCandidates(t, target, true);
	    pushSeeds(t);
	}

	void debugging_output() {
	    for (int b=0; b<prfl.blockCount(); b++) {
		cerr << "Block " << b << " (" << prfl[b].id << ")" << endl;
		cerr << "    len: " << prfl.blockSize(b) << endl;
		cerr << "   av c: " << double(seedhitsizes[b])/DNA::len;
		cerr << "  max c: " << maxseedcount[b] << endl;
		cerr << "  % c>t: " << fixed << setw(4) << setprecision(2) << double(candcounts[b]*100)/DNA::len << endl;
	    }
	}
    }; // class PP::CandidateCollection
} // namespace PP
#endif // __PP_BLOCKSEARCHER_HH