File: Terminator.cpp

package info (click to toggle)
mapsembler2 2.2.4%2Bdfsg1-5
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 7,344 kB
  • sloc: cpp: 51,204; ansic: 13,165; sh: 542; makefile: 396; asm: 271; python: 28
file content (410 lines) | stat: -rw-r--r-- 13,582 bytes parent folder | download | duplicates (11)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
//
//  Terminator.cpp
//

#ifndef ASSERTS
#define NDEBUG // disable asserts, they're computationnally intensive
#endif 

#include <iostream>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include <algorithm> // for max

#include "Terminator.h"
#include "Traversal.h" // for extensions() 

using namespace::std;
bool Terminator::verbose = true;

// common terminator functions (actually, more like Kmer64 + bloom + debloom)

// we define a structure 2x4 bits which indicates which nucleotides correspond to in- and out-branching of a kmer
// let's see an example (bidirected debruijn graph):
// 
//     ACT [G]  -\/->   [C] CAG [T]  <--  [C] CCA
//     ---       X          ---               ---
//[C]  AGT     -/\- >   [A] CTG [G]   -->     TGG [G]
//
// structure for the node CAG/CTG: (forward:)0010(reverse:)0001 (the order is ACTG); T forward (to the right) and G reverse (to the right)
unsigned char Terminator::branching_structure(kmer_type graine)
{
    assert(graine<=revcomp(graine));
    kmer_type new_graine;
    unsigned char result = 0;
    int nt;
    int strand;

    for(nt=0; nt<4; nt++) 
    {
        // forward right extensions 
        strand=0;
        new_graine = next_kmer(graine,nt,&strand);
        if(bloom_solid_kmers->contains(new_graine) && !debloom->contains(new_graine)){
            result|=1<<nt;
        }

        // reverse right extensions
        strand=1;
        new_graine = next_kmer(graine,nt,&strand);
        if(bloom_solid_kmers->contains(new_graine) && !debloom->contains(new_graine)){
            result|=1<<(nt+4);
        }
    }
    return result;
}

// determines if a kmer is branching or not
bool Terminator::is_branching(kmer_type graine)
{
    assert(graine<=revcomp(graine));

    // method specific to order=0 (remember that order>0 is never used)
    if (order == 0)
    {
        // cannot really be optimized, because most kmers will be non-branching, hence computing branching_structure() takes optimal time
        int nb_forward_links = 0, nb_reverse_links = 0;
        int i;
        unsigned char branching = branching_structure(graine);

        for (i=0;i<4;i++)
            nb_forward_links += (branching>>i)&1;

        for (i=4;i<8;i++)
            nb_reverse_links += (branching>>i)&1;

        return !(nb_forward_links == 1 && nb_reverse_links == 1);
    }
    else
    {
        if (order==1 && is_tip(graine,bloom_solid_kmers,debloom))// algo fixme-order>0: order=1, tips are ignored as branching kmers. should find a more general code for order>1
            return false;

        // any order>=0: check that this node really yields a true branching, as opposed to branching yielding tips
        for (int strand=0;strand<2;strand++)
        {
            int osef;
            int nb_extensions = traversal_extensions(graine,strand,osef, bloom_solid_kmers, debloom);
            if (nb_extensions != 1)
                return true;
        }
    }
    return false;
}

// [branching kmers]-based terminator
// most kmers should have 1 in-neighbor and 1 out-neighbor
// this terminator indexes all the other kmers (ie., branching or dead-end kmers)
// it will miss circular regions tho..

// mark a kmer (the kmer has to be a branching kmer or a neighbor of one, else no effect)
void BranchingTerminator::mark(kmer_type graine)
{
    assert(graine<=revcomp(graine));
    bool could_mark = false;

    // if it is a branching kmer, mark it directly (it may have no branching neighbor)
    if (is_indexed(graine))
    {
        set_value_t val;
        branching_kmers->get(graine,&val);
        branching_kmers->set(graine,val|(1<<8));
        could_mark = true;
    }

    // enumerate all the neighbors
    for (int strand=0; strand < 2; strand++)
    {
        for(int nt=0; nt<4; nt++) 
        {
            // test if the neighbor is a branching kmer
            int neighbor_strand = strand;
            kmer_type neighbor = next_kmer(graine,nt,&neighbor_strand);
            if( (!bloom_solid_kmers->contains(neighbor)) || debloom->contains(neighbor) )
                continue;
            if (!is_indexed(neighbor))
                continue;

            // mark the kmer in that neighbor
            int revStrand ;
            int revNT;
            revStrand = 1-neighbor_strand;
            if (strand == 0)
                revNT = revcomp_int(code2nucleotide(graine,0));
            else
                revNT = code2nucleotide(graine,sizeKmer-1);

            mark(neighbor,revNT,revStrand);

            could_mark = true;
        }
    }
    if (could_mark)
        assert(is_marked(graine));
}

// record info into a branching kmer
void BranchingTerminator::mark(kmer_type graine, char nt, int strand)
{
    assert(nt<4);
    assert(strand<2);
    assert(graine<=revcomp(graine));

    // BranchingTerminator ignores non-branching kmers
    if (!is_indexed(graine))
        return;

    //int val = 0;
    set_value_t val=0;
    branching_kmers->get(graine,&val);

    // set a 1 at the right NT & strand position
    if (strand==0)
        val|=1<<(nt);
    else
        val|=1<<(nt+4);

    //    printf ("mark,  graine = %llx val: %x branching structure: %x\n",graine,val,branching_structure(graine));
    
    branching_kmers->set(graine,val); //was insert for Hash16

    assert(is_marked(graine,nt,strand));
}

// test if a kmer is marked, providing that the kmer is a branching kmer or a neighbor of one (else, considered unmarked)
bool BranchingTerminator::is_marked(kmer_type graine)
{
    assert(graine<=revcomp(graine));

    // if it is a branching kmer, read marking directly (it may have no branching neighbor)
    if (is_indexed(graine))
        return is_marked_branching(graine);

    // enumerate all the neighbors
    for (int strand=0; strand < 2; strand++)
    {
        for(int nt=0; nt<4; nt++) 
        {
            // test if the neighbor is a branching kmer
            int neighbor_strand = strand;
            kmer_type neighbor = next_kmer(graine,nt,&neighbor_strand);
            if( (!bloom_solid_kmers->contains(neighbor)) || debloom->contains(neighbor) )
                continue;
            if ( !is_indexed(neighbor) )
                continue;

            // test the kmer w.r.t that neighbor
            int revStrand ;
            int revNT;
            revStrand = 1-neighbor_strand;
            if (strand == 0)
                revNT = revcomp_int(code2nucleotide(graine,0));
            else
                revNT = code2nucleotide(graine,sizeKmer-1);

            if ( is_marked(neighbor,revNT,revStrand) ) 
                return true;
        }
    }
    return false;
}

// test if a branching kmer is marked (using the special marker for branching kmers only)
bool BranchingTerminator::is_marked_branching(kmer_type graine)
{
    assert(graine<=revcomp(graine));
    assert(is_branching(graine));
    assert(is_indexed(graine));
    
    set_value_t val;
    branching_kmers->get(graine,&val);
    return (val&(1<<8)) != 0;
}

// that function returns false for non-indexed kmers
bool BranchingTerminator::is_marked(kmer_type graine, char nt, int strand)
{
    assert(nt<4);
    assert(strand<2);
    assert(graine<=revcomp(graine));

    set_value_t val = 0;
    int is_present = branching_kmers->get(graine,&val);

    if (!is_present)
        return false;

    //printf ("is_marked,  graine = %llx val: %x branching structure: %x\n",graine,val,branching_structure(graine));

    int extension_nucleotide_marked;
    if (strand==0)
        extension_nucleotide_marked = (val>>nt)&1;
    else
        extension_nucleotide_marked = (val>>(nt+4))&1;

    return  extension_nucleotide_marked == 1;
}

bool BranchingTerminator::is_indexed(kmer_type graine)
{
  return branching_kmers->contains(graine);
}

BranchingTerminator::BranchingTerminator(BinaryBank *given_SolidKmers, uint64_t genome_size, Bloom *given_bloom, Set *given_debloom) : Terminator(given_SolidKmers,given_bloom,given_debloom)
{
    // estimate, from the first million of kmers, the number of branching kmers, extrapolating given the estimated genome size
    // TODO: erwan noticed that this code isn't useful anymore with AssocSet, feel free to remove it sometimes
    uint64_t nb_extrapolation = 3000000;
    SolidKmers->rewind_all();
    uint64_t nb_kmers = 0;
    nb_branching_kmers = 0;
    kmer_type kmer;
    uint64_t previous_estimated_nb_branching_kmers, estimated_nb_branching_kmers;
    while (SolidKmers->read_element(&kmer))
    {
        if (is_branching(kmer))
            nb_branching_kmers++;

        if ((nb_branching_kmers%1000)==0 && nb_branching_kmers>0)
        {
            previous_estimated_nb_branching_kmers = estimated_nb_branching_kmers;
            estimated_nb_branching_kmers = (uint64_t)((1.0*nb_branching_kmers)/nb_kmers * genome_size);
            // minor todo: stop when previous_.. - estimated < threshold (pourquoi pas = 10% estimated)
            fprintf (stderr,"%cExtrapolating the number of branching kmers from the first %dM kmers: %lld",13,(int)ceilf(nb_extrapolation/1024.0/1024.0),estimated_nb_branching_kmers);
        }

        if (nb_kmers++ == nb_extrapolation)
            break;
    }
    estimated_nb_branching_kmers = (uint64_t)((1.0*nb_branching_kmers)/nb_kmers * genome_size); // final estimation
    int estimated_NBITS_TERMINATOR = max( (int)ceilf(log2f(estimated_nb_branching_kmers)), 1);
    fprintf (stderr,"\n");

    // call Hash16 constructor
    // branching_kmers = new Hash16(estimated_NBITS_TERMINATOR);
    branching_kmers = new AssocSet();

    // index, once and for all, all the branching solid kmers
    SolidKmers->rewind_all();
    nb_branching_kmers = 0;
    uint64_t nb_solid_kmers = 0;
    while (SolidKmers->read_element(&kmer))
    {
        if (is_branching(kmer))
        {
	  // branching_kmers->insert(kmer,0);
            branching_kmers->insert(kmer);
            nb_branching_kmers++;
        }

        nb_solid_kmers++;
        if ((nb_branching_kmers%500)==0) fprintf (stderr,"%cIndexing branching kmers %lld / ~%lld",13,nb_branching_kmers,estimated_nb_branching_kmers);
    }

    if (nb_branching_kmers == 0)
        printf("\n**** Warning\n\nNo branching kmers were found in this dataset (it is either empty or a tiny circular genome) - Minia will not assemble anything.\n\n****\n\n");

	branching_kmers->finalize();
//	branching_kmers->print_total_size();
//    fprintf (stderr,"\n\nAllocated memory for marking: %lld branching kmers x (%lu+%lu) B \n",nb_branching_kmers,sizeof(kmer_type),sizeof(set_value_t));
//    fprintf (stderr," actual implementation:  (sizeof(kmer_type) = %lu B) + (sizeof(set_value_t) = %lu B) per entry:  %.2f bits / solid kmer\n",sizeof(kmer_type),sizeof(set_value_t),(nb_branching_kmers*((sizeof(kmer_type)+sizeof(set_value_t))*8.0))/nb_solid_kmers);

    // init branching_kmers iterator for what happens next
     branching_kmers->start_iterator(); 
}

// constructor that simply loads a dump of branching kmers
BranchingTerminator::BranchingTerminator(BinaryBank *branchingKmers, BinaryBank *given_SolidKmers, Bloom *given_bloom, Set *given_debloom) : Terminator(given_SolidKmers,given_bloom,given_debloom)
{
    nb_branching_kmers = branchingKmers->nb_elements();
    int NBITS_TERMINATOR = max( (int)ceilf(log2f(nb_branching_kmers)), 1);

    // call Hash16 constructor
    // branching_kmers = new Hash16(NBITS_TERMINATOR);
    branching_kmers = new AssocSet();

    // load branching kmers
    branchingKmers->rewind_all();
    kmer_type kmer;
    while (branchingKmers->read_element(&kmer))
      branching_kmers->insert(kmer); //,0 for Hash16

	branching_kmers->finalize();

    if (verbose)
        fprintf (stderr,"\nLoaded %lld branching kmers x %lu B =  %.1f MB\n",nb_branching_kmers,sizeof(cell<kmer_type>),((1<<NBITS_TERMINATOR)*sizeof(cell<kmer_type>)*1.0)/1024.0/1024.0);

    // init branching_kmers iterator for what happens next
    branching_kmers->start_iterator(); 
}

BranchingTerminator::~BranchingTerminator()
{
    delete branching_kmers;
}

bool BranchingTerminator::next(kmer_type *kmer)
{   
    if (branching_kmers->next_iterator())
    {
      //*kmer =  branching_kmers->iterator.cell_ptr->graine; //for Hash16
      *kmer =  *(branching_kmers->iterator);

        return true;
    }
    return false;
}

void BranchingTerminator::dump_branching_kmers(BinaryBank *BranchingKmers)
{
    // init branching_kmers iterator for what happens next
    branching_kmers->start_iterator(); 
    
    while (branching_kmers->next_iterator())
    {
      //kmer_type kmer =  branching_kmers->iterator.cell_ptr->graine;//for Hash16
      kmer_type kmer = *( branching_kmers->iterator);

        BranchingKmers->write_element(&kmer);
    }
    fprintf (stderr,"Dumped branching kmers\n");
}


// reset all marking information: everything is now unmarked
void BranchingTerminator::reset()
{
    branching_kmers->clear(); 
}

//--------------------------------------------------------


// bloom-based terminator (untested, totally unused)
BloomTerminator::BloomTerminator(int tai_Bloom)
{
    bloo2 = new Bloom(tai_Bloom); // to mark kmers already used for assembly  ... taille a changer
}

void BloomTerminator::mark(kmer_type graine, char nt, int strand)
{
    bloo2->add(graine); 
}

bool BloomTerminator::is_marked(kmer_type graine, char nt, int strand)
{
    return bloo2->contains(graine);
}

bool BloomTerminator::is_fully_marked(kmer_type graine)
{
    return is_marked(graine,0,0);
}

bool BloomTerminator::next(kmer_type *kmer)
{    
    return SolidKmers->read_element(kmer);
}