File: Fragment_Bank.cpp

package info (click to toggle)
mapsembler2 2.1.6%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 4,116 kB
  • ctags: 3,391
  • sloc: cpp: 27,549; ansic: 2,662; asm: 271; sh: 226; makefile: 153
file content (425 lines) | stat: -rw-r--r-- 17,720 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
//
//  Fragment_Banks.cpp
//
//
//  Created by Pierre Peterlongo on 25/10/2012
//
#include "Fragment_Bank.h"
extern int size_seeds;

Fragment_Bank::Fragment_Bank(){
    //seeds = hash_create(100000);  	test_alloc(seeds);
    
    seeds_count = hash_create_binarykey(100000);  	test_alloc(seeds_count);
}
Fragment_Bank::~Fragment_Bank(){
    //  hash_delete(seeds, list_of_generic_free);
    //free(seeds_count);
    //  hash_delete(seeds_count, list_of_generic_free);
    hash_free(seeds_count);
    all_fragments.clear(); // All the elements of the vector are dropped: their destructors are called, and then they are removed from the vector container, leaving the container with a size of 0.
    // free the array of couples:
    free(seed_table);
}


void Fragment_Bank::load_fragments(char * fragment_file){
    Bank * bank = new Bank(fragment_file);
    char * read;
    int size_read;
    while(bank->get_next_seq(&read,&size_read)){ // each fragment
        all_fragments.push_back(new Fragment_Starting(read));
    }
    delete bank;
}


int estimated_size_reads=256; // ATTENTION : this parameter is a source of problems... Take care it fits the maximal read size.
void Fragment_Bank::index_fragments_and_init_vector(){
    char * read;
    char seed [size_seeds+1];
    int size_fragment;
    int fragment_id=0;
    total_seeds = 0 ;
    
    long sum=0;
    
    // FIRST COUNT THE NUMBER OF OCCURRENCES OF EACH SEED
    for( int i=0; i<all_fragments.size();i++){
        read = all_fragments[i]->fragment_sequence;
        
        size_fragment=strlen(read);
        all_fragments[i]->starting_id_in_the_boolean_vector=sum+estimated_size_reads;
        sum+=size_fragment+estimated_size_reads;
        int stop = size_fragment-size_seeds+1;
        for (int i=0;i<stop;i++){ // each seed
            int validSeed=1;
            for(int j=0;j<size_seeds;j++) {
                seed[j]=read[i+j];// read the seed
                if(!valid_character(seed[j])){validSeed=0; break;}
            }
            if(validSeed) {
                seed[size_seeds]='\0';
                hash_incr_kmer_count(seeds_count,seed);
                total_seeds++;
            }
        } // end each seed
        fragment_id++;
    } // end each fragment
    printf("total seeds in fragments %lli  size %lli MB \n",total_seeds,total_seeds*sizeof(couple)/1024LL/1024LL);
    
    printf("sum = %ld\n", sum);
    bv = new BooleanVector(sum);
    
    fragment_id=0;
    seed_table = (couple *) calloc(total_seeds,sizeof(couple));
    test_alloc(seed_table);
    iterate_and_fill_offsets(seeds_count);
    
    // NOW FILL THE INFORMATION ABOUT EACH SEED
    for( int i=0; i<all_fragments.size();i++){
        read = all_fragments[i]->fragment_sequence;
        size_fragment=strlen(read);
        int stop = size_fragment-size_seeds+1;
        for (int i=0;i<stop;i++){ // each seed
            int validSeed=1;
            for(int j=0;j<size_seeds;j++) {
                seed[j]=read[i+j];// read the seed
                if(!valid_character(seed[j])){validSeed=0; break;}
            }
            if(validSeed) {
                seed[size_seeds]='\0';
                hash_fill_kmer_index(seeds_count,seed,seed_table, fragment_id, i);
            }
        } // end each seed
        fragment_id++;
    } // end each fragment
}



/**
 *  |<->|  pwi (may be negative)
 *       |<->| pseed (where seed mapped on fragment)
 *       --------------  fragment
 *           ||||||    seed
 *  *******************      read
 *
 *  Tests if the overlapping part between read and fragment do not have more than <code>subst_allowed</code> substitutions and if the overlap size is bigger or equal to <code>min_overlap</code>
 *  returns 1 if true between read and fragment, 0 else
 */
char read_coherent_enhanced(const int pwi, const int pseed, const char * fragment, const char * read, const int subst_allowed){
	int substitution_seen=0; // number of seen substitutions for now
	int pos_on_read, pos_on_fragment; // where to start
    
	/*
	 *  | pwi (negative)
	 *     --------------  fragment
	 *  *************      read
	 *     | we start here on the read
	 */
	if(pwi<0) {
		pos_on_fragment=0;
		pos_on_read=-pwi;
	}
    
	/*
	 *        | pwi (positive)
	 *     --------------  fragment
	 *        *************      read
	 *        | we start here on the read
	 */
	else{
		pos_on_fragment=pwi;
		pos_on_read=0;
        
	}
    
    
    
	// walk the read and the fragment together, detecting substitutions.
	// stop if the number of substitution is too high
	while(fragment[pos_on_fragment]!='\0' && read[pos_on_read]!='\0'){
        if(pos_on_fragment==pseed){ // we reached the seed, we can skip it, we know it contains no substitutions
#ifdef DEBUG_COHERENT
			printf("seeds (%d on fragment %d on read:\n", pseed, pseed-pwi);
			int z;
			for(z=pseed;z<pseed+size_seeds;z++) printf("%c", fragment[z]); printf("\n");
			for(z=pseed-pwi;z<pseed-pwi+size_seeds;z++) printf("%c", read[z]); printf("\n");
            
            
			// TEMP TODO: REMOVE THESE LINES ONCE SURE
			for(z=0;z<size_seeds;z++)
				if(fragment[z+pos_on_fragment]!=read[z+pos_on_read]){
					printf("fragment[%d]!=read[%d] (%c != %c)\n", z+pos_on_fragment, z+pos_on_read, fragment[z+pos_on_fragment],read[z+pos_on_read]);
					int space=30;
                    for(z=0;z<space;z++) printf(" "); printf("%s - pwi: %d - seed: %d\n", fragment, pwi, pseed);
                    for(z=0;z<space+pwi;z++) printf(" ");
                    printf("%s\n", read);
                    exit(1);
				}
			// END LAST TODO REMARQ
#endif
			pos_on_fragment+=size_seeds;
			pos_on_read+=size_seeds;
		}
		else{
            
			if(fragment[pos_on_fragment]!=read[pos_on_read]){ // one subsitution
				substitution_seen++;
				if(substitution_seen>subst_allowed) return 0;
			}
			pos_on_fragment++;
			pos_on_read++;
		}
	}
	return 1;
}


// checks if, for a fragment, a position mapped already (return 1) or none (return 0)
int exists_tuple_read_position(const vector<int> mapped_positions, const int pos){
    return find(mapped_positions.begin(), mapped_positions.end(), pos)!=mapped_positions.end();
}


// checks if, for a fragment, a tuple read/position mapped already a few position before of after (<min_dist) (return 0) or not (return 1)
char previous_mapping_position_far_enougth(const hash_t map, const char * read, const int pos){
	const int min_dist = 10;
	/***************
	 *  these two reads overlap on |read|-min_dist. Lets consider only one one them
	 *   +++++++++++++++++++++++++++++++++++++  fragment
	 *        ------------------------          R1
	 *           --------------------------     R2
	 *****************/
	list_ * read_coherent_positions;                 //for one couple (read/fragment) : list of coherent mapped positions (usually, one position)
	if(hash_entry_by_key(map, read, (void **) (&read_coherent_positions)) !=0){
		cell_mapsembler *c = ((list_ *)read_coherent_positions)->first;
		while(c != NULL){
			if((*(int *) (c->val)) != pos && // in case the same read exists twice in the read file, mapped twice at the same position.
               (*(int *) (c->val)) > pos - min_dist && (*(int *) (c->val)) < pos+min_dist)	{
# ifdef DEBUG_EXTENTIONS
				//				printf("pos %d to close to %d return 0\n", (*(int *) (c->val)), pos); // DEB
# endif
				return 0;
			}
			c = c->prox;
		}
	}
	return 1;
}

/**
 * test if a position has already been tested (for instance an anchoring position of a mapped read)
 */
bool Fragment_Bank::is_position_already_tested(const Fragment_Starting * fragment, int pos){
    return bv->is_boolean_vector_visited(fragment->starting_id_in_the_boolean_vector+pos);
}
/**
 * set a position has already been tested (for instance an anchoring position of a mapped read)
 */
void Fragment_Bank::set_position_tested(const Fragment_Starting * fragment, int pos){
    bv->set_boolean_vector_visited(fragment->starting_id_in_the_boolean_vector+pos);
}

void Fragment_Bank::map_reads(Bank * reads, const int substitutions_allowed){
    //////////////////////////////////////////////////////////////////////////
	/////////////// read all reads - storing those coherent with reads ///////
	//////////////////////////////////////////////////////////////////////////
    
    
	//char * fragment;
	//list * mapping_positions;             //offsets of seeds mapping positions for a read
	// working variables
	int pwi,  stop, read_coherence_value, extention_number=0;//
	//int read_id=0;
    uint64_t offset_seed;
    uint64_t nb_seeds;
    
    
    
    
	// book space for the revcomp read that going to be read.
	char * rev_comp_read = (char *)malloc(sizeof(char)*16384); //
	test_alloc(rev_comp_read);
    
	// read all the reads
	char seed [size_seeds+1];
	char * read;
	int size_read;
    
	while(reads->get_next_seq(&read,&size_read)){ // each read
        stop = size_read-size_seeds+1;
        // read all seeds present on the read:
        for(int direction=0;direction<2;direction++) { // 0 = forward, 1 = rev comp
            for (int i=0;i<stop;i++){
                int j;
                for(j=0;j<size_seeds;j++)
                    seed[j]=read[i+j]; seed[j]='\0';// read the seed
//                printf("query seed = %s\n", seed);
                // if the seed is indexed in the fragments:
                if(get_seed_info(seeds_count,seed,&offset_seed,&nb_seeds)){
                    //	    if(hash_entry_by_key(seeds, seed, (void **) (& mapping_positions)) !=0){
                    //	      cell_mapsembler * c = mapping_positions->first;
                    // for each occurrence of this seed on the fragment:
                    int ii;
                    for (ii=offset_seed; ii<offset_seed+nb_seeds; ii++) {
                        couple * value = &(seed_table[ii]);
                        Fragment_Starting* fragment = all_fragments[value->a];
                        
                        if(!fragment->Mapped){
                            fragment->Mapped=hash_create(1000);
                            test_alloc(fragment->Mapped);
                        }
                        
                        pwi = value->b-i; // starting position of the read on the fragment.
                        
//                        printf("with seed %s occurring pos %d on read and %d on fragment\n", seed, i, value->b);
                        
                        // overview of the situation:
                        
                        //        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;  fragment
                        //        <---------> b
                        //                   [--------]                     seed
                        //             ******************************       read
                        //             <----> i
                        //        <---> pwi
                        
                        // if the tuple read/fragment/position was not already mapped nor already detected as unmappable, try to map.
                        if(!is_position_already_tested(fragment, pwi)){// &&
                            //		   previous_mapping_position_far_enougth (fragment->Mapped, read, pwi)){
                            read_coherence_value = read_coherent_enhanced(pwi, value->b, fragment->fragment_sequence, read, substitutions_allowed);
                            //		  printf("test read coherence between %s and %s %d %d = %d\n", fragment->fragment_sequence, read, read_id, pwi, read_coherence_value);
                            //		  printf("with seed %s occurring pos %d on read and %d on fragment\n", seed, i, value->b);
                            if(read_coherence_value == 1){ // tuple read fragment->fragment_sequence position is read coherent
                                hash_add_int_to_list(fragment->Mapped, read, pwi);
                            } // end tuple read fragment->fragment_sequence position is read coherent
                            
                            set_position_tested(fragment, pwi);
                        } // end tuple read/fragment->fragment_sequence/position was not already mapped nor already detected as unmappable
                        //					else printf("%s %s %d %d already tested \n", fragment->fragment_sequence, read, read_id, pwi);
                        //		c=c->prox;
                    } // end for each occurrence of this seed on the fragment->fragment_sequence
                } // end if the seed is indexed in the fragment->fragment_sequences:
            } // end all seeds of the read
            
            bv->reinit();
            revcomp(read,size_read);
        } // end each direciton
    } // end each read
    
    
}

void Fragment_Bank::show_alignments(){
    
    //int print_fragment_cover(hash_t map, char * fragment, signed int left){
    int j,p, space=70;
    char *key;
    int *position_fragment;
    list_ *mapping_positions;
    hash_iter iterator;
    
    //  for(vector<Fragment>::iterator fragment_it=all_fragments.begin(); fragment_it!= all_fragments.end(); ++(fragment_it)){
    for(int i=0;i<all_fragments.size();i++){
        Fragment * fragment_it=all_fragments[i];
        char * fragment_seq = fragment_it->fragment_sequence;
        printf("\n");
        for(j=0;j<space;j++) printf(" "); printf("%s = Fragment", fragment_seq);
        printf("\n");
        if(fragment_it->Mapped==NULL){
            for(j=0;j<space;j++) printf(" "); printf("NO_MAPPED_READS\n");
            continue;
        }
        hash_t map = fragment_it->Mapped;
        iterator=hash_iterator_init(map); // get the first element of the Mapped hashmap
        // Go through the hashmap Mapped and for each fragment
        // get the reads that have seeds that match somewhere on this fragment
     
        if((long int)iterator!=-1){
            while(!hash_iterator_is_end(map, iterator)){     // go through all the mapped reads
                hash_iterator_return_entry(map, iterator, &key, (void**)&mapping_positions); // get key(read) and corresponding mapping positions (pwi)
                if(mapping_positions){
                    cell_mapsembler * c=mapping_positions->first;
                    //print mapping positions
                    while(c!=NULL){
                        //	    couple *value= (couple *) c->val;
                        position_fragment=(int *)c->val;
                        //position_fragment=(int) *(c->val);
                        for(j=0;j<space+(*position_fragment);j++) printf(" ");
                        for(p=0;p<(signed)strlen(key);p++){
                            if((*position_fragment)+p>=0 && (*position_fragment)+p<(signed)strlen(fragment_seq) && key[p] != fragment_seq[(*position_fragment)+p]) printf("%c", tolower(key[p]));
                            else printf("%c", key[p]);
                        }
                        printf(" %d\n", (*position_fragment));
                        
                        c=c->prox;
                    }
                }
                iterator=hash_iterator_next(map, iterator);
            }
        }
    }
}

void Fragment_Bank::output_mapped_reads(const char * output_file_name){
    
    //int print_fragment_cover(hash_t map, char * fragment, signed int left){
    int j,p;
    char *key;
    FILE * out = fopen(output_file_name, "w");
    if(out == NULL){
        fprintf(stderr, "Cannot open file %s for writing mapped reads, sorry\n", output_file_name);
        return;
    }
    int *position_fragment;
    list_ *mapping_positions;
    hash_iter iterator;
    
    //  for(vector<Fragment>::iterator fragment_it=all_fragments.begin(); fragment_it!= all_fragments.end(); ++(fragment_it)){
    for(int i=0;i<all_fragments.size();i++){
        Fragment * fragment_it=all_fragments[i];
        
        char * fragment_seq = fragment_it->fragment_sequence;
        if(fragment_it->Mapped==NULL){
            fprintf(out, ">Fragment %s NO_MAPPED_READS\n", fragment_seq);
            continue;
        }
        fprintf(out, ">Fragment %s\n", fragment_seq);
        
        
        hash_t map = fragment_it->Mapped;
        
        
        iterator=hash_iterator_init(map); // get the first element of the Mapped hashmap
        // Go through the hashmap Mapped and for each fragment
        // get the reads that have seeds that match somewhere on this fragment
    
        
        if((long int)iterator!=-1){
            while(!hash_iterator_is_end(map, iterator)){     // go through all the mapped reads
                hash_iterator_return_entry(map, iterator, &key, (void**)&mapping_positions); // get key(read) and corresponding mapping positions (pwi)
                if(mapping_positions){
                    cell_mapsembler * c=mapping_positions->first;
                    //print mapping positions
                    while(c!=NULL){
                        //	    couple *value= (couple *) c->val;
                        position_fragment=(int *)c->val;
                        //position_fragment=(int) *(c->val);
                        fprintf(out, ">read - occ %d\n", (*position_fragment));
                        for(p=0;p<(signed)strlen(key);p++){

                            if((*position_fragment)+p>=0 && (*position_fragment)+p<(signed)strlen(fragment_seq) && key[p] != fragment_seq[(*position_fragment)+p]) fprintf(out,"%c", tolower(key[p]));
                            else fprintf(out,"%c", key[p]);
                        }
                        fprintf(out,"\n");
                        
                        c=c->prox;
                    }
                }
                iterator=hash_iterator_next(map, iterator);
            }
        }
    }
    fclose(out);
}