1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425
|
//
// Fragment_Banks.cpp
//
//
// Created by Pierre Peterlongo on 25/10/2012
//
#include "Fragment_Bank.h"
extern int size_seeds;
Fragment_Bank::Fragment_Bank(){
//seeds = hash_create(100000); test_alloc(seeds);
seeds_count = hash_create_binarykey(100000); test_alloc(seeds_count);
}
Fragment_Bank::~Fragment_Bank(){
// hash_delete(seeds, list_of_generic_free);
//free(seeds_count);
// hash_delete(seeds_count, list_of_generic_free);
hash_free(seeds_count);
all_fragments.clear(); // All the elements of the vector are dropped: their destructors are called, and then they are removed from the vector container, leaving the container with a size of 0.
// free the array of couples:
free(seed_table);
}
void Fragment_Bank::load_fragments(char * fragment_file){
Bank * bank = new Bank(fragment_file);
char * read;
int size_read;
while(bank->get_next_seq(&read,&size_read)){ // each fragment
all_fragments.push_back(new Fragment_Starting(read));
}
delete bank;
}
int estimated_size_reads=256; // ATTENTION : this parameter is a source of problems... Take care it fits the maximal read size.
void Fragment_Bank::index_fragments_and_init_vector(){
char * read;
char seed [size_seeds+1];
int size_fragment;
int fragment_id=0;
total_seeds = 0 ;
long sum=0;
// FIRST COUNT THE NUMBER OF OCCURRENCES OF EACH SEED
for( int i=0; i<all_fragments.size();i++){
read = all_fragments[i]->fragment_sequence;
size_fragment=strlen(read);
all_fragments[i]->starting_id_in_the_boolean_vector=sum+estimated_size_reads;
sum+=size_fragment+estimated_size_reads;
int stop = size_fragment-size_seeds+1;
for (int i=0;i<stop;i++){ // each seed
int validSeed=1;
for(int j=0;j<size_seeds;j++) {
seed[j]=read[i+j];// read the seed
if(!valid_character(seed[j])){validSeed=0; break;}
}
if(validSeed) {
seed[size_seeds]='\0';
hash_incr_kmer_count(seeds_count,seed);
total_seeds++;
}
} // end each seed
fragment_id++;
} // end each fragment
printf("total seeds in fragments %lli size %lli MB \n",total_seeds,total_seeds*sizeof(couple)/1024LL/1024LL);
printf("sum = %ld\n", sum);
bv = new BooleanVector(sum);
fragment_id=0;
seed_table = (couple *) calloc(total_seeds,sizeof(couple));
test_alloc(seed_table);
iterate_and_fill_offsets(seeds_count);
// NOW FILL THE INFORMATION ABOUT EACH SEED
for( int i=0; i<all_fragments.size();i++){
read = all_fragments[i]->fragment_sequence;
size_fragment=strlen(read);
int stop = size_fragment-size_seeds+1;
for (int i=0;i<stop;i++){ // each seed
int validSeed=1;
for(int j=0;j<size_seeds;j++) {
seed[j]=read[i+j];// read the seed
if(!valid_character(seed[j])){validSeed=0; break;}
}
if(validSeed) {
seed[size_seeds]='\0';
hash_fill_kmer_index(seeds_count,seed,seed_table, fragment_id, i);
}
} // end each seed
fragment_id++;
} // end each fragment
}
/**
* |<->| pwi (may be negative)
* |<->| pseed (where seed mapped on fragment)
* -------------- fragment
* |||||| seed
* ******************* read
*
* Tests if the overlapping part between read and fragment do not have more than <code>subst_allowed</code> substitutions and if the overlap size is bigger or equal to <code>min_overlap</code>
* returns 1 if true between read and fragment, 0 else
*/
char read_coherent_enhanced(const int pwi, const int pseed, const char * fragment, const char * read, const int subst_allowed){
int substitution_seen=0; // number of seen substitutions for now
int pos_on_read, pos_on_fragment; // where to start
/*
* | pwi (negative)
* -------------- fragment
* ************* read
* | we start here on the read
*/
if(pwi<0) {
pos_on_fragment=0;
pos_on_read=-pwi;
}
/*
* | pwi (positive)
* -------------- fragment
* ************* read
* | we start here on the read
*/
else{
pos_on_fragment=pwi;
pos_on_read=0;
}
// walk the read and the fragment together, detecting substitutions.
// stop if the number of substitution is too high
while(fragment[pos_on_fragment]!='\0' && read[pos_on_read]!='\0'){
if(pos_on_fragment==pseed){ // we reached the seed, we can skip it, we know it contains no substitutions
#ifdef DEBUG_COHERENT
printf("seeds (%d on fragment %d on read:\n", pseed, pseed-pwi);
int z;
for(z=pseed;z<pseed+size_seeds;z++) printf("%c", fragment[z]); printf("\n");
for(z=pseed-pwi;z<pseed-pwi+size_seeds;z++) printf("%c", read[z]); printf("\n");
// TEMP TODO: REMOVE THESE LINES ONCE SURE
for(z=0;z<size_seeds;z++)
if(fragment[z+pos_on_fragment]!=read[z+pos_on_read]){
printf("fragment[%d]!=read[%d] (%c != %c)\n", z+pos_on_fragment, z+pos_on_read, fragment[z+pos_on_fragment],read[z+pos_on_read]);
int space=30;
for(z=0;z<space;z++) printf(" "); printf("%s - pwi: %d - seed: %d\n", fragment, pwi, pseed);
for(z=0;z<space+pwi;z++) printf(" ");
printf("%s\n", read);
exit(1);
}
// END LAST TODO REMARQ
#endif
pos_on_fragment+=size_seeds;
pos_on_read+=size_seeds;
}
else{
if(fragment[pos_on_fragment]!=read[pos_on_read]){ // one subsitution
substitution_seen++;
if(substitution_seen>subst_allowed) return 0;
}
pos_on_fragment++;
pos_on_read++;
}
}
return 1;
}
// checks if, for a fragment, a position mapped already (return 1) or none (return 0)
int exists_tuple_read_position(const vector<int> mapped_positions, const int pos){
return find(mapped_positions.begin(), mapped_positions.end(), pos)!=mapped_positions.end();
}
// checks if, for a fragment, a tuple read/position mapped already a few position before of after (<min_dist) (return 0) or not (return 1)
char previous_mapping_position_far_enougth(const hash_t map, const char * read, const int pos){
const int min_dist = 10;
/***************
* these two reads overlap on |read|-min_dist. Lets consider only one one them
* +++++++++++++++++++++++++++++++++++++ fragment
* ------------------------ R1
* -------------------------- R2
*****************/
list_ * read_coherent_positions; //for one couple (read/fragment) : list of coherent mapped positions (usually, one position)
if(hash_entry_by_key(map, read, (void **) (&read_coherent_positions)) !=0){
cell_mapsembler *c = ((list_ *)read_coherent_positions)->first;
while(c != NULL){
if((*(int *) (c->val)) != pos && // in case the same read exists twice in the read file, mapped twice at the same position.
(*(int *) (c->val)) > pos - min_dist && (*(int *) (c->val)) < pos+min_dist) {
# ifdef DEBUG_EXTENTIONS
// printf("pos %d to close to %d return 0\n", (*(int *) (c->val)), pos); // DEB
# endif
return 0;
}
c = c->prox;
}
}
return 1;
}
/**
* test if a position has already been tested (for instance an anchoring position of a mapped read)
*/
bool Fragment_Bank::is_position_already_tested(const Fragment_Starting * fragment, int pos){
return bv->is_boolean_vector_visited(fragment->starting_id_in_the_boolean_vector+pos);
}
/**
* set a position has already been tested (for instance an anchoring position of a mapped read)
*/
void Fragment_Bank::set_position_tested(const Fragment_Starting * fragment, int pos){
bv->set_boolean_vector_visited(fragment->starting_id_in_the_boolean_vector+pos);
}
void Fragment_Bank::map_reads(Bank * reads, const int substitutions_allowed){
//////////////////////////////////////////////////////////////////////////
/////////////// read all reads - storing those coherent with reads ///////
//////////////////////////////////////////////////////////////////////////
//char * fragment;
//list * mapping_positions; //offsets of seeds mapping positions for a read
// working variables
int pwi, stop, read_coherence_value, extention_number=0;//
//int read_id=0;
uint64_t offset_seed;
uint64_t nb_seeds;
// book space for the revcomp read that going to be read.
char * rev_comp_read = (char *)malloc(sizeof(char)*16384); //
test_alloc(rev_comp_read);
// read all the reads
char seed [size_seeds+1];
char * read;
int size_read;
while(reads->get_next_seq(&read,&size_read)){ // each read
stop = size_read-size_seeds+1;
// read all seeds present on the read:
for(int direction=0;direction<2;direction++) { // 0 = forward, 1 = rev comp
for (int i=0;i<stop;i++){
int j;
for(j=0;j<size_seeds;j++)
seed[j]=read[i+j]; seed[j]='\0';// read the seed
// printf("query seed = %s\n", seed);
// if the seed is indexed in the fragments:
if(get_seed_info(seeds_count,seed,&offset_seed,&nb_seeds)){
// if(hash_entry_by_key(seeds, seed, (void **) (& mapping_positions)) !=0){
// cell_mapsembler * c = mapping_positions->first;
// for each occurrence of this seed on the fragment:
int ii;
for (ii=offset_seed; ii<offset_seed+nb_seeds; ii++) {
couple * value = &(seed_table[ii]);
Fragment_Starting* fragment = all_fragments[value->a];
if(!fragment->Mapped){
fragment->Mapped=hash_create(1000);
test_alloc(fragment->Mapped);
}
pwi = value->b-i; // starting position of the read on the fragment.
// printf("with seed %s occurring pos %d on read and %d on fragment\n", seed, i, value->b);
// overview of the situation:
// ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; fragment
// <---------> b
// [--------] seed
// ****************************** read
// <----> i
// <---> pwi
// if the tuple read/fragment/position was not already mapped nor already detected as unmappable, try to map.
if(!is_position_already_tested(fragment, pwi)){// &&
// previous_mapping_position_far_enougth (fragment->Mapped, read, pwi)){
read_coherence_value = read_coherent_enhanced(pwi, value->b, fragment->fragment_sequence, read, substitutions_allowed);
// printf("test read coherence between %s and %s %d %d = %d\n", fragment->fragment_sequence, read, read_id, pwi, read_coherence_value);
// printf("with seed %s occurring pos %d on read and %d on fragment\n", seed, i, value->b);
if(read_coherence_value == 1){ // tuple read fragment->fragment_sequence position is read coherent
hash_add_int_to_list(fragment->Mapped, read, pwi);
} // end tuple read fragment->fragment_sequence position is read coherent
set_position_tested(fragment, pwi);
} // end tuple read/fragment->fragment_sequence/position was not already mapped nor already detected as unmappable
// else printf("%s %s %d %d already tested \n", fragment->fragment_sequence, read, read_id, pwi);
// c=c->prox;
} // end for each occurrence of this seed on the fragment->fragment_sequence
} // end if the seed is indexed in the fragment->fragment_sequences:
} // end all seeds of the read
bv->reinit();
revcomp(read,size_read);
} // end each direciton
} // end each read
}
void Fragment_Bank::show_alignments(){
//int print_fragment_cover(hash_t map, char * fragment, signed int left){
int j,p, space=70;
char *key;
int *position_fragment;
list_ *mapping_positions;
hash_iter iterator;
// for(vector<Fragment>::iterator fragment_it=all_fragments.begin(); fragment_it!= all_fragments.end(); ++(fragment_it)){
for(int i=0;i<all_fragments.size();i++){
Fragment * fragment_it=all_fragments[i];
char * fragment_seq = fragment_it->fragment_sequence;
printf("\n");
for(j=0;j<space;j++) printf(" "); printf("%s = Fragment", fragment_seq);
printf("\n");
if(fragment_it->Mapped==NULL){
for(j=0;j<space;j++) printf(" "); printf("NO_MAPPED_READS\n");
continue;
}
hash_t map = fragment_it->Mapped;
iterator=hash_iterator_init(map); // get the first element of the Mapped hashmap
// Go through the hashmap Mapped and for each fragment
// get the reads that have seeds that match somewhere on this fragment
if((long int)iterator!=-1){
while(!hash_iterator_is_end(map, iterator)){ // go through all the mapped reads
hash_iterator_return_entry(map, iterator, &key, (void**)&mapping_positions); // get key(read) and corresponding mapping positions (pwi)
if(mapping_positions){
cell_mapsembler * c=mapping_positions->first;
//print mapping positions
while(c!=NULL){
// couple *value= (couple *) c->val;
position_fragment=(int *)c->val;
//position_fragment=(int) *(c->val);
for(j=0;j<space+(*position_fragment);j++) printf(" ");
for(p=0;p<(signed)strlen(key);p++){
if((*position_fragment)+p>=0 && (*position_fragment)+p<(signed)strlen(fragment_seq) && key[p] != fragment_seq[(*position_fragment)+p]) printf("%c", tolower(key[p]));
else printf("%c", key[p]);
}
printf(" %d\n", (*position_fragment));
c=c->prox;
}
}
iterator=hash_iterator_next(map, iterator);
}
}
}
}
void Fragment_Bank::output_mapped_reads(const char * output_file_name){
//int print_fragment_cover(hash_t map, char * fragment, signed int left){
int j,p;
char *key;
FILE * out = fopen(output_file_name, "w");
if(out == NULL){
fprintf(stderr, "Cannot open file %s for writing mapped reads, sorry\n", output_file_name);
return;
}
int *position_fragment;
list_ *mapping_positions;
hash_iter iterator;
// for(vector<Fragment>::iterator fragment_it=all_fragments.begin(); fragment_it!= all_fragments.end(); ++(fragment_it)){
for(int i=0;i<all_fragments.size();i++){
Fragment * fragment_it=all_fragments[i];
char * fragment_seq = fragment_it->fragment_sequence;
if(fragment_it->Mapped==NULL){
fprintf(out, ">Fragment %s NO_MAPPED_READS\n", fragment_seq);
continue;
}
fprintf(out, ">Fragment %s\n", fragment_seq);
hash_t map = fragment_it->Mapped;
iterator=hash_iterator_init(map); // get the first element of the Mapped hashmap
// Go through the hashmap Mapped and for each fragment
// get the reads that have seeds that match somewhere on this fragment
if((long int)iterator!=-1){
while(!hash_iterator_is_end(map, iterator)){ // go through all the mapped reads
hash_iterator_return_entry(map, iterator, &key, (void**)&mapping_positions); // get key(read) and corresponding mapping positions (pwi)
if(mapping_positions){
cell_mapsembler * c=mapping_positions->first;
//print mapping positions
while(c!=NULL){
// couple *value= (couple *) c->val;
position_fragment=(int *)c->val;
//position_fragment=(int) *(c->val);
fprintf(out, ">read - occ %d\n", (*position_fragment));
for(p=0;p<(signed)strlen(key);p++){
if((*position_fragment)+p>=0 && (*position_fragment)+p<(signed)strlen(fragment_seq) && key[p] != fragment_seq[(*position_fragment)+p]) fprintf(out,"%c", tolower(key[p]));
else fprintf(out,"%c", key[p]);
}
fprintf(out,"\n");
c=c->prox;
}
}
iterator=hash_iterator_next(map, iterator);
}
}
}
fclose(out);
}
|