File: Starter.cpp

package info (click to toggle)
mapsembler2 2.1.6%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 4,116 kB
  • ctags: 3,391
  • sloc: cpp: 27,549; ansic: 2,662; asm: 271; sh: 226; makefile: 153
file content (250 lines) | stat: -rw-r--r-- 11,658 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250


#include "Starter.h"

extern int size_seeds;
extern char * prefix_trashable;

Starter::Starter(char *starter_sequence){fragment = new Fragment(starter_sequence);}
Starter::Starter(Fragment *fragment):fragment(fragment){}
Starter::~Starter(){
    delete fragment;
    for(int i=0;i<nb_substarters;i++)
        free(substarters_sequences[i]);
    free(substarters_sequences);
    free(right_kmers);
    free(left_kmers);
}


static char ** remove_redondancies (char ** extensions, int * number_of_extensions){
    if(*number_of_extensions<2) return extensions;
    extensions=sort_strings(extensions,*number_of_extensions);
    
    int i;
    int idx=1; // we keep the first extension, so we start at 1.
    
    for (i=1;i<(*number_of_extensions);i++)
        if(strcmp(extensions[i-1],extensions[i])!=0){ // store extensions[i] as it is different from extensions[i-1]
            extensions[idx++]=extensions[i];
        }
    
    *number_of_extensions=idx;
    extensions = (char **) realloc(extensions,sizeof(char *)*idx);
    return extensions;
}

void Starter::generate_substarter_no_mapping(Extension_Bank *extension_bank, const int size_kmers){
    
    char * kmer;// = (char *) malloc (sizeof(char)*(size_seeds+1)); test_alloc(kmer);
    right_kmers = (int *) malloc (sizeof(int)); test_alloc(right_kmers);
    left_kmers = (int *) malloc (sizeof(int)); test_alloc(left_kmers);
    substarters_sequences = (char **) malloc (sizeof(char *)); // a unique substarter
    test_alloc(substarters_sequences);
    substarters_sequences[0] = strdup(fragment->fragment_sequence);
    nb_substarters=1;
    // get the last kmer
    kmer=strndup(fragment->fragment_sequence+strlen(fragment->fragment_sequence)-size_kmers,size_kmers);
    // add it in the extension_bank of kmers to be extended
    right_kmers[0]=extension_bank->maybe_add_a_new_extension(kmer);
    free(kmer);
    // get the first kmer
    kmer=strndup(fragment->fragment_sequence,size_kmers);
    // reverse complement it (left extension = right extension of a reverse complemented kmer)
    revcomp(kmer,size_kmers);
    //   printf("left revcomp kmer = %s\n", kmer); // DEB
    // add it in the extension_bank of kmers to be extended
    left_kmers[0]=extension_bank->maybe_add_a_new_extension(kmer);
    free(kmer);
    
    
}

int Starter::generate_substarters(Extension_Bank * extension_bank, const int substitutions_allowed, const int min_coverage, const int error_correction_threshold, const int size_kmers){
    if(fragment->Mapped==NULL){
        // no read was mapped on this starter.
        substarters_sequences=NULL;
        nb_substarters=0;
        
    }
    else{
        substarters_sequences = read_coherent_homologs(fragment->Mapped,
                                                       fragment->fragment_sequence,
                                                       substitutions_allowed,
                                                       min_coverage,
                                                       error_correction_threshold,
                                                       size_kmers,
                                                       &nb_substarters);
        
        substarters_sequences = remove_redondancies(substarters_sequences, &nb_substarters);
    }
    right_kmers = (int *) malloc (sizeof(int)*nb_substarters); test_alloc(right_kmers);
    left_kmers = (int *) malloc (sizeof(int)*nb_substarters); test_alloc(left_kmers);
    if(nb_substarters) fragment->read_coherent=true;
    else fragment->read_coherent=false;
    
//    fprintf(stderr,"\nstarter_sequence = %s has %d substarters \n", fragment->fragment_sequence, nb_substarters);
    
    char * kmer;// = (char *) malloc (sizeof(char)*(size_seeds+1)); test_alloc(kmer);
    
    for( int i=0;i<nb_substarters;i++ ){
        // printf("substarter %s\n", substarters_sequences[i]); //DEB
        // get the last kmer
        kmer=strndup(substarters_sequences[i]+strlen(substarters_sequences[i])-size_kmers,size_kmers);
        // add it in the extension_bank of kmers to be extended
        right_kmers[i]=extension_bank->maybe_add_a_new_extension(kmer);
        free(kmer);
        // get the first kmer
        kmer=strndup(substarters_sequences[i],size_kmers);
        // reverse complement it (left extension = right extension of a reverse complemented kmer)
        revcomp(kmer,size_kmers);
        //   printf("left revcomp kmer = %s\n", kmer); // DEB
        // add it in the extension_bank of kmers to be extended
        left_kmers[i]=extension_bank->maybe_add_a_new_extension(kmer);
        free(kmer);
    }
    
    return nb_substarters;
}


//TODO : bank-> all extension
id_els Starter::starter_output_graph(Extension_Bank *bank, GraphOutput graph, id_els first_id_els){ 
//  printf("OUTPUT GRAPHS\n");
    long extremKmerRightId = 0;
    long extremKmerLeftId = 0;
    
     
    // graph.print_edges_end();
  // for each substarter:
  for(int i=0;i<nb_substarters;i++){
    long first_extending_left_node;
    long first_extending_right_node;
    //bool newExtensionRight = false;
    //bool newExtensionLeft = false;
   // bool writeHead = false;

    // RIGHT KMER
    // test if the extension was not allready created. In this case  bank->all_extensions[right_kmers[i]].node_id>=0

    if(bank->all_extensions[right_kmers[i]].node_id<0){ // This extension has never been seen before
      // set the node_id
      bank->all_extensions[right_kmers[i]].node_id=first_id_els.node;
      first_extending_right_node=first_id_els.node; 
      string right_kmer =  bank->all_extensions[right_kmers[i]].starting_kmer;
      graph.load_nodes_extremities((char *)(prefix_trashable+right_kmer).c_str());
      first_id_els=graph.construct_graph(prefix_trashable+right_kmer, "RIGHT");
      graph.first_id_els=first_id_els;
      //graph.print_head(bank->all_extensions[right_kmers[i]].node_id, bank->all_extensions[left_kmers[i]].node_id);
      //writeHead = true;
      //newExtensionRight = true;
      extremKmerRightId++;     
    }    
    else first_extending_right_node = bank->all_extensions[right_kmers[i]].node_id;

    // LEFT KMER
    // test if the extension was not allready created. In this case  bank->all_extensions[right_kmers[i]].node_id>=0 
    if(bank->all_extensions[left_kmers[i]].node_id<0){ // This extension has never been seen before
      // set the node_id
      bank->all_extensions[left_kmers[i]].node_id=first_id_els.node;
      first_extending_left_node=first_id_els.node; 
      string left_kmer =  bank->all_extensions[left_kmers[i]].starting_kmer; 
      graph.load_nodes_extremities((char *)(prefix_trashable+left_kmer).c_str());
      first_id_els=graph.construct_graph(prefix_trashable+left_kmer, "LEFT");
      graph.first_id_els=first_id_els;
      //if(!writeHead) graph.print_head(bank->all_extensions[right_kmers[i]].node_id, bank->all_extensions[left_kmers[i]].node_id);
       //newExtensionLeft = true;
      extremKmerLeftId++;
    }
    else first_extending_left_node = bank->all_extensions[left_kmers[i]].node_id;
	
    if(i==0){
	extremKmerRightId=0;
	extremKmerLeftId=1;
        
    }

    /*if(newExtensionRight){
	      	graph.print_sublink(first_id_els.edge, first_id_els.node, first_extending_right_node,  "ff","");
	      	first_id_els.edge++;
	     	 graph.print_sublink(first_id_els.edge, first_extending_right_node, first_id_els.node,  "rr","");
	     	 first_id_els.edge++;
    };
    if(newExtensionLeft){
		graph.print_sublink(first_id_els.edge, first_id_els.node, first_extending_left_node,  "rf",""); //TODO : double check these edges
		 first_id_els.edge++;
		 graph.print_sublink(first_id_els.edge, first_extending_left_node, first_id_els.node,  "rf","");
		first_id_els.edge++;
    };*/

	
    //graph.print_node(first_id_els.node, substarters_sequences[i], ", \"type\": \"real-starter\""); //TODO: need to extract sequence to starter.fa
    char data [200];
    sprintf (data, "\"length\":%d, \"extremGraphRight\":\"k%d\", \"extremGraphLeft\":\"k%d\"",strlen(substarters_sequences[i]), extremKmerRightId,  extremKmerLeftId);
    
    graph.print_substarter(i, substarters_sequences[i], string(data));
//TODO: Meta info for each sub-graph
//TODO: XGMML fomat

    
    //   // PRINT THE SUBSTARTER IN XML FILE
    //   // create a new node containing the substarter
    //   fprintf(graph.graph_file,"<node id=\"%ld\" label=\"%s\">\n</node>\n",  first_node_id, substarters_sequences[i]); 
    //   // link the new node containing the substarter to the first extending left node. We should take care as the first extended kmer was reverse complemented. So links are rf rf
    //   fprintf(graph.graph_file,"<edge source=\"%ld\" target=\"%ld\" label=\"%s\">\n</edge>\n",first_node_id, first_extending_left_node, "rf");
    //   fprintf(graph.graph_file,"<edge source=\"%ld\" target=\"%ld\" label=\"%s\">\n</edge>\n",first_extending_left_node, first_node_id, "rf");
    //   // link the new node containing the substarter to the first extending right node
    //   fprintf(graph.graph_file,"<edge source=\"%ld\" target=\"%ld\" label=\"%s\">\n</edge>\n",first_node_id, first_extending_right_node, "ff");
    //   fprintf(graph.graph_file,"<edge source=\"%ld\" target=\"%ld\" label=\"%s\">\n</edge>\n",first_extending_right_node, first_node_id, "rr");
    //   graph.first_id++; // A new node (substarter) was added

    // create a new node containing the substarter

    
  }
  graph.print_nodes_end();
  graph.print_substarters_end();  
  return first_id_els;
}

void Starter::starter_output_sequence(Extension_Bank *bank, int index, string output_res, bool erase){
    char * rseq;
    char * lseq;
    int readlen;
    FILE * output_res_file = fopen((char *) output_res.c_str(), erase?"w":"a");
    // for each substarter:
    for(int i=0;i<nb_substarters;i++){
        // get the right kmer
        string right_kmer =  bank->all_extensions[right_kmers[i]].starting_kmer;
        cout<<"opening "<<(char*)(prefix_trashable+right_kmer).c_str()<<endl;
        // get the sequence of the right extension
        Bank * Right = new Bank((char*)(prefix_trashable+right_kmer).c_str());
        // ! we should have a unique sequence in this file !
        //        Right->reset_max_readlen(1000000);
        assert(Right->get_next_seq(&rseq,&readlen));
//        printf("RIGHT: I read %s in %s\n", rseq, (char *)(prefix_trashable+right_kmer).c_str(), direction_right.c_str());
        
        // get the left kmer
        string left_kmer =  bank->all_extensions[left_kmers[i]].starting_kmer;
        //string direction_left =   bank->all_extensions[left_kmers[i]].direction;
        // get the sequence of the left extension
        Bank * Left = new Bank((char*)(prefix_trashable+left_kmer).c_str());
        // ! we should have a unique sequence in this file !
        //        Left->reset_max_readlen(1000000);
        assert(Left->get_next_seq(&lseq,&readlen));
//        printf("LEFT: I read %s in %s (direction %s)\n", lseq, (char *)(prefix_trashable+left_kmer).c_str(), direction_left.c_str());
        //    delete Left;
        // We should revese complement the left sequence
        
        revcomp(lseq,readlen);
        lseq[readlen-sizeKmer+1]='\0';
        
        fprintf(output_res_file, ">%s|starter_%d|substarter_%d\n", fragment->fragment_comment, index, i );
        fprintf(output_res_file, "%s%s%s\n", to_lower(lseq), substarters_sequences[i], to_lower(rseq+sizeKmer-1)); // to_lower(rseq+size_seeds-1) removes the overlapping k-1 mer
        Right->close();
        delete Right;
        Left->close();
        delete Left;
    }
  fclose(output_res_file); 
}