1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
|
#include "Starter.h"
extern int size_seeds;
extern char * prefix_trashable;
Starter::Starter(char *starter_sequence){fragment = new Fragment(starter_sequence);}
Starter::Starter(Fragment *fragment):fragment(fragment){}
Starter::~Starter(){
delete fragment;
for(int i=0;i<nb_substarters;i++)
free(substarters_sequences[i]);
free(substarters_sequences);
free(right_kmers);
free(left_kmers);
}
static char ** remove_redondancies (char ** extensions, int * number_of_extensions){
if(*number_of_extensions<2) return extensions;
extensions=sort_strings(extensions,*number_of_extensions);
int i;
int idx=1; // we keep the first extension, so we start at 1.
for (i=1;i<(*number_of_extensions);i++)
if(strcmp(extensions[i-1],extensions[i])!=0){ // store extensions[i] as it is different from extensions[i-1]
extensions[idx++]=extensions[i];
}
*number_of_extensions=idx;
extensions = (char **) realloc(extensions,sizeof(char *)*idx);
return extensions;
}
void Starter::generate_substarter_no_mapping(Extension_Bank *extension_bank, const int size_kmers){
char * kmer;// = (char *) malloc (sizeof(char)*(size_seeds+1)); test_alloc(kmer);
right_kmers = (int *) malloc (sizeof(int)); test_alloc(right_kmers);
left_kmers = (int *) malloc (sizeof(int)); test_alloc(left_kmers);
substarters_sequences = (char **) malloc (sizeof(char *)); // a unique substarter
test_alloc(substarters_sequences);
substarters_sequences[0] = strdup(fragment->fragment_sequence);
nb_substarters=1;
// get the last kmer
kmer=strndup(fragment->fragment_sequence+strlen(fragment->fragment_sequence)-size_kmers,size_kmers);
// add it in the extension_bank of kmers to be extended
right_kmers[0]=extension_bank->maybe_add_a_new_extension(kmer);
free(kmer);
// get the first kmer
kmer=strndup(fragment->fragment_sequence,size_kmers);
// reverse complement it (left extension = right extension of a reverse complemented kmer)
revcomp(kmer,size_kmers);
// printf("left revcomp kmer = %s\n", kmer); // DEB
// add it in the extension_bank of kmers to be extended
left_kmers[0]=extension_bank->maybe_add_a_new_extension(kmer);
free(kmer);
}
int Starter::generate_substarters(Extension_Bank * extension_bank, const int substitutions_allowed, const int min_coverage, const int error_correction_threshold, const int size_kmers){
if(fragment->Mapped==NULL){
// no read was mapped on this starter.
substarters_sequences=NULL;
nb_substarters=0;
}
else{
substarters_sequences = read_coherent_homologs(fragment->Mapped,
fragment->fragment_sequence,
substitutions_allowed,
min_coverage,
error_correction_threshold,
size_kmers,
&nb_substarters);
substarters_sequences = remove_redondancies(substarters_sequences, &nb_substarters);
}
right_kmers = (int *) malloc (sizeof(int)*nb_substarters); test_alloc(right_kmers);
left_kmers = (int *) malloc (sizeof(int)*nb_substarters); test_alloc(left_kmers);
if(nb_substarters) fragment->read_coherent=true;
else fragment->read_coherent=false;
// fprintf(stderr,"\nstarter_sequence = %s has %d substarters \n", fragment->fragment_sequence, nb_substarters);
char * kmer;// = (char *) malloc (sizeof(char)*(size_seeds+1)); test_alloc(kmer);
for( int i=0;i<nb_substarters;i++ ){
// printf("substarter %s\n", substarters_sequences[i]); //DEB
// get the last kmer
kmer=strndup(substarters_sequences[i]+strlen(substarters_sequences[i])-size_kmers,size_kmers);
// add it in the extension_bank of kmers to be extended
right_kmers[i]=extension_bank->maybe_add_a_new_extension(kmer);
free(kmer);
// get the first kmer
kmer=strndup(substarters_sequences[i],size_kmers);
// reverse complement it (left extension = right extension of a reverse complemented kmer)
revcomp(kmer,size_kmers);
// printf("left revcomp kmer = %s\n", kmer); // DEB
// add it in the extension_bank of kmers to be extended
left_kmers[i]=extension_bank->maybe_add_a_new_extension(kmer);
free(kmer);
}
return nb_substarters;
}
//TODO : bank-> all extension
id_els Starter::starter_output_graph(Extension_Bank *bank, GraphOutput graph, id_els first_id_els){
// printf("OUTPUT GRAPHS\n");
long extremKmerRightId = 0;
long extremKmerLeftId = 0;
// graph.print_edges_end();
// for each substarter:
for(int i=0;i<nb_substarters;i++){
long first_extending_left_node;
long first_extending_right_node;
//bool newExtensionRight = false;
//bool newExtensionLeft = false;
// bool writeHead = false;
// RIGHT KMER
// test if the extension was not allready created. In this case bank->all_extensions[right_kmers[i]].node_id>=0
if(bank->all_extensions[right_kmers[i]].node_id<0){ // This extension has never been seen before
// set the node_id
bank->all_extensions[right_kmers[i]].node_id=first_id_els.node;
first_extending_right_node=first_id_els.node;
string right_kmer = bank->all_extensions[right_kmers[i]].starting_kmer;
graph.load_nodes_extremities((char *)(prefix_trashable+right_kmer).c_str());
first_id_els=graph.construct_graph(prefix_trashable+right_kmer, "RIGHT");
graph.first_id_els=first_id_els;
//graph.print_head(bank->all_extensions[right_kmers[i]].node_id, bank->all_extensions[left_kmers[i]].node_id);
//writeHead = true;
//newExtensionRight = true;
extremKmerRightId++;
}
else first_extending_right_node = bank->all_extensions[right_kmers[i]].node_id;
// LEFT KMER
// test if the extension was not allready created. In this case bank->all_extensions[right_kmers[i]].node_id>=0
if(bank->all_extensions[left_kmers[i]].node_id<0){ // This extension has never been seen before
// set the node_id
bank->all_extensions[left_kmers[i]].node_id=first_id_els.node;
first_extending_left_node=first_id_els.node;
string left_kmer = bank->all_extensions[left_kmers[i]].starting_kmer;
graph.load_nodes_extremities((char *)(prefix_trashable+left_kmer).c_str());
first_id_els=graph.construct_graph(prefix_trashable+left_kmer, "LEFT");
graph.first_id_els=first_id_els;
//if(!writeHead) graph.print_head(bank->all_extensions[right_kmers[i]].node_id, bank->all_extensions[left_kmers[i]].node_id);
//newExtensionLeft = true;
extremKmerLeftId++;
}
else first_extending_left_node = bank->all_extensions[left_kmers[i]].node_id;
if(i==0){
extremKmerRightId=0;
extremKmerLeftId=1;
}
/*if(newExtensionRight){
graph.print_sublink(first_id_els.edge, first_id_els.node, first_extending_right_node, "ff","");
first_id_els.edge++;
graph.print_sublink(first_id_els.edge, first_extending_right_node, first_id_els.node, "rr","");
first_id_els.edge++;
};
if(newExtensionLeft){
graph.print_sublink(first_id_els.edge, first_id_els.node, first_extending_left_node, "rf",""); //TODO : double check these edges
first_id_els.edge++;
graph.print_sublink(first_id_els.edge, first_extending_left_node, first_id_els.node, "rf","");
first_id_els.edge++;
};*/
//graph.print_node(first_id_els.node, substarters_sequences[i], ", \"type\": \"real-starter\""); //TODO: need to extract sequence to starter.fa
char data [200];
sprintf (data, "\"length\":%d, \"extremGraphRight\":\"k%d\", \"extremGraphLeft\":\"k%d\"",strlen(substarters_sequences[i]), extremKmerRightId, extremKmerLeftId);
graph.print_substarter(i, substarters_sequences[i], string(data));
//TODO: Meta info for each sub-graph
//TODO: XGMML fomat
// // PRINT THE SUBSTARTER IN XML FILE
// // create a new node containing the substarter
// fprintf(graph.graph_file,"<node id=\"%ld\" label=\"%s\">\n</node>\n", first_node_id, substarters_sequences[i]);
// // link the new node containing the substarter to the first extending left node. We should take care as the first extended kmer was reverse complemented. So links are rf rf
// fprintf(graph.graph_file,"<edge source=\"%ld\" target=\"%ld\" label=\"%s\">\n</edge>\n",first_node_id, first_extending_left_node, "rf");
// fprintf(graph.graph_file,"<edge source=\"%ld\" target=\"%ld\" label=\"%s\">\n</edge>\n",first_extending_left_node, first_node_id, "rf");
// // link the new node containing the substarter to the first extending right node
// fprintf(graph.graph_file,"<edge source=\"%ld\" target=\"%ld\" label=\"%s\">\n</edge>\n",first_node_id, first_extending_right_node, "ff");
// fprintf(graph.graph_file,"<edge source=\"%ld\" target=\"%ld\" label=\"%s\">\n</edge>\n",first_extending_right_node, first_node_id, "rr");
// graph.first_id++; // A new node (substarter) was added
// create a new node containing the substarter
}
graph.print_nodes_end();
graph.print_substarters_end();
return first_id_els;
}
void Starter::starter_output_sequence(Extension_Bank *bank, int index, string output_res, bool erase){
char * rseq;
char * lseq;
int readlen;
FILE * output_res_file = fopen((char *) output_res.c_str(), erase?"w":"a");
// for each substarter:
for(int i=0;i<nb_substarters;i++){
// get the right kmer
string right_kmer = bank->all_extensions[right_kmers[i]].starting_kmer;
cout<<"opening "<<(char*)(prefix_trashable+right_kmer).c_str()<<endl;
// get the sequence of the right extension
Bank * Right = new Bank((char*)(prefix_trashable+right_kmer).c_str());
// ! we should have a unique sequence in this file !
// Right->reset_max_readlen(1000000);
assert(Right->get_next_seq(&rseq,&readlen));
// printf("RIGHT: I read %s in %s\n", rseq, (char *)(prefix_trashable+right_kmer).c_str(), direction_right.c_str());
// get the left kmer
string left_kmer = bank->all_extensions[left_kmers[i]].starting_kmer;
//string direction_left = bank->all_extensions[left_kmers[i]].direction;
// get the sequence of the left extension
Bank * Left = new Bank((char*)(prefix_trashable+left_kmer).c_str());
// ! we should have a unique sequence in this file !
// Left->reset_max_readlen(1000000);
assert(Left->get_next_seq(&lseq,&readlen));
// printf("LEFT: I read %s in %s (direction %s)\n", lseq, (char *)(prefix_trashable+left_kmer).c_str(), direction_left.c_str());
// delete Left;
// We should revese complement the left sequence
revcomp(lseq,readlen);
lseq[readlen-sizeKmer+1]='\0';
fprintf(output_res_file, ">%s|starter_%d|substarter_%d\n", fragment->fragment_comment, index, i );
fprintf(output_res_file, "%s%s%s\n", to_lower(lseq), substarters_sequences[i], to_lower(rseq+sizeKmer-1)); // to_lower(rseq+size_seeds-1) removes the overlapping k-1 mer
Right->close();
delete Right;
Left->close();
delete Left;
}
fclose(output_res_file);
}
|