iVar
suffix_tree.cpp
1 #include "iostream"
2 #include "algorithm"
3 #include "fstream"
4 #include "vector"
5 
6 #include "suffix_tree.h"
7 #include "alignment.h"
8 
9 suffix_node::suffix_node(int b, int *e, suffix_node *p, suffix_node *l){
10  this->children = new suffix_node*[MAX_CHAR];
11  for(unsigned int i = 0; i < MAX_CHAR;i++)
12  this->children[i] = 0;
13  this->begin = b;
14  this->end = e;
15  this->nchildren = 0;
16  this->parent = p;
17  this->suffix_link =l;
18 }
19 
20 bool suffix_node::is_leaf_node(){
21  return (nchildren == 0);
22 }
23 
24 bool suffix_node::contains_child(int ext){
25  return !(this->children[ext] == 0);
26 }
27 
28 int suffix_node::get_length(){
29  if(this->end == 0)
30  return 0; // For root
31  return *(this->end) - this->begin + 1;
32 }
33 
34 int suffix_node::get_depth(){
35  int ctr = 0;
36  suffix_node* n = this;
37  while(n->begin!=-1){
38  n = n->parent;
39  ctr++;
40  }
41  return ctr;
42 }
43 
44 int get_hamming_distance(std::string s1, std::string s2, int k){
45  int n = 0;
46  for(unsigned int i = 0; i < s1.length(); i++){
47  if(s1[i] != s2[i]){
48  n++;
49  }
50  if(n > k)
51  return -1;
52  }
53  return n;
54 }
55 
56 std::string suffix_node::get_longest_common_substring(std::string s1, std::string s2){
57  std::string str = s1+"#"+s2+"@";
58  std::string p="", tmp, longest_trvsl = "";
59  unsigned int max = 0, i;
60  suffix_node* node = this;
61  if(node->begin == -1)
62  p = "";
63  else{
64  tmp = node->get_path(str);
65  if(s1.find(p+tmp) != std::string::npos && s2.find(p+tmp) != std::string::npos)
66  p += tmp;
67  }
68  tmp = "";
69  for(i = 0; i < MAX_CHAR;i++){
70  if(node->children[i] == 0)
71  continue;
72  longest_trvsl = node->children[i]->get_longest_common_substring(s1, s2);
73  if(longest_trvsl.length() > max){
74  max = longest_trvsl.length();
75  tmp = longest_trvsl;
76  }
77  }
78  p += tmp;
79  return p;
80 }
81 
82 std::string suffix_node::get_path(std::string s){
83  if(this->begin == -1)
84  return "R";
85  return s.substr(this->begin, *(this->end) - this->begin + 1);
86 }
87 
88 void suffix_node::extend_path(int *e){
89  this->end = e;
90 }
91 
92 suffix_node* suffix_node::add_child(int ext, int b, int *e, suffix_node* l){
93  suffix_node *n = new suffix_node(b, e, this, l);
94  this->children[ext] = n;
95  this->nchildren++;
96  return n;
97 }
98 
99 void suffix_node::add_child(suffix_node* c, int ext){
100  this->children[ext] = c;
101  c->parent = this;
102  this->nchildren++;
103 }
104 
105 suffix_node* suffix_node::get_child(int ext){
106  return this->children[ext];
107 }
108 
109 bool suffix_node::contains_depth(int depth){
110  return this->get_depth() <= depth && depth <= (this->get_length() + this->get_depth());
111 }
112 
113 void suffix_node::print(std::string s){
114  for(int i = 0; i < this->get_depth(); i++){
115  std::cout << " ";
116  }
117  std::string t = (this->begin == -1) ? "R" : " "+s.substr(this->begin, *(this->end) - this->begin + 1);
118  std::cout << t;
119  if(this->suffix_link != 0)
120  std::cout << " --- ( " << this->suffix_link->parent->get_path(s) << " " << this->suffix_link->get_path(s) << ")";
121  if(this->begin!=-1)
122  std::cout << " - " << this->parent->get_path(s);
123  std::cout << std::endl;
124  for(unsigned int i = 0; i<alphabet.length();i++){
125  if(this->children[i]!=0)
126  this->children[i]->print(s);
127  }
128 }
129 
130 bool suffix_node::walk_next(int &beg, int &suffix_length){
131  suffix_node *node = this;
132  if(suffix_length >= node->get_length()){
133  beg += node->get_length();
134  suffix_length -= node->get_length();
135  return true;
136  }
137  return false;
138 }
139 
140 suffix_node* build_suffix_tree(std::string s){
141  suffix_node *root = new suffix_node(-1,0,0,0);
142  root->suffix_link = root;
143  root->parent = root;
144  suffix_node *cur_node = root, *new_node = 0, *new_cur_node = 0;
145  int str_ind[MAX_SIZE], suffix_length = 0, *leaf_end= new int, suffix_count = 0, beg = -1, *end;
146  unsigned int i = 0, n_str_ind = 0;
147  for(i = 0; i < s.length();i++){
148  str_ind[i] = alphabet.find(s[i]);
149  n_str_ind++;
150  }
151  *leaf_end = 0;
152  // root->add_child(str_ind[0], 0, leaf_end, root);
153  for(i = 0; i < n_str_ind;i++){
154  *leaf_end = i; // Handle Extension Rule 1
155  suffix_count++;
156  new_node = 0;
157  while(suffix_count > 0){
158  if(suffix_length == 0)
159  beg = i;
160  if(cur_node->children[str_ind[beg]] == 0){ // Rule 2
161  cur_node->add_child(str_ind[beg], beg, leaf_end, 0); // Trick 3
162  if(new_node != 0){
163  new_node->suffix_link = cur_node;
164  new_node = 0; // No internal node created in Rule 2 here.
165  }
166  } else {
167  new_cur_node = cur_node->children[str_ind[beg]];
168  if(new_cur_node->walk_next(beg, suffix_length)){
169  cur_node = new_cur_node;
170  continue;
171  }
172  if(str_ind[new_cur_node->begin + suffix_length] == str_ind[i]){ // Rule 3
173  if(new_node != 0 && cur_node->end != 0){
174  new_node->suffix_link = cur_node;
175  new_node = 0; // No internal node created in Rule 2 here.
176  }
177  suffix_length++;
178  break;
179  }
180  // Rule 2 - New internal node created
181  suffix_node *new_int_node;
182  end = new int;
183  *end = new_cur_node->begin + suffix_length - 1;
184  //New internal node
185  new_int_node = cur_node->add_child(str_ind[beg], new_cur_node->begin, end, 0);
186  new_int_node->add_child(str_ind[i], i, leaf_end, 0);
187  new_cur_node->begin += suffix_length;
188  new_int_node->add_child(new_cur_node, str_ind[new_cur_node->begin]);
189  if(new_node != 0)
190  new_node->suffix_link = new_int_node;
191  new_node = new_int_node;
192  }
193  suffix_count--;
194  if(cur_node->end==0 && suffix_length > 0){
195  suffix_length--;
196  beg = i - suffix_count + 1;
197  } else if(cur_node->end!=0){
198  if(cur_node->suffix_link != 0){
199  cur_node = cur_node->suffix_link;
200  } else {
201  cur_node = cur_node->parent->suffix_link;
202  }
203  } else if(cur_node->get_length() == 1){
204  cur_node = root;
205  }
206  }
207  }
208  return root;
209 }
210 
211 std::string get_reverse_complement(std::string rev_read){
212  char t;
213  for(unsigned int i = 0;i< rev_read.length();i++){
214  switch(rev_read[i]){
215  case 'A':
216  t = 'T';
217  break;
218  case 'G':
219  t='C';
220  break;
221  case 'C':
222  t='G';
223  break;
224  case 'T':
225  t='A';
226  break;
227  }
228  rev_read[i] = t;
229  }
230  std::reverse(rev_read.begin(), rev_read.end());
231  return rev_read;
232 }
233 
234 std::vector<std::string> read_adapters_from_fasta(std::string p){
235  std::vector<std::string> adp;
236  std::ifstream fin(p);
237  std::string line;
238  while (std::getline(fin, line)){
239  if(line[0] == '>')
240  continue;
241  adp.push_back(line);
242  }
243  return adp;
244 }
245 
246 std::string extend_till_mismatches(std::string cmn, std::string l, std::string adp){
247  if(cmn.length() < MIN_LENGTH){
248  return cmn;
249  }
250  int pos = adp.find(cmn);
251  int r_pos = l.find(cmn);
252  if(pos == 0) // Beginning of adapter match
253  return cmn;
254  int k = 0;
255  pos--;
256  r_pos--;
257  while(r_pos >= 0 && pos >= 0 && k <= MAX_MISMATCHES){
258  cmn = l[r_pos] + cmn;
259  if(l[r_pos]!=adp[pos])
260  k++;
261  r_pos--;
262  pos--;
263  }
264  return cmn;
265 }
266 
267 int trim_adapter(std::string f1, std::string f2, std::string adp_path, std::string p){
268  std::string l1, l2, l1_rc, l2_rc, s, cmn, trmd_read;
269  int *t = new int[2], beg;
270  unsigned int pos;
271  std::ifstream ff1(f1), ff2, pf(p);
272  std::ofstream out(p+".trimmed.fastq");
273  std::vector<std::string>::iterator it;
274  suffix_node *root = (suffix_node*) malloc(sizeof(suffix_node));
275  std::vector<std::string> adp = read_adapters_from_fasta(adp_path);
276  int i = -1, n = 0, adp_pos;
277  if(!f2.empty()){
278  ff2.open(f2);
279  while (std::getline(ff1, l1) && std::getline(ff2, l2)){
280  i++;
281  if((i - 1)%4 != 0)
282  continue;
283  }
284  } else {
285  while (std::getline(ff1, l1)){
286  i++;
287  if((i - 1)%4 != 0){
288  out << l1 << std::endl;
289  continue;
290  }
291  beg = 0;
292  pos = l1.length();
293  trmd_read = l1;
294  for(it = adp.begin(); it != adp.end(); ++it) {
295  s = l1 + "#" + *it + "@";
296  root = build_suffix_tree(s);
297  cmn = root->get_longest_common_substring(l1, *it);
298  cmn = extend_till_mismatches(cmn, l1, *it);
299  if(cmn.empty()){
300  l1_rc = get_reverse_complement(l1);
301  s = l1_rc + "#" + *it + "@";
302  root = build_suffix_tree(s);
303  cmn = root->get_longest_common_substring(l1_rc, *it);
304  cmn = extend_till_mismatches(cmn, l1_rc, *it);
305  pos = l1_rc.find(cmn);
306  beg = l1_rc.length() - pos;
307  adp_pos = (*it).find(cmn);
308  } else {
309  beg = 0;
310  pos = l1.find(cmn);
311  adp_pos = (*it).find(cmn);
312  }
313  if(cmn.length() >= MIN_LENGTH && adp_pos != 0){
314  beg = 0;
315  pos = l1.length();
316  t = align_seqs(l1, *it);
317  if(*t != -1){
318  beg = 0;
319  pos = *(t+1);
320  } else {
321  std::cout << "Reverse!" << std::endl;
322  l1_rc = get_reverse_complement(l1);
323  t = align_seqs(l1_rc, *it);
324  if(*t != -1){
325  pos = l1.length() - *(t+1);
326  beg = *(t+1);
327  }
328  }
329  }
330  std::cout << beg << " " << pos << std::endl;
331  if(beg != 0 || pos != l1.length()){
332  trmd_read = l1.substr(beg, pos);
333  n++;
334  break;
335  }
336  }
337  out << trmd_read << std::endl;
338  }
339  }
340  std::cout << "Number of Adapter Trimmed: :" << n << std::endl;
341  delete root;
342  delete[] t;
343  ff1.close();
344  ff2.close();
345  pf.close();
346  return 0;
347 }