6 #include "suffix_tree.h" 11 for(
unsigned int i = 0; i < MAX_CHAR;i++)
12 this->children[i] = 0;
20 bool suffix_node::is_leaf_node(){
21 return (nchildren == 0);
24 bool suffix_node::contains_child(
int ext){
25 return !(this->children[ext] == 0);
28 int suffix_node::get_length(){
31 return *(this->end) - this->begin + 1;
34 int suffix_node::get_depth(){
44 int get_hamming_distance(std::string s1, std::string s2,
int k){
46 for(
unsigned int i = 0; i < s1.length(); i++){
56 std::string suffix_node::get_longest_common_substring(std::string s1, std::string s2){
57 std::string str = s1+
"#"+s2+
"@";
58 std::string p=
"", tmp, longest_trvsl =
"";
59 unsigned int max = 0, i;
64 tmp = node->get_path(str);
65 if(s1.find(p+tmp) != std::string::npos && s2.find(p+tmp) != std::string::npos)
69 for(i = 0; i < MAX_CHAR;i++){
70 if(node->children[i] == 0)
72 longest_trvsl = node->children[i]->get_longest_common_substring(s1, s2);
73 if(longest_trvsl.length() > max){
74 max = longest_trvsl.length();
82 std::string suffix_node::get_path(std::string s){
85 return s.substr(this->begin, *(this->end) - this->begin + 1);
88 void suffix_node::extend_path(
int *e){
94 this->children[ext] = n;
99 void suffix_node::add_child(
suffix_node* c,
int ext){
100 this->children[ext] = c;
106 return this->children[ext];
109 bool suffix_node::contains_depth(
int depth){
110 return this->get_depth() <= depth && depth <= (this->get_length() + this->get_depth());
113 void suffix_node::print(std::string s){
114 for(
int i = 0; i < this->get_depth(); i++){
117 std::string t = (this->begin == -1) ?
"R" :
" "+s.substr(this->begin, *(this->end) - this->begin + 1);
119 if(this->suffix_link != 0)
120 std::cout <<
" --- ( " << this->suffix_link->parent->get_path(s) <<
" " << this->suffix_link->get_path(s) <<
")";
122 std::cout <<
" - " << this->parent->get_path(s);
123 std::cout << std::endl;
124 for(
unsigned int i = 0; i<alphabet.length();i++){
125 if(this->children[i]!=0)
126 this->children[i]->print(s);
130 bool suffix_node::walk_next(
int &beg,
int &suffix_length){
132 if(suffix_length >= node->get_length()){
133 beg += node->get_length();
134 suffix_length -= node->get_length();
142 root->suffix_link = root;
144 suffix_node *cur_node = root, *new_node = 0, *new_cur_node = 0;
145 int str_ind[MAX_SIZE], suffix_length = 0, *leaf_end=
new int, suffix_count = 0, beg = -1, *end;
146 unsigned int i = 0, n_str_ind = 0;
147 for(i = 0; i < s.length();i++){
148 str_ind[i] = alphabet.find(s[i]);
153 for(i = 0; i < n_str_ind;i++){
157 while(suffix_count > 0){
158 if(suffix_length == 0)
160 if(cur_node->children[str_ind[beg]] == 0){
161 cur_node->add_child(str_ind[beg], beg, leaf_end, 0);
163 new_node->suffix_link = cur_node;
167 new_cur_node = cur_node->children[str_ind[beg]];
168 if(new_cur_node->walk_next(beg, suffix_length)){
169 cur_node = new_cur_node;
172 if(str_ind[new_cur_node->begin + suffix_length] == str_ind[i]){
173 if(new_node != 0 && cur_node->end != 0){
174 new_node->suffix_link = cur_node;
183 *end = new_cur_node->begin + suffix_length - 1;
185 new_int_node = cur_node->add_child(str_ind[beg], new_cur_node->begin, end, 0);
186 new_int_node->add_child(str_ind[i], i, leaf_end, 0);
187 new_cur_node->begin += suffix_length;
188 new_int_node->add_child(new_cur_node, str_ind[new_cur_node->begin]);
190 new_node->suffix_link = new_int_node;
191 new_node = new_int_node;
194 if(cur_node->end==0 && suffix_length > 0){
196 beg = i - suffix_count + 1;
197 }
else if(cur_node->end!=0){
198 if(cur_node->suffix_link != 0){
199 cur_node = cur_node->suffix_link;
201 cur_node = cur_node->parent->suffix_link;
203 }
else if(cur_node->get_length() == 1){
211 std::string get_reverse_complement(std::string rev_read){
213 for(
unsigned int i = 0;i< rev_read.length();i++){
230 std::reverse(rev_read.begin(), rev_read.end());
234 std::vector<std::string> read_adapters_from_fasta(std::string p){
235 std::vector<std::string> adp;
236 std::ifstream fin(p);
238 while (std::getline(fin, line)){
246 std::string extend_till_mismatches(std::string cmn, std::string l, std::string adp){
247 if(cmn.length() < MIN_LENGTH){
250 int pos = adp.find(cmn);
251 int r_pos = l.find(cmn);
257 while(r_pos >= 0 && pos >= 0 && k <= MAX_MISMATCHES){
258 cmn = l[r_pos] + cmn;
259 if(l[r_pos]!=adp[pos])
267 int trim_adapter(std::string f1, std::string f2, std::string adp_path, std::string p){
268 std::string l1, l2, l1_rc, l2_rc, s, cmn, trmd_read;
269 int *t =
new int[2], beg;
271 std::ifstream ff1(f1), ff2, pf(p);
272 std::ofstream out(p+
".trimmed.fastq");
273 std::vector<std::string>::iterator it;
275 std::vector<std::string> adp = read_adapters_from_fasta(adp_path);
276 int i = -1, n = 0, adp_pos;
279 while (std::getline(ff1, l1) && std::getline(ff2, l2)){
285 while (std::getline(ff1, l1)){
288 out << l1 << std::endl;
294 for(it = adp.begin(); it != adp.end(); ++it) {
295 s = l1 +
"#" + *it +
"@";
296 root = build_suffix_tree(s);
297 cmn = root->get_longest_common_substring(l1, *it);
298 cmn = extend_till_mismatches(cmn, l1, *it);
300 l1_rc = get_reverse_complement(l1);
301 s = l1_rc +
"#" + *it +
"@";
302 root = build_suffix_tree(s);
303 cmn = root->get_longest_common_substring(l1_rc, *it);
304 cmn = extend_till_mismatches(cmn, l1_rc, *it);
305 pos = l1_rc.find(cmn);
306 beg = l1_rc.length() - pos;
307 adp_pos = (*it).find(cmn);
311 adp_pos = (*it).find(cmn);
313 if(cmn.length() >= MIN_LENGTH && adp_pos != 0){
316 t = align_seqs(l1, *it);
321 std::cout <<
"Reverse!" << std::endl;
322 l1_rc = get_reverse_complement(l1);
323 t = align_seqs(l1_rc, *it);
325 pos = l1.length() - *(t+1);
330 std::cout << beg <<
" " << pos << std::endl;
331 if(beg != 0 || pos != l1.length()){
332 trmd_read = l1.substr(beg, pos);
337 out << trmd_read << std::endl;
340 std::cout <<
"Number of Adapter Trimmed: :" << n << std::endl;