File: suffixdb.cpp

package info (click to toggle)
mothur 1.33.3%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 11,248 kB
  • ctags: 12,231
  • sloc: cpp: 152,046; fortran: 665; makefile: 74; sh: 34
file content (85 lines) | stat: -rw-r--r-- 3,150 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
/*
 *  suffixdb.cpp
 *  
 *
 *  Created by Pat Schloss on 12/16/08.
 *  Copyright 2008 Patrick D. Schloss. All rights reserved.
 *
 *	This is a child class of the Database abstract datatype.  The class is basically a database of suffix trees and an
 *	encapsulation of the method for finding the most similar tree to an inputted sequence.  the suffixForest objecct
 *	is a vector of SuffixTrees, with each template sequence being represented by a different SuffixTree.  The class also
 *	provides a method to take an unaligned sequence and find the closest sequence in the suffixForest.  The search
 *	method is inspired by the article and Perl source code provided at http://www.ddj.com/web-development/184416093.  I 
 *	would estimate that the time complexity is O(LN) for each search, which is slower than the kmer searching, but 
 *	faster than blast
 *
 */

#include "database.hpp"
#include "sequence.hpp"
#include "suffixtree.hpp"
#include "suffixdb.hpp"

/**************************************************************************************************/

SuffixDB::SuffixDB(int numSeqs) : Database() {
	suffixForest.resize(numSeqs);
	count = 0;
}
/**************************************************************************************************/

SuffixDB::SuffixDB() : Database() {
	count = 0;
}

/**************************************************************************************************/
//assumes sequences have been added using addSequence
vector<int> SuffixDB::findClosestSequences(Sequence* candidateSeq, int num){
	try {
		vector<int> topMatches;
		string processedSeq = candidateSeq->convert2ints();		//	the candidate sequence needs to be a string of ints
		
		vector<seqMatch> seqMatches;
		for(int i=0;i<suffixForest.size();i++){					//	scan through the forest and see what the minimum
			int count = suffixForest[i].countSuffixes(processedSeq);	//	return score is and keep track of the
			seqMatch temp(i, count);
			seqMatches.push_back(temp);
		}
		
		//sorts putting smallest matches first
		sort(seqMatches.begin(), seqMatches.end(), compareSeqMatchesReverse);
		
		searchScore = seqMatches[0].match;
		searchScore = 100 * (1. - searchScore / (float)processedSeq.length());
		
		//save top matches
		for (int i = 0; i < num; i++) {
			topMatches.push_back(seqMatches[i].seq);
		}

		//	return the Sequence object that has the minimum score
		return topMatches;
	}
	catch(exception& e) {
		m->errorOut(e, "SuffixDB", "findClosestSequences");
		exit(1);
	}	
}
/**************************************************************************************************/
//adding the sequences generates the db
void SuffixDB::addSequence(Sequence seq) {
	try {
		suffixForest[count].loadSequence(seq);		
		count++;
	}
	catch(exception& e) {
		m->errorOut(e, "SuffixDB", "addSequence");
		exit(1);
	}
}
/**************************************************************************************************/

SuffixDB::~SuffixDB(){														
	for (int i = (suffixForest.size()-1); i >= 0; i--) {  suffixForest.pop_back();  }
}
/**************************************************************************************************/