File: kmerdb.cpp

package info (click to toggle)
mothur 1.48.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 13,692 kB
  • sloc: cpp: 161,866; makefile: 122; sh: 31
file content (283 lines) | stat: -rwxr-xr-x 9,559 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
/*
 *  kmerdb.cpp
 *  
 *
 *  Created by Pat Schloss on 12/16/08.
 *  Copyright 2008 Patrick D. Schloss. All rights reserved.
 *
 *	This class is a child class of the Database class, which stores the template sequences as a kmer table and provides
 *	a method of searching the kmer table for the sequence with the most kmers in common with a query sequence.
 *	kmerLocations is the primary storage variable that is a two-dimensional vector where each row represents the
 *	different number of kmers and each column contains the index to sequences that use that kmer.
 *
 *	Construction of an object of this type will first look for an appropriately named database file and if it is found
 *	then will read in the database file (readKmerDB), otherwise it will generate one and store the data in memory
 *	(generateKmerDB)
 *
 *	The search method used here is roughly the same as that used in the SimRank program that is found at the
 *	greengenes website.  The default kmer size is 7.  The speed complexity is between O(L) and O(LN).  When I use 7mers
 *	on average a kmer is found in ~100 other sequences with a database of ~5000 sequences.  If this is the case then the
 *	time would be on the order of O(0.1LN) -> fast.
 *

 */

#include "sequence.hpp"
#include "kmer.hpp"
#include "searchdatabase.hpp"
#include "kmerdb.hpp"

/**************************************************************************************************/

KmerDB::KmerDB(string fastaFileName, int kSize) : SearchDatabase(), kmerSize(kSize) {
	try { 
	
		kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
		
		int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
		count = 0;
		
		maxKmer = power4s[kmerSize];
		kmerLocations.resize(maxKmer+1);
        
        CurrentFile* current; current = CurrentFile::getInstance();
        version = current->getVersion();
		
	}
	catch(exception& e) {
		m->errorOut(e, "KmerDB", "KmerDB");
		exit(1);
	}	

}
/**************************************************************************************************/
KmerDB::KmerDB() : SearchDatabase() {
    CurrentFile* current; current = CurrentFile::getInstance();
    version = current->getVersion();
}
/**************************************************************************************************/

KmerDB::~KmerDB(){}

/**************************************************************************************************/

vector<int> KmerDB::findClosestSequences(Sequence* candidateSeq, int num, vector<float>& Scores) const{
	try {
		if (num > numSeqs) { m->mothurOut("[WARNING]: you requested " + toString(num) + " closest sequences, but the template only contains " + toString(numSeqs) + ", adjusting.\n");  num = numSeqs; }
		
		vector<int> topMatches;
		Kmer kmer(kmerSize);
		float searchScore = 0;
		Scores.clear();
		
		vector<int> matches(numSeqs, 0);						//	a record of the sequences with shared kmers
		vector<bool> timesKmerFound(kmerLocations.size()+1, false);	//	a record of the kmers that we have already found
		
		int numKmers = candidateSeq->getNumBases() - kmerSize + 1;	
	
		for(int i=0;i<numKmers;i++){
			int kmerNumber = kmer.getKmerNumber(candidateSeq->getUnaligned(), i);		//	go through the query sequence and get a kmer number
			if(!timesKmerFound[kmerNumber]){				//	if we haven't seen it before...
				for(int j=0;j<kmerLocations[kmerNumber].size();j++){//increase the count for each sequence that also has
					matches[kmerLocations[kmerNumber][j]]++;	//	that kmer
				}
			}
			timesKmerFound[kmerNumber] = true;						//	ok, we've seen the kmer now
		}
		
		if (num != 1) {
			vector<seqMatch> seqMatches; seqMatches.resize(numSeqs);
			for(int i=0;i<numSeqs;i++){		
				seqMatches[i].seq = i;
				seqMatches[i].match = matches[i];
			}
			
			//sorts putting largest matches first
			sort(seqMatches.begin(), seqMatches.end(), compareSeqMatches);
			
			searchScore = seqMatches[0].match;
			searchScore = 100 * searchScore / (float) numKmers;		//	return the Sequence object corresponding to the db
            Scores.push_back(searchScore);
            
			//save top matches
			for (int i = 0; i < num; i++) {
				topMatches.push_back(seqMatches[i].seq);
				float thisScore = 100 * seqMatches[i].match / (float) numKmers;
				Scores.push_back(thisScore);
			}
		}else{
			int bestIndex = 0;
			int bestMatch = -1;
            
			for(int i=0;i<numSeqs;i++){	
				
				if (matches[i] > bestMatch) {
					bestIndex = i;
					bestMatch = matches[i];
				}
			}
            
			searchScore = bestMatch;
			searchScore = 100 * searchScore / (float) numKmers;		//	return the Sequence object corresponding to the db
			topMatches.push_back(bestIndex);
			Scores.push_back(searchScore);
		}
		return topMatches;		
	}
	catch(exception& e) {
		m->errorOut(e, "KmerDB", "findClosestSequences");
		exit(1);
	}	
}

/**************************************************************************************************/
//print shortcut file
void KmerDB::generateDB(){
	try {
		
		ofstream kmerFile;										//	once we have the kmerLocations folder print it out
		util.openOutputFile(kmerDBName, kmerFile);					//	to a file
		
		//output version
		kmerFile << "#" << version << endl;
		
		for(int i=0;i<maxKmer;i++){								//	step through all of the possible kmer numbers
			kmerFile << i << ' ' << kmerLocations[i].size();	//	print the kmer number and the number of sequences with
			for(int j=0;j<kmerLocations[i].size();j++){			//	that kmer.  then print out the indices of the sequences
				kmerFile << ' ' << kmerLocations[i][j];			//	with that kmer.
			}
			kmerFile << endl;
		}
		kmerFile.close();
		
	}
	catch(exception& e) {
		m->errorOut(e, "KmerDB", "generateDB");
		exit(1);
	}	
	
}
/**************************************************************************************************/
void KmerDB::addSequence(Sequence seq) {
	try {
		Kmer kmer(kmerSize);
		
		string unaligned = seq.getUnaligned();	//	...take the unaligned sequence...
		int numKmers = unaligned.length() - kmerSize + 1;
			
		vector<int> seenBefore(maxKmer+1,0);
		for(int j=0;j<numKmers;j++){						//	...step though the sequence and get each kmer...
			int kmerNumber = kmer.getKmerNumber(unaligned, j);
			if(seenBefore[kmerNumber] == 0){
				kmerLocations[kmerNumber].push_back(count);		//	...insert the sequence index into kmerLocations for
			}												//	the appropriate kmer number
			seenBefore[kmerNumber] = 1;
		}													
	
		count++;
	}
	catch(exception& e) {
		m->errorOut(e, "KmerDB", "addSequence");
		exit(1);
	}	
}
/**************************************************************************************************/
//reads fasta file
void KmerDB::readSeqs(ifstream& fastaFile){
    try {
          
        while (!fastaFile.eof()) {
            
            if (m->getControl_pressed()) { break;  }
            
            Sequence seq(fastaFile); gobble(fastaFile);
            
            addSequence(seq);
        }
        
    }
    catch(exception& e) {
        m->errorOut(e, "KmerDB", "readSeqs");
        exit(1);
    }
}
/**************************************************************************************************/
//reads shortcut file
void KmerDB::readDB(ifstream& kmerDBFile){
	try {
					
		kmerDBFile.seekg(0);									//	start at the beginning of the file
		
		//read version
		string line = util.getline(kmerDBFile); gobble(kmerDBFile);
		
		string seqName;
		int seqNumber;

		for(int i=0;i<maxKmer;i++){
			int numValues = 0;	
			kmerDBFile >> seqName >> numValues;
			
			for(int j=0;j<numValues;j++){						//	for each kmer number get the...
				kmerDBFile >> seqNumber;						//		1. number of sequences with the kmer number
				kmerLocations[i].push_back(seqNumber);			//		2. sequence indices
			}
		}
		kmerDBFile.close();
		
	}
	catch(exception& e) {
		m->errorOut(e, "KmerDB", "readDB");
		exit(1);
	}	
}

/**************************************************************************************************/
int KmerDB::getCount(int kmer) {
	try {
		if (kmer < 0) { return 0; }  //if user gives negative number
		else if (kmer > maxKmer) {	return 0;	}  //or a kmer that is bigger than maxkmer
		else {	return kmerLocations[kmer].size();	}  // kmer is in vector range
	}
	catch(exception& e) {
		m->errorOut(e, "KmerDB", "getCount");
		exit(1);
	}	
}
/**************************************************************************************************/
int KmerDB::getReversed(int kmerNumber) {
	try {
		Kmer kmer(kmerSize);
		
		if (kmerNumber < 0) { return 0; }  //if user gives negative number
		else if (kmerNumber > maxKmer) {	return 0;	}  //or a kmer that is bigger than maxkmer
		else {	return kmer.getReverseKmerNumber(kmerNumber);	}  // kmer is in vector range
	}
	catch(exception& e) {
		m->errorOut(e, "KmerDB", "getReversed");
		exit(1);
	}	
}
/**************************************************************************************************/
vector<int> KmerDB::getSequencesWithKmer(int kmer) {
	try {
		
		vector<int> seqs;
	
		if (kmer < 0) { }  //if user gives negative number
		else if (kmer > maxKmer) {	}  //or a kmer that is bigger than maxkmer
		else {	seqs = kmerLocations[kmer];	}
		
		return seqs;
	}
	catch(exception& e) {
		m->errorOut(e, "KmerDB", "getSequencesWithKmer");
		exit(1);
	}	
}
/**************************************************************************************************/


/**************************************************************************************************/