File: knn.cpp

package info (click to toggle)
mothur 1.24.1-1
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 7,868 kB
  • sloc: cpp: 110,948; ansic: 2,037; fortran: 665; makefile: 74; sh: 59
file content (187 lines) | stat: -rw-r--r-- 5,648 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
/*
 *  knn.cpp
 *  Mothur
 *
 *  Created by westcott on 11/4/09.
 *  Copyright 2009 Schloss Lab. All rights reserved.
 *
 */

#include "knn.h"

/**************************************************************************************************/
Knn::Knn(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch, int n, int tid) 
: Classify(), num(n), search(method) {
	try {
		threadID = tid;
		
		//create search database and names vector
		generateDatabaseAndNames(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch);
	}
	catch(exception& e) {
		m->errorOut(e, "Knn", "Knn");
		exit(1);
	}
}
/**************************************************************************************************/
void Knn::setDistName(string s) {
	try {
		outDistName = s;
		ofstream outDistance;
		m->openOutputFile(outDistName, outDistance);
		outDistance << "Name\tBestMatch\tDistance" << endl;
		outDistance.close();
	}
	catch(exception& e) {
		m->errorOut(e, "Knn", "setDistName");
		exit(1);
	}
}
/**************************************************************************************************/
Knn::~Knn() {
	try {
		 delete phyloTree; 
		 if (database != NULL) {  delete database; }
	}
	catch(exception& e) {
		m->errorOut(e, "Knn", "~Knn");
		exit(1);
	}
}
/**************************************************************************************************/
string Knn::getTaxonomy(Sequence* seq) {
	try {
		string tax;
		
		//use database to find closest seq
		vector<int> closest = database->findClosestSequences(seq, num);
	
		if (search == "distance") { ofstream outDistance; m->openOutputFileAppend(outDistName, outDistance); outDistance << seq->getName() << '\t' << database->getName(closest[0]) << '\t' << database->getSearchScore() << endl; outDistance.close();  }
	
		if (m->control_pressed) { return tax; }

		vector<string> closestNames;
		for (int i = 0; i < closest.size(); i++) {
			//find that sequences taxonomy in map
			it = taxonomy.find(names[closest[i]]);
		
			//is this sequence in the taxonomy file
			if (it == taxonomy.end()) { //error not in file
				m->mothurOut("Error: sequence " + names[closest[i]] + " is not in the taxonomy file.  It will be eliminated as a match to sequence " + seq->getName() + "."); m->mothurOutEndLine();
			}else{   closestNames.push_back(it->first);	}
		}
		
		if (closestNames.size() == 0) {
			m->mothurOut("Error: All the matches for sequence " + seq->getName() + " have been eliminated. "); m->mothurOutEndLine();
			tax = "unknown;";
		}else{
			tax = findCommonTaxonomy(closestNames);
			if (tax == "") { m->mothurOut("There are no common levels for sequence " + seq->getName() + ". "); m->mothurOutEndLine(); tax = "unknown;"; }
		}
		
		simpleTax = tax;
		return tax;	
	}
	catch(exception& e) {
		m->errorOut(e, "Knn", "getTaxonomy");
		exit(1);
	}
}
/**************************************************************************************************/
string Knn::findCommonTaxonomy(vector<string> closest)  {
	try {
		/*vector< vector<string> > taxons;  //taxon[0] = vector of taxonomy info for closest[0].
										//so if closest[0] taxonomy is Bacteria;Alphaproteobacteria;Rhizobiales;Azorhizobium_et_rel.;Methylobacterium_et_rel.;Bosea;
										//taxon[0][0] = Bacteria, taxon[0][1] = Alphaproteobacteria....
										
		taxons.resize(closest.size());
		int smallest = 100;
		
		for (int i = 0; i < closest.size(); i++) {
			if (m->control_pressed) { return "control"; }
		
			string tax = taxonomy[closest[i]];  //we know its there since we checked in getTaxonomy
			cout << tax << endl;
		
			taxons[i] = parseTax(tax);
		
			//figure out who has the shortest taxonomy info. so you can start comparing there
			if (taxons[i].size() < smallest) {
				smallest = taxons[i].size();
			}
		}
	
		//start at the highest level all the closest seqs have
		string common = "";
		for (int i = (smallest-1); i >= 0; i--) {
			if (m->control_pressed) { return "control"; }

			string thistax = taxons[0][i];
			int num = 0;
			for (int j = 1; j < taxons.size(); j++) {
				if (taxons[j][i] != thistax) { break; }
				num = j;
			}
		
			if (num == (taxons.size()-1)) { //they all match at this level
				for (int k = 0; k <= i; k++) {
					common += taxons[0][k] + ';';
				}
				break;
			}
		}*/
		
		string conTax;
		
		//create a tree containing sequences from this bin
		PhyloTree* p = new PhyloTree();
		
		for (int i = 0; i < closest.size(); i++) {
			p->addSeqToTree(closest[i], taxonomy[closest[i]]);
		}
		
		//build tree
		p->assignHeirarchyIDs(0);
		
		TaxNode currentNode = p->get(0);
		
		//at each level
		while (currentNode.children.size() != 0) { //you still have more to explore
			
			TaxNode bestChild;
			int bestChildSize = 0;
			
			//go through children
			for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
				
				TaxNode temp = p->get(itChild->second);
				
				//select child with largest accessions - most seqs assigned to it
				if (temp.accessions.size() > bestChildSize) {
					bestChild = p->get(itChild->second);
					bestChildSize = temp.accessions.size();
				}
				
			}
			
			if (bestChildSize == closest.size()) { //if yes, add it
				conTax += bestChild.name + ";";
			}else{ //if no, quit
				break;
			}
			
			//move down a level
			currentNode = bestChild;
		}
		
		delete p;	
		
		return conTax;
	}
	catch(exception& e) {
		m->errorOut(e, "Knn", "findCommonTaxonomy");
		exit(1);
	}
}
/**************************************************************************************************/