1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
|
/*
* knn.cpp
* Mothur
*
* Created by westcott on 11/4/09.
* Copyright 2009 Schloss Lab. All rights reserved.
*
*/
#include "knn.h"
/**************************************************************************************************/
Knn::Knn(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch, int n, int tid)
: Classify(), num(n), search(method) {
try {
threadID = tid;
//create search database and names vector
generateDatabaseAndNames(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch);
}
catch(exception& e) {
m->errorOut(e, "Knn", "Knn");
exit(1);
}
}
/**************************************************************************************************/
void Knn::setDistName(string s) {
try {
outDistName = s;
ofstream outDistance;
m->openOutputFile(outDistName, outDistance);
outDistance << "Name\tBestMatch\tDistance" << endl;
outDistance.close();
}
catch(exception& e) {
m->errorOut(e, "Knn", "setDistName");
exit(1);
}
}
/**************************************************************************************************/
Knn::~Knn() {
try {
delete phyloTree;
if (database != NULL) { delete database; }
}
catch(exception& e) {
m->errorOut(e, "Knn", "~Knn");
exit(1);
}
}
/**************************************************************************************************/
string Knn::getTaxonomy(Sequence* seq) {
try {
string tax;
//use database to find closest seq
vector<int> closest = database->findClosestSequences(seq, num);
if (search == "distance") { ofstream outDistance; m->openOutputFileAppend(outDistName, outDistance); outDistance << seq->getName() << '\t' << database->getName(closest[0]) << '\t' << database->getSearchScore() << endl; outDistance.close(); }
if (m->control_pressed) { return tax; }
vector<string> closestNames;
for (int i = 0; i < closest.size(); i++) {
//find that sequences taxonomy in map
it = taxonomy.find(names[closest[i]]);
//is this sequence in the taxonomy file
if (it == taxonomy.end()) { //error not in file
m->mothurOut("Error: sequence " + names[closest[i]] + " is not in the taxonomy file. It will be eliminated as a match to sequence " + seq->getName() + "."); m->mothurOutEndLine();
}else{ closestNames.push_back(it->first); }
}
if (closestNames.size() == 0) {
m->mothurOut("Error: All the matches for sequence " + seq->getName() + " have been eliminated. "); m->mothurOutEndLine();
tax = "unknown;";
}else{
tax = findCommonTaxonomy(closestNames);
if (tax == "") { m->mothurOut("There are no common levels for sequence " + seq->getName() + ". "); m->mothurOutEndLine(); tax = "unknown;"; }
}
simpleTax = tax;
return tax;
}
catch(exception& e) {
m->errorOut(e, "Knn", "getTaxonomy");
exit(1);
}
}
/**************************************************************************************************/
string Knn::findCommonTaxonomy(vector<string> closest) {
try {
/*vector< vector<string> > taxons; //taxon[0] = vector of taxonomy info for closest[0].
//so if closest[0] taxonomy is Bacteria;Alphaproteobacteria;Rhizobiales;Azorhizobium_et_rel.;Methylobacterium_et_rel.;Bosea;
//taxon[0][0] = Bacteria, taxon[0][1] = Alphaproteobacteria....
taxons.resize(closest.size());
int smallest = 100;
for (int i = 0; i < closest.size(); i++) {
if (m->control_pressed) { return "control"; }
string tax = taxonomy[closest[i]]; //we know its there since we checked in getTaxonomy
cout << tax << endl;
taxons[i] = parseTax(tax);
//figure out who has the shortest taxonomy info. so you can start comparing there
if (taxons[i].size() < smallest) {
smallest = taxons[i].size();
}
}
//start at the highest level all the closest seqs have
string common = "";
for (int i = (smallest-1); i >= 0; i--) {
if (m->control_pressed) { return "control"; }
string thistax = taxons[0][i];
int num = 0;
for (int j = 1; j < taxons.size(); j++) {
if (taxons[j][i] != thistax) { break; }
num = j;
}
if (num == (taxons.size()-1)) { //they all match at this level
for (int k = 0; k <= i; k++) {
common += taxons[0][k] + ';';
}
break;
}
}*/
string conTax;
//create a tree containing sequences from this bin
PhyloTree* p = new PhyloTree();
for (int i = 0; i < closest.size(); i++) {
p->addSeqToTree(closest[i], taxonomy[closest[i]]);
}
//build tree
p->assignHeirarchyIDs(0);
TaxNode currentNode = p->get(0);
//at each level
while (currentNode.children.size() != 0) { //you still have more to explore
TaxNode bestChild;
int bestChildSize = 0;
//go through children
for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
TaxNode temp = p->get(itChild->second);
//select child with largest accessions - most seqs assigned to it
if (temp.accessions.size() > bestChildSize) {
bestChild = p->get(itChild->second);
bestChildSize = temp.accessions.size();
}
}
if (bestChildSize == closest.size()) { //if yes, add it
conTax += bestChild.name + ";";
}else{ //if no, quit
break;
}
//move down a level
currentNode = bestChild;
}
delete p;
return conTax;
}
catch(exception& e) {
m->errorOut(e, "Knn", "findCommonTaxonomy");
exit(1);
}
}
/**************************************************************************************************/
|