File: nj.cpp

package info (click to toggle)
fastml 3.11-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,772 kB
  • sloc: cpp: 48,522; perl: 3,588; ansic: 819; makefile: 386; python: 83; sh: 55
file content (411 lines) | stat: -rw-r--r-- 13,238 bytes parent folder | download | duplicates (10)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
// $Id: nj.cpp 9948 2011-10-23 15:53:03Z cohenofi $

// version 1.00
// last modified 3 Nov 2002

#include "nj.h"
#include "errorMsg.h"
#include "logFile.h"
#include "treeUtil.h"
#include <cassert>
#include <algorithm>
#include <map>
using namespace std;


//------------------------------------------
// general outline:
// we follow Swofford's book, "Molecular Systematics" pg489.
// currentNodes is the vector of the nodes that are "in process".
// in the beggining, these are all the leaves. Once, 2 leaves are separeted, 
// they are excluded from currentNodes, and their father is added to currentNodes.
// we (almost) finish the algorithm when currentNodes's size is 3. (i.e., we know the topology).
// thus when we start from an evolutionary tree, all we do, is to construct a star (start) tree
//------------------------------------------




//------------------------------------------
// constructor and start
//------------------------------------------
tree NJalg::computeTree(VVdouble distances,const vector<string>& names, const tree * const constriantTree /*= NULL*/){
	assert(distances.size() == names.size());
	tree resTree = startingTree(names);
	if (distances.size()<3) return resTree;
	vector<tree::nodeP> currentNodes;
	resTree.getAllLeaves(currentNodes,resTree.getRoot());
	if (constriantTree) {
		njConstraint njc(resTree, *constriantTree);
		while (currentNodes.size() >= 3) NJiterate(resTree,currentNodes,distances, njc);
	} else {
		while (currentNodes.size() >= 3) NJiterate(resTree,currentNodes,distances);
	}
	resTree.create_names_to_internal_nodes();
	resTree.makeSureAllBranchesArePositive();
	LOGDO(5,resTree.output(myLog::LogFile()));
	return resTree;
}

tree NJalg::startingTree(const vector<string>& names) {
	return starTree(names);
}

tree NJalg::startingTree(const tree& inTree) {
	tree et;
	et.createRootNode();
	vector<tree::nodeP> allLeaves;
	inTree.getAllLeaves(allLeaves,inTree.getRoot());
	
	vector<string> names(allLeaves.size());
	for (int k = 0 ; k < allLeaves.size(); ++k)
	  names[k]=allLeaves[k]->name();
	  
	return startingTree(names);
}

void NJalg::updateBranchDistance(const VVdouble& distanceTable,
								 const Vdouble& rValues,
								 tree::nodeP nodeNew,
								 tree::nodeP nodeI,
								 tree::nodeP nodeJ,
								 int Iplace,
								 int Jplace) {
	MDOUBLE dis= (Iplace<Jplace) ? distanceTable[Iplace][Jplace] : distanceTable[Jplace][Iplace];
	MDOUBLE DisI_new = dis/2.0;
	MDOUBLE tmp = rValues[Iplace] - rValues[Jplace];
	tmp/= (  2.0*(distanceTable.size()-2)  );
	DisI_new = DisI_new+ tmp;
	MDOUBLE DisJ_new = dis - DisI_new;
	if (DisI_new<tree::SHORT_LENGTH_VALUE) DisI_new=tree::SHORT_LENGTH_VALUE; // no negative..
	if (DisJ_new<tree::SHORT_LENGTH_VALUE) DisJ_new=tree::SHORT_LENGTH_VALUE; // no negative..
	nodeI->setDisToFather(DisI_new);
	nodeJ->setDisToFather(DisJ_new);
}

void NJalg::NJiterate(tree& et,
					  vector<tree::nodeP>& currentNodes,
							 VVdouble& distanceTable) {
	Vdouble	rVector = calc_r_values(currentNodes,distanceTable);//CHECK2
	
	if (currentNodes.size() == 3) {
		update3taxaLevel(distanceTable,rVector,currentNodes);
		currentNodes.clear();
		return;
	}
	
	int minRaw,minCol;
	calc_M_matrix(currentNodes,distanceTable,rVector,minRaw,minCol);//CHECK3
	tree::nodeP nodeI = currentNodes[minRaw];
	tree::nodeP nodeJ = currentNodes[minCol];
	tree::nodeP theNewNode;
	theNewNode= SeparateNodes(et,nodeI,nodeJ);
	//CHECK4
	updateBranchDistance(distanceTable,rVector,theNewNode,nodeI,nodeJ,minRaw,minCol);
	//CHECK6
		et.create_names_to_internal_nodes();
	UpdateDistanceTableAndCurrentNodes(currentNodes,distanceTable,nodeI,nodeJ,theNewNode,minRaw,minCol);
}

void NJalg::NJiterate(tree& et,
		      vector<tree::nodeP>& currentNodes,
		      VVdouble& distanceTable,
		      njConstraint& njc) {
	Vdouble	rMatrix = calc_r_values(currentNodes,distanceTable);//CHECK2
	
	if (currentNodes.size() == 3) {
		update3taxaLevel(distanceTable,rMatrix,currentNodes);
		currentNodes.clear();
		return;
	}
	
	int minRaw,minCol;
	calc_M_matrix(currentNodes,distanceTable,rMatrix,minRaw,minCol, njc);//CHECK3
	tree::nodeP nodeI = currentNodes[minRaw];
	tree::nodeP nodeJ = currentNodes[minCol];
	tree::nodeP theNewNode;
	theNewNode= SeparateNodes(et,nodeI,nodeJ);
	njc.join(nodeI, nodeJ, theNewNode);
	//CHECK4
	updateBranchDistance(distanceTable,rMatrix,theNewNode,nodeI,nodeJ,minRaw,minCol);
	//CHECK6
		et.create_names_to_internal_nodes();
	UpdateDistanceTableAndCurrentNodes(currentNodes,distanceTable,nodeI,nodeJ,theNewNode,minRaw,minCol);
	LOGDO(15,et.output(myLog::LogFile(),tree::ANCESTORID));

}



Vdouble NJalg::calc_r_values(vector<tree::nodeP>& currentNodes,
							 const VVdouble& distanceTable) {
	Vdouble r_values(currentNodes.size(),0.0);
	for (int i=0; i <r_values.size();++i) {
		for (int j =0; j < r_values.size();++j) {
			MDOUBLE dis= (i<j) ? distanceTable[i][j] : distanceTable[j][i];
			r_values[i] += dis;
		}
	}
	return r_values;
}

void NJalg::calc_M_matrix(vector<tree::nodeP>& currentNodes,
						  const VVdouble& distanceTable,
						  const Vdouble & r_values,
						  int& minRaw,int& minCol){
	MDOUBLE min = VERYBIG;
	for (int i=0; i < currentNodes.size();++i){
		for (int j =i+1; j < currentNodes.size();++j) {
			MDOUBLE dis= (i<j) ? distanceTable[i][j] : distanceTable[j][i];
			MDOUBLE tmp = dis-(r_values[i]+r_values[j])/(currentNodes.size()-2);
			if (tmp<min) {minRaw = i;minCol=j;min=tmp;}
			
		}
	}
}

void NJalg::calc_M_matrix(vector<tree::nodeP>& currentNodes,
			  const VVdouble& distanceTable,
			  const Vdouble & r_values,
			  int& minRaw,int& minCol, 
			  const njConstraint& njc){
	MDOUBLE min = VERYBIG;
	MDOUBLE min_noc =  VERYBIG;
	int minRaw_noc=-1,minCol_noc=-1;
	for (int i=0; i < currentNodes.size();++i){
	  for (int j =i+1; j < currentNodes.size();++j) {
	    if (njc.isCompatible(currentNodes[i],currentNodes[j])) {
	      MDOUBLE dis= (i<j) ? distanceTable[i][j] : distanceTable[j][i];
	      MDOUBLE tmp = dis-(r_values[i]+r_values[j])/(currentNodes.size()-2);
	      if (tmp<min) {minRaw = i;minCol=j;min=tmp;}
	    }
	    LOGDO(10,{
		    MDOUBLE dis= (i<j) ? distanceTable[i][j] : distanceTable[j][i];
		    MDOUBLE tmp = dis-(r_values[i]+r_values[j])/(currentNodes.size()-2);
		    if (tmp<min_noc) {minRaw_noc = i;minCol_noc=j;min_noc=tmp;}
		  });
	      
	      }
	}
	LOGDO(10, {if (min_noc != min) {myLog::LogFile() 
		    << "NJ-constratin changes outcome " <<
		    currentNodes[minRaw_noc]->name()<<","<<currentNodes[minCol_noc]->name() <<"-> " <<
		    currentNodes[minRaw]    ->name()<<","<<currentNodes[minCol]    ->name()<< 
		    "  ("<<min-min_noc<<")"<<endl; 
		  njc.isCompatible(currentNodes[minRaw_noc], currentNodes[minCol_noc], true);
		  myLog::LogFile() << njc <<endl;
		}
	      });
}

tree::nodeP NJalg::SeparateNodes(tree& et, tree::nodeP node1,
											 tree::nodeP node2) {
	if (node1->father() != node2->father()) 
	 errorMsg::reportError(" error in function NJalg::SeparateNodes - nodes don't have the same father");

	tree::nodeP fatherNode = node1->father();

	tree::nodeP theNewNode = et.createNode(fatherNode,et.getNodesNum());
	node1->setFather(theNewNode);
	theNewNode->setSon(node1);
	node2->setFather(theNewNode);
	theNewNode->setSon(node2);

	// remove from son list of father node.
	fatherNode->removeSon(node1); 

	fatherNode->removeSon(node2); 
	return theNewNode;
}

void NJalg::update3taxaLevel(VVdouble& distanceTable,Vdouble & r_values,
							 vector<tree::nodeP>& currentNodes) {
	// update the distance of the 3 taxa that are left in the end, to the root.
	
	MDOUBLE dis0root = distanceTable[0][1]/2+0.5*(r_values[0]-r_values[1]);
	MDOUBLE dis1root = distanceTable[0][1]/2+0.5*(r_values[1]-r_values[0]);
	MDOUBLE dis2root = distanceTable[0][2]/2+0.5*(r_values[2]-r_values[0]);
	if (dis0root<tree::SHORT_LENGTH_VALUE) dis0root=tree::SHORT_LENGTH_VALUE; // no negative..
	if (dis1root<tree::SHORT_LENGTH_VALUE) dis1root=tree::SHORT_LENGTH_VALUE; // no negative..
	if (dis2root<tree::SHORT_LENGTH_VALUE) dis2root=tree::SHORT_LENGTH_VALUE; // no negative..
	currentNodes[0]->setDisToFather(dis0root);
	currentNodes[1]->setDisToFather(dis1root);
	currentNodes[2]->setDisToFather(dis2root);
}

void NJalg::UpdateDistanceTableAndCurrentNodes(vector<tree::nodeP>& currentNodes,
											   VVdouble& distanceTable,
											   tree::nodeP nodeI,
											   tree::nodeP nodeJ,
											   tree::nodeP theNewNode,
											   int Iplace,
											   int Jplace) {
	//	Iplace is the place of i in the "old" currentNodes vector
	int i,j;
	//	updating currentNodes
	vector<tree::nodeP> newCurrentNode= currentNodes;

	vector<tree::nodeP>::iterator vec_iter1=remove(
		newCurrentNode.begin(),newCurrentNode.end(),nodeI );
	newCurrentNode.erase(vec_iter1,newCurrentNode.end());

	vector<tree::nodeP>::iterator vec_iter2=remove(
	newCurrentNode.begin(),newCurrentNode.end(),nodeJ );
	newCurrentNode.erase(vec_iter2,newCurrentNode.end());
	
	newCurrentNode.push_back(theNewNode);

	map<tree::nodeP,int> nodeIntMap1;
	for (int z=0; z<currentNodes.size();++z) {
		nodeIntMap1.insert(map<tree::nodeP,int>::value_type(currentNodes[z],z));
	}

	VVdouble newDisTable;
	newDisTable.resize(newCurrentNode.size());
	for (int z1=0;z1<newDisTable.size();++z1) newDisTable[z1].resize(newCurrentNode.size(),0.0);

// updatine the table
	for (i=0; i < newCurrentNode.size(); i++) {
		for (j=i+1; j < newCurrentNode.size() ; j++) {
			if ((i!=newCurrentNode.size()-1) && (j!=newCurrentNode.size()-1)) {// both old nodes
				int oldI = nodeIntMap1[newCurrentNode[i]];
				int oldJ = nodeIntMap1[newCurrentNode[j]];
				MDOUBLE dis= (oldI<oldJ) ? distanceTable[oldI][oldJ] : distanceTable[oldJ][oldI];
				newDisTable[i][j] = dis;
			} //else if (i==newCurrentNode.size()-1) { // i is new
			//	newDisTable[i][j] = (dis(Iplace,NewOldPlaces[j])+dis(Jplace,NewOldPlaces[j])-dis(Iplace,Jplace))/2.0;
			//}
			else if (j==newCurrentNode.size()-1) { // j is new
				int oldI = Iplace;
				int oldJ = Jplace;
				int oldK = nodeIntMap1[newCurrentNode[i]];
				MDOUBLE disIK= (oldI<oldK) ? distanceTable[oldI][oldK] : distanceTable[oldK][oldI];
				MDOUBLE disIJ= (oldI<oldJ) ? distanceTable[oldI][oldJ] : distanceTable[oldJ][oldI];
				MDOUBLE disJK= (oldJ<oldK) ? distanceTable[oldJ][oldK] : distanceTable[oldK][oldJ];
				newDisTable[i][j] = 0.5*(disIK+disJK-disIJ); //EQ. 43 SWOFFORD PAGE 489.
			}
		}
	}

	currentNodes=newCurrentNode;
	distanceTable=newDisTable;
}

/*
NJalg::NJalg(){
	_myET = NULL;
}



//-----------------------------
// The algorithm
//-----------------------------

void NJalg::GetDisTable(const sequenceContainer& sd,const vector<MDOUBLE>  * weights) {
	
	VVresize(_startingDistanceTable,distanceTable.size(),distanceTable.size());// for printing stuff later.
	VVresize(LTable,distanceTable.size(),distanceTable.size());// for printing stuff later.

	int i,j;
	_nodeNames.resize(currentNodes.size());
	for ( i=0; i < currentNodes.size(); i++) {
		_nodeNames[i] =(currentNodes[i]->name()); 
		for ( j=i+1; j < currentNodes.size(); j++) {
			MDOUBLE tempDis = -2000.0;
			MDOUBLE resLikelihood;
			int seqnodeI_ID = sd.getId(currentNodes[i]->name());
			int seqnodeJ_ID = sd.getId(currentNodes[j]->name());
			const sequence& snodeI = *sd.getSeqPtr(seqnodeI_ID,true);
			const sequence& snodeJ = *sd.getSeqPtr(seqnodeJ_ID,true);
			tempDis = _cd->giveDistance(snodeI,snodeJ,weights,&resLikelihood);
			distanceTable[i][j] = tempDis;
			LTable[i][j] = resLikelihood;
		}
	}
	if (myLog::LogLevel()>4) {
		for (i=0; i < currentNodes.size(); i++) {
			for (j=i+1; j < currentNodes.size(); j++) {
				LOG(100,<<"nj distance ["<<i<<"]["<<j<<"] ="<<distanceTable[i][j]<<endl);
			}
		}
	}
	//if (myLog::LogLevel()>4) {
	//	for (i=0; i < currentNodes.size(); i++) {
	//		for (j=i+1; j < currentNodes.size(); j++) {
	//			LOG(4,<<"nj likelihood for distance["<<i<<"]["<<j<<"] ="<<LTable[i][j]<<endl);
	//		}
	//	}
	//}
	// for printing stuff later.
	for (int tmp1=0; tmp1<distanceTable.size();++tmp1)
	for (int tmp2=0; tmp2<distanceTable.size();++tmp2) 
	_startingDistanceTable[tmp1][tmp2] = distanceTable[tmp1][tmp2];
}






void NJalg::NJiterate() {
	getMmatrixFromDistanceTable();
	int minRaw,minCol;
	findMinM(minRaw,minCol);
	
	tree::nodeP nodeI = currentNodes[minRaw];
	tree::nodeP nodeJ = currentNodes[minCol];
	tree::nodeP theNewNode;
	theNewNode= SeparateNodes(nodeI,nodeJ);

	//CHECK4

	updateBranchDistance(theNewNode,nodeI,nodeJ,minRaw,minCol);
	//CHECK6

	UpdateDistanceTableAndCurrentNodes(nodeI,nodeJ,theNewNode,minRaw,minCol);
}

		
		












//CHECK1
//cout<<"\n-----------------------------------------------"<<endl;
//for (int h=0; h < currentNodes.size(); h++) cout<<currentNodes[h]->name()<<" = "<<h<<endl;

//CHECK2
//	for (int i =0; i < r_values.size();++i) cout<<"r["<<i<<"] = "<<r_values[i]<<endl;

//CHECK3
//	for (i =0; i < currentNodes.size();++i) 
//		for (int j =i+1; j <currentNodes.size();++j) 
//			cout<<"M["<<i<<"]["<<j<<"] = "<<Mmatrix[i][j]<<endl;

//CHECK4
//	string htuname = "HTU";
//	char k = 'a'+currentNodes.size();
//	htuname+=k;
//	theNewNode->SetName(htuname);		
	
//CHECK5
//_myET->getRoot()->SetName("RootOfStar");		
	
//CHECK6
//	et.output(cout,et.getRoot(),tree::ANCESTOR);

	



*/