File: bblEMfixRoot.cpp

package info (click to toggle)
fastml 3.11-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,772 kB
  • sloc: cpp: 48,522; perl: 3,588; ansic: 819; makefile: 386; python: 83; sh: 55
file content (236 lines) | stat: -rw-r--r-- 9,549 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
// $Id: bblEM.cpp 4478 2008-07-17 17:09:55Z cohenofi $
#include "bblEMfixRoot.h"
#include "likelihoodComputation.h"
using namespace likelihoodComputation;
#include "computeUpAlg.h"
#include "computeDownAlg.h"
#include "computeCounts.h"
#include "treeIt.h"
#include "fromCountTableComponentToDistancefixRoot.h"
#include <ctime>

bblEMfixRoot::bblEMfixRoot(tree& et,
				const sequenceContainer& sc,
				const stochasticProcess& sp,
				const Vdouble * weights,
				const int maxIterations,
				const MDOUBLE epsilon,
				const MDOUBLE tollForPairwiseDist,
				unObservableData*  unObservableData_p,
				const MDOUBLE* likelihoodLast) :
_et(et),_sc(sc),_sp(sp),_weights (weights),_unObservableData_p(unObservableData_p) 
{
	//if(!plogLforMissingData){
	//	_plogLforMissingData = NULL;
	//}
	_treeLikelihood = compute_bblEM(maxIterations,epsilon,tollForPairwiseDist,likelihoodLast);
}

/********************************************************************************************
*********************************************************************************************/
MDOUBLE bblEMfixRoot::compute_bblEM(
			const int maxIterations,
			const MDOUBLE epsilon,
			const MDOUBLE tollForPairwiseDist,
			const MDOUBLE* likelihoodLast){
	allocatePlace();
	MDOUBLE oldL=VERYSMALL;
	MDOUBLE currL = VERYSMALL;
	tree oldT = _et;
	for (int i=0; i < maxIterations; ++i) {
		//if(_unObservableData_p)
		//	_unObservableData_p->setLforMissingData(_et,&_sp);

		computeUp();
		currL = likelihoodComputation::getTreeLikelihoodFromUp2(_et,_sc,_sp,_cup,_posLike,_weights,_unObservableData_p);
		LOGnOUT(4,<<"--- Iter="<<i<<" logL="<<currL<<endl);
		//if(_unObservableData_p){
			//if(!likelihoodLast)
			//	LOGnOUT(4,<<" WARNING!!! likelihoodLast was not sent to bblEM with unObservableData prog"<<endl);				
		if(oldL<=currL){	// make sure not to use tree with lower likelihood then last computed likelihood (before BBL-EM)
				if(likelihoodLast){
					if(*likelihoodLast<=currL)
						oldT = _et;	 // L didn't go down
					else
						LOGnOUT(4,<<"Likelihood went down compared pre-BBL oldL="<<*likelihoodLast<<" newL="<<currL<<" Do not update tree"<<endl);
				}
				else{
					oldT = _et;	 // L didn't go down
					LOGnOUT(7,<<"LikelihoodLast was not sent to bblEM"<<endl);
				}
		}
		else
			LOGnOUT(4,<<"Likelihood went down oldL="<<oldL<<" newL="<<currL<<" Do not update tree"<<endl);				
		//}
		//else
			//oldT = _et;	// all application that don't need to correct for unObservableData, update tree
		
		
		if (currL < oldL + epsilon) { // need to break
			if (currL<=oldL) {
				_et = oldT;
				if(_unObservableData_p)
					_unObservableData_p->setLforMissingData(_et,&_sp);
				return oldL; // keep the old tree, and old likelihood
			} else {
                //update the tree and likelihood and return
				return currL;
			}
		}
        bblEM_it(tollForPairwiseDist);
		oldL = currL;
	}
	// in the case were we reached max_iter, we have to recompute the likelihood of the new tree...
	computeUp();
	if(_unObservableData_p)
		_unObservableData_p->setLforMissingData(_et,&_sp);
	currL = likelihoodComputation::getTreeLikelihoodFromUp2(_et,_sc,_sp,_cup,_posLike,_weights, _unObservableData_p);

	if (currL<=oldL) 
	{
		_et = oldT;
		if(_unObservableData_p)
			_unObservableData_p->setLforMissingData(_et,&_sp);
		return oldL; // keep the old tree, and old likelihood
	} 
	else 
        return currL;
}

/********************************************************************************************
*********************************************************************************************/
void bblEMfixRoot::allocatePlace() {
	
	_computeCountsV.resize(_et.getNodesNum());//initiateTablesOfCounts
	for (int node=0; node < _computeCountsV.size(); ++node) {
	{
		_computeCountsV[node].resize(_sp.alphabetSize()); 
		for (int letterAtRoot = 0; letterAtRoot < _computeCountsV[node].size(); ++letterAtRoot)
			_computeCountsV[node][letterAtRoot].countTableComponentAllocatePlace(_sp.alphabetSize(),_sp.categories()); //_computeCountsV[node][letterAtRoot][rate][alph][alph]
			//_computeCountsV[i][letterAtRoot].zero();	// removed, a BUG, done later	
		}
	}
	
	_cup.allocatePlace(_sc.seqLen(),_sp.categories(), _et.getNodesNum(), _sc.alphabetSize());	
	_cdown.resize(_sp.categories());
	for (int categor = 0; categor < _sp.categories(); ++categor)
	{
		// stay with the convention of fillComputeDownNonReversible where the first index is for rate cat and the second is for letterAtRoot
		_cdown[categor].allocatePlace(_sp.alphabetSize(), _et.getNodesNum(), _sc.alphabetSize()); //_cdown[categ][letter@root][nodeid][letter][prob]
	}
}

/********************************************************************************************
*********************************************************************************************/
void bblEMfixRoot::bblEM_it(const MDOUBLE tollForPairwiseDist){
	string costTable =  "costTableBBLEMit.txt";	//DEBUG 
	ofstream costTableStream(costTable.c_str());	//DEBUG
	//cout<<"before zero\n";
	for (int node=0; node < _computeCountsV.size(); ++node) {
		for (int letAtRoot=0; letAtRoot < _computeCountsV[node].size(); ++letAtRoot) {
			_computeCountsV[node][letAtRoot].zero();
			_computeCountsV[node][letAtRoot].printTable(costTableStream);	//DEBUG
		}
	}
	//cout<<"after zero\n";

	for (int i=0; i < _sc.seqLen(); ++i) {
		computeDown(i);
		addCounts(i); // computes the counts and adds to the table.
	}
	//cout<<"after add counts\n";
	for (int node=0; node < _computeCountsV.size(); ++node) {
		for (int letAtRoot=0; letAtRoot < _computeCountsV[node].size(); ++letAtRoot) {
			_computeCountsV[node][letAtRoot].printTable(costTableStream);	//DEBUG
		}
	}


	optimizeBranches(tollForPairwiseDist);
	if(_unObservableData_p)
		_unObservableData_p->setLforMissingData(_et,&_sp);
}

/********************************************************************************************
*********************************************************************************************/
void bblEMfixRoot::optimizeBranches(const MDOUBLE tollForPairwiseDist){
	treeIterDownTopConst tIt(_et);
	for (tree::nodeP mynode = tIt.first(); mynode != tIt.end(); mynode = tIt.next()) {
		if (!tIt->isRoot()) {
			fromCountTableComponentToDistancefixRoot from1(_computeCountsV[mynode->id()],_sp,tollForPairwiseDist,mynode->dis2father(),_unObservableData_p);
			from1.computeDistance();
			mynode->setDisToFather(from1.getDistance());

			if(false){	//DEBUG
				if(_unObservableData_p)
					_unObservableData_p->setLforMissingData(_et,&_sp);
				computeUp();
				MDOUBLE bL = likelihoodComputation::getTreeLikelihoodFromUp2(_et,_sc,_sp,_cup,_posLike,_weights, _unObservableData_p);
				LOG(6,<<"  node "<<mynode->name()<<" L= "<<bL<<endl);
			}
		}
	}
}

/********************************************************************************************
*********************************************************************************************/
void bblEMfixRoot::computeUp(){
	_pij.fillPij(_et,_sp,0); // 0 is becaues we compute Pij(t) and not its derivations...
	computeUpAlg cupAlg;
	for (int pos=0; pos < _sc.seqLen(); ++pos) {
        for (int categor = 0; categor < _sp.categories(); ++categor) {
			cupAlg.fillComputeUp(_et,_sc,pos,_pij[categor],_cup[pos][categor]);
		}
	}
 }

/********************************************************************************************
*********************************************************************************************/
void bblEMfixRoot::computeDown(const int pos){
	computeDownAlg cdownAlg;
		for (int categor = 0; categor < _sp.categories(); ++categor) {
			cdownAlg.fillComputeDownNonReversible(_et,_sc,pos,_pij[categor],_cdown[categor],_cup[pos][categor]);
		}
 }

/********************************************************************************************
*********************************************************************************************/
void bblEMfixRoot::addCounts(const int pos){
	//MDOUBLE posProb = 
	//	likelihoodComputation::getProbOfPosWhenUpIsFilledGam(pos,_et,_sc,_sp,_cup);
						
	MDOUBLE weig = (_weights ? (*_weights)[pos] : 1.0);
	if (weig == 0) return;
	treeIterDownTopConst tIt(_et);
	for (tree::nodeP mynode = tIt.first(); mynode != tIt.end(); mynode = tIt.next()) {
		if (!tIt->isRoot()) {
			addCountsFixedRoot(pos,mynode,_posLike[pos],weig);
		}
	}
}

/********************************************************************************************
*********************************************************************************************/
// fill _computeCountsV: specific node, letterAtRoot and categor at a time
void bblEMfixRoot::addCountsFixedRoot(const int pos, tree::nodeP mynode, const doubleRep posProb, const MDOUBLE weig){

	computeCounts cc;
	for(int letterAtRoot = 0; letterAtRoot < _sp.alphabetSize(); letterAtRoot++)
	{
		for (int categor =0; categor< _sp.categories(); ++ categor) 
		{
				cc.computeCountsNodeFatherNodeSonHomPos(_sc,
											_pij[categor],
											_sp,
											_cup[pos][categor],
											_cdown[categor][letterAtRoot],
											weig,
											posProb,
											mynode,
											_computeCountsV[mynode->id()][letterAtRoot][categor],
											_sp.ratesProb(categor),
											letterAtRoot);	// letterInFather is used in freq? or already by _cdown?
		}
	}
}