File: siteSpecificGL.cpp

package info (click to toggle)
fastml 3.11-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,772 kB
  • sloc: cpp: 48,522; perl: 3,588; ansic: 819; makefile: 386; python: 83; sh: 55
file content (215 lines) | stat: -rw-r--r-- 8,415 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
/*
Copyright (C) 2011 Tal Pupko  TalP@tauex.tau.ac.il.

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

#include "siteSpecificGL.h"
#include "definitions.h"
#include "numRec.h"
#include "matrixUtils.h"
#include "seqContainerTreeMap.h"
#include "gainLossUtils.h"
#include "gainLossModel.h"
#include "gainLossOptions.h"



// THE BAYESIAN EB_EXP PART OF gain and loss ESTIMATION. //

/*************************************
This function computes the expectation of 
the posterior gain and loss distribution for a specific site
as well as the confidence interval 
*************************************/
// per all sites computation
void computeEB_EXP_siteSpecificGL(Vdouble & GainLossV,
									Vdouble & stdV,
									Vdouble & lowerBoundV,
									Vdouble & upperBoundV,
									VVdouble & posteriorsV,
									const sequenceContainer& sc,
									const vector<vector<stochasticProcess*> >& spVVec,
									const tree& tr,
									const distribution * gainDist,
									const distribution * lossDist,
									const distribution * distPrim,
									const MDOUBLE alphaConf,
									VVVdouble & postProbPerSpPerCatPerPos,	//2 fill (*postProbPerSpPerCatPerPos)[sp][pos]
									unObservableData* unObservableData_p)
{
	LOG(5,<<"Calculating posterior and expectation of posterior values for all sites"<<endl);
	int seqLen = sc.seqLen();
	GainLossV.resize(seqLen);
	stdV.resize(seqLen);
	lowerBoundV.resize(seqLen);
	upperBoundV.resize(seqLen);	
	int numOfSPs = gainDist->categories()*lossDist->categories();
	resizeMatrix(posteriorsV,seqLen,numOfSPs);
	//computePijGam cpg;
	//cpg._V.resize(numOfSPs);
	//for (int i=0; i < numOfSPs; ++i) {
	//	int gainIndex =fromIndex2gainIndex(i,gainDist->categories(),lossDist->categories());
	//	int lossIndex =fromIndex2lossIndex(i,gainDist->categories(),lossDist->categories());
	//	cpg._V[i].fillPij(tr,*spVVec[gainIndex][lossIndex]);
	//}
	for (int pos=0; pos < sc.seqLen(); ++pos) {
		computeEB_EXP_siteSpecificGL(pos, sc, spVVec, tr, gainDist,lossDist,distPrim,posteriorsV[pos],	//cpg
			GainLossV[pos], stdV[pos], lowerBoundV[pos], upperBoundV[pos], alphaConf, postProbPerSpPerCatPerPos,unObservableData_p);
	}
}


/********************************************************************************************
*********************************************************************************************/
void computeEB_EXP_siteSpecificGL(int pos,
									const sequenceContainer& sc,
									const vector<vector<stochasticProcess*> >& spVVec,
									//const computePijGam& cpg,
									const tree &tr,
									const distribution * gainDist,
									const distribution * lossDist,
									const distribution * distPrim,
									Vdouble & posteriorV,
									MDOUBLE& GainLossExpectation,
									MDOUBLE & stdGainLoss,
									MDOUBLE & lowerConf,
									MDOUBLE & upperConf,
									const MDOUBLE alphaConf,
									VVVdouble & postProbPerSpPerCatPerPos,	//2 fill (*postProbPerSpPerCatPerPos)[sp][pos]
									unObservableData* unObservableData_p) // alpha of 0.05 is considered 0.025 for each side.
{
	bool isLpostPerSpPerCatComputed =false;
	if(postProbPerSpPerCatPerPos[0][0][pos]>0)
		isLpostPerSpPerCatComputed =true;


	// here we compute the posterior P(r|data)
	int numOfRateCat = (*spVVec[0][0]).categories();	// ver2
	int numOfSPs = gainDist->categories()*lossDist->categories();

	posteriorV.resize(distPrim->categories(),0.0);
	// ver2
	VVdoubleRep PosteriorVVRateCat;
	resizeMatrix(PosteriorVVRateCat,numOfSPs,numOfRateCat);

	doubleRep dRepTotalLikelihood(0.0);// temporary dblRep for total likelihood

	for (int spIndex=0; spIndex < numOfSPs; ++spIndex) {
		int gainIndex =fromIndex2gainIndex(spIndex,gainDist->categories(),lossDist->categories());
		int lossIndex =fromIndex2lossIndex(spIndex,gainDist->categories(),lossDist->categories());
		
		//int primIndex;
		//if(distPrim == gainDist)
		//	primIndex = gainIndex;
		//else
		//	primIndex = lossIndex;
		
		computePijGam pi;
		pi.fillPij(tr,*spVVec[gainIndex][lossIndex]);
		
		// ver1 - no rate dist in rate computation
		//dblRepPosteriorV[primIndex] += likelihoodComputation::getLofPos(pos,tr,sc,pi,*spVVec[gainIndex][lossIndex])* gainDist->ratesProb(gainIndex)*lossDist->ratesProb(lossIndex);

		// ver2 - with rate dist
		for (int rateInd=0; rateInd < numOfRateCat; ++rateInd) {
			PosteriorVVRateCat[spIndex][rateInd] += likelihoodComputation::getLofPos(pos,tr,sc,pi[rateInd],*spVVec[gainIndex][lossIndex],unObservableData_p)
					* gainDist->ratesProb(gainIndex) * lossDist->ratesProb(lossIndex) * spVVec[gainIndex][lossIndex]->ratesProb(rateInd);
		}		
	}

	// here we compute sigma r * P(r | data)
	GainLossExpectation = 0.0;
	MDOUBLE sumOfSquares = 0.0; // this is the sum of squares. this will be used to compute the variance
	
	// ver1 - no rate dist in rate computation
	//for (int i=0; i < distPrim->categories(); ++i) {
	//	dblRepTotalLikelihood+=dblRepPosteriorV[i];
	//}
	//for (int j=0; j < distPrim->categories(); ++j) {
	//	dblRepPosteriorV[j]/=dblRepTotalLikelihood; // so that posteriorV is probability.
	//	if(unObservableData_p){
	//		dblRepPosteriorV[j] = dblRepPosteriorV[j]/(1- exp(unObservableData_p->getlogLforMissingData()));	// Note: each postProbCat corrected by unObs of all cat
	//	}
	//	posteriorV[j] = convert(dblRepPosteriorV[j]); // revert back to DOUBLE
	//	MDOUBLE tmp = posteriorV[j]*distPrim->rates(j);
	//	GainLossExpectation += tmp;
	//	sumOfSquares += (tmp*distPrim->rates(j));
	//}

	// ver2
	for (int spIndex=0; spIndex < numOfSPs; ++spIndex) {
		for (int i=0; i < numOfRateCat; ++i) {
			dRepTotalLikelihood+=PosteriorVVRateCat[spIndex][i];
		}
	}
	
	
	for (int spIndex=0; spIndex < numOfSPs; ++spIndex) {
		int gainIndex =fromIndex2gainIndex(spIndex,gainDist->categories(),lossDist->categories());
		int lossIndex =fromIndex2lossIndex(spIndex,gainDist->categories(),lossDist->categories());

		int primIndex;
		if(distPrim == gainDist)
			primIndex = gainIndex;
		else
			primIndex = lossIndex;

		for (int i=0; i < numOfRateCat; ++i) {
			PosteriorVVRateCat[spIndex][i]/=convert(dRepTotalLikelihood); // so that posteriorV is probability.
			posteriorV[primIndex] += convert(PosteriorVVRateCat[spIndex][i]);
			MDOUBLE tmp = convert(PosteriorVVRateCat[spIndex][i]) * distPrim->rates(primIndex) * spVVec[0][0]->rates(i); // the rateVal
			GainLossExpectation += tmp;
			sumOfSquares += (tmp * distPrim->rates(primIndex) * spVVec[0][0]->rates(i));	// ???
		}
	}
////////////////////////////////////////////////////////////////////////// ?
	if(!isLpostPerSpPerCatComputed){
		for (int spIndex=0; spIndex < numOfSPs; ++spIndex) {
			for (int rateInd=0; rateInd < numOfRateCat; ++rateInd) {
				postProbPerSpPerCatPerPos[spIndex][rateInd][pos] = convert(PosteriorVVRateCat[spIndex][rateInd]);
			}
		}
	}
	MDOUBLE variance = sumOfSquares - GainLossExpectation*GainLossExpectation; // variance
	//if (!(variance!=0))
	//	errorMsg::reportError("Error in computeEB_EXP_siteSpecificGainLoss, variance = 0");
	stdGainLoss = sqrt(variance); // standard deviation of inferred Ka/Ks

	// detecting the confidence intervals.
	MDOUBLE oneSideConfAlpha = alphaConf/2.0; // because we are computing the two tail.
	MDOUBLE cdf = 0.0; // cumulative density function.
	int k=0;
	while (k < distPrim->categories()){
		cdf += posteriorV[k];
		if (cdf >oneSideConfAlpha) {
			lowerConf = distPrim->rates(k);
			break;
		} 
		k++;
	}
	while (k < distPrim->categories()) {
		if (cdf >(1.0-oneSideConfAlpha)) {
			upperConf = distPrim->rates(k);
			break;
		}
		++k;
		cdf += posteriorV[k];
	}
	if (k==distPrim->categories()) 
		upperConf = distPrim->rates(k-1);
}