File: evaluateCharacterFreq.cpp

package info (click to toggle)
fastml 3.11-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,772 kB
  • sloc: cpp: 48,522; perl: 3,588; ansic: 819; makefile: 386; python: 83; sh: 55
file content (153 lines) | stat: -rw-r--r-- 4,322 bytes parent folder | download | duplicates (10)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
// $Id: evaluateCharacterFreq.cpp 10474 2012-03-18 07:54:07Z itaymay $

#include "evaluateCharacterFreq.h"
#include "someUtil.h"
#include <cassert>

vector<MDOUBLE> sumAlphabetCounts(const sequenceContainer & sc) {
	vector<MDOUBLE> charFreq(sc.alphabetSize(),0.0);
	sequenceContainer::constTaxaIterator tIt;
	sequenceContainer::constTaxaIterator tItEnd;
	tIt.begin(sc);
	tItEnd.end(sc);
	while (tIt!= tItEnd) {
		sequence::constIterator sIt;
		sequence::constIterator sItEnd;
		sIt.begin(*tIt);
		sItEnd.end(*tIt);
		while (sIt != sItEnd) {
			if ((*sIt >= 0) && (*sIt <charFreq.size())) ++charFreq[(*sIt)];
			++sIt;
		}
		++tIt;
	}
	return charFreq;
}

void changeCountsToFreqs(vector<MDOUBLE>& charFreq){
	MDOUBLE sumA = 0;
	int i=0;
	for (i=0; i < charFreq.size(); ++i) {
        sumA+=charFreq[i] ;
	}
	for (i=0; i < charFreq.size(); ++i) {
		charFreq[i] /= sumA;
	}
}

void makeSureNoZeroFreqs(vector<MDOUBLE> & charFreq){
	// CORRECT SO THAT THERE ARE NO ZERO FREQUENCIES.
	// ALL FREQS THAT WERE ZERO ARE CHANGED
	MDOUBLE ZERO_FREQ = 0.0000000001;
	MDOUBLE sumB=0;
	int charWithZeroFreq = 0;
	int i=0;
	for (i=0; i < charFreq.size(); ++i) {
		if (DSMALL_EQUAL(charFreq[i], ZERO_FREQ)) {
			charFreq[i] = ZERO_FREQ;
			++charWithZeroFreq;
		}
		else sumB +=charFreq[i];
	}
	if (!DEQUAL(sumB, 1.0, 0.01))
	{
		cerr.precision(10);	
		cerr<<"sumFreq = "<<sumB<<endl;
		//errorMsg::reportError("error in makeSureNoZeroFreqs(). Input frequencies must sum to 1.0");
	}
	scaleVec(charFreq, 1.0/charFreq.size());
	//MDOUBLE scaleFactor = sumB - (charWithZeroFreq * ZERO_FREQ);
	//for (i=0; i < charFreq.size(); ++i) {
	//	if (charFreq[i] != ZERO_FREQ)
	//		charFreq[i] *= scaleFactor;
	//}
}


vector<MDOUBLE> evaluateCharacterFreq(const sequenceContainer & sc) {
	vector<MDOUBLE> charFreq=sumAlphabetCounts(sc);
	changeCountsToFreqs(charFreq);
	makeSureNoZeroFreqs(charFreq);
	return charFreq;
}

VVdouble evaluateCharacterFreqOneForEachGene(const vector<sequenceContainer> & scVec){
	VVdouble charFreq;
	for (int k=0; k < scVec.size(); ++k) {
		charFreq.push_back(evaluateCharacterFreq(scVec[k]));
	}
	return charFreq;
}

		


vector<MDOUBLE> evaluateCharacterFreqBasedOnManyGenes(const vector<sequenceContainer> & scVec) {
	// note: all alphabets have to be the same!
	vector<MDOUBLE> charFreq(scVec[0].alphabetSize(),0.0);
	for (int i=0; i < scVec.size();++i) {
		assert(scVec[0].getAlphabet()->size()==scVec[i].getAlphabet()->size());
        vector<MDOUBLE> charFreqTmp=sumAlphabetCounts(scVec[i]);
		for (int z=0; z < charFreq.size();++z) charFreq[z]+=charFreqTmp[z];
	}
	changeCountsToFreqs(charFreq);
	makeSureNoZeroFreqs(charFreq);
	return charFreq;
}

//returns the number of each character in each position. 
//NOTE: returns also the number of unknown charecters in the last place in each vector, so that the actual vector size for each position is alphabetSize()+1
void getCharacterCounts(const sequenceContainer & sc, VVint& counts4pos)
{
	const alphabet* pAlph = sc.getAlphabet();
	int alphSize = sc.alphabetSize();
	int pos; 
	counts4pos.resize(sc.seqLen());
	for (pos = 0; pos < sc.seqLen(); ++pos)
		counts4pos[pos].resize(alphSize + 1, 0);

	for (int seq = 0; seq < sc.numberOfSeqs();++seq) 
	{
		int id = sc.placeToId(seq);
		for (pos = 0; pos < sc.seqLen(); ++pos)
		{
			int charType = sc[id][pos];
			if (pAlph->isSpecific(charType))
			{
				++counts4pos[pos][charType];
			}
			else
				++counts4pos[pos][alphSize];
		}
	}
}

//returns the number of different character types in each position
void getCharacterType4pos(const sequenceContainer & sc, Vint& charactersType4pos)
{
	VVint counts4Pos;
	getCharacterCounts(sc, counts4Pos);
	charactersType4pos.resize(sc.seqLen(), 0);
	for (int pos = 0; pos < sc.seqLen(); ++pos)
	{
		for (int c = 0; c < counts4Pos[pos].size()-1; ++c)
		{
			if (counts4Pos[pos][c] > 0)
				++charactersType4pos[pos];
		}
	}
}

//returns the distribution of the different character types in each position along the whole alignment
void getCharacterTypeDistribution(const sequenceContainer & sc, Vint& charactersTypeDist)
{
	Vint charactersType4pos;
	getCharacterType4pos(sc, charactersType4pos);
	charactersTypeDist.resize(sc.numberOfSeqs()+1, 0);
	for (int pos = 0; pos < sc.seqLen(); ++pos)
	{
		int count = charactersType4pos[pos];
		++charactersTypeDist[count];
	}

}