File: fastdistjones.cpp

package info (click to toggle)
muscle 3.60-1
  • links: PTS
  • area: main
  • in suites: etch, etch-m68k
  • size: 1,384 kB
  • ctags: 2,079
  • sloc: cpp: 26,452; xml: 185; makefile: 101
file content (206 lines) | stat: -rw-r--r-- 6,011 bytes parent folder | download | duplicates (11)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#include "muscle.h"
#include "distfunc.h"
#include "seqvect.h"
#include <math.h>

const unsigned TRIPLE_COUNT = 20*20*20;

struct TripleCount
	{
	unsigned m_uSeqCount;			// How many sequences have this triple?
	unsigned short *m_Counts;		// m_Counts[s] = nr of times triple found in seq s
	};
static TripleCount *TripleCounts;

// WARNING: Sequences MUST be stripped of gaps and upper case!
void DistKmer20_3(const SeqVect &v, DistFunc &DF)
	{
	const unsigned uSeqCount = v.Length();

	DF.SetCount(uSeqCount);
	if (0 == uSeqCount)
		return;
	for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
		{
		DF.SetDist(uSeq1, uSeq1, 0);
		for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
			DF.SetDist(uSeq1, uSeq2, 0);
		}

	const unsigned uTripleArrayBytes = TRIPLE_COUNT*sizeof(TripleCount);
	TripleCounts = (TripleCount *) malloc(uTripleArrayBytes);
	if (0 == TripleCounts)
		Quit("Not enough memory (TripleCounts)");
	memset(TripleCounts, 0, uTripleArrayBytes);

	for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)
		{
		TripleCount &tc = *(TripleCounts + uWord);
		const unsigned uBytes = uSeqCount*sizeof(short);
		tc.m_Counts = (unsigned short *) malloc(uBytes);
		memset(tc.m_Counts, 0, uBytes);
		}

	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		Seq &s = *(v[uSeqIndex]);
		const unsigned uSeqLength = s.Length();
		for (unsigned uPos = 0; uPos < uSeqLength - 2; ++uPos)
			{
			const unsigned uLetter1 = CharToLetterEx(s[uPos]);
			if (uLetter1 >= 20)
				continue;
			const unsigned uLetter2 = CharToLetterEx(s[uPos+1]);
			if (uLetter2 >= 20)
				continue;
			const unsigned uLetter3 = CharToLetterEx(s[uPos+2]);
			if (uLetter3 >= 20)
				continue;

			const unsigned uWord = uLetter1 + uLetter2*20 + uLetter3*20*20;
			assert(uWord < TRIPLE_COUNT);

			TripleCount &tc = *(TripleCounts + uWord);
			const unsigned uOldCount = tc.m_Counts[uSeqIndex];
			if (0 == uOldCount)
				++(tc.m_uSeqCount);

			++(tc.m_Counts[uSeqIndex]);
			}
		}

#if TRACE
	{
	Log("TripleCounts\n");
	unsigned uGrandTotal = 0;
	for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)
		{
		const TripleCount &tc = *(TripleCounts + uWord);
		if (0 == tc.m_uSeqCount)
			continue;

		const unsigned uLetter3 = uWord/(20*20);
		const unsigned uLetter2 = (uWord - uLetter3*20*20)/20;
		const unsigned uLetter1 = uWord%20;
		Log("Word %6u %c%c%c   %6u",
		  uWord,
		  LetterToCharAmino(uLetter1),
		  LetterToCharAmino(uLetter2),
		  LetterToCharAmino(uLetter3),
		  tc.m_uSeqCount);

		unsigned uSeqCountWithThisWord = 0;
		for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
			{
			const unsigned uCount = tc.m_Counts[uSeqIndex];
			if (uCount > 0)
				{
				++uSeqCountWithThisWord;
				Log(" %u=%u", uSeqIndex, uCount);
				uGrandTotal += uCount;
				}
			}
		if (uSeqCountWithThisWord != tc.m_uSeqCount)
			Log(" *** SQ ERROR *** %u %u", tc.m_uSeqCount, uSeqCountWithThisWord);
		Log("\n");
		}
	
	unsigned uTotalBySeqLength = 0;
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		Seq &s = *(v[uSeqIndex]);
		const unsigned uSeqLength = s.Length();
		uTotalBySeqLength += uSeqLength - 2;
		}
	if (uGrandTotal != uTotalBySeqLength)
		Log("*** TOTALS DISAGREE *** %u %u\n", uGrandTotal, uTotalBySeqLength);
	}
#endif

	const unsigned uSeqListBytes = uSeqCount*sizeof(unsigned);
	unsigned short *SeqList = (unsigned short *) malloc(uSeqListBytes);

	for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)
		{
		const TripleCount &tc = *(TripleCounts + uWord);
		if (0 == tc.m_uSeqCount)
			continue;

		unsigned uSeqCountFound = 0;
		memset(SeqList, 0, uSeqListBytes);

		for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
			{
			if (tc.m_Counts[uSeqIndex] > 0)
				{
				SeqList[uSeqCountFound] = uSeqIndex;
				++uSeqCountFound;
				if (uSeqCountFound == tc.m_uSeqCount)
					break;
				}
			}
		assert(uSeqCountFound == tc.m_uSeqCount);

		for (unsigned uSeq1 = 0; uSeq1 < uSeqCountFound; ++uSeq1)
			{
			const unsigned uSeqIndex1 = SeqList[uSeq1];
			const unsigned uCount1 = tc.m_Counts[uSeqIndex1];
			for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
				{
				const unsigned uSeqIndex2 = SeqList[uSeq2];
				const unsigned uCount2 = tc.m_Counts[uSeqIndex2];
				const unsigned uMinCount = uCount1 < uCount2 ? uCount1 : uCount2;
				const double d = DF.GetDist(uSeqIndex1, uSeqIndex2);
				DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (d + uMinCount));
				}
			}
		}
	delete[] SeqList;
	free(TripleCounts);

	unsigned uDone = 0;
	const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2;
	for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
		{
		DF.SetDist(uSeq1, uSeq1, 0.0);

		const Seq &s1 = *(v[uSeq1]);
		const unsigned uLength1 = s1.Length();

		for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
			{
			const Seq &s2 = *(v[uSeq2]);
			const unsigned uLength2 = s2.Length();
			unsigned uMinLength = uLength1 < uLength2 ? uLength1 : uLength2;
			if (uMinLength < 3)
				{
				DF.SetDist(uSeq1, uSeq2, 1.0);
				continue;
				}

			const double dTripleCount = DF.GetDist(uSeq1, uSeq2);
			if (dTripleCount == 0)
				{
				DF.SetDist(uSeq1, uSeq2, 1.0);
				continue;
				}
			double dNormalizedTripletScore = dTripleCount/(uMinLength - 2);
			//double dEstimatedPairwiseIdentity = exp(0.3912*log(dNormalizedTripletScore));
			//if (dEstimatedPairwiseIdentity > 1)
			//	dEstimatedPairwiseIdentity = 1;
//			DF.SetDist(uSeq1, uSeq2, (float) (1.0 - dEstimatedPairwiseIdentity));
			DF.SetDist(uSeq1, uSeq2, (float) dNormalizedTripletScore);

#if	TRACE
			{
			Log("%s - %s  Triplet count = %g  Lengths %u, %u Estimated pwid = %g\n",
			  s1.GetName(), s2.GetName(), dTripleCount, uLength1, uLength2,
			  dEstimatedPairwiseIdentity);
			}
#endif
			if (uDone%1000 == 0)
				Progress(uDone, uTotal);
			}
		}
	ProgressStepsDone();
	}