1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
|
#include "muscle.h"
#include "distfunc.h"
#include "seqvect.h"
#include <math.h>
const unsigned TRIPLE_COUNT = 20*20*20;
struct TripleCount
{
unsigned m_uSeqCount; // How many sequences have this triple?
unsigned short *m_Counts; // m_Counts[s] = nr of times triple found in seq s
};
static TripleCount *TripleCounts;
// WARNING: Sequences MUST be stripped of gaps and upper case!
void DistKmer20_3(const SeqVect &v, DistFunc &DF)
{
const unsigned uSeqCount = v.Length();
DF.SetCount(uSeqCount);
if (0 == uSeqCount)
return;
for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
{
DF.SetDist(uSeq1, uSeq1, 0);
for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
DF.SetDist(uSeq1, uSeq2, 0);
}
const unsigned uTripleArrayBytes = TRIPLE_COUNT*sizeof(TripleCount);
TripleCounts = (TripleCount *) malloc(uTripleArrayBytes);
if (0 == TripleCounts)
Quit("Not enough memory (TripleCounts)");
memset(TripleCounts, 0, uTripleArrayBytes);
for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)
{
TripleCount &tc = *(TripleCounts + uWord);
const unsigned uBytes = uSeqCount*sizeof(short);
tc.m_Counts = (unsigned short *) malloc(uBytes);
memset(tc.m_Counts, 0, uBytes);
}
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
{
Seq &s = *(v[uSeqIndex]);
const unsigned uSeqLength = s.Length();
for (unsigned uPos = 0; uPos < uSeqLength - 2; ++uPos)
{
const unsigned uLetter1 = CharToLetterEx(s[uPos]);
if (uLetter1 >= 20)
continue;
const unsigned uLetter2 = CharToLetterEx(s[uPos+1]);
if (uLetter2 >= 20)
continue;
const unsigned uLetter3 = CharToLetterEx(s[uPos+2]);
if (uLetter3 >= 20)
continue;
const unsigned uWord = uLetter1 + uLetter2*20 + uLetter3*20*20;
assert(uWord < TRIPLE_COUNT);
TripleCount &tc = *(TripleCounts + uWord);
const unsigned uOldCount = tc.m_Counts[uSeqIndex];
if (0 == uOldCount)
++(tc.m_uSeqCount);
++(tc.m_Counts[uSeqIndex]);
}
}
#if TRACE
{
Log("TripleCounts\n");
unsigned uGrandTotal = 0;
for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)
{
const TripleCount &tc = *(TripleCounts + uWord);
if (0 == tc.m_uSeqCount)
continue;
const unsigned uLetter3 = uWord/(20*20);
const unsigned uLetter2 = (uWord - uLetter3*20*20)/20;
const unsigned uLetter1 = uWord%20;
Log("Word %6u %c%c%c %6u",
uWord,
LetterToCharAmino(uLetter1),
LetterToCharAmino(uLetter2),
LetterToCharAmino(uLetter3),
tc.m_uSeqCount);
unsigned uSeqCountWithThisWord = 0;
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
{
const unsigned uCount = tc.m_Counts[uSeqIndex];
if (uCount > 0)
{
++uSeqCountWithThisWord;
Log(" %u=%u", uSeqIndex, uCount);
uGrandTotal += uCount;
}
}
if (uSeqCountWithThisWord != tc.m_uSeqCount)
Log(" *** SQ ERROR *** %u %u", tc.m_uSeqCount, uSeqCountWithThisWord);
Log("\n");
}
unsigned uTotalBySeqLength = 0;
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
{
Seq &s = *(v[uSeqIndex]);
const unsigned uSeqLength = s.Length();
uTotalBySeqLength += uSeqLength - 2;
}
if (uGrandTotal != uTotalBySeqLength)
Log("*** TOTALS DISAGREE *** %u %u\n", uGrandTotal, uTotalBySeqLength);
}
#endif
const unsigned uSeqListBytes = uSeqCount*sizeof(unsigned);
unsigned short *SeqList = (unsigned short *) malloc(uSeqListBytes);
for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)
{
const TripleCount &tc = *(TripleCounts + uWord);
if (0 == tc.m_uSeqCount)
continue;
unsigned uSeqCountFound = 0;
memset(SeqList, 0, uSeqListBytes);
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
{
if (tc.m_Counts[uSeqIndex] > 0)
{
SeqList[uSeqCountFound] = uSeqIndex;
++uSeqCountFound;
if (uSeqCountFound == tc.m_uSeqCount)
break;
}
}
assert(uSeqCountFound == tc.m_uSeqCount);
for (unsigned uSeq1 = 0; uSeq1 < uSeqCountFound; ++uSeq1)
{
const unsigned uSeqIndex1 = SeqList[uSeq1];
const unsigned uCount1 = tc.m_Counts[uSeqIndex1];
for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
{
const unsigned uSeqIndex2 = SeqList[uSeq2];
const unsigned uCount2 = tc.m_Counts[uSeqIndex2];
const unsigned uMinCount = uCount1 < uCount2 ? uCount1 : uCount2;
const double d = DF.GetDist(uSeqIndex1, uSeqIndex2);
DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (d + uMinCount));
}
}
}
delete[] SeqList;
free(TripleCounts);
unsigned uDone = 0;
const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2;
for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
{
DF.SetDist(uSeq1, uSeq1, 0.0);
const Seq &s1 = *(v[uSeq1]);
const unsigned uLength1 = s1.Length();
for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
{
const Seq &s2 = *(v[uSeq2]);
const unsigned uLength2 = s2.Length();
unsigned uMinLength = uLength1 < uLength2 ? uLength1 : uLength2;
if (uMinLength < 3)
{
DF.SetDist(uSeq1, uSeq2, 1.0);
continue;
}
const double dTripleCount = DF.GetDist(uSeq1, uSeq2);
if (dTripleCount == 0)
{
DF.SetDist(uSeq1, uSeq2, 1.0);
continue;
}
double dNormalizedTripletScore = dTripleCount/(uMinLength - 2);
//double dEstimatedPairwiseIdentity = exp(0.3912*log(dNormalizedTripletScore));
//if (dEstimatedPairwiseIdentity > 1)
// dEstimatedPairwiseIdentity = 1;
// DF.SetDist(uSeq1, uSeq2, (float) (1.0 - dEstimatedPairwiseIdentity));
DF.SetDist(uSeq1, uSeq2, (float) dNormalizedTripletScore);
#if TRACE
{
Log("%s - %s Triplet count = %g Lengths %u, %u Estimated pwid = %g\n",
s1.GetName(), s2.GetName(), dTripleCount, uLength1, uLength2,
dEstimatedPairwiseIdentity);
}
#endif
if (uDone%1000 == 0)
Progress(uDone, uTotal);
}
}
ProgressStepsDone();
}
|