1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
|
#include "muscle.h"
#include "msa.h"
/***
Compute Henikoff weights.
Steven Henikoff and Jorja G. Henikoff (1994), Position-based sequence weights.
J. Mol. Biol., 243(4):574-578.
Award each different residue an equal share of the weight, and then to divide up
that weight equally among the sequences sharing the same residue. So if in a
position of a multiple alignment, r different residues are represented, a residue
represented in only one sequence contributes a score of 1/r to that sequence, whereas a
residue represented in s sequences contributes a score of 1/rs to each of the s
sequences. For each sequence, the contributions from each position are summed to give
a sequence weight.
Here we use the variant from PSI-BLAST, which (a) treats gaps as a 21st letter,
and (b) ignores columns that are perfectly conserved.
>>> WARNING -- I SUSPECT THIS DOESN'T WORK CORRECTLY <<<
***/
void MSA::CalcHenikoffWeightsColPB(unsigned uColIndex) const
{
const unsigned uSeqCount = GetSeqCount();
// Compute letter counts in this column
unsigned uLetterCount[MAX_ALPHA+1];
memset(uLetterCount, 0, (MAX_ALPHA+1)*sizeof(unsigned));
unsigned uLetter;
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
{
if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex))
uLetter = MAX_ALPHA;
else
uLetter = GetLetter(uSeqIndex, uColIndex);
++(uLetterCount[uLetter]);
}
// Check for special case of perfect conservation
for (unsigned uLetter = 0; uLetter < MAX_ALPHA+1; ++uLetter)
{
unsigned uCount = uLetterCount[uLetter];
if (uCount > 0)
{
// Perfectly conserved?
if (uCount == uSeqCount)
return;
else
// If count > 0 but less than nr. sequences, can't be conserved
break;
}
}
// Compute weight contributions
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
{
unsigned uLetter;
if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex))
uLetter = MAX_ALPHA;
else
uLetter = GetLetter(uSeqIndex, uColIndex);
const unsigned uCount = uLetterCount[uLetter];
m_Weights[uSeqIndex] += (WEIGHT) (1.0/uCount);
}
}
bool MSA::IsGapSeq(unsigned uSeqIndex) const
{
const unsigned uColCount = GetColCount();
for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
if (!IsGap(uSeqIndex, uColIndex))
return false;
return true;
}
void MSA::SetUniformWeights() const
{
const unsigned uSeqCount = GetSeqCount();
if (0 == uSeqCount)
return;
const WEIGHT w = (WEIGHT) (1.0 / uSeqCount);
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
m_Weights[uSeqIndex] = w;
}
void MSA::SetHenikoffWeightsPB() const
{
const unsigned uColCount = GetColCount();
const unsigned uSeqCount = GetSeqCount();
if (0 == uSeqCount)
return;
else if (1 == uSeqCount)
{
m_Weights[0] = 1.0;
return;
}
else if (2 == uSeqCount)
{
m_Weights[0] = (WEIGHT) 0.5;
m_Weights[1] = (WEIGHT) 0.5;
return;
}
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
m_Weights[uSeqIndex] = 0.0;
for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
CalcHenikoffWeightsColPB(uColIndex);
// Set all-gap seqs weight to 0
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
if (IsGapSeq(uSeqIndex))
m_Weights[uSeqIndex] = 0.0;
// Check for special case of identical sequences, which will cause all
// columns to be skipped becasue they're perfectly conserved.
if (VectorIsZero(m_Weights, uSeqCount))
VectorSet(m_Weights, uSeqCount, 1.0);
Normalize(m_Weights, uSeqCount);
}
|