File: henikoffweightpb.cpp

package info (click to toggle)
muscle 3.60-1
links: PTS
area: main
in suites: etch, etch-m68k
size: 1,384 kB
ctags: 2,079
sloc: cpp: 26,452; xml: 185; makefile: 101
file content (124 lines) | stat: -rw-r--r-- 3,681 bytes
parent folder | download | duplicates (14)
#include "muscle.h"
#include "msa.h"

/***
Compute Henikoff weights.
Steven Henikoff and Jorja G. Henikoff (1994), Position-based sequence weights.
J. Mol. Biol., 243(4):574-578.

Award each different residue an equal share of the weight, and then to divide up
that weight equally among the sequences sharing the same residue. So if in a
position of a multiple alignment, r different residues are represented, a residue
represented in only one sequence contributes a score of 1/r to that sequence, whereas a
residue represented in s sequences contributes a score of 1/rs to each of the s
sequences. For each sequence, the contributions from each position are summed to give
a sequence weight.

Here we use the variant from PSI-BLAST, which (a) treats gaps as a 21st letter,
and (b) ignores columns that are perfectly conserved.

>>> WARNING -- I SUSPECT THIS DOESN'T WORK CORRECTLY <<<
***/

void MSA::CalcHenikoffWeightsColPB(unsigned uColIndex) const
	{
	const unsigned uSeqCount = GetSeqCount();

// Compute letter counts in this column
	unsigned uLetterCount[MAX_ALPHA+1];
	memset(uLetterCount, 0, (MAX_ALPHA+1)*sizeof(unsigned));
	unsigned uLetter;
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex))
			uLetter = MAX_ALPHA;
		else
			uLetter = GetLetter(uSeqIndex, uColIndex);
		++(uLetterCount[uLetter]);
		}

// Check for special case of perfect conservation
	for (unsigned uLetter = 0; uLetter < MAX_ALPHA+1; ++uLetter)
		{
		unsigned uCount = uLetterCount[uLetter];
		if (uCount > 0)
			{
		// Perfectly conserved?
			if (uCount == uSeqCount)
				return;
			else
			// If count > 0 but less than nr. sequences, can't be conserved
				break;
			}
		}

// Compute weight contributions
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		unsigned uLetter;
		if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex))
			uLetter = MAX_ALPHA;
		else
			uLetter = GetLetter(uSeqIndex, uColIndex);
		const unsigned uCount = uLetterCount[uLetter];
		m_Weights[uSeqIndex] += (WEIGHT) (1.0/uCount);
		}
	}

bool MSA::IsGapSeq(unsigned uSeqIndex) const
	{
	const unsigned uColCount = GetColCount();
	for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
		if (!IsGap(uSeqIndex, uColIndex))
			return false;
	return true;
	}

void MSA::SetUniformWeights() const
	{
	const unsigned uSeqCount = GetSeqCount();
	if (0 == uSeqCount)
		return;

	const WEIGHT w = (WEIGHT) (1.0 / uSeqCount);
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		m_Weights[uSeqIndex] = w;
	}

void MSA::SetHenikoffWeightsPB() const
	{
	const unsigned uColCount = GetColCount();
	const unsigned uSeqCount = GetSeqCount();

	if (0 == uSeqCount)
		return;
	else if (1 == uSeqCount)
		{
		m_Weights[0] = 1.0;
		return;
		}
	else if (2 == uSeqCount)
		{
		m_Weights[0] = (WEIGHT) 0.5;
		m_Weights[1] = (WEIGHT) 0.5;
		return;
		}

	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		m_Weights[uSeqIndex] = 0.0;

	for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
		CalcHenikoffWeightsColPB(uColIndex);

// Set all-gap seqs weight to 0
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		if (IsGapSeq(uSeqIndex))
			m_Weights[uSeqIndex] = 0.0;

// Check for special case of identical sequences, which will cause all
// columns to be skipped becasue they're perfectly conserved.
	if (VectorIsZero(m_Weights, uSeqCount))
		VectorSet(m_Weights, uSeqCount, 1.0);

	Normalize(m_Weights, uSeqCount);
	}