File: cons.cpp

package info (click to toggle)
muscle 3.60-1
  • links: PTS
  • area: main
  • in suites: etch, etch-m68k
  • size: 1,384 kB
  • ctags: 2,079
  • sloc: cpp: 26,452; xml: 185; makefile: 101
file content (118 lines) | stat: -rw-r--r-- 3,146 bytes parent folder | download | duplicates (14)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
/***
Conservation value for a column in an MSA is defined as the number
of times the most common letter appears divided by the number of
sequences.
***/

#include "muscle.h"
#include "msa.h"
#include <math.h>

double MSA::GetAvgCons() const
	{
	assert(GetSeqCount() > 0);
	double dSum = 0;
	unsigned uNonGapColCount = 0;
	for (unsigned uColIndex = 0; uColIndex < GetColCount(); ++uColIndex)
		{
		if (!IsGapColumn(uColIndex))
			{
			dSum += GetCons(uColIndex);
			++uNonGapColCount;
			}
		}
	assert(uNonGapColCount > 0);
	double dAvg = dSum / uNonGapColCount;
	assert(dAvg > 0 && dAvg <= 1);
	return dAvg;
	}

double MSA::GetCons(unsigned uColIndex) const
	{
	unsigned Counts[MAX_ALPHA];
	for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter)
		Counts[uLetter] = 0;

	unsigned uMaxCount = 0;
	for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex)
		{
		if (IsGap(uSeqIndex, uColIndex))
			continue;
		char c = GetChar(uSeqIndex, uColIndex);
		c = toupper(c);
		if ('X' == c || 'B' == c || 'Z' == c)
			continue;
		unsigned uLetter = GetLetter(uSeqIndex, uColIndex);
		unsigned uCount = Counts[uLetter] + 1;
		if (uCount > uMaxCount)
			uMaxCount = uCount;
		Counts[uLetter] = uCount;
		}

// Cons is undefined for all-gap column
	if (0 == uMaxCount)
		{
//		assert(false);
		return 1;
		}

	double dCons = (double) uMaxCount / (double) GetSeqCount();
	assert(dCons > 0 && dCons <= 1);
	return dCons;
	}

// Perecent identity of a pair of sequences.
// Positions with one or both gapped are ignored.
double MSA::GetPctIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const
	{
	const unsigned uColCount = GetColCount();
	unsigned uPosCount = 0;
	unsigned uSameCount = 0;
	for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
		{
		const char c1 = GetChar(uSeqIndex1, uColIndex);
		const char c2 = GetChar(uSeqIndex2, uColIndex);
		if (IsGapChar(c1) || IsGapChar(c2))
			continue;
		if (c1 == c2)
			++uSameCount;
		++uPosCount;
		}
	if (0 == uPosCount)
		return 0;
	return (double) uSameCount / (double) uPosCount;
	}

// Perecent group identity of a pair of sequences.
// Positions with one or both gapped are ignored.
double MSA::GetPctGroupIdentityPair(unsigned uSeqIndex1,
  unsigned uSeqIndex2) const
	{
	extern unsigned ResidueGroup[];

	const unsigned uColCount = GetColCount();
	unsigned uPosCount = 0;
	unsigned uSameCount = 0;
	for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
		{
		if (IsGap(uSeqIndex1, uColIndex))
			continue;
		if (IsGap(uSeqIndex2, uColIndex))
			continue;
		if (IsWildcard(uSeqIndex1, uColIndex))
			continue;
		if (IsWildcard(uSeqIndex2, uColIndex))
			continue;

		const unsigned uLetter1 = GetLetter(uSeqIndex1, uColIndex);
		const unsigned uLetter2 = GetLetter(uSeqIndex2, uColIndex);
		const unsigned uGroup1 = ResidueGroup[uLetter1];
		const unsigned uGroup2 = ResidueGroup[uLetter2];
		if (uGroup1 == uGroup2)
			++uSameCount;
		++uPosCount;
		}
	if (0 == uPosCount)
		return 0;
	return (double) uSameCount / (double) uPosCount;
	}