File: scoredist.cpp

package info (click to toggle)
muscle 3.70%2Bfix1-2
  • links: PTS, VCS
  • area: main
  • in suites: lenny, squeeze
  • size: 1,424 kB
  • ctags: 2,116
  • sloc: cpp: 26,897; xml: 230; makefile: 25
file content (128 lines) | stat: -rw-r--r-- 3,498 bytes parent folder | download | duplicates (12)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#include <limits.h>
#include <math.h>
#include "muscle.h"
#include "msa.h"
#include "distfunc.h"
#include "msa.h"
#include "seqvect.h"
#include "pwpath.h"

// ScoreDist
// E. Sonnhammer & V. Hollich, Scoredist: A simple and robust protein sequence
// distance estimator, BMC Bioinformatics 2005, 6:108.

extern int BLOSUM62[20][20];
extern double BLOSUM62_Expected;

static const double Dayhoff_CalibrationFactor = 1.3370;
static const double JTT_CalibrationFactor = 1.2873;
static const double MV_CalibrationFactor = 1.1775;
static const double LARGE_D = 3.0;

static double CalibrationFactor = JTT_CalibrationFactor;


// Similarity score
static double Sigma(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2,
  unsigned *ptrLength)
	{
	unsigned Length = 0;
	double Score = 0;
	const unsigned ColCount = msa.GetColCount();
	for (unsigned ColIndex = 0; ColIndex < ColCount; ++ColIndex)
		{
		unsigned Letter1 = msa.GetLetterEx(SeqIndex1, ColIndex);
		unsigned Letter2 = msa.GetLetterEx(SeqIndex2, ColIndex);
		if (Letter1 >= 20 || Letter2 >= 20)
			continue;
		++Length;
		Score += BLOSUM62[Letter1][Letter2];
		}

	*ptrLength = Length;
	return Score;
	}

// Normalized score
static double Sigma_N(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2)
	{
	unsigned Length = UINT_MAX;
	double Score = Sigma(msa, SeqIndex1, SeqIndex2, &Length);
	double RandomScore = Length*BLOSUM62_Expected;
	return Score - RandomScore;
	}

// Upper limit
static double Sigma_U(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2,
  unsigned *ptrLength)
	{
	double Score11 = Sigma(msa, SeqIndex1, SeqIndex1, ptrLength);
	double Score22 = Sigma(msa, SeqIndex2, SeqIndex2, ptrLength);
	return (Score11 + Score22)/2;
	}

// Normalized upper limit
static double Sigma_UN(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2)
	{
	unsigned Length = UINT_MAX;
	double Score = Sigma_U(msa, SeqIndex1, SeqIndex2, &Length);
	double RandomScore = Length*BLOSUM62_Expected;
	return Score - RandomScore;
	}

double GetScoreDist(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2)
	{
	if (g_Alpha != ALPHA_Amino)
		Quit("Scoredist is only for amino acid sequences");

	double s_N = Sigma_N(msa, SeqIndex1, SeqIndex2);
	double s_UN = Sigma_UN(msa, SeqIndex1, SeqIndex2);
	double d = 0.0;
	if (s_UN != 0)
		{
		double Ratio = s_N/s_UN;
		if (Ratio < 0.001)
			d = LARGE_D;
		else
			d =	-log(Ratio);
		}
	return d*CalibrationFactor;
	}

void DistPWScoreDist(const SeqVect &v, DistFunc &DF)
	{
	SEQWEIGHT SeqWeightSave = GetSeqWeightMethod();
	SetSeqWeightMethod(SEQWEIGHT_Henikoff);

	const unsigned uSeqCount = v.Length();
	DF.SetCount(uSeqCount);

	const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2;
	unsigned uCount = 0;
	SetProgressDesc("PW ScoreDist");
	for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)
		{
		const Seq &s1 = v.GetSeq(uSeqIndex1);
		MSA msa1;
		msa1.FromSeq(s1);
		for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2)
			{
			if (0 == uCount%20)
				Progress(uCount, uPairCount);
			++uCount;
			const Seq &s2 = v.GetSeq(uSeqIndex2);
			MSA msa2;
			msa2.FromSeq(s2);
		
			PWPath Path;
			MSA msaOut;
			AlignTwoMSAs(msa1, msa2, msaOut, Path, false, false);

			float d = (float) GetScoreDist(msaOut, 0, 1);
			DF.SetDist(uSeqIndex1, uSeqIndex2, d);
			}
		}
	ProgressStepsDone();

	SetSeqWeightMethod(SeqWeightSave);
	}