File: scoredist.cpp

package info (click to toggle)
libmuscle 3.7%2B4565-7
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye
  • size: 1,912 kB
  • sloc: cpp: 27,959; makefile: 58; sh: 26
file content (131 lines) | stat: -rw-r--r-- 3,590 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#include <limits.h>
#include <math.h>
#include "libMUSCLE/muscle.h"
#include "libMUSCLE/msa.h"
#include "libMUSCLE/distfunc.h"
#include "libMUSCLE/msa.h"
#include "libMUSCLE/seqvect.h"
#include "libMUSCLE/pwpath.h"

namespace muscle {

// ScoreDist
// E. Sonnhammer & V. Hollich, Scoredist: A simple and robust protein sequence
// distance estimator, BMC Bioinformatics 2005, 6:108.

extern int BLOSUM62[20][20];
extern double BLOSUM62_Expected;

static const double Dayhoff_CalibrationFactor = 1.3370;
static const double JTT_CalibrationFactor = 1.2873;
static const double MV_CalibrationFactor = 1.1775;
static const double LARGE_D = 3.0;

static double CalibrationFactor = JTT_CalibrationFactor;


// Similarity score
static double Sigma(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2,
  unsigned *ptrLength)
	{
	unsigned Length = 0;
	double Score = 0;
	const unsigned ColCount = msa.GetColCount();
	for (unsigned ColIndex = 0; ColIndex < ColCount; ++ColIndex)
		{
		unsigned Letter1 = msa.GetLetterEx(SeqIndex1, ColIndex);
		unsigned Letter2 = msa.GetLetterEx(SeqIndex2, ColIndex);
		if (Letter1 >= 20 || Letter2 >= 20)
			continue;
		++Length;
		Score += BLOSUM62[Letter1][Letter2];
		}

	*ptrLength = Length;
	return Score;
	}

// Normalized score
static double Sigma_N(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2)
	{
	unsigned Length = UINT_MAX;
	double Score = Sigma(msa, SeqIndex1, SeqIndex2, &Length);
	double RandomScore = Length*BLOSUM62_Expected;
	return Score - RandomScore;
	}

// Upper limit
static double Sigma_U(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2,
  unsigned *ptrLength)
	{
	double Score11 = Sigma(msa, SeqIndex1, SeqIndex1, ptrLength);
	double Score22 = Sigma(msa, SeqIndex2, SeqIndex2, ptrLength);
	return (Score11 + Score22)/2;
	}

// Normalized upper limit
static double Sigma_UN(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2)
	{
	unsigned Length = UINT_MAX;
	double Score = Sigma_U(msa, SeqIndex1, SeqIndex2, &Length);
	double RandomScore = Length*BLOSUM62_Expected;
	return Score - RandomScore;
	}

double GetScoreDist(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2)
	{
	if (g_Alpha.get() != ALPHA_Amino)
		Quit("Scoredist is only for amino acid sequences");

	double s_N = Sigma_N(msa, SeqIndex1, SeqIndex2);
	double s_UN = Sigma_UN(msa, SeqIndex1, SeqIndex2);
	double d = 0.0;
	if (s_UN != 0)
		{
		double Ratio = s_N/s_UN;
		if (Ratio < 0.001)
			d = LARGE_D;
		else
			d =	-log(Ratio);
		}
	return d*CalibrationFactor;
	}

void DistPWScoreDist(const SeqVect &v, DistFunc &DF)
	{
	SEQWEIGHT SeqWeightSave = GetSeqWeightMethod();
	SetSeqWeightMethod(SEQWEIGHT_Henikoff);

	const unsigned uSeqCount = v.Length();
	DF.SetCount(uSeqCount);

	const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2;
	unsigned uCount = 0;
	SetProgressDesc("PW ScoreDist");
	for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)
		{
		const Seq &s1 = v.GetSeq(uSeqIndex1);
		MSA msa1;
		msa1.FromSeq(s1);
		for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2)
			{
			if (0 == uCount%20)
				Progress(uCount, uPairCount);
			++uCount;
			const Seq &s2 = v.GetSeq(uSeqIndex2);
			MSA msa2;
			msa2.FromSeq(s2);
		
			PWPath Path;
			MSA msaOut;
			AlignTwoMSAs(msa1, msa2, msaOut, Path, false, false);

			float d = (float) GetScoreDist(msaOut, 0, 1);
			DF.SetDist(uSeqIndex1, uSeqIndex2, d);
			}
		}
	ProgressStepsDone();

	SetSeqWeightMethod(SeqWeightSave);
	}
}