1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
|
#include <limits.h>
#include <math.h>
#include "libMUSCLE/muscle.h"
#include "libMUSCLE/msa.h"
#include "libMUSCLE/distfunc.h"
#include "libMUSCLE/msa.h"
#include "libMUSCLE/seqvect.h"
#include "libMUSCLE/pwpath.h"
namespace muscle {
// ScoreDist
// E. Sonnhammer & V. Hollich, Scoredist: A simple and robust protein sequence
// distance estimator, BMC Bioinformatics 2005, 6:108.
extern int BLOSUM62[20][20];
extern double BLOSUM62_Expected;
static const double Dayhoff_CalibrationFactor = 1.3370;
static const double JTT_CalibrationFactor = 1.2873;
static const double MV_CalibrationFactor = 1.1775;
static const double LARGE_D = 3.0;
static double CalibrationFactor = JTT_CalibrationFactor;
// Similarity score
static double Sigma(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2,
unsigned *ptrLength)
{
unsigned Length = 0;
double Score = 0;
const unsigned ColCount = msa.GetColCount();
for (unsigned ColIndex = 0; ColIndex < ColCount; ++ColIndex)
{
unsigned Letter1 = msa.GetLetterEx(SeqIndex1, ColIndex);
unsigned Letter2 = msa.GetLetterEx(SeqIndex2, ColIndex);
if (Letter1 >= 20 || Letter2 >= 20)
continue;
++Length;
Score += BLOSUM62[Letter1][Letter2];
}
*ptrLength = Length;
return Score;
}
// Normalized score
static double Sigma_N(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2)
{
unsigned Length = UINT_MAX;
double Score = Sigma(msa, SeqIndex1, SeqIndex2, &Length);
double RandomScore = Length*BLOSUM62_Expected;
return Score - RandomScore;
}
// Upper limit
static double Sigma_U(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2,
unsigned *ptrLength)
{
double Score11 = Sigma(msa, SeqIndex1, SeqIndex1, ptrLength);
double Score22 = Sigma(msa, SeqIndex2, SeqIndex2, ptrLength);
return (Score11 + Score22)/2;
}
// Normalized upper limit
static double Sigma_UN(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2)
{
unsigned Length = UINT_MAX;
double Score = Sigma_U(msa, SeqIndex1, SeqIndex2, &Length);
double RandomScore = Length*BLOSUM62_Expected;
return Score - RandomScore;
}
double GetScoreDist(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2)
{
if (g_Alpha.get() != ALPHA_Amino)
Quit("Scoredist is only for amino acid sequences");
double s_N = Sigma_N(msa, SeqIndex1, SeqIndex2);
double s_UN = Sigma_UN(msa, SeqIndex1, SeqIndex2);
double d = 0.0;
if (s_UN != 0)
{
double Ratio = s_N/s_UN;
if (Ratio < 0.001)
d = LARGE_D;
else
d = -log(Ratio);
}
return d*CalibrationFactor;
}
void DistPWScoreDist(const SeqVect &v, DistFunc &DF)
{
SEQWEIGHT SeqWeightSave = GetSeqWeightMethod();
SetSeqWeightMethod(SEQWEIGHT_Henikoff);
const unsigned uSeqCount = v.Length();
DF.SetCount(uSeqCount);
const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2;
unsigned uCount = 0;
SetProgressDesc("PW ScoreDist");
for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)
{
const Seq &s1 = v.GetSeq(uSeqIndex1);
MSA msa1;
msa1.FromSeq(s1);
for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2)
{
if (0 == uCount%20)
Progress(uCount, uPairCount);
++uCount;
const Seq &s2 = v.GetSeq(uSeqIndex2);
MSA msa2;
msa2.FromSeq(s2);
PWPath Path;
MSA msaOut;
AlignTwoMSAs(msa1, msa2, msaOut, Path, false, false);
float d = (float) GetScoreDist(msaOut, 0, 1);
DF.SetDist(uSeqIndex1, uSeqIndex2, d);
}
}
ProgressStepsDone();
SetSeqWeightMethod(SeqWeightSave);
}
}
|