1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
|
// Copyright 2008, 2009, 2011 Michiaki Hamada
// Copyright 2012, 2013 Toshiyuki Sato
#ifndef CENTROID_HH
#define CENTROID_HH
#include "GappedXdropAligner.hh"
#include "GeneralizedAffineGapCosts.hh"
#include "SegmentPair.hh"
#include "OneQualityScoreMatrix.hh"
#include <stddef.h> // size_t
#include <vector>
#include <iostream> // for debug
namespace cbrc{
struct ExpectedCount{
public:
double emit[scoreMatrixRowSize][scoreMatrixRowSize];
double MM, MD, MP, MI, MQ;
double DD, DM, DI;
double PP, PM, PD, PI;
double II, IM;
double SM, SD, SP, SI, SQ;
public:
ExpectedCount ();
std::ostream& write (std::ostream& os) const;
};
/**
* (1) Forward and backward algorithm on the DP region given by Xdrop algorithm
* (2) \gamma-centroid decoding
*/
class Centroid{
public:
GappedXdropAligner& aligner() { return xa; }
// Setters
void setScoreMatrix( const ScoreMatrixRow* sm, double T );
void setPssm ( const ScoreMatrixRow* pssm, size_t qsize, double T,
const OneQualityExpMatrix& oqem,
const uchar* sequenceBeg, const uchar* qualityBeg );
void setOutputType( int m ) { outputType = m; }
// For a sequence with quality data, store the probability that
// each position is each letter (possibly scaled by a constant per
// position). xxx I don't think this really belongs in Centroid.
void setLetterProbsPerPosition(unsigned alphabetSize,
size_t sequenceLength,
const uchar *sequence,
const uchar *qualityCodes,
bool isFastq,
const double *qualToProbCorrect,
const double *letterProbs,
const uchar *toUnmasked);
// start1 is the index of the first letter to look at in seq1
// start2 is the index of the first letter to look at in seq2
void doForwardBackwardAlgorithm(const uchar* seq1, const uchar* seq2,
size_t start1, size_t start2,
bool isExtendFwd,
const GeneralizedAffineGapCosts& gap,
int globality) {
seq1 += start1;
seq2 += start2;
const ExpMatrixRow *pssm = isPssm ? pssmExp2 + start2 : 0;
numAntidiagonals = xa.numAntidiagonals();
scale.assign(numAntidiagonals + 2, 1.0);
forward(seq1, seq2, pssm, isExtendFwd, gap, globality);
mD.assign(numAntidiagonals + 2, 0.0);
mI.assign(numAntidiagonals + 2, 0.0);
backward(seq1, seq2, pssm, isExtendFwd, gap, globality);
}
double dp( double gamma );
void traceback(std::vector<SegmentPair> &chunks, double gamma) const {
if (outputType==5) traceback_centroid(chunks, gamma);
if (outputType==6) traceback_ama(chunks, gamma);
}
double dp_centroid( double gamma );
void traceback_centroid( std::vector< SegmentPair >& chunks, double gamma ) const;
double dp_ama( double gamma );
void traceback_ama( std::vector< SegmentPair >& chunks, double gamma ) const;
void getMatchAmbiguities(std::vector<char>& ambiguityCodes,
size_t seq1end, size_t seq2end,
size_t length) const;
void getDeleteAmbiguities(std::vector<char>& ambiguityCodes,
size_t seq1end, size_t seq1beg) const;
void getInsertAmbiguities(std::vector<char>& ambiguityCodes,
size_t seq2end, size_t seq2beg) const;
double logPartitionFunction() const; // a.k.a. full score, forward score
// Added by MH (2008/10/10) : compute expected counts for transitions and emissions
void computeExpectedCounts(const uchar* seq1, const uchar* seq2,
size_t start1, size_t start2, bool isExtendFwd,
const GeneralizedAffineGapCosts& gap,
unsigned alphabetSize,
ExpectedCount& count) const;
private:
typedef double ExpMatrixRow[scoreMatrixRowSize];
GappedXdropAligner xa;
double T; // temperature
size_t numAntidiagonals;
double match_score[scoreMatrixRowSize][scoreMatrixRowSize];
bool isPssm;
std::vector<double> pssmExp; //
ExpMatrixRow* pssmExp2; // pre-computed pssm for prob align
std::vector<double> letterProbsPerPosition; // for uncertain sequences
int outputType;
typedef std::vector< double > dvec_t;
dvec_t fM; // f^M(i,j)
dvec_t fD; // f^D(i,j) Ix
dvec_t fI; // f^I(i,j) Iy
dvec_t fP; // f^P(i,j)
dvec_t bM; // b^M(i,j)
dvec_t bD; // b^D(i,j)
dvec_t bI; // b^I(i,j)
dvec_t bP; // b^P(i,j)
dvec_t mD;
dvec_t mI;
dvec_t mX1;
dvec_t mX2;
dvec_t X; // DP tables for $gamma$-decoding
dvec_t scale; // scale[n] is a scaling factor for the n-th anti-diagonal
double bestScore;
size_t bestAntiDiagonal;
size_t bestPos1;
void forward(const uchar* seq1, const uchar* seq2,
const ExpMatrixRow* pssm, bool isExtendFwd,
const GeneralizedAffineGapCosts& gap, int globality);
void backward(const uchar* seq1, const uchar* seq2,
const ExpMatrixRow* pssm, bool isExtendFwd,
const GeneralizedAffineGapCosts& gap, int globality);
void initForwardMatrix();
void initBackwardMatrix();
void updateScore(double score, size_t antiDiagonal, size_t cur) {
if (bestScore < score) {
bestScore = score;
bestAntiDiagonal = antiDiagonal;
bestPos1 = cur;
}
}
// start of the x-drop region (i.e. number of skipped seq1 letters
// before the x-drop region) for this antidiagonal
size_t seq1start( size_t antidiagonal ) const {
return xa.scoreEndIndex( antidiagonal ) - xa.scoreOrigin( antidiagonal );
}
};
}
#endif
|