File: Centroid.hh

package info (click to toggle)
last-align 963-2
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 3,380 kB
  • sloc: cpp: 41,136; python: 2,744; ansic: 1,240; makefile: 383; sh: 255
file content (173 lines) | stat: -rw-r--r-- 5,522 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
// Copyright 2008, 2009, 2011 Michiaki Hamada
// Copyright 2012, 2013 Toshiyuki Sato

#ifndef CENTROID_HH
#define CENTROID_HH
#include "GappedXdropAligner.hh"
#include "GeneralizedAffineGapCosts.hh"
#include "SegmentPair.hh"
#include "OneQualityScoreMatrix.hh"
#include <stddef.h>  // size_t
#include <vector>
#include <iostream> // for debug

namespace cbrc{

  struct ExpectedCount{
  public:
    double emit[scoreMatrixRowSize][scoreMatrixRowSize];
    double MM, MD, MP, MI, MQ;
    double DD, DM, DI;
    double PP, PM, PD, PI;
    double II, IM;
    double SM, SD, SP, SI, SQ;
  public:
    ExpectedCount ();
    std::ostream& write (std::ostream& os) const;
  };

  /**
   * (1) Forward and backward algorithm on the DP region given by Xdrop algorithm
   * (2) \gamma-centroid decoding
   */
  class Centroid{
  public:
    GappedXdropAligner& aligner() { return xa; }

    // Setters
    void setScoreMatrix( const ScoreMatrixRow* sm, double T );
    void setPssm ( const ScoreMatrixRow* pssm, size_t qsize, double T,
                   const OneQualityExpMatrix& oqem,
                   const uchar* sequenceBeg, const uchar* qualityBeg );
    void setOutputType( int m ) { outputType = m; }

    // For a sequence with quality data, store the probability that
    // each position is each letter (possibly scaled by a constant per
    // position).  xxx I don't think this really belongs in Centroid.
    void setLetterProbsPerPosition(unsigned alphabetSize,
				   size_t sequenceLength,
				   const uchar *sequence,
				   const uchar *qualityCodes,
				   bool isFastq,
				   const double *qualToProbCorrect,
				   const double *letterProbs,
				   const uchar *toUnmasked);

    // start1 is the index of the first letter to look at in seq1
    // start2 is the index of the first letter to look at in seq2

    void doForwardBackwardAlgorithm(const uchar* seq1, const uchar* seq2,
				    size_t start1, size_t start2,
				    bool isExtendFwd,
				    const GeneralizedAffineGapCosts& gap,
				    int globality) {
      seq1 += start1;
      seq2 += start2;
      const ExpMatrixRow *pssm = isPssm ? pssmExp2 + start2 : 0;
      numAntidiagonals = xa.numAntidiagonals();
      scale.assign(numAntidiagonals + 2, 1.0);
      forward(seq1, seq2, pssm, isExtendFwd, gap, globality);
      mD.assign(numAntidiagonals + 2, 0.0);
      mI.assign(numAntidiagonals + 2, 0.0);
      backward(seq1, seq2, pssm, isExtendFwd, gap, globality);
    }

    double dp( double gamma );

    void traceback(std::vector<SegmentPair> &chunks, double gamma) const {
      if (outputType==5) traceback_centroid(chunks, gamma);
      if (outputType==6) traceback_ama(chunks, gamma);
    }

    double dp_centroid( double gamma );
    void traceback_centroid( std::vector< SegmentPair >& chunks, double gamma ) const;

    double dp_ama( double gamma );
    void traceback_ama( std::vector< SegmentPair >& chunks, double gamma ) const;

    void getMatchAmbiguities(std::vector<char>& ambiguityCodes,
			     size_t seq1end, size_t seq2end,
			     size_t length) const;

    void getDeleteAmbiguities(std::vector<char>& ambiguityCodes,
			      size_t seq1end, size_t seq1beg) const;

    void getInsertAmbiguities(std::vector<char>& ambiguityCodes,
			      size_t seq2end, size_t seq2beg) const;

    double logPartitionFunction() const;  // a.k.a. full score, forward score

    // Added by MH (2008/10/10) : compute expected counts for transitions and emissions
    void computeExpectedCounts(const uchar* seq1, const uchar* seq2,
			       size_t start1, size_t start2, bool isExtendFwd,
			       const GeneralizedAffineGapCosts& gap,
			       unsigned alphabetSize,
			       ExpectedCount& count) const;

  private:
    typedef double ExpMatrixRow[scoreMatrixRowSize];

    GappedXdropAligner xa;
    double T; // temperature
    size_t numAntidiagonals;
    double match_score[scoreMatrixRowSize][scoreMatrixRowSize];
    bool isPssm;
    std::vector<double> pssmExp; //
    ExpMatrixRow* pssmExp2; // pre-computed pssm for prob align
    std::vector<double> letterProbsPerPosition;  // for uncertain sequences
    int outputType;

    typedef std::vector< double > dvec_t;

    dvec_t fM; // f^M(i,j)
    dvec_t fD; // f^D(i,j) Ix
    dvec_t fI; // f^I(i,j) Iy
    dvec_t fP; // f^P(i,j)

    dvec_t bM; // b^M(i,j)
    dvec_t bD; // b^D(i,j)
    dvec_t bI; // b^I(i,j)
    dvec_t bP; // b^P(i,j)

    dvec_t mD;
    dvec_t mI;
    dvec_t mX1;
    dvec_t mX2;

    dvec_t X; // DP tables for $gamma$-decoding

    dvec_t scale; // scale[n] is a scaling factor for the n-th anti-diagonal

    double bestScore;
    size_t bestAntiDiagonal;
    size_t bestPos1;

    void forward(const uchar* seq1, const uchar* seq2,
		 const ExpMatrixRow* pssm, bool isExtendFwd,
		 const GeneralizedAffineGapCosts& gap, int globality);

    void backward(const uchar* seq1, const uchar* seq2,
		  const ExpMatrixRow* pssm, bool isExtendFwd,
		  const GeneralizedAffineGapCosts& gap, int globality);

    void initForwardMatrix();
    void initBackwardMatrix();

    void updateScore(double score, size_t antiDiagonal, size_t cur) {
      if (bestScore < score) {
	bestScore = score;
	bestAntiDiagonal = antiDiagonal;
	bestPos1 = cur;
      }
    }

    // start of the x-drop region (i.e. number of skipped seq1 letters
    // before the x-drop region) for this antidiagonal
    size_t seq1start( size_t antidiagonal ) const {
      return xa.scoreEndIndex( antidiagonal ) - xa.scoreOrigin( antidiagonal );
    }
  };

}

#endif