1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263
|
/***********************************************************************
* file: statemodel.hh
* licence: Artistic Licence, see file LICENCE.TXT or
* http://www.opensource.org/licenses/artistic-license.php
* descr.: base interface class for the state model classes
* authors: Emmanouil Stafilarakis, Mario Stanke (mario@gobics.de),
* Oliver Keller
*
**********************************************************************/
#ifndef _STATEMODEL_HH
#define _STATEMODEL_HH
// project includes
#include "matrix.hh"
#include "vitmatrix.hh"
#include "merkmal.hh"
/*
* constants used in splitting into state/substate pairs
* we use the lower byte for the state, and the upper 3 for the substate
* SHIFT_LEFT is used to build the full id from the substate id
* SHIFT_RIGHT is used to retrieve back the substate id
*/
#define SHIFT_SIZE (sizeof (signed char)*8)
#define SHIFT_LEFT(x) ((x) << SHIFT_SIZE)
#define SHIFT_RIGHT(x) ((x) >> SHIFT_SIZE)
#define MAX_STATECOUNT SHIFT_LEFT(1)
/*
* getFullStateId: merge state/substate pairs into single integers (used in calls of viterbi)
* precondition: state >= -1, substate >= -1
* if substate is -1 (none), state remains unchanged
* getStatePair: split integers back into state/substate pairs
*
*/
inline int getFullStateId(int state, SubstateId substate = SubstateId()) {
return SHIFT_LEFT (substate.fullId()) + state;
}
inline void getStatePair(int fullState, int& state, SubstateId& substate) {
state = (signed char)(fullState);
substate.set(SHIFT_RIGHT(fullState -state));
}
/*
* Predecessor in the state transition Graph
*/
struct Ancestor{
Ancestor(int newpos=0, Double newval=0.0) : pos(newpos), val(newval) {}
int pos;
Double val;
};
/*
* This is the base interface common to all state model classes
* (ExonModel, IntronModel, IGenicModel, UTRModel)
*
*/
class StateModel {
protected:
StateModel() {} // do not create StateModel object
public:
void initPredecessors(Matrix<Double>&, int self);
// virtual methods to be implemented by the specialised classes
virtual void registerPars(Parameters* parameters) {}
virtual void buildModel(const AnnoSequence* annoseq, int parIndex) =0;
virtual void printProbabilities(int zusNumber=1, BaseCount *bc = NULL, const char* suffix = NULL) =0;
virtual StateType getStateType() const = 0;
virtual void viterbiForwardAndSampling(ViterbiMatrixType&, ViterbiMatrixType&, int, int,
AlgorithmVariant, OptionListItem&) = 0;
virtual Double emiProbUnderModel(int , int) const = 0;
virtual void initAlgorithms(Matrix<Double>&, int) = 0;
virtual void updateToLocalGCEach(Matrix<Double>& trans, int cur) { };
virtual ~StateModel() {} // nothing to do here since class is purely abstract
// class functions
static void init();
static StateModel* newStateModelPtr (const char* path);
static void determineShortPatterns(const vector<Integer>& patcounts, int k, int minCount);
static void makeProbsFromCounts(vector<Double > &patprobs , const vector<Integer > &patcounts,
int k, Double pseudocount, Boolean shorten = false);
static void computeEmiFromPat(const vector<Double>& patprobs, vector<Double>& emiprobs, Integer k);
static void prepareViterbi(const char* dna, int len, const vector<StateType> &stateMap);
static void readProbabilities(int);
static void resetPars();
// this is done when gcIdx changes once per state class
static void updateToLocalGC(int idx, int from = -1, int to = -1);
static void readAllParameters();
static void storeGCPars(int);
static void resetModelCounts();
static bool isPossibleDSS(int pos) {
return pos >= 1 && pos <= dnalen-2 &&
(onGenDSS(sequence + pos) ||
seqFeatColl->isHintedDSS(pos, plusstrand));
}
static bool isPossibleRDSS(int pos) {
return pos >= 1 && pos <= dnalen-2 &&
(onGenRDSS(sequence + pos -1) ||
seqFeatColl->isHintedDSS(pos, minusstrand));
}
static bool isPossibleASS(int pos) {
return pos >= 1 && pos <= dnalen-2 &&
(onASS(sequence + pos -1) ||
seqFeatColl->isHintedASS(pos, plusstrand));
}
static bool isPossibleRASS(int pos) {
return pos >= 1 && pos <= dnalen-2 &&
(onRASS(sequence + pos) ||
seqFeatColl->isHintedASS(pos, minusstrand));
}
static void setSFC(SequenceFeatureCollection *psfc) {
seqFeatColl = psfc;
}
static void setPP(PP::SubstateModel* mdl) {
profileModel = mdl;
}
static void setCountRegion(int from, int to){countStart = from; countEnd = to;}
static int getActiveWindowStart(int);
static void setGCIdx(int idx) {gcIdx = idx;}
static void setContentStairs(ContentStairs *stairs) {cs = stairs;}
static int getGCIdx(int at){if (cs) return cs->idx[at]; else return -1;}
protected:
// variable unique to each model
vector<Ancestor> ancestor; // predecessor in the state transition graph
// class variables shared by all models
static const vector<StateType>* stateMap; // needed in exonmodel
static const char* sequence; // the sequence currently examined
static int dnalen;
static SequenceFeatureCollection* seqFeatColl;
static vector<Boolean>* shortpattern;
static PP::SubstateModel* profileModel;
static int countStart, countEnd; // this is the range of positions over which features are counted in CRF training
static int activeWinLen; // states ending before the active window will be deleted if they are not yet used
static int gcIdx; // GC content class index for all states
static ContentStairs *cs;
}; // class StateModel
/*
* classes Snippet*
* intelligently store and retrieve the sequence emission probabilities of the sequence from a to b
* for common pairs of a and b
*/
class SnippetListItem {
public:
SnippetListItem (Double p, int length) { prob = p; len = length; next = NULL;}
~SnippetListItem() {
if (next)
delete next;
}
Double prob;
int len;
SnippetListItem *next;
};
class SnippetList {
public:
Double getProb(int base, int &partlen);
SnippetList() {first = last = NULL;}
~SnippetList(){}
SnippetListItem *first, *last;
};
class SnippetProbs {
public:
SnippetProbs(const char* dna, int k, bool forwardStrand=true){
this->dna = dna;
this->k = k;
this->forwardStrand = forwardStrand;
if (dna)
n = strlen(dna);
else
n = 0;
snippetlist = new SnippetList*[n];
for (int i=0; i<n; i++)
snippetlist[i] = NULL;
}
Double getSeqProb(int base, int len);
~SnippetProbs () {
if (snippetlist) {
for (int i=0; i<n; i++) {
if (snippetlist[i]){
delete snippetlist[i]->first;
delete snippetlist[i];
}
}
delete [] snippetlist;
}
}
void setEmiProbs(vector<Double> *emiprobs) {
this->emiprobs = emiprobs;
}
void addProb(int base, int len, Double p);
protected:
const char *dna;
int n;
vector<Double> *emiprobs;
int k;
SnippetList **snippetlist;
bool forwardStrand;
private:
Double getElemSeqProb(int base, int len);
};
/*
* SegProbs - another class for caching probabilities of sequence segments
* idea: cumulative product => get segment prob in constant time via a ratio
*/
class SegProbs {
public:
SegProbs(const char* dna, int k, bool forwardStrand=true) : s2i(k+1) {
this->dna = dna;
this->k = k;
this->forwardStrand = forwardStrand;
if (dna)
n = strlen(dna);
else
n = 0;
cumProds = new Double[n+1];
}
~SegProbs () { delete [] cumProds; }
void setEmiProbs(vector<Double> *emiprobs, int from = -1, int to = -1);
Double getSeqProb(int from, int to);
int getN() { return n; }
protected:
const char *dna;
int n;
int k;
Seq2Int s2i;
vector<Double> *emiprobs;
Double *cumProds;
bool forwardStrand;
};
/*
* data structure to store possible endOfPred positions to iterate directly
* only over those endOfPred positions, that are possible in the inner loop of
* the Viterbi algorithm
*/
class EOPList {
public:
void clear() { possibleEndOfPreds.clear(); }
void init() {
eopit = possibleEndOfPreds.begin();
inCache = false;
}
void decrement(int &endOfPred);
void update(int endOfPred);
list<int> possibleEndOfPreds;
list<int>::iterator eopit;
bool inCache;
};
#endif // _STATEMODEL_HH
|