1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
|
/*****************************************************************************\
* Filename : exonmodel.hh
* Author : Mario Stanke
* Project : HMM
*
*
* Description: Training the exon model and implementing the algorithms.
*
*
* Date | Author | Changes
*------------|-----------------------|------------------------------------------
* ? | Stafilarakis | creations of the class
* 06.05.2002 | Stanke | debugging and testing
* 18.05.2002 | Stanke | added length distributions
* 24.03.2003 | Mario Stanke | introducing shadow states
* 31.08.2005 | Mario Stanke | sequence is no longer a member of gene
\******************************************************************************/
#ifndef _EXONMODEL_HH
#define _EXONMODEL_HH
#include "statemodel.hh"
/*
* The reading frame of an exon is the position of the nucleotide following the exon
* in its codon starting counting at 0:
* forward strand:
* *** | *** index 0
* *** * | ** *** index 1
* *** ** | * *** index 2
* reverse strand:
* *** | *** index 2
* *** * | ** *** index 1
* *** ** | * *** index 0
*/
class ExonModelError : public ProjectError {
public:
ExonModelError(string msg) : ProjectError(msg) {}
};
/*
* Open Reading Frame (ORF), part of the exon model
*/
class OpenReadingFrame{
public:
OpenReadingFrame(const char* dna, int _max_exon_length, int _n);
OpenReadingFrame() {}
~OpenReadingFrame() {}
int leftmostExonBegin(int frame, int base, bool forward);
private:
// used internally, these call GeneticCode
static bool isStopcodon(const char* dna);
static bool isRCStopcodon(const char* dna);
vector<Integer> nearestStopForward;
vector<Integer> nearestStopReverse;
int n;
int max_exon_length;
// static bool amber, ochre, opal; // now taken care of by GeneticCode
};
/*
* ExonModel
*/
class ExonModel : public StateModel{
public:
ExonModel();
~ExonModel();
StateType getStateType() const {
return etype;
}
void buildModel ( const AnnoSequence* annoseq, int parIndex );
void registerPars ( Parameters* parameters);
void printProbabilities ( int parIndex, BaseCount *bc, const char* suffix = NULL );
void viterbiForwardAndSampling(ViterbiMatrixType&, ViterbiMatrixType&, int, int,
AlgorithmVariant, OptionListItem&);
void processOvlpOption(ViterbiMatrixType& , ViterbiMatrixType&, AlgorithmVariant&,
int state, int endOfPred, int beginOfBioExon, Double &maxProb,
Double emiProb, Double &fwdsum, OptionsList *, OptionListItem &oli, int base) const;
Double emiProbUnderModel(int begin, int end) const;
Double endPartEmiProb(int end) const;
Double notEndPartEmiProb(int beginOfStart, int right, int frameOfRight, Feature *exonparts) const;
void initAlgorithms(Matrix<Double>&, int);
static void storeGCPars(int idx);
// class functions
static void init();
static void resetPars() {
initAlgorithmsCalled = false;
}
static void updateToLocalGC(int from = -1, int to = -1);
static void readProbabilities(int parIndex);
static void readAllParameters();
static double *getCodonUsage();
static void resetModelCount(){exoncount = 0;};
static int getMaxStateLen() { return Constant::max_exon_len + trans_init_window; }
static void setORF() {
if (orf)
delete orf;
orf = new OpenReadingFrame(sequence, Constant::max_exon_len, dnalen);
initAlgorithmsCalled = false;
}
static vector<Double> lenDistSingle; // Length distribution of Single exons (length of biol. exon)
static vector<Double> lenDistInitial; // Length distribution of Initial exons (length of biol. exon)
static vector<Double> lenDistInternal; // Length distribution of Internal exons (length of biol. exon)
static vector<Double> lenDistTerminal; // Length distribution of Terminal exons (length of biol. exon)
private:
void processExons(const Gene* gene);
void processSingleExon(const State* exon);
void processInitialExon(const State* exon);
void processInternalExon(const State* exon);
void processTerminalExon(const State* exon);
void processInnerSequence(const char* begin, const char* end, int modeltype = 0);
void buildProbabilities ( );
Double seqProb(int endOfStart, int right, int frameOfRight) const;
Double eTermSeqProb(int left, int right, int frameOfRight) const;
Double initialSeqProb(int left, int right, int frameOfRight) const;
void computeLowerOrderPats();
void computeCMFromBW ();
// internal class functions
static void computeLengthDistributions( );
static void fillTailsOfLengthDistributions( );
static int getBaseOffset(StateType type);
static int getInnerPartEndOffset(StateType type);
StateType etype;
Integer win, // reading frame of this state (fixed)
curwin; // current reading frame during training
int beginPartLen; // constant for every etype: endOfPred = beginOfStart - beginPartLen -1
int innerPartOffset; // beginOfStart = beginOfBioExon + innerPartOffset
int innerPartEndOffset; // right = endOfBioExon - innerPartEndOffset
int baseOffset; // base = endOfBioExon - baseOffset
Integer gweight;
// int prewin;
// class variables
static vector<Integer> patterncount[3]; // {0,1,2}x{acgt}^(k+1), the reading frame is
// the position of the emitted (last) nucleotide
static vector<Integer> initpatterncount[3]; // like above, just the first nucleotides of a gene
static vector<Integer> etpatterncount[3] ; // internal exon ternimal part
static Integer k; // order of the content MM
static Integer etorder; // order of the exon terminating motif
static Integer etpseudocount;// pseudocount for the exon terminating motif
static Integer min_exon_length;
static Integer trans_init_window;
static FramedPatMMGroup emiprobs;
static FramedPatMMGroup *GCemiprobs;
static vector<Double> initemiprobs[3];
static vector<Double> **GCinitemiprobs;
static vector<Double> etemiprobs[3];
static vector<Double> **GCetemiprobs;
static vector<Integer> numExonsOfType;
static vector<Integer> numHugeExonsOfType; // number of exons exceeding the maximal length
// modelled by the length distribution
static Integer numSingle, numInitial, numInternal, numTerminal;
static Integer numHugeSingle, numHugeInitial, numHugeInternal, numHugeTerminal;
static Matrix<vector<Double> > Pls;
static Matrix<vector<Double> >* GCPls; // array with one matrix per GC content class
static Integer exoncount;
static Boolean hasLenDist;
static Integer gesbasen[3];
static Double patpseudo; // pseudocount for patterns in sequence
static Integer exonLenD; // number of exons of length <= d
static Integer minPatSum; // for the decision to shorten the emission pattern
static vector<Integer> lenCountSingle; // Length count of Single exons (length of biol. exon)
static vector<Integer> lenCountInitial; // Length count of Initial exons (length of biol. exon)
static vector<Integer> lenCountInternal; // Length count of Internal exons (length of biol. exon)
static vector<Integer> lenCountTerminal; // Length count of Terminal exons (length of biol. exon)
static double slope_of_bandwidth;// for smoothing
static Integer minwindowcount; // see class Smooth in commontrain.hh
static Motif *transInitMotif; // weight matrix before the translation initiation
static Motif *GCtransInitMotif; // array for each GC content class
static BinnedMMGroup transInitBinProbs; // CRF-features based on transInitMotif
static BinnedMMGroup *GCtransInitBinProbs;// for all GC content classes
static Integer tis_motif_memory; // order of the trans init motif
static Integer tis_motif_radius; // radius for the smoothing of the trans init motif
static Motif **etMotif; // weight matrices before the donor splice site (3 frames)
static Motif ***GCetMotif; // array with motifs for each GC content class
static int numModels;
static Double *modelStartProbs;
static int ilend;
static OpenReadingFrame *orf;
static int ochrecount, ambercount, opalcount; // frequencies of the 3 stop codons
static bool initAlgorithmsCalled, haveORF;
static int lastParIndex; // GC-index of current parameter set
static int verbosity;
static list<string> *tiswins; // holds translation initiation windows (for CRF training)
static int startcounts[]; // how often did the different start codons appear during training?
static int lenboostL; // parameters for boosting length distribution of single and initial exons
static double lenboostE; // length above L are improved, the more the larger E is
};
#endif // _EXONMODEL_HH
|