File: baumwelch.hh

package info (click to toggle)
augustus 3.3.2%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 486,188 kB
  • sloc: cpp: 51,969; perl: 20,926; ansic: 1,251; makefile: 935; python: 120; sh: 118
file content (119 lines) | stat: -rw-r--r-- 3,289 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
/*****************************************************************************\
 * Filename : baumwelch.hh
 * Author   : Mario Stanke
 * Project  : HMM
 * Version  : 0.1
 *
 *
 * Description: Implementation of a special case of Baum-Welch parameter estimation
 *
 *
 * Date       |   Author              |  Changes
 *------------|-----------------------|------------------------------------------
 * 15.08.2002 | Stanke                | creation of the class
\******************************************************************************/

#ifndef _BAUMWELCH_HH
#define _BAUMWELCH_HH

// project includes
#include "types.hh"
#include "gene.hh"
#include "contentmodel.hh"

// standard C/C++ includes
#include <iostream>


/*
 * class TrainingData
 * a sequence together with some information
 */
class TrainingData {
public:
    TrainingData(){
	type = -1;            // type usually not known
	next = NULL;
	name = NULL;
    }

    char *seq;
    int seqLen;
    int frame;   // reading Frame of the first nucleotide
    int type;
    int weight;
    char *name;
    TrainingData *next;
};


class BaumWelch {
public:

    BaumWelch(int k, int numFrames, int modelTypes = 2){
	this->k = k;
	this->numFrames = numFrames;
	this->modelTypes = modelTypes;
	models = NULL;
	modelTypeProbs = NULL;
	inputSeqs = NULL;

    }
    void initialTypeParameters(Double patpseudocount = 0.0);
    void initialRandomParameters();
    void initialExonTrainingData(AnnoSequence *seqList);
    void initialIntronTrainingData(const Gene* geneList); 
    void classifyTrainingData();  
    void setK(int k){
	if (k != this->k) {
	    delete models;
	    models = NULL;
	    delete modelTypeProbs;
	    modelTypeProbs = NULL;
	    this->k = k;
	}
    };
    void setTrainingData(TrainingData *inputSeqs){
	if (this->inputSeqs) {
	    // TODO
	}
	this->inputSeqs = inputSeqs;
    }
    Double reestimate(Double patpseudocount = 0.0);
    void printAllContentModels(ostream &out);
    void readAllContentModels(istream &in);
    void printInputSeqs(ostream &out);
    TrainingData* getInputSeqs(){return inputSeqs;};
    ContentModel getContentModel(int type);
    int getK() { return k;}
    int getModelTypes() { return modelTypes;}
    int getNumFrames() { return numFrames;}
    Double getModelTypeProb(int i) { return (*modelTypeProbs)[i];}
    void addTrainingData(TrainingData *td){
	td->next = inputSeqs;
	inputSeqs = td;
	numData++;
    }
private:
    //Double seqProbUnderModel(Matrix<Double> &patprob, char *s, int len, int frame); 

private:
    int k;                  // order of the HMM
    int modelTypes;         // number of different models
    int numFrames;          // number of different reading frames = periodicity
    int numData;            // number of input sequences

    /* For each model type we have a matrix which holds in row f (f=0,1,2, frame) 
     * and row p (p = 1..4^(k+1), number of patrern) the probability of that pattern
     * ending in reading frame f in the model of the respective type.
     */
    vector<ContentModel> *models;
    vector<Double> *modelTypeProbs;       // a priori probability of that model
    TrainingData *inputSeqs;
};


void printContentModels(ostream& out, vector<ContentModel> *models,
			vector<Double> *modelTypeProbs, int k);

#endif