File: icm.hh

package info (click to toggle)
tigr-glimmer 3.02b-5
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, sid
  • size: 13,948 kB
  • sloc: cpp: 24,416; awk: 232; csh: 220; makefile: 147; sh: 51
file content (303 lines) | stat: -rw-r--r-- 8,002 bytes parent folder | download | duplicates (12)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
//  A. L. Delcher
//
//  File:  icm.hh
//
//  Last Modified:  3 May 2004
//
//  Declarations for the Interpolated Context Model (ICM) used by Glimmer



#ifndef _ICM_H_INCLUDED
#define _ICM_H_INCLUDED


#include  "delcher.hh"
#include  "gene.hh"

using namespace std;


#define  STORE_MUT_INFO  1
  // If true, save best mutual information in training node and include it
  // in ascii dump of model


const int  ALPHABET_SIZE = 4;
  // Number of characters is strings being modelled
const int  ALPHA_SQUARED = (ALPHABET_SIZE * ALPHABET_SIZE);
  // Square of the preceding
const char  ALPHA_STRING [] = "acgt";
  // String of allowed characters in alphabet, used for converting
  // characters to subscripts

const int  NUM_CHI2_ENTRIES = 7;
  // Number of entries in following tables
const float  CHI2_VAL [NUM_CHI2_ENTRIES]
    = {2.37, 4.11, 6.25, 7.81, 9.35, 11.3, 12.8};
  // Table of chi-squared values
const float  CHI2_SIGNIFICANCE [NUM_CHI2_ENTRIES]
    = {0.50, 0.75, 0.90, 0.95, 0.975, 0.99, 0.995};
  // Corresponding significance values

const int  DEFAULT_MODEL_LEN = 12;
  // Number of bases in window used for scoring.  The last base
  // in this window is Markov dependent on the preceding bases
const int  ID_STRING_LEN = 150;
  // Length of string written at start of icm file
const int  DEFAULT_MODEL_DEPTH = 7;
  // The max number of bases to restrict (use) in the window
const unsigned  NUM_FIXED_LENGTH_PARAMS = 6;
  // The number of binary integer parameters at the start of the
  // fixed-length icm file
const double  MUT_INFO_BIAS = 0.03;
  // A mutual information position on the left (i.e., earlier in the
  // context window) must be at least this much better than one
  // to its right in order to be used.
const int  DEFAULT_PERIODICITY = 3;
  // Number of models to cycle through in each string.  Must be at least 1

const char  MAX_MI_CHAR = '*';
  // Indicates position in context of current node's max mutual info
  // Used for labels of ascii version of output
const char  SEPARATOR_CHAR = '|';
  // Used to separate groups of periodicity characters in ascii version
  // of output

const double  MAX_LOG_DIFF = -46.0;
  // Treat differences larger than this as essentially infinite
  // to avoid over/underflow problems.  This is approximately
  // 1e-20 difference in probs
const double  MUT_INFO_EPSILON = 1e-4;
  // Smaller than this is regarded as zero mutual information
const double  PSEUDO_COUNT = 0.001;
  // Part of a count inherited in proportion to the parent's probabilities
  // to prevent 0 counts.
const int  SAMPLE_SIZE_BOUND = 400;
  // If the counts for the bases sum to less than this value,
  // use chi^2 formula to weight the probability

const int  ICM_VERSION_ID = 200;
  // Integer version number stored in model prefix for compatibility check


#define  PARENT(x) ((int) ((x) - 1) / ALPHABET_SIZE) 
//  Macro that returns the parent of x in our tree



// Temporary so it will compile

enum  ICM_Model_t
    {UNKNOWN_TYPE};


struct  ICM_Training_Node_t
  {
   short int  mut_info_seq;
     // The base that is to be restricted in mut_info_pos
   int  (* count) [ALPHA_SQUARED];
     // count [x] [y] (where y is a pair of letters (e.g., aa, ac, ag, .. tt))
     // is the number of times the first base of y occurs at position x
     // and the last base of y occurs at position (model_len - 1)
  };


struct  ICM_Score_Node_t
  {
   short int  mut_info_pos;
#if  STORE_MUT_INFO
   float  mut_info;
#endif
   float  prob [ALPHABET_SIZE];
  };


class  ICM_t
  {
  protected:
   bool  empty;
   int  model_len;
     // Number of bases in window
   int  model_depth;
     // Most levels in model tree, i.e., most number of positions
     // to be dependent on
   int  periodicity;
     // Number of different models, alternating cyclicly
   int  num_nodes;
     // Number of nodes in tree
   ICM_Score_Node_t  * * score;

  public:
   ICM_t
       (int m = DEFAULT_MODEL_LEN,
        int d = DEFAULT_MODEL_DEPTH,
        int p = DEFAULT_PERIODICITY);
   ~ ICM_t
       ();

   int  Get_Model_Len
       (void)
     { return  model_len; }
   int  Get_Periodicity
       (void)
     { return  periodicity; }

   void  Build_Indep_WO_Stops
       (double gc_frac, const vector <const char *> & stop_codon);
   void  Build_Reverse_Codon_WO_Stops
    (double codon_prob [64], const vector <const char *> & stop_codon);
   void  Cumulative_Score
       (const string & s, vector <double> & score, int frame)  const;
   void  Cumulative_Score_String
       (char * string, int len, int frame, double * score);
   void  Display
       (FILE * fp);
   void  Full_Window_Distrib
       (char * string, int frame, float * dist);
   double  Full_Window_Prob
       (const char * string, int frame)  const;
   void  Input
       (FILE * fp);
   void  Output
       (FILE * fp, bool binary_form);
   void  Output_Node
       (FILE * fp, ICM_Score_Node_t * node, int id, int frame, bool binary_form);
   double  Partial_Window_Prob
       (int predict_pos, const char * string, int frame)  const;
   void  Read
       (char * path);
   double  Score_String
       (const char * string, int len, int frame)  const;
   void  Set_Label_String
       (char * label, int id, int frame);
   void  Write_Header
       (FILE * fp, bool binary_form);
  };


class  ICM_Training_t  :  public ICM_t
  {
  private:
   ICM_Training_Node_t  * * train;
   int  (* count_memory) [ALPHA_SQUARED];
     // Holds dynamically allocated block for all counts to avoid
     // loss at tail from separate callocs.  Saved here to allow
     // freeing it in the destructor.

   void  Complete_Tree
       (const vector <char *> & data);
   void  Count_Char_Pairs_Restricted
       (const char * string, int level);
   ICM_Training_Node_t *  Get_Training_Node
       (const char * w, int frame, int level);
   void  Interpolate_Probs
       (int frame, int sub, int ct []);
   void  Take_Logs
       (void);

  public:
   ICM_Training_t
       (int m = DEFAULT_MODEL_LEN,
        int d = DEFAULT_MODEL_DEPTH,
        int p = DEFAULT_PERIODICITY);
   ~ ICM_Training_t
       ();

   void  Train_Model
       (const vector <char *> & data);
  };


class  Fixed_Length_ICM_t
  {
  private:
   int  length;
   int  max_depth;
   int  special_position;
   ICM_Model_t  model_type;
   int  * permutation;
   vector <ICM_t *> sub_model;

  public:
   Fixed_Length_ICM_t
       (int len = 1, int sp = 0, int * perm = NULL, ICM_Model_t = UNKNOWN_TYPE);
   ~ Fixed_Length_ICM_t
       ();

   int  Get_Length
       (void)
     {
      return  length;
     }
   double  Score_Window
       (char * w);

   // Match MDD_t interface
   int  getModelLength
       (void)
     {
      return  length;
     }
   int  getModelType
       (void)
     {
      return  model_type;
     }
   int  getSpecialPosition
       (void)
     {
      return  special_position;
     }
   void  read
       (const char * path);
   double  subrange_score
       (char * w, int lo, int hi);
   double  score
       (char * w)
     {
      return  Score_Window (w);
     }
  };


class  Fixed_Length_ICM_Training_t
  {
  private:
   int  length;
   int  max_depth;
   int  special_position;
   ICM_Model_t  model_type;
   int  * permutation;
   vector <ICM_Training_t *> sub_model;

  public:
   Fixed_Length_ICM_Training_t
       (int len, int md, int sp, int * perm, ICM_Model_t mt = UNKNOWN_TYPE);
   ~ Fixed_Length_ICM_Training_t
       ();
   void  Output
       (FILE * fp, bool binary_form);
   void  Train_Model
       (vector <char *> & data);
   void  Write_Header
       (FILE * fp, bool binary_form);
  };



void  Count_Char_Pairs
    (int ct [] [ALPHA_SQUARED], char * string, int w, int period);
void  Count_Single_Chars
    (int ct [ALPHABET_SIZE], char * string, int w, int period);
double  Get_Mutual_Info
    (int ct [], int n, int sum);
void  Permute_Data
    (vector <char *> & data, int * perm);
void  Permute_String
    (char * s, int * perm, int n);
int  Subscript
    (char ch);

#endif