File: gene.hh

package info (click to toggle)
tigr-glimmer 3.02b-5
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, sid
  • size: 13,948 kB
  • sloc: cpp: 24,416; awk: 232; csh: 220; makefile: 147; sh: 51
file content (259 lines) | stat: -rw-r--r-- 6,069 bytes parent folder | download | duplicates (12)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
//  A. L. Delcher
//
//  File:  gene.hh
//
//  Last Modified:  23 October 2003
//
//  DNA- and gene-related routines delcarations



#ifndef  __GENE_HH_INCLUDED
#define  __GENE_HH_INCLUDED

#include  "xlate_tables.hh"


const unsigned  ATG_MASK = 0x184;
const unsigned  CAA_MASK = 0x211;
const unsigned  CAC_MASK = 0x212;
const unsigned  CAG_MASK = 0x214;
const unsigned  CAT_MASK = 0x218;
const unsigned  CAY_MASK = 0x21a;
const unsigned  CTA_MASK = 0x281;
const unsigned  CTG_MASK = 0x284;
const unsigned  GTG_MASK = 0x484;
const unsigned  RTG_MASK = 0x584;
const unsigned  TAA_MASK = 0x811;
const unsigned  TAG_MASK = 0x814;
const unsigned  TAR_MASK = 0x815;
const unsigned  TCA_MASK = 0x821;
const unsigned  TGA_MASK = 0x841;
const unsigned  TRA_MASK = 0x851;
const unsigned  TTA_MASK = 0x881;
const unsigned  TTG_MASK = 0x884;
const unsigned  TYA_MASK = 0x8a1;
const unsigned  YTA_MASK = 0xa81;
const unsigned  SHIFT_MASK = 0xFF;

const unsigned  DELETE_FLAG = 0x01;
const unsigned  TRUNCATED_START_FLAG = 0x02;
const unsigned  TRUNCATED_STOP_FLAG = 0x04;

const long int  INCR_SIZE = 10000;
const long int  INIT_SIZE = 10000;
const int  MAX_LINE = 300;

#define  DEFAULT_POS_ENTROPY_PROF  {0.08468,0.01606,0.05739,0.05752,0.04328,\
  0.07042,0.02942,0.05624,0.04442,0.05620,0.03029,0.03975,0.05116,0.04098,\
  0.05989,0.08224,0.05660,0.06991,0.02044,0.03310}
#define  DEFAULT_NEG_ENTROPY_PROF  {0.07434,0.03035,0.05936,0.04729,0.05662,\
  0.07704,0.05777,0.05328,0.03360,0.05581,0.01457,0.03718,0.04594,0.05977,\
  0.08489,0.05990,0.04978,0.07227,0.01050,0.01974}
const char  * const DEFAULT_START_CODON []
     = {"atg", "gtg", "ttg"};
const char  * const DEFAULT_STOP_CODON []
     = {"taa", "tag", "tga"};



class  Codon_t
  {
  private:
   static const unsigned  shift_mask = 0xff;
   static const unsigned  reverse_shift_mask = 0xff0;

   unsigned int  data;
     // Represent the codon as a 12-bit string.  Each character
     // is 4 bits, representing whether it can be a, c, g or t.
     // a is 1, c is 2, g is 4 and t is 8.
     // E.g., 'a' is 0001; IUPAC character 's' (which is 'c' or 'g')
     // is 0110.
   

  public:
   Codon_t ()
     { data = 0x0; }

   void  Clear
       ()
     { data = 0x0; }
   bool  Can_Be
       (const vector <Codon_t> & a, int & which);
   bool  Must_Be
       (const vector <Codon_t> & a, int & which);
   void  Print
       (FILE * fp)
     { fprintf (fp, "%03x", data); }
   void  Reverse_Complement
       (void);
   void  Reverse_Shift_In
       (char ch);
   void  Set_From
       (const char * s);
   void  Shift_In
       (char ch);
  };


class  Orf_t
  {
  protected:
   int  stop_position;
     // first base (i.e., lowest subscript) counting positions
     // starting at 1
   int  frame;
     // is determined by the leftmost position of the stop codon,
     // positions starting at 1, positive for forward, negative for
     // reverse
   int  orf_len;
   int  gene_len;

  public:
   Orf_t  ()
     { stop_position = 0;  frame = 0; }

   int  Get_Frame  (void)  const
     { return  frame; }
   int  Get_Gene_Len  (void)  const
     { return  gene_len; }
   int  Get_Orf_Len  (void)  const
     { return  orf_len; }
   int  Get_Stop_Position  (void)  const
     { return  stop_position; }

   void  Set_Frame  (int i)
     { frame = i; }
   void  Set_Gene_Len  (int i)
     { gene_len = i; }
   void  Set_Orf_Len  (int i)
     { orf_len = i; }
   void  Set_Stop_Position  (int i)
     { stop_position = i; }
     
  };


struct  DNA_vect_t
  {
   double  p [4];
  };


class  PWM_t
  {
  private:
   vector <DNA_vect_t>  col;

  public:
   PWM_t  ()
     {}

   void  Check  (void)
     { cerr << "PWM_t Check:  size = " << col . size () << endl; }
   void  Counts_To_Prob
       (void);
   double  Column_Score
       (char ch, int col)  const;
   bool  Is_Empty  (void)  const
     { return  col . empty (); }
   void  Make_Log_Odds_WRT_GC
       (double gc_frac);
   void  Print
       (FILE * fp);
   void  Probs_To_Logs
    (void);
   bool  Read
       (FILE * fp);
   int  Width  (void)  const
     { return   int (col . size ()); }

   PWM_t &  operator =
       (const PWM_t & src);
  };


class  Gene_t  :  public Orf_t
  {
  private:
   unsigned int  status;
   int  id;
   double  score;

  public:
   Gene_t  ()
     { status = 0; }
   Gene_t  (const Orf_t & orf) : Orf_t (orf)
     { status = 0; }

   int  Get_ID  (void)  const
     { return  id; }
   double  Get_Score  (void)  const
     { return  score; }
   unsigned int  Get_Status  (void)  const
     { return  status; }
   unsigned int  Get_Status_Bit
       (unsigned int u)  const;

   void  Set_ID  (int i)
     { id = i; }
   void  Set_Score  (double d)
     { score = d; }
   void  Set_Status  (unsigned int u)
     { status = u; }
   void  Set_Status_Bit  (unsigned int u)
     { status |= u; }

   void  Clear_Status  (void)
     { status = 0; }
  };



bool  By_ID
    (const Gene_t & a, const Gene_t & b);
unsigned  Ch_Mask
    (char);
int  Char_Sub
    (char ch);
char  Codon_Translation
    (const char * c, int transl_tabl = 1);
char  Complement
    (char ch);
void  Counts_To_Entropy_Profile
    (int count [26], double ep [20]);
int  Filter
    (char ch);
void  Find_Stop_Codons
    (const char * s, int t, int stop []);
int  First_In_Frame_Stop
    (char * s, int frame);
void  Forward_Strand_Transfer
    (string & t, const string & s, int start, int len);
int  Is_Forward_Start
    (unsigned codon);
int  Is_Forward_Stop
    (unsigned codon);
int  Is_Reverse_Start
    (unsigned codon);
int  Is_Reverse_Stop
    (unsigned codon);
int  Is_Start
    (const char * s);
int  Is_Stop
    (const char * s);
int  Nucleotide_To_Subscript
    (char ch);
int  Read_String
    (FILE * fp, char * & t, long int & size, char name [], int partial);
void  Reverse_Complement
    (char * s);
void  Reverse_Complement
    (string & s);
void  Reverse_Strand_Transfer
    (string & t, const string & s, int start, int len);
void  Set_Stop_Codons_By_Code
    (vector <const char *> & stop_codon, int code, bool & errflg);


#endif