File: alphabet.c

package info (click to toggle)
wise 2.4.1-10
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 25,060 kB
  • sloc: ansic: 276,350; makefile: 996; perl: 886; lex: 93; sh: 82; yacc: 81; csh: 16
file content (393 lines) | stat: -rw-r--r-- 11,693 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
/************************************************************
 * HMMER - Biological sequence analysis with profile-HMMs
 * Copyright (C) 1992-1997 Sean R. Eddy
 *
 *   This source code is distributed under the terms of the
 *   GNU General Public License. See the files COPYING and
 *   GNULICENSE for details.
 *
 ************************************************************/

/* alphabet.c
 * Configuration of the global symbol alphabet information.
 */

#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#include "config.h"
#include "structs.h"
#include "funcs.h"
#include "squid.h"

#ifdef MEMDEBUG
#include "dbmalloc.h"
#endif

static void set_degenerate(char iupac, char *syms);


/* Function: DetermineAlphabet()
 * 
 * Purpose:  From a set of sequences (raw or aligned), make a good
 *           guess whether they're Nucleic, Amino, or something
 *           else, and set alphabet accordingly. 
 *           
 *           If Alphabet_type is already set, that means our
 *           autodetection was overridden from the command line, 
 *           and we just set the other globals accordingly.  
 */
void
DetermineAlphabet(char **rseqs, int  nseq)
{
  int idx;
  int other, nucleic, amino;
  
  /* Autodetection of alphabet type.
   */
  other = nucleic = amino = 0;
  for (idx = 0; idx < nseq; idx++) {
    switch (Seqtype(rseqs[idx])) {
    case kRNA:      nucleic++;   break;
    case kDNA:      nucleic++;   break;
    case kAmino:    amino++;     break;
    case kOtherSeq: other++;     break;
    default: Die("No such alphabet type");
    }
  }

  if      (nucleic == nseq) Alphabet_type = hmmNUCLEIC;
  else if (amino   == nseq) Alphabet_type = hmmAMINO;
  else if (nucleic > amino && nucleic > other) {
    Warn("Looks like nucleic acid sequence, hope that's right");
    Alphabet_type = hmmNUCLEIC;
  }
  else if (amino > nucleic && amino > other) {
    Warn("Looks like amino acid sequence, hope that's right");
    Alphabet_type = hmmAMINO;
  }
  else Die("Sorry, I can't tell if that's protein or DNA"); 

  /* Now set up the alphabet.
   */
  SetAlphabet(Alphabet_type);
}


/* Function: SetAlphabet()
 * 
 * Purpose:  Set the alphabet globals, given an alphabet type
 *           of either hmmAMINO or hmmNUCLEIC.
 */
void
SetAlphabet(int type)
{
  int x;

  switch(type) { /* Alphabet is not a string - careful! */
  case hmmAMINO: 
    Alphabet_type     = type;
    /* Bug fix folded in from sean folded in from HMMER 2.3.1 */
    strcpy(Alphabet, "ACDEFGHIKLMNPQRSTVWYBZX");
    Alphabet_size     = 20; 
    Alphabet_iupac    = 23;
    for (x = 0; x < Alphabet_iupac; x++) {
      memset(Degenerate[x], 0, Alphabet_size);
    }
    for (x = 0; x < Alphabet_size; x++) {
      Degenerate[x][x] = 1;
      DegenCount[x] = 1;
    }
    set_degenerate('B', "ND");
    set_degenerate('Z', "QE");
    set_degenerate('X', Alphabet);
    break;
  case hmmNUCLEIC:
    Alphabet_type     = type;
    strncpy(Alphabet, "ACGTUNRYMKSWHBVDX", 17); 
    Alphabet_size     = 4; 
    Alphabet_iupac    = 17;
    for (x = 0; x < Alphabet_iupac; x++) {
      memset(Degenerate[x], 0, Alphabet_size);
    }
    for (x = 0; x < Alphabet_size; x++) {
      Degenerate[x][x] = 1;
      DegenCount[x] = 1;
    }
    set_degenerate('U', "T");
    set_degenerate('N', Alphabet);
    set_degenerate('X', Alphabet);
    set_degenerate('R', "AG");
    set_degenerate('Y', "CT");
    set_degenerate('M', "AC");
    set_degenerate('K', "GT");
    set_degenerate('S', "CG");
    set_degenerate('W', "AT");
    set_degenerate('H', "ACT");
    set_degenerate('B', "CGT");
    set_degenerate('V', "ACG");
    set_degenerate('D', "AGT");
    break;
  default: Die("No support for non-nucleic or protein alphabets");  
  }
}

/* Function: SymbolIndex()
 * 
 * Purpose:  Convert a symbol to its index in Alphabet[].
 *           Bogus characters are converted to 'X'.
 *           More robust than the SYMIDX() macro but
 *           presumably slower.
 */ 
int
SymbolIndex(char sym)
{
  char *s;
  return ((s = strchr(Alphabet, toupper(sym))) == NULL) ?
	  Alphabet_iupac-1 : s - Alphabet;
} 


/* Function: DigitizeSequence()
 * 
 * Purpose:  Internal representation of a sequence in HMMER is
 *           as a char array. 1..L are the indices
 *           of seq symbols in Alphabet[]. 0,L+1 are sentinel
 *           bytes, set to be Alphabet_iupac -- i.e. one more
 *           than the maximum allowed index.  
 *           
 *           Assumes that 'X', the fully degenerate character,
 *           is the last character in the allowed alphabet.
 *           
 * Args:     seq - sequence to be digitized (0..L-1)
 *           L   - length of sequence      
 *           
 * Return:   digitized sequence, dsq.
 *           dsq is allocated here and must be free'd by caller.
 */
char *
DigitizeSequence(char *seq, int L)
{
  char *dsq;
  int i;

  dsq = MallocOrDie (sizeof(char) * (L+2));
  dsq[0] = dsq[L+1] = (char) Alphabet_iupac;
  for (i = 1; i <= L; i++) 
    dsq[i] = SymbolIndex(seq[i-1]);
  return dsq;
}


/* Function: DedigitizeSequence()
 * Date:     SRE, Tue Dec 16 10:39:19 1997 [StL]
 * 
 * Purpose:  Returns a 0..L-1 character string, converting the
 *           dsq back to the real alphabet.
 */
char *
DedigitizeSequence(char *dsq, int L)
{
  char *seq;
  int i;

  seq = MallocOrDie(sizeof(char) * (L+1));
  for (i = 0; i < L; i++)
    seq[i] = Alphabet[(int) dsq[i+1]];
  seq[L] = '\0';
  return seq;
}


/* Function: DigitizeAlignment() 
 * 
 * Purpose:  Given an alignment, return digitized unaligned
 *           sequence array. (Tracebacks are always relative
 *           to digitized unaligned seqs, even if they are
 *           faked from an existing alignment in modelmakers.c.)
 *           
 * Args:     aseqs    - alignment to digitize
 *           ainfo    - optional info on alignment                  
 *           ret_dsqs - RETURN: array of digitized unaligned sequences
 *           
 * Return:   (void)
 *           dsqs is alloced here. Free2DArray(dseqs, nseq).
 */ 
void
DigitizeAlignment(char **aseqs, AINFO *ainfo, char ***ret_dsqs)
{
  char **dsq;
  int    idx;			/* counter for sequences     */
  int    dpos;			/* position in digitized seq */
  int    apos;			/* position in aligned seq   */

  dsq = (char **) MallocOrDie (sizeof(char *) * ainfo->nseq);
  for (idx = 0; idx < ainfo->nseq; idx++) {
    dsq[idx] = (char *) MallocOrDie (sizeof(char) * (ainfo->alen+2));

    dsq[idx][0] = (char) Alphabet_iupac; /* sentinel byte at start */

    for (apos = 0, dpos = 1; apos < ainfo->alen; apos++) {
      if (! isgap(aseqs[idx][apos]))  /* skip gaps */
	dsq[idx][dpos++] = SymbolIndex(aseqs[idx][apos]);
    }
    dsq[idx][dpos] = (char) Alphabet_iupac; /* sentinel byte at end */

    if (! ainfo->sqinfo[idx].flags & SQINFO_LEN) {
      ainfo->sqinfo[idx].len = dpos - 1;
      ainfo->sqinfo[idx].flags |= SQINFO_LEN;
    }
  }
  *ret_dsqs = dsq;
}


/* Function: P7CountSymbol()
 * 
 * Purpose:  Given a possibly degenerate symbol code, increment
 *           a symbol counter array (generally an emission
 *           probability vector in counts form) appropriately.
 *           
 * Args:     counters:  vector to count into. [0..Alphabet_size-1]
 *           symidx:    symbol index to count: [0..Alphabet_iupac-1]
 *           wt:        weight to use for the count; often 1.0
 *           
 * Return:   (void)                
 */
void
P7CountSymbol(float *counters, char symidx, float wt)
{
  int x;

  if (symidx < Alphabet_size) 
    counters[symidx] += wt;
  else
    for (x = 0; x < Alphabet_size; x++) {
      if (Degenerate[(int) symidx][x])
	counters[x] += wt / (float) DegenCount[(int) symidx];
    }
}


/* Function: DefaultGeneticCode()
 * 
 * Purpose:  Configure aacode, mapping triplets to amino acids.
 *           Triplet index: AAA = 0, AAC = 1, ... UUU = 63.
 *           AA index: alphabetical: A=0,C=1... Y=19
 *           Stop codon: -1. 
 *           Uses the stdcode1[] global translation table from SQUID.
 *           
 * Args:     aacode  - preallocated 0.63 array for genetic code
 *                     
 * Return:   (void)
 */
void
DefaultGeneticCode(int *aacode)
{
  int x;

  for (x = 0; x < 64; x++) {
    if (*(stdcode1[x]) == '*') aacode[x] = -1;
    else                       aacode[x] = SYMIDX(*(stdcode1[x]));
  }
}


/* Function: DefaultCodonBias()
 * 
 * Purpose:  Configure a codonbias table, mapping triplets to
 *           probability of using the triplet for the amino acid
 *           it represents: P(triplet | aa).
 *           The default is to assume codons are used equiprobably.
 *           
 * Args:     codebias:  0..63 array of P(triplet|aa), preallocated.
 * 
 * Return:   (void)
 */
void
DefaultCodonBias(float *codebias)
{
  codebias[0]  = 1./2.;	/* AAA Lys 2 */
  codebias[1]  = 1./2.;	/* AAC Asn 2 */
  codebias[2]  = 1./2.;	/* AAG Lys 2 */
  codebias[3]  = 1./2.;	/* AAU Asn 2 */
  codebias[4]  = 1./4.;	/* ACA Thr 4 */
  codebias[5]  = 1./4.;	/* ACC Thr 4 */
  codebias[6]  = 1./4.;	/* ACG Thr 4 */
  codebias[7]  = 1./4.;	/* ACU Thr 4 */
  codebias[8]  = 1./6.;	/* AGA Ser 6 */
  codebias[9]  = 1./6.;	/* AGC Arg 6 */
  codebias[10] = 1./6.;	/* AGG Ser 6 */
  codebias[11] = 1./6.;	/* AGU Arg 6 */
  codebias[12] = 1./3.;	/* AUA Ile 3 */
  codebias[13] = 1./3.;	/* AUC Ile 3 */
  codebias[14] = 1.;	/* AUG Met 1 */
  codebias[15] = 1./3.;	/* AUU Ile 3 */
  codebias[16] = 1./2.;	/* CAA Gln 2 */
  codebias[17] = 1./2.;	/* CAC His 2 */
  codebias[18] = 1./2.;	/* CAG Gln 2 */
  codebias[19] = 1./2.;	/* CAU His 2 */
  codebias[20] = 1./4.;	/* CCA Pro 4 */
  codebias[21] = 1./4.;	/* CCC Pro 4 */
  codebias[22] = 1./4.;	/* CCG Pro 4 */
  codebias[23] = 1./4.;	/* CCU Pro 4 */
  codebias[24] = 1./6.;	/* CGA Arg 6 */
  codebias[25] = 1./6.;	/* CGC Arg 6 */
  codebias[26] = 1./6.;	/* CGG Arg 6 */
  codebias[27] = 1./6.;	/* CGU Arg 6 */
  codebias[28] = 1./6.;	/* CUA Leu 6 */
  codebias[29] = 1./6.;	/* CUC Leu 6 */
  codebias[30] = 1./6.;	/* CUG Leu 6 */
  codebias[31] = 1./6.;	/* CUU Leu 6 */
  codebias[32] = 1./2.;	/* GAA Glu 2 */
  codebias[33] = 1./2.;	/* GAC Asp 2 */
  codebias[34] = 1./2.;	/* GAG Glu 2 */
  codebias[35] = 1./2.;	/* GAU Asp 2 */
  codebias[36] = 1./4.;	/* GCA Ala 4 */
  codebias[37] = 1./4.;	/* GCC Ala 4 */
  codebias[38] = 1./4.;	/* GCG Ala 4 */
  codebias[39] = 1./4.;	/* GCU Ala 4 */
  codebias[40] = 1./4.;	/* GGA Gly 4 */
  codebias[41] = 1./4.;	/* GGC Gly 4 */
  codebias[42] = 1./4.;	/* GGG Gly 4 */
  codebias[43] = 1./4.;	/* GGU Gly 4 */
  codebias[44] = 1./4.;	/* GUA Val 4 */
  codebias[45] = 1./4.;	/* GUC Val 4 */
  codebias[46] = 1./4.;	/* GUG Val 4 */
  codebias[47] = 1./4.;	/* GUU Val 4 */
  codebias[48] = 0.;	/* UAA och - */
  codebias[49] = 1./2.;	/* UAC Tyr 2 */
  codebias[50] = 0.;	/* UAG amb - */
  codebias[51] = 1./2.;	/* UAU Tyr 2 */
  codebias[52] = 1./6.;	/* UCA Ser 6 */
  codebias[53] = 1./6.;	/* UCC Ser 6 */
  codebias[54] = 1./6.;	/* UCG Ser 6 */
  codebias[55] = 1./6.;	/* UCU Ser 6 */
  codebias[56] = 0.;	/* UGA opa - */
  codebias[57] = 1./2.;	/* UGC Cys 2 */
  codebias[58] = 1.;	/* UGG Trp 1 */
  codebias[59] = 1./2.;	/* UGU Cys 2 */
  codebias[60] = 1./6.;	/* UUA Leu 6 */
  codebias[61] = 1./2.;	/* UUC Phe 2 */
  codebias[62] = 1./6.; /* UUG Leu 6 */
  codebias[63] = 1./2.;	/* UUU Phe 2 */
}



/* Function: set_degenerate()
 * 
 * Purpose:  convenience function for setting up 
 *           Degenerate[][] global for the alphabet.
 */
static void 
set_degenerate(char iupac, char *syms)
{
  DegenCount[strchr(Alphabet,iupac)-Alphabet] = strlen(syms);
  while (*syms) {
    Degenerate[strchr(Alphabet,iupac)-Alphabet]
              [strchr(Alphabet,*syms)-Alphabet] = 1;
    syms++;
  }
}