File: esl_paml.c

package info (click to toggle)
hmmer 3.2.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 23,380 kB
  • sloc: ansic: 119,305; perl: 8,791; sh: 3,266; makefile: 1,871; python: 598
file content (174 lines) | stat: -rw-r--r-- 5,694 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
/* PAML interface.
 * 
 *   Ziheng Yang, "Phylogenetic Analysis by Maximum Likelihood"  [Yang97]
 *   http://abacus.gene.ucl.ac.uk/software/paml.html
 */
#include "esl_config.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "easel.h"
#include "esl_alphabet.h"
#include "esl_vectorops.h"
#include "esl_dmatrix.h"
#include "esl_fileparser.h"
#include "esl_paml.h"

/* Function:  esl_paml_ReadE()
 * Incept:    SRE, Fri Jul  9 09:27:24 2004 [St. Louis]
 *
 * Purpose:   Read an amino acid rate matrix in PAML format from stream
 *            <fp>. Return it in two pieces: the symmetric E
 *            exchangeability matrix in <E>, and the stationary
 *            probability vector $\pi$ in <pi>.
 *            Caller provides the memory for both <E> and <pi>.  <E>
 *            is a $20 \times 20$ matrix allocated as
 *            <esl_dmatrix_Create(20, 20)>. <pi> is an array with
 *            space for at least 20 doubles.
 *            
 *            The <E> matrix is symmetric for off-diagonal elements:
 *            $E_{ij} = E_{ij}$ for $i \neq j$.  The on-diagonal
 *            elements $E_{ii}$ are not valid and should not be
 *            accessed.  (They are set to zero.)

 *            The rate matrix will later be obtained from <E>
 *            and <pi> as 
 *                $Q_{ij} = E_{ij} \pi_j$ for $i \neq j$ 
 *            and
 *                $Q_{ii} = -\sum_{j \neq i} Q_{ij}$ 
 *            then scaled to units of one
 *            substitution/site; see <esl_ratemx_E2Q()> and
 *            <esl_ratemx_ScaleTo()>.
 *
 *            Data file format: First 190 numbers are a
 *            lower-triangular matrix E of amino acid
 *            exchangeabilities $E_{ij}$. Next 20 numbers are the
 *            amino acid frequencies $\pi_i$. Remainder of the
 *            datafile is ignored.
 *            
 *            The alphabet order in the matrix and the frequency
 *            vector is assumed to be "ARNDCQEGHILKMFPSTWYV"
 *            (alphabetical by three-letter code), which appears to be
 *            PAML's default order. This is transformed to Easel's
 *            "ACDEFGHIKLMNPQRSTVWY" (alphabetical by one-letter code)
 *            in the $E_{ij}$ and $\pi_i$ that are returned.
 *            
 * Args:      fp   - open datafile for reading.
 *            E    - RETURN: E matrix of amino acid exchangeabilities e_ij,
 *                     symmetric (E_ij = E_ji),
 *                     in Easel amino acid alphabet order A..Y.
 *                     Caller provides appropriately allocated space.
 *            pi   - RETURN: \pi_i vector of amino acid frequencies,
 *                    in Easel amino acid alphabet order A..Y.
 *                    Caller provides appropriately allocated space.
 *
 * Returns:   <eslOK> on success.
 *            Returns <eslEOF> on premature end of file (parse failed), in which
 *            case the contents of <E> and <pi> are undefined.
 *            
 * Throws:    <eslEMEM> on internal allocation failure,
 *            and the contents of <E> and <pi> are undefined.
 *
 * Xref:      STL8/p.56.
 */
int
esl_paml_ReadE(FILE *fp, ESL_DMATRIX *E, double *pi)
{
  int             status;
  ESL_FILEPARSER *efp = NULL;
  char           *tok;
  int             i,j;
  char           *pamlorder = "ARNDCQEGHILKMFPSTWYV";
  char           *eslorder  = "ACDEFGHIKLMNPQRSTVWY";
  int             perm[20];

  if ((status =  esl_dmatrix_SetZero(E))                 != eslOK) goto ERROR;
  esl_vec_DSet(pi, 20, 0.);

  if ((efp =    esl_fileparser_Create(fp))               == NULL)  goto ERROR;
  if ((status = esl_fileparser_SetCommentChar(efp, '#')) != eslOK) goto ERROR;

  /* Construct the alphabet permutation we need.
   * perm[i] -> original row/column i goes to row/column perm[i]
   */
   for (i = 0; i < 20; i++)
     perm[i] = (int) (strchr(eslorder, pamlorder[i]) - eslorder);

   /* Read the s_ij matrix data in, permuting as we go. */

   for (i = 1; i < 20; i++)
    for (j = 0; j < i; j++)
      {
	if ((status = esl_fileparser_GetToken(efp, &tok, NULL)) != eslOK) goto ERROR;
	E->mx[perm[i]][perm[j]] = atof(tok);
	E->mx[perm[j]][perm[i]] = E->mx[perm[i]][perm[j]];
      }

   /* Read the pi_i vector in, permuting as we read. */
  for (i = 0; i < 20; i++)
    {
      if ((status = esl_fileparser_GetToken(efp, &tok, NULL)) != eslOK) goto ERROR;
      pi[perm[i]] = atof(tok);
    }

  esl_fileparser_Destroy(efp);
  return eslOK;

 ERROR:
  if (efp != NULL) esl_fileparser_Destroy(efp);
  return status;
}


/*****************************************************************
 * Utility: reformat a PAML file to a static vector
 *****************************************************************/
#ifdef eslPAML_UTILITY1

/* gcc -g -Wall -o utility -I. -L. -DeslPAML_UTILITY1 esl_paml.c -leasel -lm
 */
#include "easel.h"
#include "esl_dmatrix.h"
#include "esl_paml.h"

int 
main(int argc, char **argv)
{
  char        *filename = argv[1];
  FILE        *fp       = NULL;
  ESL_DMATRIX *E        = NULL;
  double      *pi       = NULL;
  int          i,j,n;

  E = esl_dmatrix_Create(20, 20);
  pi = malloc(20 * sizeof(double));
  if ((fp = fopen(filename, "r")) == NULL) esl_fatal("open failed");
  if (esl_paml_ReadE(fp, E, pi) != eslOK)  esl_fatal("parse failed");

  n = 1;
  for (i = 1; i < 20; i++)
    for (j = 0; j < i; j++)
      {
	printf("%8.6f, ", E->mx[i][j]);
	if (n++ == 10) { puts(""); n=1; }
      }
  
  puts("");

  n = 1;
  for (i = 0; i < 20; i++)
    {
      printf("%8.6f, ", pi[i]);
      if (n++ == 10) { puts(""); n=1; }
    }
  
  fclose(fp);
  free(pi);
  esl_dmatrix_Destroy(E);
  return 0;
}

#endif /*eslPAML_UTILITY1*/