1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
|
/*****************************************************************
* @LICENSE@
*****************************************************************/
/* dayhoff.c
*
* Routines for dealing with PAM matrices.
*
* Includes:
* ParsePAMFile() -- read a PAM matrix from disk.
*
*
* SRE - Fri Apr 2 11:23:45 1993
* CVS $Id: dayhoff.c,v 1.7 2003/05/26 16:21:50 eddy Exp $
*/
#include "squidconf.h"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <ctype.h>
#include "squid.h"
/* Function: ParsePAMFile()
*
* Purpose: Given a pointer to an open file containing a PAM matrix,
* parse the file and allocate and fill a 2D array of
* floats containing the matrix. The PAM file is
* assumed to be in the format that NCBI distributes
* with BLAST. BLOSUM matrices also work fine, as
* produced by Henikoff's program "MATBLAS".
*
* Parses both old format and new format BLAST matrices.
* Old format just had rows of integers.
* New format includes a leading character on each row.
*
* The PAM matrix is a 27x27 matrix, 0=A..25=Z,26=*.
* Note that it's not a 20x20 matrix as you might expect;
* this is for speed of indexing as well as the ability
* to deal with ambiguous characters.
*
* Args: fp - open PAM file
* ret_pam - RETURN: pam matrix, integers
* ret_scale - RETURN: scale factor for converting
* to real Sij. For instance, PAM120 is
* given in units of ln(2)/2. This may
* be passed as NULL if the caller
* doesn't care.
*
* Returns: 1 on success; 0 on failure and sets squid_errno to
* indicate the cause. ret_pam is allocated here and
* must be freed by the caller (use FreePAM).
*/
int
ParsePAMFile(FILE *fp, int ***ret_pam, float *ret_scale)
{
int **pam;
char buffer[512]; /* input buffer from fp */
int order[27]; /* order of fields, obtained from header */
int nsymbols; /* total number of symbols in matrix */
char *sptr;
int idx;
int row, col;
float scale;
int gotscale = FALSE;
scale = 0.0; /* just to silence gcc uninit warnings */
if (fp == NULL) { squid_errno = SQERR_NODATA; return 0; }
/* Look at the first non-blank, non-comment line in the file.
* It gives single-letter codes in the order the PAM matrix
* is arrayed in the file.
*/
do {
if (fgets(buffer, 512, fp) == NULL)
{ squid_errno = SQERR_NODATA; return 0; }
/* Get the scale factor from the header.
* For BLOSUM files, we assume the line looks like:
* BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
* and we assume that the fraction is always 1/x;
*
* For PAM files, we assume the line looks like:
* PAM 120 substitution matrix, scale = ln(2)/2 = 0.346574
* and we assume that the number following the final '=' is our scale
*/
if (strstr(buffer, "BLOSUM Clustered Scoring Matrix") != NULL &&
(sptr = strchr(buffer, '/')) != NULL)
{
sptr++;
if (! isdigit((int) (*sptr))) { squid_errno = SQERR_FORMAT; return 0; }
scale = (float) ( (log(2.0)) / (atof(sptr)));
gotscale = TRUE;
}
else if (strstr(buffer, "substitution matrix,") != NULL)
{
while ((sptr = strrchr(buffer, '=')) != NULL) {
sptr += 2;
if (IsReal(sptr)) {
scale = atof(sptr);
gotscale = TRUE;
break;
}
}
}
} while ((sptr = strtok(buffer, " \t\n")) == NULL || *sptr == '#');
idx = 0;
do {
order[idx] = (int) *sptr - (int) 'A';
if (order[idx] < 0 || order[idx] > 25) order[idx] = 26;
idx++;
} while ((sptr = strtok(NULL, " \t\n")) != NULL);
nsymbols = idx;
/* Allocate a pam matrix. For speed of indexing, we use
* a 27x27 matrix so we can do lookups using the ASCII codes
* of amino acid single-letter representations, plus one
* extra field to deal with the "*" (terminators).
*/
if ((pam = (int **) calloc (27, sizeof(int *))) == NULL)
Die("calloc failed");
for (idx = 0; idx < 27; idx++)
if ((pam[idx] = (int *) calloc (27, sizeof(int))) == NULL)
Die("calloc failed");
/* Parse the rest of the file.
*/
for (row = 0; row < nsymbols; row++)
{
if (fgets(buffer, 512, fp) == NULL)
{ squid_errno = SQERR_NODATA; return 0; }
if ((sptr = strtok(buffer, " \t\n")) == NULL)
{ squid_errno = SQERR_NODATA; return 0; }
for (col = 0; col < nsymbols; col++)
{
if (sptr == NULL) { squid_errno = SQERR_NODATA; return 0; }
/* Watch out for new BLAST format, with leading characters
*/
if (*sptr == '*' || isalpha((int) *sptr))
col--; /* hack hack */
else
pam [order[row]] [order[col]] = atoi(sptr);
sptr = strtok(NULL, " \t\n");
}
}
/* Return
*/
if (ret_scale != NULL)
{
if (gotscale) *ret_scale = scale;
else
{
Warn("Failed to parse PAM matrix scale factor. Defaulting to ln(2)/2!");
*ret_scale = log(2.0) / 2.0;
}
}
*ret_pam = pam;
return 1;
}
|