File: phylip.c

package info (click to toggle)
biosquid 1.9g%2Bcvs20050121-11
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 1,616 kB
  • sloc: ansic: 12,750; sh: 1,412; perl: 243; makefile: 230
file content (192 lines) | stat: -rw-r--r-- 5,085 bytes parent folder | download | duplicates (9)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
/*****************************************************************
 * @LICENSE@
 *****************************************************************/

/* phylip.c
 * SRE, Mon Jun 14 14:08:33 1999 [St. Louis]
 * 
 * Import/export of PHYLIP interleaved multiple sequence alignment
 * format files.
 * 
 * Follows documentation at:
 * http://evolution.genetics.washington.edu/phylip/doc/sequence.html
 * for rev 3.6 of PHYLIP's molecular sequence analysis programs.
 * 
 * CVS $Id: phylip.c,v 1.4 2004/07/29 00:23:27 eddy Exp $
 */

#include "squidconf.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "squid.h"
#include "msa.h"

#ifdef TESTDRIVE_PHYLIP
/*****************************************************************
 * phylip.c test driver:
 * 
 */
int 
main(int argc, char **argv)
{
  MSAFILE *afp;
  MSA     *msa;
  char    *file;
  
  file = argv[1];

  if ((afp = MSAFileOpen(file, MSAFILE_UNKNOWN, NULL)) == NULL)
    Die("Couldn't open %s\n", file);

  printf("format %d\n", afp->format);

  while ((msa = ReadPhylip(afp)) != NULL)
    {
      WritePhylip(stdout, msa);
      MSAFree(msa); 
    }
  
  MSAFileClose(afp);
  exit(0);
}
/******************************************************************/
#endif /* testdrive_phylip */



/* Function: ReadPhylip()
 * Date:     SRE, Fri Jun 18 12:59:37 1999 [Sanger Centre]
 *
 * Purpose:  Parse an alignment from an open Phylip format
 *           alignment file. Phylip is a single-alignment format.
 *           Return the alignment, or NULL if we have no data.
 *
 * Args:     afp - open alignment file
 *
 * Returns:  MSA * - an alignment object
 *                   Caller responsible for an MSAFree()
 *           NULL if no more alignments        
 */
MSA *
ReadPhylip(MSAFILE *afp)
{
  MSA  *msa;
  char *s, *s1, *s2;
  char  name[11];		/* seq name max len = 10 char */
  int   nseq, alen;
  int   idx;			/* index of current sequence */
  int   nblock;
  int   i,j;
  
  if (feof(afp->f)) return NULL;

  /* Skip until we see a nonblank line; it's the header,
   * containing nseq/alen
   */
  nseq = 0; alen = 0;
  while ((s = MSAFileGetLine(afp)) != NULL)
    {
      if ((s1 = sre_strtok(&s, WHITESPACE, NULL)) == NULL) continue;
      if ((s2 = sre_strtok(&s, WHITESPACE, NULL)) == NULL)
	Die("Failed to parse nseq/alen from first line of PHYLIP file %s\n", afp->fname);
      if (! IsInt(s1) || ! IsInt(s2))
	Die("nseq and/or alen not an integer in first line of PHYLIP file %s\n", afp->fname);
      nseq = atoi(s1);
      alen = atoi(s2);
      break;
    }

  msa = MSAAlloc(nseq, 0);
  idx    = 0;
  nblock = 0;
  while ((s = MSAFileGetLine(afp)) != NULL) 
    {
      /* ignore blank lines. nonblank lines start w/ nonblank char */
      if (isspace((int) *s)) continue;
				/* First block has seq names */
      if (nblock == 0) {
	strncpy(name, s, 10);	/* First ten chars are the name */
	name[10] = '\0';
	GKIStoreKey(msa->index, name);
	msa->sqname[idx] = sre_strdup(name, -1);
	s += 10;		
      }
      
      /* Run through the buffer and strip any chars we don't recognize as
       * legal PHYLIP: especially, spaces.
       */
      for (i = 0, j=0; s[i] != '\0'; i++)
	{
	  if (isalpha(s[i]) || strchr("-?*", s[i]) != NULL) { s[j] = s[i]; j++; }
	}
      s[j] = '\0';

      /* Concat the buffer onto our growing sequence.
       */
      msa->sqlen[idx] = sre_strcat(&(msa->aseq[idx]), msa->sqlen[idx], s, j);

      idx++;
      if (idx == nseq) { idx = 0; nblock++; }
    }
  msa->nseq = nseq;
  MSAVerifyParse(msa);		/* verifies; sets alen, wgt; frees sqlen[] */
  if (msa->alen != alen) 
    Die("First line said alignment would be alen=%d; read %d\n", alen, msa->alen);
  return msa;
}



/* Function: WritePhylip()
 * Date:     SRE, Fri Jun 18 12:07:41 1999 [Sanger Centre]
 *
 * Purpose:  Write an alignment in Phylip format to an open file.
 *
 * Args:     fp    - file that's open for writing.
 *           msa   - alignment to write. 
 *
 * Returns:  (void)
 */
void
WritePhylip(FILE *fp, MSA *msa)
{
  int    idx;			/* counter for sequences         */
  int    cpl = 50;		/* 50 seq char per line          */
  char   buf[51];		/* buffer for writing seq        */
  int    pos;
  int    i;

  /* First line has nseq, alen
   */
  fprintf(fp, " %d  %d\n", msa->nseq, msa->alen);

  /* Alignment section.
   * PHYLIP is a multiblock format, blocks (optionally) separated
   * by blanks; names only attached to first block. Names are
   * restricted to ten char; we achieve this by simple truncation (!).
   * Gaps must be '-'. Seq chars should be upper case.
   */
  for (pos = 0; pos < msa->alen; pos += cpl)
    {
      if (pos > 0) fprintf(fp, "\n");

      for (idx = 0; idx < msa->nseq; idx++)
	{
	  strncpy(buf, msa->aseq[idx] + pos, cpl);
	  buf[cpl] = '\0';	      

	  for (i = 0; buf[i] != '\0'; i++)
	    {
	      if (islower(buf[i])) buf[i] = toupper(buf[i]);
	      if (isgap(buf[i]))   buf[i] = '-';
	    }

	  if (pos > 0) fprintf(fp, "%s\n", buf);
	  else         fprintf(fp, "%-10.10s%s\n", msa->sqname[idx], buf);
	}
    }
  return;
}