File: multiruns.c

package info (click to toggle)
paml 4.8%2Bdfsg-1
  • links: PTS, VCS
  • area: non-free
  • in suites: jessie, jessie-kfreebsd, stretch
  • size: 10,164 kB
  • ctags: 2,300
  • sloc: ansic: 25,770; xml: 4,486; makefile: 42
file content (159 lines) | stat: -rwxr-xr-x 5,451 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
/* multiruns.c

     cl -O2 -Ot -W4 multiruns.c
     cc -o multiruns -O2 multiruns.c -lm

     multiruns <rstfile1> <rstfile2> ... <lnLColumn> 

  Examples (comparing three runs with lnL in column 19 in rst1):

     multiruns rst1.r1 rst1.r2 rst1.r3 19
     multiruns a/rst1 b/rst1 c/rst1 19

   March 2003, Ziheng Yang
   September 2005, changed tworuns into multiruns, ziheng yang

   This program compares outputs from multiple separate ML runs analyzing
   many data sets (using ndata) to assemble a result file.  Because of local 
   peaks and convergence problems, multiple runs for the same analysis may not 
   generate the same results.  Then we should use the results corresponding to 
   the highest lnL.  This program takes input files which have summary results 
   from multiple runs, one line for each data set.  The program takes one line 
   from each of the input files and compare the first field, which is an index 
   column and should be identical between the input files, and an lnL column.  
   The program decides which run generated the highest lnL, and copy the line 
   from that run into the output file: out.txt.

   This is useful when you analyze the same set of simulated replicate data 
   sets multiple times, using different starting values.  For example, codeml 
   may write a line of output in rst1 for each data set, including parameter 
   estimates and lnL.  You can then use this program to compare the rst1 output 
   files from multiple runs to generate one output file.  The program allows the 
   fields to be either numerical or text, but the first (index) and lnL columns
   should be numerical.

*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define MAXNFIILES  20
#define MAXNFIELDS  1000
#define MAXLLINE    64000

void error2 (char * message);
FILE *gfopen(char *filename, char *mode);
int splitline (char line[], int fields[]);


int main(int argc, char* argv[])
{
   FILE *fout, *fin[MAXNFIILES];
   char infile[MAXNFIILES][96]={"rst1.r1", "rst1.r2"}, outfile[96]="out.txt";
   int index=0, nfile, nfileread, lnLcolumn=13, i, nrecords=0, lline=MAXLLINE;
   int nfields[MAXNFIILES],fields[MAXNFIELDS], minf, maxf, miss[MAXNFIILES];
   char *line[MAXNFIILES];
   double lnL[MAXNFIILES], lnLmin, lnLmax, indexfield[MAXNFIILES], y;

   puts("Usage: \n\tmultiruns <file1> <file2> ... <lnLcolumn>\n");
   nfile=argc-2;
   if(nfile<2) exit(1);

   for(i=0; i<nfile; i++) { 
      strcpy(infile[i], argv[1+i]);
      fin[i]=gfopen(infile[i],"r");
      if((line[i]=(char*)malloc(MAXLLINE*sizeof(char)))==NULL) error2("oom");
      printf("%s  ", infile[i]);
   }
   printf("  ==>  %s\n\n", outfile);

   sscanf(argv[argc-1], "%d", &lnLcolumn);
   lnLcolumn--;
   fout=(FILE*)gfopen(outfile,"w");

   for(nrecords=0; ; nrecords++) {
      for(i=0,nfileread=0; i<nfile; i++) { 
         nfields[i]=0; lnL[i]=0; line[i][0]='\0';  miss[i]=1;
         if(!fgets(line[i], lline, fin[i])) continue;
         nfields[i]=splitline(line[i],fields);

         if(nfields[i]>index) sscanf(line[i]+fields[index], "%lf", &indexfield[i]);
         if(nfields[i]>lnLcolumn) {
            sscanf(line[i]+fields[lnLcolumn], "%lf", &lnL[i]);
            miss[i]=0;
            nfileread++;
         }
      }
      if(nfileread==0) break;
      for(i=0,y=-1; i<nfile; i++) {
         if(!miss[i]) {
           if(y==-1) y=indexfield[i];
           else if(y!=indexfield[i]) error2("index field different");
         }
      }
      for(i=0,lnLmin=1e300,lnLmax=-1e300; i<nfile; i++) {
         if(miss[i]) continue;
         if(lnL[i]<lnLmin) { lnLmin=lnL[i];  minf=i; }
         if(lnL[i]>lnLmax) { lnLmax=lnL[i];  maxf=i; }
      }

      fprintf(fout, "%s", line[maxf]);

      printf("record %4d  (", nrecords+1);
      for(i=0; i<nfile; i++)  printf("%c", (miss[i]?'-':'+')); 
      printf(") %10.3f (%d) - %10.3f (%d) = %8.3f\n", 
         lnLmin, minf+1, lnLmax, maxf+1, lnLmin-lnLmax);
   }
   printf("\nwrote %d records into %s\n", nrecords, outfile);
   for(i=0; i<nfile; i++) { fclose(fin[i]);  free(line[i]); }
   fclose(fout);
}

void error2 (char * message)
{ printf("\nError: %s.\n", message); exit(-1); }

FILE *gfopen(char *filename, char *mode)
{
   FILE *fp;

   if(filename==NULL || filename[0]==0) 
      error2("file name empty.");

   fp=(FILE*)fopen(filename, mode);
   if(fp==NULL) {
      printf("\nerror when opening file %s\n", filename);
      if(!strchr(mode,'r')) exit(-1);
      printf("tell me the full path-name of the file? ");
      scanf("%s", filename);
      if((fp=(FILE*)fopen(filename, mode))!=NULL)  return(fp);
      puts("Can't find the file.  I give up.");
      exit(-1);
   }
   return(fp);
}

int splitline (char line[], int fields[])
{
/* This finds out how many fields there are in the line, and marks the starting 
   positions of the fields.
   Fields are separated by spaces, and texts are allowed as well.
*/
   int lline=64000, i, nfields=0, InSpace=1;
   char *p=line;

   for(i=0; i<lline && *p && *p!='\n'; i++,p++) {
      if (isspace(*p))
         InSpace=1;
      else  {
         if(InSpace) {
            InSpace=0;
            fields[nfields++]=i;
            /* if(nfields>MAXNFIELDS) puts("raise MAXNFIELDS?"); */
         }
      }
   }
   return(nfields);
}