File: rmark-rocplot.c

package info (click to toggle)
infernal 1.1.5-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 74,208 kB
  • sloc: ansic: 230,749; perl: 14,433; sh: 6,147; makefile: 3,071; python: 1,247
file content (735 lines) | stat: -rw-r--r-- 30,958 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
/* Summarizing results of a benchmark by plotting a ROC-like plot,
 * including confidence intervals derived by Bayesian bootstrapping.
 * 
 * Alternatively, with the --mer or --Ethresh options, ROC-like plots
 * are not output. With --mer, output mean MER (minimum error rate)
 * values for all bootstrap trials. MER is the minimal sum of false
 * negatives and false positives over all possible thresholds; a good
 * single summary statistic for benchmark performance. With --Ethresh
 * <x>, output mean number of true positives and false positives with
 * E-values <= <x> for all bootstrap trials.
 * 
 * 
 * The <.out file> from an rmark benchmark consists of lines:
 *     <E-value> <bitscore> <target_sequence> <query_model> <matching_model> <seq_idx_in_fam> <strand>
 *    
 * The <.out> file (or stream) must be sorted by E-value, with best
 * (lowest) E-values first.
 *   
 * Unlike profmark, target sequence names are not informative. 
 * 
 * Positive and negative hits are determined as follows:
 * - A hit is a positive if <matching_model> matches <query_model> and
 *   <strand> is "same", and there's no better scoring hits between
 *   <query_model> and <target_sequence>
 * - A hit is a negative if <matching_model> is "decoy" (strand will also
 *   be "decoy").
 * - A hit is ignored for three possible reasons: 
 *   : if <matching_model> is neither "decoy" nor <query_model> OR
 *   : if <matching_model> is <query_model> and <strand> is "opposite".
 *   : <matching_model> is <query_model> and <strand> is "same" but
 *      there is a better scoring hit between <query_model> and 
 *      <target_sequence>.
 * 
 * The program also needs to find the query and positive
 * tables for the benchmark that was run. It looks for these by
 * appending ".tbl" and ".pos" to the <rmark> argument;
 * that is:
 *   : <rmark>.tbl contains one line per query: 1st field is name of query
 *   : <rmark>.pos contains one line per positive: 1st field is the
 *     concatenation of the query family name a '/' and the sequence index
 *     in the family. For example: "tRNA/3" is the third tRNA in the 
 *     benchmark set. (Note these are not in order in the test sequences,
 *     tRNA/3 and tRNA/1 might be in the first and second negative sequences
 *     respectively.)
 *   
 * The program calculates a plot of fractional coverage of the positives
 * (on the Y-axis; range 0..1) versus errors per query (on the X-axis; 
 * ranging from 1/(# of models) to 10.0 by default). 
 * 
 * The default output is an XMGRACE xydydy file, with points
 * representing mean coverage in the bootstrap samples, and error bars
 * representing a 95% confidence interval.
 *
 * A typical command line, after having run a benchmark on "rmark" under
 * MPI with many output files:
 * 
 *    cat *.out | sort -g | ./rmark-rocplot rmark - > results.xy
 *    xmgrace -settype xydydy results.xy
 *   
 * SRE, Wed Jun 18 13:37:31 2008 [Janelia]
 */
#include <esl_config.h>

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#include "easel.h"
#include "esl_dirichlet.h"
#include "esl_getopts.h"
#include "esl_keyhash.h"
#include "esl_random.h"
#include "esl_stats.h"
#include "esl_vectorops.h"


static char banner[] = "construct a ROC plot of rmark results, using Bayesian bootstrapping";
static char usage[]  = "[options] <rmark_basename> <.out file>\n";

static ESL_OPTIONS options[] = {
  /* name       type        default env   range togs  reqs  incomp      help                                          docgroup */
  { "-h",       eslARG_NONE,  FALSE, NULL, NULL, NULL,NULL, NULL, "help; show brief info on version and usage",           1 },
  { "-a",       eslARG_NONE,  FALSE, NULL, NULL, NULL,NULL, NULL, "all: plot all bootstrap samples individually",         1 },
  { "-n",       eslARG_NONE,  FALSE, NULL, NULL, NULL,NULL, NULL, "plot original data, without any bootstrapping",        1 },
  { "-s",       eslARG_NONE,  FALSE, NULL, NULL, NULL,NULL, NULL, "plot error bars by std. dev. not confidence interval", 1 },
  { "-N",       eslARG_INT,   "500", NULL, NULL, NULL,NULL, NULL, "number of bootstrap samples to take",                  1 },
  { "--min",    eslARG_REAL,   NULL, NULL, NULL, NULL,NULL, NULL, "set minimum x-axis value (FPs/query) to plot",         1 },
  { "--max",    eslARG_REAL,  "10.", NULL, NULL, NULL,NULL, NULL, "set maximum x-axis value (FPs/query) to plot",         1 },
  { "--steps",  eslARG_INT,    "10", NULL, NULL, NULL,NULL, NULL, "set number of steps to plot per 10x on x-axis",        1 },
  { "--seed",   eslARG_INT,   FALSE, NULL,"n>0", NULL,NULL, NULL, "set random number generator's seed to <n>",            1 },
  { "--nsd",    eslARG_REAL,   "3.", NULL,"x>0", NULL,"-s", NULL, "how many std.dev.'s big error bars should be",         1 },
  { "--interval", eslARG_REAL,"0.95",NULL,"0<=x<=1",NULL,NULL,"-s", "confidence interval width for error bars",           1 },
  { "--mer",    eslARG_NONE,  FALSE, NULL, NULL, NULL,NULL, NULL, "don't construct a plot, summarize MER instead",        1 },
  { "--Ethresh",eslARG_REAL,  "1.",  NULL,"x>0", NULL, NULL,"--mer", "don't construct a plot, print avg num FNs/FPs w/E < <x>", 1 },
  { 0,0,0,0,0,0,0,0,0,0 },
};


/* The FP/query x-axis is discretized into <nxpts> points, evenly spaced on a log scale.
 *
 * To calculate FP/query threshold for a point i: 10^{ (base+i) / nsteps }
 * To convert a FP/query value x to a bin i:      ceil(log10(x) * nsteps)
 */
struct oneplot_s {
  double *tp;			/* yaxis values, [0..nxpts-1]; # of FNs <= given FP/query on x-axis */
  int     base;                 /* scaled integer offset of bin #0 in tp */
  int     nsteps;		/* resolution of logarithmic x-axis: # of evenly spaced points per 10x */
  int     nxpts;		/* total # of points on axis */
  double  totalpos;		/* total # of positives possible in this bootstrap sample */
  double  mer;                  /* MER: minimum error rate, sum of FP and FN over choice of all possible thresholds */
  double  mer_thr;              /* MER threshold */
  double  mer_fn;               /* number of false negatives at MER threshold */
  double  mer_fp;               /* number of false positives at MER threshold */
  double  Ethresh;              /* E-value threshold for Ethresh_fn and Ethresh_fp */
  double  Ethresh_fn;           /* number of true  positives with E-values <= Ethresh */
  double  Ethresh_fp;           /* number of false positives with E-values <= Ethresh */
};

struct result_s {
  double E;			/* E-value */
  int    qidx;			/* index of query  */
  int    tidx; 			/* index of matching seq: 0..npos-1 for positives */
  int    class;			/* +1 = positive; -1 = negative; 0 = ignore   */
};



static int    parse_tblfile(char *tblfile, ESL_KEYHASH *kh);
static int    parse_results_rmark(char *resfile, int **pni, ESL_KEYHASH *modelkh, ESL_KEYHASH *poskh, struct result_s **ret_r, int *ret_nr, int *ret_nneg);
static int    classify_pair_by_names_and_strand(const char *query, const char *target, const char *strand);
static double weighted_total_positives(int **pni, double *queryp, int nq, double *seqp, int npos, int nseq);
static struct oneplot_s *create_plot(ESL_GETOPTS *go, int nq);
static void   destroy_plot(struct oneplot_s *plot);
static void   make_plot(struct result_s *rp, int nr, int **pni, double *queryp, int nq, double *seqp, int nseq, int npos, 
			struct oneplot_s *plot);
static void   write_plot    (FILE *fp, struct oneplot_s *plot);
static void   write_mer     (FILE *fp, struct oneplot_s *plot);
static void   write_Ethresh (FILE *fp, struct oneplot_s *plot);
static void   summary_graph(ESL_GETOPTS *go, FILE *fp, struct oneplot_s *plot, double **yv);
static void   summary_mer  (ESL_GETOPTS *go, FILE *fp, struct oneplot_s *plot, double *mervec, double *merthrvec, double *merfpvec, double *merfnvec);
static void   summary_Ethresh(ESL_GETOPTS *go, FILE *fp, struct oneplot_s *plot, double *Ethresh_fn_vec, double *Ethresh_fp_vec);

static void
cmdline_failure(char *argv0, char *format, ...)
{
  va_list argp;
  va_start(argp, format);
  vfprintf(stderr, format, argp);
  va_end(argp);
  esl_usage(stdout, argv0, usage);
  printf("\nTo see more help on available options, do %s -h\n\n", argv0);
  exit(1);
}

static void
cmdline_help(char *argv0, ESL_GETOPTS *go) 
{
  esl_banner(stdout, argv0, banner);
  esl_usage (stdout, argv0, usage);
  puts("\n where options are:");
  esl_opt_DisplayHelp(stdout, go, 1, 2, 80);
  exit(0);
}

int 
main(int argc, char **argv)
{
  ESL_GETOPTS  *go        = NULL;
  ESL_KEYHASH  *qkh       = esl_keyhash_Create();
  ESL_KEYHASH  *poskh     = esl_keyhash_Create();
  ESL_RANDOMNESS *r       = NULL;
  char         *pmarkbase = NULL;
  char         *posfile   = NULL;
  char         *modelfile = NULL;
  char         *resfile   = NULL;
  struct oneplot_s *plot  = NULL;
  struct result_s  *rp    = NULL;
  int             **pni   = NULL;
  double          **yv    = NULL;	/* yv[0..nxpts-1][0..nboots-1]: vector of bootstrapped samples at each xaxis point */
  double          *mervec    = NULL;	/* [0..nboots-1]: MER of bootstrapped samples */
  double          *merthrvec = NULL;	/* [0..nboots-1]: MER threshold of bootstrapped samples */
  double          *merfpvec  = NULL;	/* [0..nboots-1]: MER false positives of bootstrapped samples */
  double          *merfnvec  = NULL;	/* [0..nboots-1]: MER false negatives of bootstrapped samples */
  double          *Ethresh_fn_vec = NULL;	/* [0..nboots-1]: num FNs <= Ethresh of bootstrapped samples */
  double          *Ethresh_fp_vec = NULL;	/* [0..nboots-1]: num FPs <= Ethresh of bootstrapped samples */
  int           nq, npos, nneg, nseq;
  int           nresults  = 0;
  int           nboots;
  double       *queryp;
  double       *seqp;
  int           i,j;
  int           xi;

  /* Parse command line */
  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf);
  if (esl_opt_VerifyConfig(go)               != eslOK) cmdline_failure(argv[0], "Error in app configuration:   %s\n", go->errbuf);
  if (esl_opt_GetBoolean(go, "-h"))                    cmdline_help(argv[0], go);
  if (esl_opt_ArgNumber(go)                  != 2)     cmdline_failure(argv[0], "Incorrect number of command line arguments\n");
  pmarkbase = esl_opt_GetArg(go, 1); 
  resfile   = esl_opt_GetArg(go, 2);

  nboots    = esl_opt_GetInteger(go, "-N");

  /* Set up the RNG */
  if (esl_opt_IsDefault(go, "--seed")) r = esl_randomness_CreateTimeseeded();
  else                                 r = esl_randomness_Create(esl_opt_GetInteger(go, "--seed"));

  /* Read the queries and positives into hash tables, and count them. */
  esl_FileNewSuffix(pmarkbase, "tbl", &modelfile);  parse_tblfile(modelfile, qkh);   free(modelfile);
  esl_FileNewSuffix(pmarkbase, "pos", &posfile);    parse_tblfile(posfile,   poskh); free(posfile);  
  nq   = esl_keyhash_GetNumber(qkh);
  npos = esl_keyhash_GetNumber(poskh);

  /* Create a [0..nq-1]x[0..pos-1] matrix preclassifying each pair as +1 (positive), -1 (negative), or 0 (ignore) */
  if ((pni = malloc(sizeof(int *) * nq)) == NULL) esl_fatal("malloc failed");
  for (i = 0; i < nq; i++) 
    {
      if ((pni[i] = malloc(sizeof(int) * npos)) == NULL) esl_fatal("malloc failed");
      for (j = 0; j < npos; j++)
	pni[i][j] = classify_pair_by_names_and_strand(esl_keyhash_Get(qkh, i), esl_keyhash_Get(poskh, j), "same");
    }  

  /* Read and code the .pout file; assigning positives, negatives to the results */
  parse_results_rmark(resfile, pni, qkh, poskh, &rp, &nresults, &nneg);
  nseq = npos+nneg;

  /* Allocate for the bootstrap weights on queries, seqs */
  if ((queryp = malloc(sizeof(double) * nq))    == NULL) esl_fatal("malloc failed");
  if ((seqp   = malloc(sizeof(double) * nseq))  == NULL) esl_fatal("malloc failed");

  /* In seqp, 0..npos-1 are the positives; npos..nseq-1 are the negatives.
   */

  /* Figure out the coordinate system for the plot's xaxis; then
   * allocate for a single plot sample in <plot>, as well as for
   * storing all the bootstrap results in <yv>. The <plot> 
   * holds the information about the x coordinate system.
   */
  plot = create_plot(go, nq);

  if ((yv       = malloc(sizeof(double *) * plot->nxpts)) == NULL) esl_fatal("malloc failed");
  for (xi = 0; xi < plot->nxpts; xi++)
    if ((yv[xi]   = malloc(sizeof(double *) * nboots)) == NULL) esl_fatal("malloc failed");
  if ((mervec     = malloc(sizeof(double) * nboots)) == NULL) esl_fatal("malloc failed");
  if ((merthrvec  = malloc(sizeof(double) * nboots)) == NULL) esl_fatal("malloc failed");
  if ((merfpvec   = malloc(sizeof(double) * nboots)) == NULL) esl_fatal("malloc failed");
  if ((merfnvec   = malloc(sizeof(double) * nboots)) == NULL) esl_fatal("malloc failed");
  if ((Ethresh_fn_vec = malloc(sizeof(double) * nboots)) == NULL) esl_fatal("malloc failed");
  if ((Ethresh_fp_vec = malloc(sizeof(double) * nboots)) == NULL) esl_fatal("malloc failed");

  /* "Bayesian" bootstraps:  */
  if (! esl_opt_GetBoolean(go, "-n"))
    {
      for (i = 0; i < nboots; i++)
	{
	  esl_dirichlet_DSampleUniform(r, nq,    queryp);
	  esl_dirichlet_DSampleUniform(r, nseq,  seqp);

	  make_plot(rp, nresults, pni, queryp, nq, seqp, nseq, npos, plot);
      
	  /* Plot or store this bootstrap sample. */      
	  if (esl_opt_GetBoolean(go, "-a")) { 
	    if      (esl_opt_GetBoolean(go, "--mer"))     { write_mer     (stdout, plot); }
	    else if (esl_opt_IsUsed    (go, "--Ethresh")) { write_Ethresh (stdout, plot); }
	    else                                          { write_plot    (stdout, plot); }
	  }
	  else
	    {
	      for (xi = 0; xi < plot->nxpts; xi++) 
		yv[xi][i] = plot->tp[xi] / plot->totalpos;
	      mervec[i]    = plot->mer;
	      merthrvec[i] = plot->mer_thr;
	      merfpvec[i]  = plot->mer_fp;
	      merfnvec[i]  = plot->mer_fn;
	      Ethresh_fn_vec[i] = plot->Ethresh_fn;
	      Ethresh_fp_vec[i] = plot->Ethresh_fp;
	    }
	}
    }
  else /* just plot the original data with no bootstraps */
    {
      make_plot(rp, nresults, pni, NULL, nq, NULL, nseq, npos, plot);
      if      (esl_opt_GetBoolean (go, "--mer")){ write_mer (stdout, plot); }
      else if (esl_opt_IsUsed(go, "--Ethresh")) { write_Ethresh (stdout, plot); }
      else                                      { write_plot(stdout, plot); }
    }
      
      
  /* Summarize the bootstraps */
  if (! esl_opt_GetBoolean(go, "-a") && ! esl_opt_GetBoolean(go, "-n") ) { 
    if(esl_opt_GetBoolean(go, "--mer"))      { summary_mer    (go, stdout, plot, mervec, merthrvec, merfpvec, merfnvec); }
    else if(esl_opt_IsUsed(go, "--Ethresh")) { summary_Ethresh(go, stdout, plot, Ethresh_fn_vec, Ethresh_fp_vec); }
    else                                     { summary_graph  (go, stdout, plot, yv); }
  }

  for (i = 0; i < nq; i++) free(pni[i]);
  free(pni);
  for (xi = 0; xi < plot->nxpts; xi++) free(yv[xi]);
  free(yv);
  free(mervec);
  free(merthrvec);
  free(merfpvec);
  free(merfnvec);
  free(Ethresh_fn_vec);
  free(Ethresh_fp_vec);
  destroy_plot(plot);
  free(queryp);
  free(seqp);
  free(rp);
  esl_keyhash_Destroy(poskh);
  esl_keyhash_Destroy(qkh);
  esl_randomness_Destroy(r);
  esl_getopts_Destroy(go);
  return 0;
}



static int
parse_tblfile(char *tblfile, ESL_KEYHASH *kh)
{
  ESL_FILEPARSER *efp = NULL;
  char           *tok = NULL;
  int             toklen;

  if (esl_fileparser_Open(tblfile, NULL, &efp) != eslOK) esl_fatal("failed to open pmark table %s", tblfile);
  esl_fileparser_SetCommentChar(efp, '#');
  
  while (esl_fileparser_NextLine(efp) == eslOK)
    {
      if (esl_fileparser_GetTokenOnLine(efp, &tok, &toklen) != eslOK) esl_fatal("failed to parse line %d of %s", efp->linenumber, tblfile);
      if (esl_keyhash_Store(kh, tok, toklen, NULL)          != eslOK) esl_fatal("failed to add %s to seq index", tok);
    }      
  esl_fileparser_Close(efp);
  return eslOK;
}


static int
classify_pair_by_names_and_strand(const char *query, const char *target, const char *strand)
{
  int qlen = strlen(query);
  int tlen = strlen(target);
  
  if   (tlen > qlen && strncmp(query, target, qlen) == 0 && target[qlen] == '/' && strncmp(strand, "same", 4) == 0) 
    return 1;   /* this tests for <model> == <query_model> */
  else if (strncmp(target, "decoy", 5) == 0) 
    return -1;  /* matches a stretch of negative sequence, doesn't overlap any positive */
  else
    return 0;	/* ignore, could be target == query, but strand != "same", or target != "decoy" && target != query */
}

/* Given bootstrap sampled weights, calculate the maximum # of positives possible */
static double
weighted_total_positives(int **pni, double *queryp, int nq, double *seqp, int npos, int nseq)
{
  int    q, t;
  double total_pos = 0.0;

  for (q = 0; q < nq; q++)
    for (t = 0; t < npos; t++)
      if (pni[q][t] == 1) 
	total_pos += queryp[q] * seqp[t];

  return total_pos * nq * nseq;
}


/* The output files have format:
 *   <E-value> <bitscore> <target_from> <target_to> <target_name> <query_model> <matching_model>/<pos_idx_for_query> <strand>
 *
 * Example of <matching_model>/<pos_idx_for_query>: "tRNA/3":
 * "tRNA" is <matching_model>, "3" is <pos_idx_for_query>, this is the third tRNA.
 * 
 * A hit is a positive if <query_model> equals <matching_model> and <strand> is "same"
 * 
 * A hit is a negative if the <matching_model> is "decoy" and <pos_idx_for_query> is "0".
 * 
 * A hit is ignored if the <query_model> != <matching_model> OR 
 * <query_model> == <matching_model> but <strand> is "opposite".
 * 
 * This information is parsed here, such that each pairwise comparison
 * is stored as:
 *    qidx     : index of the query model
 *    tidx     : index of the target sequence
 *    E        : E-value of the comparison; results are already sorted on this
 */
static int
parse_results_rmark(char *resfile, int **pni, ESL_KEYHASH *qkh, ESL_KEYHASH *poskh, struct result_s **ret_r, int *ret_nr, int *ret_nneg)
{
  ESL_FILEPARSER  *efp    = NULL;
  char            *tok    = NULL;
  char            *match  = NULL;
  char            *query  = NULL;
  char            *strand = NULL;
  int              toklen;
  int              qlen, mlen, slen;
  struct result_s *rp     = NULL;
  int              ralloc = 0;
  int              nr     = 0;
  int              nneg   = 0;
  int              nq, npos;   /* number of querys, number of targets */
  int              i;          /* counter */
  int            **pos_exists; /* [0..q..nq-1][0..t..npos-1], 1 if we've seen a hit from query q to target t, 0 if not */

  if (esl_fileparser_Open(resfile, NULL, &efp) != eslOK) esl_fatal("failed to open pmark results file %s", resfile);
  esl_fileparser_SetCommentChar(efp, '#');

  if ((rp = malloc(sizeof(struct result_s) * 256)) == NULL) esl_fatal("malloc failed");
  ralloc = 256;

  /* set up pos_exists matrix, to keep track of whether we've seen 
   * a hit from each query to each target. This allows us to disallow
   * multiple hits from the same query to the same target.
   */
  nq   = esl_keyhash_GetNumber(qkh);
  npos = esl_keyhash_GetNumber(poskh);
  if ((pos_exists = malloc(sizeof(int *) * nq)) == NULL) esl_fatal("malloc failed");
  for(i = 0; i < nq; i++) { 
    if ((pos_exists[i] = malloc(sizeof(int) * npos)) == NULL) esl_fatal("malloc failed");
    esl_vec_ISet(pos_exists[i], npos, 0);
  }

  while (esl_fileparser_NextLine(efp) == eslOK)
    {
      if (nr == ralloc) {
	if ((rp = realloc(rp, sizeof(struct result_s) * ralloc * 2)) == NULL) esl_fatal("realloc failed");
	ralloc *= 2;
      }

      if (esl_fileparser_GetTokenOnLine(efp, &tok,    &toklen) != eslOK) esl_fatal("failed to parse line %d of %s", efp->linenumber, resfile); /* E-value => rp[nr].E */
      rp[nr].E = atof(tok);
      if (esl_fileparser_GetTokenOnLine(efp, &tok,    &toklen) != eslOK) esl_fatal("failed to parse line %d of %s", efp->linenumber, resfile); /* bit score ignored */
      if (esl_fileparser_GetTokenOnLine(efp, &tok,    &toklen) != eslOK) esl_fatal("failed to parse line %d of %s", efp->linenumber, resfile); /* target from ignored */
      if (esl_fileparser_GetTokenOnLine(efp, &tok,    &toklen) != eslOK) esl_fatal("failed to parse line %d of %s", efp->linenumber, resfile); /* target to ignored */
      if (esl_fileparser_GetTokenOnLine(efp, &tok,    &toklen) != eslOK) esl_fatal("failed to parse line %d of %s", efp->linenumber, resfile); /* target name (e.g. rmark3-2) is ignored */
      if (esl_fileparser_GetTokenOnLine(efp, &query,  &qlen)   != eslOK) esl_fatal("failed to parse line %d of %s", efp->linenumber, resfile); /* query name; will be converted to an index */
      if (esl_fileparser_GetTokenOnLine(efp, &match,  &mlen)   != eslOK) esl_fatal("failed to parse line %d of %s", efp->linenumber, resfile); /* match name; will be converted to an index */
      if (esl_fileparser_GetTokenOnLine(efp, &strand, &slen)   != eslOK) esl_fatal("failed to parse line %d of %s", efp->linenumber, resfile); /* strand; will be used to determine pos/neg */
      
      if (esl_keyhash_Lookup(qkh, query, qlen, &(rp[nr].qidx)) != eslOK) esl_fatal("failed to find query model %s in hash", query);  /* query index */
      rp[nr].class = classify_pair_by_names_and_strand(query, match, strand);
      if (rp[nr].class == -1)		/* negatives: increment nneg and offset the index by npos */
	{
	  rp[nr].tidx = nneg + esl_keyhash_GetNumber(poskh);
	  nneg++;
	}
      else			/* positives/ignores: look up in poskh */
	{
	  if (esl_keyhash_Lookup(poskh, match, mlen, &(rp[nr].tidx)) != eslOK) esl_fatal("failed to find match seq  %s in hash", match);	/* target index */
	  if (rp[nr].class == 1) { 
	    if(pos_exists[rp[nr].qidx][rp[nr].tidx]) { 
	      /* we've already seen a better scoring hit for this query/target pair,
	       * make it an ignore */
	      rp[nr].class = 0;
	    }
	    pos_exists[rp[nr].qidx][rp[nr].tidx] = 1;
	  }
	}
      nr++;
    }

  *ret_r    = rp;
  *ret_nr   = nr;
  *ret_nneg = nneg;
  for(i = 0; i < nq; i++) free(pos_exists[i]);
  free(pos_exists);
    
  esl_fileparser_Close(efp);
  return eslOK;
}


static struct oneplot_s *
create_plot(ESL_GETOPTS *go, int nq)
{
  struct oneplot_s *plot;
  double minfp, maxfp;
  int    status;

  ESL_ALLOC(plot, sizeof(struct oneplot_s));

  if (esl_opt_IsDefault(go, "--min")) minfp = 1.0 / (double) nq;
  else                                minfp = esl_opt_GetReal(go, "--min");

  maxfp    = esl_opt_GetReal(go, "--max");
  plot->nsteps = esl_opt_GetInteger(go, "--steps");
  plot->base   = (int) floor(log10(minfp) * plot->nsteps);
  plot->nxpts  = (int) ceil(log10(maxfp) * plot->nsteps) - plot->base + 1;

  ESL_ALLOC(plot->tp, sizeof(double) * plot->nxpts);

  /* initialize */
  plot->Ethresh = esl_opt_GetReal(go, "--Ethresh"); 
  plot->Ethresh_fn = 0;
  plot->Ethresh_fp = 0;

  return plot;

 ERROR:
  destroy_plot(plot);
  return NULL;
}

static void
destroy_plot(struct oneplot_s *plot)
{
  if (plot == NULL) return;
  if (plot->tp != NULL) free(plot->tp);
  free(plot);
}


/* Given the results <rp> [0..nr-1], 
 *       positive/ignore classifications pni[0..nq-1][0..npos-1]
 *       and bootstrap-sampled usage probabilities <queryp>, <seqp>,
 *       and a plot axis <plot> to store results in;
 * calculate a new ROC plot for this bootstrap sample,
 * and store it in <plot->tp>.
 * 
 * As a special case, if <queryp> and/or <seqp> are NULL, calculate
 * the ROC plot for the original data without bootstrapping.
 */
static void
make_plot(struct result_s *rp, int nresults, int **pni, double *queryp, int nq, double *seqp, int nseq, int npos, 
	  struct oneplot_s *plot)
{
  double weight;
  int    xi, curr_xi;
  double true_pos, false_pos, false_neg;
  double false_pos_per_q;
  int    j;

  if (queryp != NULL && seqp != NULL) 
    plot->totalpos = weighted_total_positives(pni, queryp, nq,  seqp, npos, nseq);
  else
    plot->totalpos = npos;

  curr_xi  = 0;
  true_pos = false_pos = false_pos_per_q = 0.0;
  false_neg = plot->totalpos;

  for (j = 0; j < nresults; j++)
    {
      if (queryp != NULL && seqp != NULL) 
	weight = queryp[rp[j].qidx] * seqp[rp[j].tidx] * nseq * nq;
      else
	weight = 1.0;

      if (rp[j].class == 1) 
	{
	  true_pos  += weight;
	  false_neg -= weight;
	  plot->tp[curr_xi] = true_pos;
	  /*printf("pos %5d/%5d  %3d  %3d  %g\n", (int)  true_pos, j, rp[j].qidx, rp[j].tidx, rp[j].E);*/
	}
      else if (rp[j].class == -1) 
	{
	  false_pos       += weight;
	  false_pos_per_q += weight / (double) nq;   /* FP/query */
	  
	  xi = (int) ceil(log10(false_pos_per_q) * plot->nsteps) - plot->base;

	  if (xi > curr_xi) {
	    for (curr_xi = curr_xi+1; curr_xi < xi && curr_xi < plot->nxpts; curr_xi++)
	      plot->tp[curr_xi] = true_pos;
	    
	    if (curr_xi < plot->nxpts) plot->tp[curr_xi] = true_pos;
	    /*printf("neg %5d/%5d  %3d  %3d  %g\n", (int) true_pos, j, rp[j].qidx, rp[j].tidx, rp[j].E);*/
	  }
	}
      else { 
	;/*printf("ign %5s/%5d  %3d  %3d  %g\n", "?", j, rp[j].qidx, rp[j].tidx, rp[j].E);*/
      }
      /* update MER (initialize it if first result) */
      if (j == 0 || (false_pos + false_neg) < plot->mer) { 
	plot->mer = false_pos + false_neg;
	plot->mer_fp = false_pos;
	plot->mer_fn = false_neg;
	plot->mer_thr = rp[j].E;
      }
      /* update number of FNs and FPs at or below <x> from --Ethresh <x> */
      if (rp[j].E <= plot->Ethresh) { 
	plot->Ethresh_fn = false_neg;
	plot->Ethresh_fp = false_pos;
      }
      if (curr_xi >= plot->nxpts) break;
    }

  /* Rarely, the plot won't have enough false positives to extend all the way to 
   * the left extreme of the x-axis; make sure we propagate the last true_pos */
  for (curr_xi++; curr_xi < plot->nxpts; curr_xi++)
    plot->tp[curr_xi] = true_pos;
}
  

static void
write_plot(FILE *fp, struct oneplot_s *plot)
{
  int    xi;
  double false_pos;

  for (xi = 0; xi < plot->nxpts; xi++) 
    {
      false_pos = exp(log(10) * ((double) plot->base + (double) xi) / (double) plot->nsteps);
      fprintf(fp, "%.5f %.5f\n", false_pos, plot->tp[xi] / plot->totalpos );
    }
  fprintf(fp, "&\n"); 
}


static void
write_mer(FILE *fp, struct oneplot_s *plot)
{
  fprintf(fp, "MER: %5.1f  ", plot->mer);
  fprintf(fp, "FN: %5.1f  ", plot->mer_fn);
  fprintf(fp, "FP: %5.1f  ", plot->mer_fp);
  fprintf(fp, "THR: %5.3f\n", plot->mer_thr);
}

static void
write_Ethresh(FILE *fp, struct oneplot_s *plot)
{
  fprintf(fp, "E-value-threshold: %g  ", plot->Ethresh);
  fprintf(fp, "FN<=E: %5.1f  ", plot->Ethresh_fn);
  fprintf(fp, "FP<=E: %5.1f\n", plot->Ethresh_fp);
}

static void
summary_graph(ESL_GETOPTS *go, FILE *fp, struct oneplot_s *plot, double **yv)
{
  int    nboots              = esl_opt_GetInteger(go, "-N");
  int    by_stddev           = esl_opt_GetBoolean(go, "-s");
  double confidence_interval = esl_opt_GetReal   (go, "--interval");
  double nsd                 = esl_opt_GetReal   (go, "--nsd");
  int    xi;
  double false_pos;
  double mean, var;
  int    ntail;

  for (xi = 0; xi < plot->nxpts; xi++)
    {
      false_pos = exp(log(10) * ((double) plot->base + (double) xi) / (double) plot->nsteps);
      esl_stats_DMean(yv[xi], nboots, &mean, &var);

      /* the dy's in xmgrace xydydy format are dy's, and in order upper, lower  */
      if (by_stddev)
	{
	  fprintf(fp, "%.5f %.5f %.5f %.5f\n", false_pos, mean, nsd*sqrt(var), nsd*sqrt(var)); 
	}
      else
	{
	  esl_vec_DSortIncreasing(yv[xi], nboots);
	  ntail = (int) ((double) nboots * (1.0 - confidence_interval) / 2.0);

	  fprintf(fp, "%.5f %.5f %.5f %.5f\n", 
		  false_pos, mean, 
		  yv[xi][nboots-ntail] - mean,
		  mean - yv[xi][ntail]);
	}
    }
  fprintf(fp, "&\n");
}


static void
summary_mer(ESL_GETOPTS *go, FILE *fp, struct oneplot_s *plot, double *mervec, double *merthrvec, double *merfpvec, double *merfnvec)
{
  int    nboots              = esl_opt_GetInteger(go, "-N");
  int    by_stddev           = esl_opt_GetBoolean(go, "-s");
  double confidence_interval = esl_opt_GetReal   (go, "--interval");
  double nsd                 = esl_opt_GetReal   (go, "--nsd");
  double mermean, mervar;
  double merthrmean, merthrvar;
  double merfpmean, merfpvar;
  double merfnmean, merfnvar;
  int    ntail;

  esl_stats_DMean(mervec,    nboots, &mermean,    &mervar);
  esl_stats_DMean(merthrvec, nboots, &merthrmean, &merthrvar);
  esl_stats_DMean(merfpvec,  nboots, &merfpmean,  &merfpvar);
  esl_stats_DMean(merfnvec,  nboots, &merfnmean,  &merfnvar);

  esl_vec_DSortIncreasing(mervec,    nboots);
  esl_vec_DSortIncreasing(merthrvec, nboots);
  esl_vec_DSortIncreasing(merfpvec,  nboots);
  esl_vec_DSortIncreasing(merfnvec,  nboots);

  ntail = (int) ((double) nboots * (1.0 - confidence_interval) / 2.0);

  if (by_stddev) { 
    fprintf(fp, "[MER: %5.1f %5.1f %5.1f]  ", mermean - nsd*sqrt(mervar), mermean, mermean + nsd*sqrt(mervar));
    fprintf(fp, "[FP: %5.1f %5.1f %5.1f]  ",  merfpmean - nsd*sqrt(merfpvar), merfpmean, merfpmean + nsd*sqrt(merfpvar));
    fprintf(fp, "[FN: %5.1f %5.1f %5.1f]  ",  merfnmean - nsd*sqrt(merfnvar), merfnmean, merfnmean + nsd*sqrt(merfnvar));
    fprintf(fp, "[THR: %5.3f %5.3f %5.3f]\n", merthrmean - nsd*sqrt(merthrvar), merthrmean, merthrmean + nsd*sqrt(merthrvar));
  }
  else { 
    fprintf(fp, "[MER: %5.1f %5.1f %5.1f]  ", mervec[ntail], mermean, mervec[nboots-ntail]);
    fprintf(fp, "[FN: %5.1f %5.1f %5.1f]  ", merfnvec[ntail], merfnmean, merfnvec[nboots-ntail]);
    fprintf(fp, "[FP: %5.1f %5.1f %5.1f]  ", merfpvec[ntail], merfpmean, merfpvec[nboots-ntail]);
    fprintf(fp, "[THR: %5.3f %5.3f %5.3f]\n", merthrvec[ntail], merthrmean, merthrvec[nboots-ntail]);
  }
}


static void
summary_Ethresh(ESL_GETOPTS *go, FILE *fp, struct oneplot_s *plot, double *Ethresh_fn_vec, double *Ethresh_fp_vec)
{
  int    nboots              = esl_opt_GetInteger(go, "-N");
  int    by_stddev           = esl_opt_GetBoolean(go, "-s");
  double confidence_interval = esl_opt_GetReal   (go, "--interval");
  double nsd                 = esl_opt_GetReal   (go, "--nsd");
  double fnmean, fpmean, fnvar, fpvar;
  int    ntail;

  esl_stats_DMean(Ethresh_fn_vec,  nboots, &fnmean,    &fnvar);
  esl_stats_DMean(Ethresh_fp_vec,  nboots, &fpmean,    &fpvar);

  esl_vec_DSortIncreasing(Ethresh_fn_vec,    nboots);
  esl_vec_DSortIncreasing(Ethresh_fp_vec,    nboots);

  ntail = (int) ((double) nboots * (1.0 - confidence_interval) / 2.0);

  fprintf(fp, "[ETHRESH: %g]  ", esl_opt_GetReal(go, "--Ethresh"));
  if (by_stddev) { 
    fprintf(fp, "[E-TP: %5.1f %5.1f %5.1f]  ", fnmean - nsd*sqrt(fnvar), fnmean, fnmean + nsd*sqrt(fnvar));
    fprintf(fp, "[E-FP: %5.1f %5.1f %5.1f]\n", fpmean - nsd*sqrt(fpvar), fpmean, fpmean + nsd*sqrt(fpvar));
  }
  else { 
    fprintf(fp, "[E-TP: %5.1f %5.1f %5.1f]  ", Ethresh_fn_vec[ntail], fnmean, Ethresh_fn_vec[nboots-ntail]);
    fprintf(fp, "[E-FP: %5.1f %5.1f %5.1f]\n", Ethresh_fp_vec[ntail], fpmean, Ethresh_fp_vec[nboots-ntail]);
  }
}