File: esl_histogram.h

package info (click to toggle)
hmmer 3.2.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 23,380 kB
  • sloc: ansic: 119,305; perl: 8,791; sh: 3,266; makefile: 1,871; python: 598
file content (126 lines) | stat: -rw-r--r-- 5,378 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
/* Collection and display of score histograms.
 * 
 * SRE, Fri Jul  1 13:22:45 2005 [St. Louis]
 */
#ifndef eslHISTOGRAM_INCLUDED
#define eslHISTOGRAM_INCLUDED
#include "esl_config.h"

#include <math.h>   /* floor() is in one of the macros */


/* Structure: ESL_HISTOGRAM
 * 
 * Keeps a score histogram, in which scores are counted into bins of
 * size (width) w. 
 *   histogram starts at bmin <  floor(xmin/w) * w
 *   histogram ends at   bmax >= ceil(xmax/w)*w
 *   nb = (bmax-bmin)/w
 *   each score x is counted into bin b = nb - (int) (bmax-x)/w
 *   each bin b contains scores bw+bmin < x <= (b+1)w + bmin
 * 
 * Anything having to do with the counts themselves (obs, n, etc)
 * is a uint64_t, with range 0..2^64-1  (up to 2e19).
 */  
typedef struct {
  /* The histogram is kept as counts in fixed-width bins.
   */
  uint64_t *obs;	/* observed counts in bin b, 0..nb-1 (dynamic)      */
  int       nb;         /* number of bins                                   */
  double    w;		/* fixed width of each bin                          */
  double    bmin, bmax;	/* histogram bounds: all x satisfy bmin < x <= bmax */
  int       imin, imax;	/* smallest, largest bin that contain obs[i] > 0    */

  /* Optionally, in a "full" h, we can also keep all the raw samples in x.
   */
  double    xmin, xmax;	/* smallest, largest sample value x observed        */
  uint64_t  n;          /* total number of raw data samples                 */
  double   *x;		/* optional: raw sample values x[0..n-1]            */
  uint64_t  nalloc;	/* current allocated size of x                      */

  /* The binned data might be censored (either truly, or virtually).
   * This information has to be made available to a binned/censored
   * parameter fitting function, and to goodness-of-fit tests.
   */
  double   phi;		/* censoring value; all x_i > phi                   */
  int      cmin;	/* smallest bin index that contains uncensored data */
  uint64_t z;		/* # of censored values <= phi                      */
  uint64_t Nc;	        /* # samples in complete data (including unobs)     */
  uint64_t No;		/* # of samples in observed data                    */

  /* Expected binned counts are set by SetExpect() or SetExpectedTail().
   */
  double *expect;	/* expected counts in bin b, 0..nb-1 (not resized)  */
  int     emin;		/* smallest bin index that contains expected counts */
  double  tailbase;	/* for tail fits: fitted x > tailbase               */
  double  tailmass;	/* for tail fits: fractional prob in the tail       */

  /* Some status flags
   */
  int is_full;		/* TRUE when we're keeping raw data in x           */
  int is_done;		/* TRUE if we prevent more Add()'s                 */
  int is_sorted;	/* TRUE if x is sorted smallest-to-largest         */
  int is_tailfit;	/* TRUE if expected dist only describes tail       */
  int is_rounded;	/* TRUE if values aren't more accurate than bins   */
  enum { COMPLETE, VIRTUAL_CENSORED, TRUE_CENSORED } dataset_is; 

} ESL_HISTOGRAM;

#define esl_histogram_Bin2LBound(h,b)  ((h)->w*(b) + (h)->bmin)
#define esl_histogram_Bin2UBound(h,b)  ((h)->w*((b)+1) + (h)->bmin)

/* Creating/destroying histograms and collecting data:
 */
extern ESL_HISTOGRAM *esl_histogram_Create    (double bmin, double bmax, double w);
extern ESL_HISTOGRAM *esl_histogram_CreateFull(double bmin, double bmax, double w);
extern void           esl_histogram_Destroy  (ESL_HISTOGRAM *h);
extern int            esl_histogram_Score2Bin(ESL_HISTOGRAM *h, double x, int *ret_b);
extern int            esl_histogram_Add      (ESL_HISTOGRAM *h, double x);

/* Declarations about the binned data before parameter fitting:
 */
extern int esl_histogram_DeclareCensoring(ESL_HISTOGRAM *h, int z, double phi);
extern int esl_histogram_DeclareRounding (ESL_HISTOGRAM *h);
extern int esl_histogram_SetTail         (ESL_HISTOGRAM *h, double phi,   
					  double *ret_newmass);
extern int esl_histogram_SetTailByMass   (ESL_HISTOGRAM *h, double pmass,
					  double *ret_newmass);

/* Accessing data samples in a full histogram:
 */
extern int esl_histogram_GetRank(ESL_HISTOGRAM *h, int rank, double *ret_x);
extern int esl_histogram_GetData(ESL_HISTOGRAM *h, double **ret_x, int *ret_n);
extern int esl_histogram_GetTail(ESL_HISTOGRAM *h, double phi, double **ret_x,
				 int *ret_n, int *ret_z);
extern int esl_histogram_GetTailByMass(ESL_HISTOGRAM *h, double pmass, 
				       double **ret_x, int *ret_n, int *ret_z);


/* Setting expected binned counts:
 */
extern int esl_histogram_SetExpect(ESL_HISTOGRAM *h, 
				   double (*cdf)(double x, void *params),
				   void *params);
extern int esl_histogram_SetExpectedTail(ESL_HISTOGRAM *h, double base_val,
					 double pmass,
					 double (*cdf)(double x, void *params), 
					 void *params);

/* Output/display of binned data:
 */
extern int esl_histogram_Write       (FILE *fp, ESL_HISTOGRAM *h);
extern int esl_histogram_Plot        (FILE *fp, ESL_HISTOGRAM *h);
extern int esl_histogram_PlotSurvival(FILE *fp, ESL_HISTOGRAM *h);
extern int esl_histogram_PlotQQ      (FILE *fp, ESL_HISTOGRAM *h, 
				      double (*invcdf)(double, void *), void *params);

/* Goodness of fit testing
 */
extern int esl_histogram_Goodness(ESL_HISTOGRAM *h, int nfitted, 
				  int *ret_nbins,
				  double *ret_G,  double *ret_Gp,
				  double *ret_X2, double *ret_X2p);



#endif /*eslHISTOGRAM_INCLUDED*/