File: apop_normal.c

package info (click to toggle)
apophenia 1.0%2Bds-10
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,152 kB
  • sloc: ansic: 19,483; makefile: 378; awk: 124; sh: 105; javascript: 35; sed: 32
file content (252 lines) | stat: -rw-r--r-- 10,476 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
/* The Normal and Lognormal distributions.
 Copyright (c) 2005--2009 by Ben Klemens.  Licensed under the GPLv2; see COPYING.  

\amodel apop_normal

You know it, it's your attractor in the limit, it's the Gaussian distribution.

\f$N(\mu,\sigma^2) = {1 \over \sqrt{2 \pi \sigma^2}} \exp (-x^2 / 2\sigma^2)\f$

\f$\ln N(\mu,\sigma^2) = (-(x-\mu)^2 / 2\sigma^2) - \ln (2 \pi \sigma^2)/2 \f$

\f$d\ln N(\mu,\sigma^2)/d\mu = (x-\mu) / \sigma^2 \f$

\f$d\ln N(\mu,\sigma^2)/d\sigma^2 = ((x-\mu)^2 / 2(\sigma^2)^2) - 1/2\sigma^2 \f$

See also the \ref apop_multivariate_normal.

\adoc    Input_format A scalar, in the \c vector or \c matrix elements of the input \ref apop_data set.
\adoc    Settings   None.
\adoc    Parameter_format  Parameter zero (in the vector) is the mean, parmeter one is the standard deviation (i.e., the square root of the variance). 
After estimation, a page is added named <tt>\<Covariance\></tt> with the 2 \f$\times\f$ 2 covariance matrix for these two parameters.

\adoc    Predict  <tt>apop_predict(NULL, estimated_normal_model)</tt> returns the expected value. The <tt>->more</tt>
                 element holds an \ref apop_data set with the title <tt>\<Covariance\></tt>, whose 
                 matrix holds the covariance of the mean.
*/

#include "apop_internal.h"

static long double positive_sigma_constraint(apop_data *data, apop_model *v){
    //constraint is 0 < beta_2
    Staticdef(apop_data *, constraint, apop_data_falloc((1,1,2), 0, 0, 1));
    return apop_linear_constraint(v->parameters->vector, constraint, 1e-5);
}

//This just takes the sum of (x-mu)^2. Using gsl_ran_gaussian_pdf
//would be to calculate log(exp((x-mu)^2)) == slow.
static double apply_me(double x, void *mu){ return x - *(double *)mu; }

static double apply_me2(double x, void *mu){ return gsl_pow_2(x - *(double *)mu); }

static long double normal_log_likelihood(apop_data *d, apop_model *params){
    Nullcheck_mpd(d, params, GSL_NAN);
    Get_vmsizes(d)
    double mu = gsl_vector_get(params->parameters->vector,0);
    double sd = gsl_vector_get(params->parameters->vector,1);
    long double ll  = -apop_map_sum(d, .fn_dp = apply_me2, .param = &mu)/(2*gsl_pow_2(sd));
    ll -= tsize*((M_LNPI+M_LN2)/2+log(sd));
	return ll;
}

void get_mu_var(apop_data *data, double *mu_out, double *var_out){
    Get_vmsizes(data)
    double mmean=0, mvar=0, vmean=0, vvar=0;
    if (vsize){
        vmean = apop_mean(data->vector);
        vvar = apop_var(data->vector);
    }
    if (msize1) {
        if (!vsize) apop_matrix_mean_and_var(data->matrix, &mmean, &mvar);	
        else        mmean = apop_matrix_mean(data->matrix);	
    }
    *mu_out = mmean *(msize1*msize2/(tsize+0.0)) + vmean *(vsize/(tsize+0.0));
    *var_out = 0;
    if      (!vsize && !msize1) *var_out = 0;
    else if (vsize && !msize1)  *var_out = vvar;
    else if (!vsize && msize1)  *var_out = mvar;
    else {
        long double vv=0;
        for (int i=-1; i< msize2; i++)
            vv += gsl_pow_2(apop_data_get(data, i) - *mu_out);
        *var_out = vv/tsize;
    }
}

/* \adoc estimated_info Reports the log likelihood.*/
static void normal_estimate(apop_data * data, apop_model *est){
    Nullcheck_mpd(data, est, );
    Get_vmsizes(data); //tsize
    double mean, var;
    get_mu_var(data, &mean, &var);
    est->parameters->vector->data[0] = mean;
    est->parameters->vector->data[1] = sqrt(var);
	apop_name_add(est->parameters->names, "μ", 'r');
	apop_name_add(est->parameters->names, "σ",'r');

    apop_lm_settings *p = apop_settings_get_group(est, apop_lm);
    if (!p) p = Apop_model_add_group(est, apop_lm);
	if (!p || p->want_cov=='y'){
        apop_data *cov = apop_data_get_page(est->parameters, "<Covariance>");
        if (!cov) cov = apop_data_add_page(est->parameters, apop_data_calloc(2, 2), "<Covariance>");
        apop_data_set(cov, 0, 0, mean/tsize);
        apop_data_set(cov, 1, 1, 2*gsl_pow_2(var)/(tsize-1));
    }
    est->data = data;
    apop_data_add_named_elmt(est->info, "log likelihood", normal_log_likelihood(data, est));
}

static long double normal_cdf(apop_data *d, apop_model *params){
    Nullcheck_mpd(d, params, GSL_NAN)
    Get_vmsizes(d)  //vsize
    double val = apop_data_get(d, 0, vsize ? -1 : 0);
    double mu = gsl_vector_get(params->parameters->vector, 0);
    double sd = gsl_vector_get(params->parameters->vector, 1);
    return gsl_cdf_gaussian_P(val-mu, sd);
}

static void normal_dlog_likelihood(apop_data *d, gsl_vector *gradient, apop_model *params){    
    Nullcheck_mpd(d, params, )
    Get_vmsizes(d)
    double mu = gsl_vector_get(params->parameters->vector,0),
           sd = gsl_vector_get(params->parameters->vector,1),
           dll, sll;
    dll = apop_map_sum(d, .fn_dp = apply_me, .param=&mu);
    sll = apop_map_sum(d, .fn_dp = apply_me2, .param=&mu);
    gsl_vector_set(gradient, 0, dll/gsl_pow_2(sd));
    gsl_vector_set(gradient, 1, sll/gsl_pow_3(sd)- tsize /sd);
}

/* \adoc predict Returns the mean, regardless of the input data you give (including
\c NULL). The second page is <tt>\<Covariance\></tt> of the mean.*/ 
apop_data * normal_predict(apop_data *dummy, apop_model *m){
    apop_data *out = apop_data_alloc(1,1);
    out->matrix->data[0] = m->parameters->vector->data[0];

    apop_data *cov = apop_data_get_page(out, "<Covariance>");
    if (!cov) cov = apop_data_add_page(out, apop_data_alloc(1,1), "<Covariance>");
    if (m->data){
           Get_vmsizes(m->data) //tsize
           cov->matrix->data[0] = m->parameters->vector->data[1]/ sqrt(tsize);
    } else cov->matrix->data[0] = 0;
    return out;
}

/*\adoc RNG A wrapper for the GSL's Normal RNG. */
static int normal_rng(double *out, gsl_rng *r, apop_model *p){
	*out = gsl_ran_gaussian(r, p->parameters->vector->data[1]) + p->parameters->vector->data[0];
    return 0;
}

static void normal_prep(apop_data *data, apop_model *params){
    apop_score_vtable_add(normal_dlog_likelihood, apop_normal);
    apop_predict_vtable_add(normal_predict, apop_normal);
    apop_model_clear(data, params);
}

apop_model *apop_normal = &(apop_model){"Normal distribution", 2, 0, 0, .dsize=1,
 .estimate = normal_estimate, .log_likelihood = normal_log_likelihood, 
 .prep = normal_prep, .constraint = positive_sigma_constraint, 
 .draw = normal_rng, .cdf = normal_cdf};


/*\amodel apop_lognormal

The log likelihood function for lognormal distributions:

\f$f = exp(-(ln(x)-\mu)^2/(2\sigma^2))/ (x\sigma\sqrt{2\pi})\f$

\f$ln f = -(ln(x)-\mu)^2/(2\sigma^2) - ln(x) - ln(\sigma\sqrt{2\pi})\f$

\adoc    Input_format     A scalar in the the matrix or vector element of the input \ref apop_data set.
\adoc    Parameter_format  Zeroth vector element is the mean of the logged data set; first is the standard deviation of the logged data set.
\adoc    Estimate_results  Parameters are set. Log likelihood is calculated.
\adoc    settings   None.    
*/

static double lnx_minus_mu_squared(double x, void *mu_in){
	return gsl_pow_2(log(x) - *(double *)mu_in);
}

static long double lognormal_log_likelihood(apop_data *d, apop_model *params){
    Nullcheck_mpd(d, params, GSL_NAN)
    Get_vmsizes(d) //tsize
    double mu = gsl_vector_get(params->parameters->vector, 0);
    double sd = gsl_vector_get(params->parameters->vector, 1);
    long double ll = -apop_map_sum(d, .fn_dp=lnx_minus_mu_squared, .param=&mu);
      ll /= (2*gsl_pow_2(sd));
      ll -= apop_map_sum(d, log);
      ll -= tsize*((M_LNPI+M_LN2)/2+log(sd));
	return ll;
}

/* \adoc estimated_info   Reports <tt>log likelihood</tt>. */
static void lognormal_estimate(apop_data * data, apop_model *est){
    apop_data *cp = apop_data_copy(data);
    Apop_stopif(!cp->matrix && !cp->vector, est->error='d'; return, 
            0, "Neither matrix nor vector in the input data.");
    Get_vmsizes(cp); //vsize, msize1

    if (vsize){
        apop_vector_log(cp->vector);
    }
    if (msize2){
        for (int i=0; i< msize2; i++)
            apop_vector_log(Apop_cv(cp, i));
    }
    double mean, var;
    get_mu_var(cp, &mean, &var);
    apop_data_free(cp);
    est->parameters->vector->data[0] = mean;
    est->parameters->vector->data[1] = var < 0 ? 0 : sqrt(var); // -ε sometimes happens

    apop_name_add(est->parameters->names, "μ", 'r');
    apop_name_add(est->parameters->names, "σ", 'r');
    apop_data_add_named_elmt(est->info, "log likelihood", lognormal_log_likelihood(data, est));
}

static long double lognormal_cdf(apop_data *d, apop_model *params){
    Nullcheck_mpd(d, params, GSL_NAN)
    Get_vmsizes(d)  //vsize
    double val = apop_data_get(d, 0, vsize ? -1 : 0);
    double mu = gsl_vector_get(params->parameters->vector, 0);
    double sd = gsl_vector_get(params->parameters->vector, 1);
    return gsl_cdf_lognormal_P(val, mu, sd);
}

/* \adoc predict Returns the expeced value, \f$E(x) = e^(mu + sigma^2/2)\f$
  in the (0, 0)th element of the matrix, regardless of the input data you give (including \c NULL). */ 
apop_data * lognormal_predict(apop_data *dummy, apop_model *m){
    apop_data *out = apop_data_alloc(1,1);
    out->matrix->data[0] = exp(m->parameters->vector->data[0] 
                                + gsl_pow_2(m->parameters->vector->data[1])/2);
    return out;
}

double diff_sq(double x, void *mu){ return gsl_pow_2(log(x) - *(double*)mu); }

static void lognormal_dlog_likelihood(apop_data *d, gsl_vector *gradient, apop_model *params){    
    double mu = gsl_vector_get(params->parameters->vector,0),
           sd = gsl_vector_get(params->parameters->vector,1);
    Get_vmsizes(d); //tsize
    double dll = apop_map_sum(d, log) - mu*tsize;
    double sll = apop_map_sum(d, .fn_dp=diff_sq, .param=&mu);
    gsl_vector_set(gradient, 0, dll/gsl_pow_2(sd));
    gsl_vector_set(gradient, 1, sll/gsl_pow_3(sd)- tsize/sd);
}

/* \adoc RNG An Apophenia wrapper for the GSL's Normal RNG, exponentiated.  */
static int lognormal_rng(double *out, gsl_rng *r, apop_model *p){
	*out = exp(gsl_ran_gaussian(r, p->parameters->vector->data[1]) + p->parameters->vector->data[0]);
    return 0;
}

static void lognormal_prep(apop_data *data, apop_model *params){
    apop_score_vtable_add(lognormal_dlog_likelihood, apop_lognormal);
    apop_model_clear(data, params);
}

apop_model *apop_lognormal = &(apop_model){"Lognormal distribution", 2, 0, 0, .dsize=1,
 .estimate = lognormal_estimate, .log_likelihood = lognormal_log_likelihood,
 .prep = lognormal_prep, .constraint = positive_sigma_constraint, 
 .draw = lognormal_rng, .cdf= lognormal_cdf};