1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
|
/* The binomial distribution as an \c apop_model.
Copyright (c) 2006--2007, 2010--11 by Ben Klemens. Licensed under the GPLv2; see COPYING.
\amodel apop_binomial The multi-draw generalization of the Bernoulli, or the two-bin special case of the \ref apop_multinomial "Multinomial distribution".
It is implemented as an alias of the \ref apop_multinomial model, except that
it has an explicit CDF, we know it has two parameters, and its draw method returns a scalar. I.e.,
<tt>.vsize==2</tt> and <tt>.dsize==1</tt>.
\adoc Parameter_format a vector, v[0]=\f$n\f$; v[1]=\f$p_1\f$. Thus, \f$p_0\f$
isn't written down; see \ref apop_multinomial for further discussion.
If you input \f$v[1]>1\f$ and <tt>apop_opts.verbose >=1</tt>, the log likelihood
function will throw a warning.
Post-estimate, will have a <tt>\<Covariance\></tt> page with the covariance
matrix for the \f$p\f$s (\f$n\f$ effectively has no variance).
\adoc Input_format Each row of the matrix is one observation, consisting of two elements.
The number of draws of type zero (sometimes read as `misses' or `failures') are in column zero,
the number of draws of type one (`hits', `successes') in column one.
\adoc RNG The RNG returns a single number representing the success count, not a
vector of length two giving both the failure bin and success bin. This is notable
because it differs from the input data format, but it tends to be what people expect
from a Binomial RNG. For draws with both dimensions (or situations where draws are fed back into the model), use an \ref apop_multinomial model
with <tt>.vsize =2</tt>.
*/
#include "apop_internal.h"
/* \adoc cdf At the moment, only implemented for the Binomial.
Let the first element of the data set (top of the vector or point (0,0) in the
matrix, your pick) be $L$; then I return the sum of the odds of a draw from the given
Binomial distribution returning $0, 1, \dots, L$ hits. */
static long double binomial_cdf(apop_data *d, apop_model *est){
Nullcheck_mpd(d, est, GSL_NAN)
Get_vmsizes(d); //firstcol
double hitcount = apop_data_get(d, .col=firstcol);
double n = gsl_vector_get(est->parameters->vector, 0);
double p = gsl_vector_get(est->parameters->vector, 1);
return gsl_cdf_binomial_P(hitcount, p, n);
}
static void make_covar(apop_model *est){
int size = est->parameters->vector->size;
//the trick where we turn the params into a p-vector
double * pv = est->parameters->vector->data;
int n = pv[0];
pv[0] = 1 - (apop_sum(est->parameters->vector)-n);
apop_data *cov = apop_data_add_page(est->parameters,
apop_data_alloc(size, size), "<Covariance>");
for (int i=0; i < size; i++){
double p = apop_data_get(est->parameters, i, -1);
apop_data_set(cov, i, i, n * p *(1-p));
for (int j=i+1; j < size; j++){
double pj = apop_data_get(est->parameters, j, -1);
double thiscell = -n*p*pj;
apop_data_set(cov, i, j, thiscell);
apop_data_set(cov, j, i, thiscell);
}
}
pv[0]=n;
}
static long double multinomial_constraint(apop_data *data, apop_model *b){
//constraint is that 0 < all elmts, and 1>all ps.
int size = b->parameters->vector->size;
static threadlocal apop_data *constr;
if (constr && constr->matrix->size2 != size)
apop_data_free(constr);
if (!constr){
constr = apop_data_calloc(size*2-1, size*2-1, size);
//top half: 0 < [param], including param 0
gsl_matrix_set_identity(Apop_subm(constr->matrix, 0, 0, size, size));
//bottom (almost) half: 1 >= [param], excluding param 0
for (int i=size; i < size*2-1; i++){
apop_data_set(constr, i, -1, -1);
apop_data_set(constr, i, i-size+1, -1);
}
}
return apop_linear_constraint(b->parameters->vector, constr);
}
static double binomial_ll(gsl_vector *hits, void *paramv){
return log(gsl_ran_binomial_pdf(hits->data[1], ((gsl_vector*)paramv)->data[1], ((gsl_vector*)paramv)->data[0]));
}
static double multinomial_ll(gsl_vector *v, void *params){
double *pv = ((apop_model*)params)->parameters->vector->data;
size_t size = ((apop_model*)params)->parameters->vector->size;
unsigned int hv[v->size]; //The GSL wants our hit count in an int*.
for (size_t i=0; i < v->size; i ++)
hv[i] = gsl_vector_get(v, i);
return gsl_ran_multinomial_lnpdf(size, pv, hv);
}
static long double multinomial_log_likelihood(apop_data *d, apop_model *params){
Nullcheck_mpd(d, params, GSL_NAN);
double *pv = params->parameters->vector->data;
double n = pv[0];
Apop_assert_c(params->parameters->vector->size>=2, GSL_NAN, 0, "I need two or more input parameters "
"representing [n, p_1, (...)].");
Apop_assert_c(pv[1] <=1, GSL_NAN, 1, "The input parameters should be [n, p_1, (...)], but "
"element 1 of the parameter vector is >1."); //mostly makes sense for the binomial.
if (n==2) return apop_map_sum(d, .fn_vp=binomial_ll, .param=params->parameters->vector);
pv[0] = 1 - (apop_sum(params->parameters->vector)-n);//making the params a p-vector. Put n back at the end.
double out = apop_map_sum(d, .fn_vp=multinomial_ll, .param=params);
pv[0]=n;
return out;
}
/*
static apop_model *multinomial_paramdist(apop_data *d, apop_model *m){
apop_pm_settings *settings = Apop_settings_get_group(m, apop_pm);
if (settings->index!=-1){
int i = settings->index;
double mu = apop_data_get(m->parameters, i, -1);
double sigma = sqrt(apop_data_get(m->parameters, i, i, .page="<Covariance>"));
int df = apop_data_get(m->info, .rowname="df", .page = "info");
return apop_model_set_parameters(apop_t_distribution, mu, sigma, df);
}
}
*/
static int multinomial_rng(double *out, gsl_rng *r, apop_model* est){
Nullcheck_mp(est, 1);
double * p = est->parameters->vector->data;
//the trick where we turn the params into a p-vector
int N = p[0];
if (est->parameters->vector->size == 2) {
*out = gsl_ran_binomial_knuth(r, 1-gsl_vector_get(est->parameters->vector, 1), N);
out[1] = N-*out;
goto done;
}
//else, multinomial
//cut/pasted/modded from the GSL. Copyright them.
p[0] = 1 - (apop_sum(est->parameters->vector)-N);
double sum_p = 0.0;
int sum_n = 0;
for (int i = 0; i < est->parameters->vector->size; i++) {
out[i] = (p[i] > 0)
? gsl_ran_binomial (r, p[i] / (1 - sum_p), N - sum_n)
: 0;
sum_p += p[i];
sum_n += out[i];
}
done:
p[0] = N;
return 0;
}
static void multinomial_show(apop_model *est, FILE *out){
double * p = est->parameters->vector->data;
int N=p[0];
p[0] = 1 - (apop_sum(est->parameters->vector)-N);
fprintf(out, "%s, with %i draws.\nBin odds:\n", est->name, N);
apop_vector_print(est->parameters->vector, .output_pipe=out);
p[0]=N;
}
double avs(gsl_vector *v){return (double) apop_vector_sum(v);}
/* \amodel apop_multinomial The \f$n\f$--option generalization of the \ref apop_binomial "Binomial distribution".
\adoc estimated_info Reports <tt>log likelihood</tt>. */
static void multinomial_estimate(apop_data * data, apop_model *est){
Nullcheck_mpd(data, est, );
Get_vmsizes(data); //vsize, msize1
est->parameters= apop_map(data, .fn_v=avs, .part='c');
gsl_vector *v = est->parameters->vector;
int n = apop_sum(v)/data->matrix->size1; //size of one row
apop_vector_normalize(v);
apop_name_add(est->parameters->names, "n", 'r');
apop_data_set(est->parameters, .val=n); //zeroth item is now n, not p_0
char name[100];
for(int i=1; i < v->size; i++){
sprintf(name, "p%i", i);
apop_name_add(est->parameters->names, name, 'r');
}
est->dsize = n;
make_covar(est);
apop_data_add_named_elmt(est->info, "log likelihood", multinomial_log_likelihood(data, est));
}
static void multinom_prep(apop_data *data, apop_model *params){
apop_model_print_vtable_add(multinomial_show, params);
apop_model_clear(data, params);
}
/* \adoc Input_format Each row of the matrix is one observation: a set of draws from a single bin.
The number of draws of type zero are in column zero, the number of draws of type one in column one, et cetera.
\li You may have a set of several Bernoulli-type draws, which could be summed together
to form a single Binomial draw. The \ref apop_data_to_dummies function (using the
<tt>.keep_first='y'</tt> option), to split a single column of numbers into a sequence
of columns, may help with this.
\adoc Parameter_format
The parameters are kept in the vector element of the \c apop_model parameters element. \c parameters->vector->data[0]==n;
\c parameters->vector->data[1...]==p_1....
The numeraire is bin zero, meaning that \f$p_0\f$ is not explicitly listed, but is
\f$p_0=1-\sum_{i=1}^{k-1} p_i\f$, where \f$k\f$ is the number of bins. Conveniently enough,
the zeroth element of the parameters vector holds \f$n\f$, and so a full probability vector can
easily be produced by overwriting that first element. For example:
\code
apop_model *estimated = apop_estimate(your_data, apop_multinomial);
int n = apop_data_get(estimated->parameters);
apop_data_set(estimated->parameters, .val=1 - (apop_sum(estimated->parameters)-n));
\endcode
And now the parameter vector is a proper list of probabilities.
\li Because an observation is a single row, the number of bins, \f$k\f$ is set to equal
the length of the first row (counting both vector and matrix elements, as appropriate).
The covariance matrix will be \f$k \times k\f$.
\li Each row should sum to \f$N\f$, the number of draws. The estimation routine doesn't check this, but instead uses the average sum across all rows.
\adoc Estimate_results Parameters are estimated. Covariance matrix is filled.
\adoc RNG Returns a single vector of length \f$k\f$, the result of an imaginary tossing
of \f$N\f$ balls into \f$k\f$ urns, with the given probabilities.
*/
apop_model *apop_multinomial = &(apop_model){"Multinomial distribution", -1, .dsize=-1,
.estimate = multinomial_estimate, .log_likelihood = multinomial_log_likelihood,
.constraint = multinomial_constraint, .draw = multinomial_rng, .prep=multinom_prep};
apop_model *apop_binomial = &(apop_model){"Binomial distribution", 2, .dsize=1,
.estimate = multinomial_estimate, .log_likelihood = multinomial_log_likelihood,
.constraint = multinomial_constraint, .draw = multinomial_rng, .prep=multinom_prep, .cdf= binomial_cdf};
|