1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
|
/** \file apop_missing_data.c Some missing data handlers. */
/* Copyright (c) 2007, 2009 by Ben Klemens. Licensed under the GPLv2; see COPYING. */
#include "apop_internal.h"
#include <regex.h>
/** If there is an NaN anywhere in the row of data (including the matrix, the vector, the weights, and the text) then delete the row from the data set.
\li If every row has a NaN, then this returns \c NULL.
\li If \c apop_opts.nan_string is not \c NULL, then I will make case-insensitive comparisons to the text elements to check for bad data as well.
\li If \c inplace = 'y', then I'll free each element of the input data
set and refill it with the pruned elements. I'll still take up (up to)
twice the size of the data set in memory during the function. If
every row has a NaN, then your \c apop_data set will end up with
\c NULL vector, matrix, .... if \c inplace = 'n', then the original data set is
left where it was, though internal elements may be moved.
\li I only look at the first page of data (i.e. the \c more element is ignored).
\li Listwise deletion is often not a statistically valid means of dealing with missing data.
It is typically better to impute the data (preferably multiple times). See \ref
apop_ml_impute for a less-invalid means, or <a href="https://github.com/rodri363/tea">Tea
for survey imputation</a> for heavy-duty survey editing and imputation.
\li This function uses the \ref designated syntax for inputs.
\param d The data, with NaNs
\param inplace If \c 'y', clear out the pointer-to-\ref apop_data that
you sent in and refill with the pruned data. If \c 'n', leave the
set alone and return a new data set. Default=\c 'n'.
\return A (potentially shorter) copy of the data set, without
NaNs. If <tt>inplace=='y'</tt>, a pointer to the input, which was shortened in place. If the entire data set is cleared out, then this will be \c NULL.
\see apop_data_rm_rows
*/
#ifdef APOP_NO_VARIADIC
apop_data * apop_data_listwise_delete(apop_data *d, char inplace){
#else
apop_varad_head(apop_data *, apop_data_listwise_delete){
apop_data * apop_varad_var(d, NULL);
if (!d) return NULL;
char apop_varad_var(inplace, 'n');
return apop_data_listwise_delete_base(d, inplace);
}
apop_data * apop_data_listwise_delete_base(apop_data *d, char inplace){
#endif
Get_vmsizes(d) //defines firstcol, vsize, wsize, msize1, msize2.
Apop_stopif(!msize1 && !vsize && !*d->textsize, return NULL, 0,
"You sent to apop_data_listwise_delete a data set with NULL matrix, NULL vector, and no text. "
"Confused, it is returning NULL.");
//find out where the NaNs are
int len = GSL_MAX(vsize ? vsize : msize1, d->textsize[0]); //still some size assumptions here.
int not_empty = 0;
int *marked = calloc(len, sizeof(int));
for (int i=0; i< (vsize ? vsize: msize1); i++)
for (int j=firstcol; j <msize2; j++){
if (gsl_isnan(apop_data_get(d, i, j))){
marked[i] = 1;
break;
}
}
for (int i=0; i< wsize; i++)
if (gsl_isnan(gsl_vector_get(d->weights, i)))
marked[i] = 1;
if (d->textsize[0] && apop_opts.nan_string){
for(int i=0; i< d->textsize[0]; i++)
if (!marked[i])
for(int j=0; j< d->textsize[1]; j++)
if (!strcasecmp(apop_opts.nan_string, d->text[i][j])){
marked[i] ++;
break;
}
}
//check that at least something isn't NULL.
for (int i=0; i< len; i++)
if (!marked[i]){
not_empty ++;
break;
}
if (!not_empty){
free(marked);
return NULL;
}
apop_data *out = (inplace=='y'|| inplace=='Y') ? d : apop_data_copy(d);
apop_data_rm_rows(out, marked);
free(marked);
return out;
}
//ML imputation
/** \hideinitializer */
#define Switch_back \
apop_data *real_data = ml_model->parameters; \
apop_model *actual_base = ml_model->more; \
actual_base->parameters = d;
static void i_est(apop_data *d, apop_model *ml_model){
Switch_back
actual_base = apop_estimate(real_data, actual_base);
}
static long double i_ll(apop_data *d, apop_model *ml_model){
Switch_back
return apop_log_likelihood(real_data, actual_base);
}
static long double i_p(apop_data *d, apop_model *ml_model){
Switch_back
return apop_p(real_data, actual_base);
}
//doesn't actually move the parameters
static long double i_constraint(apop_data *d, apop_model *ml_model){
Switch_back
if (!actual_base->constraint) return 0;
apop_data *original_params = apop_data_copy(actual_base->parameters);
long double out = actual_base->constraint(real_data, actual_base);
if (out) apop_data_memcpy(actual_base->parameters, original_params);
apop_data_free(original_params);
return out;
}
apop_model *apop_swap_model = &(apop_model){"Model with data and params swapped", .estimate=i_est, .p = i_p, .log_likelihood=i_ll, .constraint = i_constraint};
/** Impute the most likely data points to replace NaNs in the data, and insert them into
the given data. That is, the data set is modified in place.
How it works: this uses the machinery for \ref apop_model_fix_params. The only difference is
that this searches over the data space and takes the parameter space as fixed, while basic
fix params model searches parameters and takes data as fixed. So this function just does the
necessary data-parameter switching to make that happen.
\param d The data set. It comes in with NaNs and leaves entirely filled in.
\param mvn A parametrized \ref apop_model from which you expect the data was derived.
if \c NULL, then I'll use the Multivariate Normal that best fits the data after listwise deletion.
\return An estimated \ref apop_model. Also, the data input will be filled in and ready to use.
*/
apop_model * apop_ml_impute(apop_data *d, apop_model* mvn){
if (!mvn){
apop_data *list_d = apop_data_listwise_delete(d);
Apop_stopif(!list_d, return NULL, 0, "Listwise deletion returned no whole rows, "
"so I couldn't fit a Multivariate Normal to your data. "
"Please provide a pre-estimated initial model.");
mvn = apop_estimate(list_d, apop_multivariate_normal);
apop_data_free(list_d);
}
apop_model *impute_me = apop_model_copy(apop_swap_model);
impute_me->parameters = d;
impute_me->more = mvn;
apop_model *fixed = apop_model_fix_params(impute_me);
Apop_model_add_group(fixed, apop_parts_wanted);
apop_model *m = apop_estimate(mvn->parameters, fixed);
apop_model_free(fixed);
return m;
}
|