File: varsel.R

package info (click to toggle)
r-cran-projpred 2.0.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 740 kB
  • sloc: cpp: 355; sh: 14; makefile: 2
file content (320 lines) | stat: -rw-r--r-- 12,353 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
#' Variable selection for generalized linear models
#'
#' Perform the projection predictive variable selection for generalized linear
#' models, generalized linear and additive multilevel models using generic
#' reference models.
#'
#' @param object Either a \code{refmodel}-type object created by
#'   \link[=get_refmodel]{get_refmodel}, a \link[=init_refmodel]{init_refmodel},
#'   an object which can be converted to a reference model using
#'   \link[=get_refmodel]{get_refmodel} or a \code{vsel} object resulting from
#'   \code{varsel} or \code{cv_varsel}.
#' @param d_test A test dataset, which is used to evaluate model performance. If
#'   not provided, training data is used. Currently this argument is for
#'   internal use only.
#' @param method The method used in the variable selection. Possible options are
#'   \code{'L1'} for L1-search and \code{'forward'} for forward selection.
#'   Default is 'forward' if the number of variables in the full data is at most
#'   20,' and \code{'L1'} otherwise.
#' @param cv_search If TRUE, then the projected coefficients after L1-selection
#'   are computed without any penalization (or using only the regularization
#'   determined by \code{regul}). If FALSE, then the coefficients are the
#'   solution from the' L1-penalized projection. This option is relevant only if
#'   \code{method}='L1'. Default is TRUE for genuine reference models and FALSE
#'   if \code{object} is datafit (see \link[=init_refmodel]{init_refmodel}).
#' @param ndraws Number of posterior draws used in the variable selection.
#'   Cannot be larger than the number of draws in the reference model. Ignored
#'   if nclusters is set.
#' @param nclusters Number of clusters to use in the clustered projection.
#'   Overrides the \code{ndraws} argument. Defaults to 1.
#' @param ndraws_pred Number of projected draws used for prediction (after
#'   selection). Ignored if nclusters_pred is given. Note that setting less
#'   draws or clusters than posterior draws in the reference model may result in
#'   slightly inaccurate projection performance, although increasing this
#'   argument linearly affects the computation time.
#' @param nclusters_pred Number of clusters used for prediction (after
#'   selection). Default is 5.
#' @param nterms_max Maximum number of varibles until which the selection is
#'   continued. Defaults to min(20, D, floor(0.4*n)) where n is the number of
#'   observations and D the number of variables.
#' @param intercept Whether to use intercept in the submodels. Defaults to TRUE.
#' @param penalty Vector determining the relative penalties or costs for the
#'   variables. Zero means that those variables have no cost and will therefore
#'   be selected first, whereas Inf means those variables will never be
#'   selected. Currently works only if method == 'L1'. By default 1 for each
#'   variable.
#' @param verbose If TRUE, may print out some information during the selection.
#'   Defaults to FALSE.
#' @param lambda_min_ratio Ratio between the smallest and largest lambda in the
#'   L1-penalized search. This parameter essentially determines how long the
#'   search is carried out, i.e., how large submodels are explored. No need to
#'   change the default value unless the program gives a warning about this.
#' @param nlambda Number of values in the lambda grid for L1-penalized search.
#'   No need to change unless the program gives a warning about this.
#' @param thresh Convergence threshold when computing L1-path. Usually no need
#'   to change this.
#' @param regul Amount of regularization in the projection. Usually there is no
#'   need for regularization, but sometimes for some models the projection can
#'   be ill-behaved and we need to add some regularization to avoid numerical
#'   problems.
#' @param search_terms A custom list of terms to evaluate for variable
#'   selection. By default considers all the terms in the reference model's
#'   formula.
#' @param ... Additional arguments to be passed to the
#'   \code{get_refmodel}-function.
#'
#' @return An object of type \code{vsel} that contains information about the
#'   feature selection. The fields are not meant to be accessed directly by
#'   the user but instead via the helper functions (see the vignettes or type
#'   ?projpred to see the main functions in the package.)
#'
#' @examples
#' \donttest{
#' if (requireNamespace('rstanarm', quietly=TRUE)) {
#'   ### Usage with stanreg objects
#'   n <- 30
#'   d <- 5
#'   x <- matrix(rnorm(n*d), nrow=n)
#'   y <- x[,1] + 0.5*rnorm(n)
#'   data <- data.frame(x,y)
#'   fit <- rstanarm::stan_glm(y ~ X1 + X2 + X3 + X4 + X5, gaussian(), data=data,
#'     chains=2, iter=500)
#'   vs <- varsel(fit)
#'   plot(vs)
#' }
#' }
#'
#' @export
varsel <- function(object, ...) {
    UseMethod("varsel")
}

#' @rdname varsel
#' @export
varsel.default <- function(object, ...) {
    refmodel <- get_refmodel(object)
    return(varsel(refmodel, ...))
}

#' @rdname varsel
#' @export
varsel.refmodel <- function(object, d_test = NULL, method = NULL,
    ndraws = NULL, nclusters = NULL, ndraws_pred = NULL,
                            nclusters_pred = NULL, cv_search = TRUE,
                            nterms_max = NULL, intercept = TRUE, verbose = TRUE,
                            lambda_min_ratio = 1e-5, nlambda = 150,
                            thresh = 1e-6, regul = 1e-4, penalty = NULL,
                            search_terms = NULL, ...) {
  refmodel <- object
  family <- refmodel$family

  ## fetch the default arguments or replace them by the user defined values
  args <- parse_args_varsel(
    refmodel, method, cv_search, intercept, nterms_max,
    nclusters, ndraws, nclusters_pred, ndraws_pred, search_terms
  )
  method <- args$method
  cv_search <- args$cv_search
  intercept <- args$intercept
  nterms_max <- args$nterms_max
  nclusters <- args$nclusters
  ndraws <- args$ndraws
  nclusters_pred <- args$nclusters_pred
  ndraws_pred <- args$ndraws_pred
  search_terms <- args$search_terms
  has_group_features <- formula_contains_group_terms(refmodel$formula)

  if (method == "l1" && has_group_features) {
    stop(
      "l1 search is not supported for multilevel models",
    )
  }

  if (is.null(d_test)) {
    d_type <- "train"
    test_points <- seq_len(NROW(refmodel$y))
    d_test <- nlist(
      y = refmodel$y, test_points, data = NULL, weights = refmodel$wobs,
      type = d_type
    )
  } else {
    d_type <- d_test$type
  }

  ## reference distributions for selection and prediction after selection
  p_sel <- .get_refdist(refmodel, ndraws, nclusters)
  p_pred <- .get_refdist(refmodel, ndraws_pred, nclusters_pred)

  ## perform the selection
  opt <- nlist(lambda_min_ratio, nlambda, thresh, regul)
  search_path <- select(
    method = method, p_sel = p_sel, refmodel = refmodel,
    family = family, intercept = intercept, nterms_max = nterms_max,
    penalty = penalty, verbose = verbose, opt = opt, search_terms = search_terms
  )
  solution_terms <- search_path$solution_terms

  ## statistics for the selected submodels
  p_sub <- .get_submodels(search_path, c(0, seq_along(solution_terms)),
    family = family, p_ref = p_pred, refmodel = refmodel, intercept = intercept,
    regul = regul, cv_search = cv_search
  )
  sub <- .get_sub_summaries(
    submodels = p_sub, test_points = seq_along(refmodel$y), refmodel = refmodel,
    family = family
  )

  ## predictive statistics of the reference model on test data. if no test data
  ## are provided,
  ## simply fetch the statistics on the train data
  if ("datafit" %in% class(refmodel)) {
    ## no actual reference model, so we don't know how to predict test
    ## observations
    ntest <- NROW(refmodel$y)
    ref <- list(mu = rep(NA, ntest), lppd = rep(NA, ntest))
  } else {
    if (d_type == "train") {
      mu_test <- refmodel$mu
    } else {
      mu_test <- refmodel$ref_predfun(refmodel$fit, newdata = d_test$data)
    }
    ref <- .weighted_summary_means(
      y_test = d_test, family = family, wsample = refmodel$wsample,
      mu = mu_test, dis = refmodel$dis
    )
  }

  ## warn the user if the projection performance does not match the reference
  ## model's.
  ref_elpd <- get_stat(ref$mu, ref$lppd, d_test, family, "elpd",
    weights = ref$w
  )
  summ <- sub[[length(sub)]]
  proj_elpd <- get_stat(summ$mu, summ$lppd, d_test, family, "elpd",
    weights = summ$w
  )

  if ((ref_elpd$value > (proj_elpd$value + proj_elpd$se) ||
       ref_elpd$value < (proj_elpd$value - proj_elpd$se)) &&
      !inherits(refmodel, "datafit")) {
    warning("The performance of the projected model seems to be misleading, we",
            " recommend checking the reference model as well as running ",
            "`varsel` with a larger `ndraws_pred` or `cv_varsel` for a more",
            " robust estimation.")
  }

  ## store the relevant fields into the object to be returned
  vs <- nlist(
    refmodel,
    search_path,
    d_test,
    summaries = nlist(sub, ref),
    family,
    solution_terms = search_path$solution_terms,
    kl = sapply(p_sub, function(x) x$kl),
    nterms_max,
    nterms_all = count_terms_in_formula(refmodel$formula)
  )
  ## suggest model size
  class(vs) <- "vsel"
  vs$suggested_size <- suggest_size(vs,
    warnings = FALSE
  )

  return(vs)
}


select <- function(method, p_sel, refmodel, family, intercept, nterms_max,
                   penalty, verbose, opt, search_terms = NULL) {
  ##
  ## Auxiliary function, performs variable selection with the given method,
  ## and returns the search_path, i.e., a list with the followint entries (the
  ## last three
  ## are returned only if one cluster projection is used for selection):
  ##   solution_terms: the variable ordering
  ##   beta: coefficients along the search path
  ##   alpha: intercepts along the search path
  ##   p_sel: the reference distribution used in the selection (the input
  ##   argument p_sel)
  ##
  ## routine that can be used with several clusters
  if (method == "l1") {
    search_path <- search_L1(
      p_sel, refmodel, family, intercept,
      nterms_max - intercept, penalty, opt
    )
    search_path$p_sel <- p_sel
    return(search_path)
  } else if (method == "forward") {
    search_path <- search_forward(p_sel, refmodel, family,
      intercept, nterms_max, verbose, opt,
      search_terms = search_terms
    )
    search_path$p_sel <- p_sel
    return(search_path)
  }
}


parse_args_varsel <- function(refmodel, method, cv_search, intercept,
                              nterms_max, nclusters, ndraws, nclusters_pred,
                              ndraws_pred, search_terms) {
  ##
  ## Auxiliary function for parsing the input arguments for varsel.
  ## The arguments specified by the user (or the function calling this function)
  ## are treated as they are, but if some are not given, then this function
  ## fills them in with the default values. The purpose of this function is to
  ## avoid repeating the same code both in varsel and cv_varsel.
  ##
  if (is.null(search_terms)) {
    search_terms <- split_formula(refmodel$formula,
      data = refmodel$fetch_data()
    )
  }
  has_group_features <- formula_contains_group_terms(refmodel$formula)
  has_additive_features <- formula_contains_additive_terms(refmodel$formula)

  if (is.null(method)) {
    if (has_group_features || has_additive_features) {
      method <- "forward"
    } else {
      method <- "l1"
    }
  } else {
    method <- tolower(method)
  }

  if (!(method %in% c("l1", "forward"))) {
    stop("Unknown search method")
  }

  if (is.null(cv_search)) {
    cv_search <- !inherits(refmodel, "datafit")
  }

  if ((is.null(ndraws) && is.null(nclusters)) || method == "l1") {
    ## use one cluster for selection by default, and always with L1-search
    nclusters <- 1
  }
  if (is.null(ndraws_pred) && is.null(nclusters_pred)) {
    ## use 5 clusters for prediction by default
    nclusters_pred <- min(NCOL(refmodel$mu), 5)
  }

  max_nv_possible <- count_terms_in_formula(refmodel$formula)
  if (is.null(intercept)) {
    intercept <- refmodel$intercept
  }
  if (is.null(nterms_max)) {
    nterms_max <- min(max_nv_possible, 20)
  } else {
    nterms_max <- min(max_nv_possible, nterms_max + 1)
  }

  return(nlist(
    method, cv_search, intercept, nterms_max, nclusters,
    ndraws, nclusters_pred, ndraws_pred,
    search_terms
  ))
}