File: varsel.R

package info (click to toggle)
r-cran-projpred 2.3.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,180 kB
  • sloc: cpp: 296; sh: 14; makefile: 5
file content (405 lines) | stat: -rw-r--r-- 18,750 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
#' Variable selection without cross-validation
#'
#' Run the *search* part and the *evaluation* part for a projection predictive
#' variable selection. The search part determines the solution path, i.e., the
#' best submodel for each submodel size (number of predictor terms). The
#' evaluation part determines the predictive performance of the submodels along
#' the solution path.
#'
#' @param object An object of class `refmodel` (returned by [get_refmodel()] or
#'   [init_refmodel()]) or an object that can be passed to argument `object` of
#'   [get_refmodel()].
#' @param d_test A `list` of the structure outlined in section "Argument
#'   `d_test`" below, providing test data for evaluating the predictive
#'   performance of the submodels as well as of the reference model. If `NULL`,
#'   the training data is used.
#' @param method The method for the search part. Possible options are `"L1"` for
#'   L1 search and `"forward"` for forward search. If `NULL`, then internally,
#'   `"L1"` is used, except if the reference model has multilevel or additive
#'   terms or if `!is.null(search_terms)`. See also section "Details" below.
#' @param refit_prj A single logical value indicating whether to fit the
#'   submodels along the solution path again (`TRUE`) or to retrieve their fits
#'   from the search part (`FALSE`) before using those (re-)fits in the
#'   evaluation part.
#' @param ndraws Number of posterior draws used in the search part. Ignored if
#'   `nclusters` is not `NULL` or in case of L1 search (because L1 search always
#'   uses a single cluster). If both (`nclusters` and `ndraws`) are `NULL`, the
#'   number of posterior draws from the reference model is used for `ndraws`.
#'   See also section "Details" below.
#' @param nclusters Number of clusters of posterior draws used in the search
#'   part. Ignored in case of L1 search (because L1 search always uses a single
#'   cluster). For the meaning of `NULL`, see argument `ndraws`. See also
#'   section "Details" below.
#' @param ndraws_pred Only relevant if `refit_prj` is `TRUE`. Number of
#'   posterior draws used in the evaluation part. Ignored if `nclusters_pred` is
#'   not `NULL`. If both (`nclusters_pred` and `ndraws_pred`) are `NULL`, the
#'   number of posterior draws from the reference model is used for
#'   `ndraws_pred`. See also section "Details" below.
#' @param nclusters_pred Only relevant if `refit_prj` is `TRUE`. Number of
#'   clusters of posterior draws used in the evaluation part. For the meaning of
#'   `NULL`, see argument `ndraws_pred`. See also section "Details" below.
#' @param nterms_max Maximum number of predictor terms until which the search is
#'   continued. If `NULL`, then `min(19, D)` is used where `D` is the number of
#'   terms in the reference model (or in `search_terms`, if supplied). Note that
#'   `nterms_max` does not count the intercept, so use `nterms_max = 0` for the
#'   intercept-only model. (Correspondingly, `D` above does not count the
#'   intercept.)
#' @param penalty Only relevant for L1 search. A numeric vector determining the
#'   relative penalties or costs for the predictors. A value of `0` means that
#'   those predictors have no cost and will therefore be selected first, whereas
#'   `Inf` means those predictors will never be selected. If `NULL`, then `1` is
#'   used for each predictor.
#' @param lambda_min_ratio Only relevant for L1 search. Ratio between the
#'   smallest and largest lambda in the L1-penalized search. This parameter
#'   essentially determines how long the search is carried out, i.e., how large
#'   submodels are explored. No need to change this unless the program gives a
#'   warning about this.
#' @param nlambda Only relevant for L1 search. Number of values in the lambda
#'   grid for L1-penalized search. No need to change this unless the program
#'   gives a warning about this.
#' @param thresh Only relevant for L1 search. Convergence threshold when
#'   computing the L1 path. Usually, there is no need to change this.
#' @param regul A number giving the amount of ridge regularization when
#'   projecting onto (i.e., fitting) submodels which are GLMs. Usually there is
#'   no need for regularization, but sometimes we need to add some
#'   regularization to avoid numerical problems.
#' @param search_terms Only relevant for forward search. A custom character
#'   vector of predictor term blocks to consider for the search. Section
#'   "Details" below describes more precisely what "predictor term block" means.
#'   The intercept (`"1"`) is always included internally via `union()`, so
#'   there's no difference between including it explicitly or omitting it. The
#'   default `search_terms` considers all the terms in the reference model's
#'   formula.
#' @param verbose A single logical value indicating whether to print out
#'   additional information during the computations.
#' @param seed Pseudorandom number generation (PRNG) seed by which the same
#'   results can be obtained again if needed. Passed to argument `seed` of
#'   [set.seed()], but can also be `NA` to not call [set.seed()] at all. Here,
#'   this seed is used for clustering the reference model's posterior draws (if
#'   `!is.null(nclusters)` or `!is.null(nclusters_pred)`) and for drawing new
#'   group-level effects when predicting from a multilevel submodel (however,
#'   not yet in case of a GAMM).
#' @param ... Arguments passed to [get_refmodel()] as well as to the divergence
#'   minimizer (during a forward search and also during the evaluation part, but
#'   the latter only if `refit_prj` is `TRUE`).
#'
#' @details
#'
#' # Argument `d_test`
#'
#' If not `NULL`, then `d_test` needs to be a `list` with the following
#' elements:
#' * `data`: a `data.frame` containing the predictor variables for the test set.
#' * `offset`: a numeric vector containing the offset values for the test set
#' (if there is no offset, use a vector of zeros).
#' * `weights`: a numeric vector containing the observation weights for the test
#' set (if there are no observation weights, use a vector of ones).
#' * `y`: a numeric vector containing the response values for the test set.
#'
#' @details Arguments `ndraws`, `nclusters`, `nclusters_pred`, and `ndraws_pred`
#'   are automatically truncated at the number of posterior draws in the
#'   reference model (which is `1` for `datafit`s). Using less draws or clusters
#'   in `ndraws`, `nclusters`, `nclusters_pred`, or `ndraws_pred` than posterior
#'   draws in the reference model may result in slightly inaccurate projection
#'   performance. Increasing these arguments affects the computation time
#'   linearly.
#'
#'   For argument `method`, there are some restrictions: For a reference model
#'   with multilevel or additive formula terms, only the forward search is
#'   available. Furthermore, argument `search_terms` requires a forward search
#'   to take effect.
#'
#'   L1 search is faster than forward search, but forward search may be more
#'   accurate. Furthermore, forward search may find a sparser model with
#'   comparable performance to that found by L1 search, but it may also start
#'   overfitting when more predictors are added.
#'
#'   An L1 search may select interaction terms before the corresponding main
#'   terms are selected. If this is undesired, choose the forward search
#'   instead.
#'
#'   The elements of the `search_terms` character vector don't need to be
#'   individual predictor terms. Instead, they can be building blocks consisting
#'   of several predictor terms connected by the `+` symbol. To understand how
#'   these building blocks work, it is important to know how \pkg{projpred}'s
#'   forward search works: It starts with an empty vector `chosen` which will
#'   later contain already selected predictor terms. Then, the search iterates
#'   over model sizes \eqn{j \in \{1, ..., J\}}{j = 1, ..., J}. The candidate
#'   models at model size \eqn{j} are constructed from those elements from
#'   `search_terms` which yield model size \eqn{j} when combined with the
#'   `chosen` predictor terms. Note that sometimes, there may be no candidate
#'   models for model size \eqn{j}. Also note that internally, `search_terms` is
#'   expanded to include the intercept (`"1"`), so the first step of the search
#'   (model size 1) always consists of the intercept-only model as the only
#'   candidate.
#'
#'   As a `search_terms` example, consider a reference model with formula `y ~
#'   x1 + x2 + x3`. Then, to ensure that `x1` is always included in the
#'   candidate models, specify `search_terms = c("x1", "x1 + x2", "x1 + x3",
#'   "x1 + x2 + x3")`. This search would start with `y ~ 1` as the only
#'   candidate at model size 1. At model size 2, `y ~ x1` would be the only
#'   candidate. At model size 3, `y ~ x1 + x2` and `y ~ x1 + x3` would be the
#'   two candidates. At the last model size of 4, `y ~ x1 + x2 + x3` would be
#'   the only candidate. As another example, to exclude `x1` from the search,
#'   specify `search_terms = c("x2", "x3", "x2 + x3")`.
#'
#' @return An object of class `vsel`. The elements of this object are not meant
#'   to be accessed directly but instead via helper functions (see the main
#'   vignette and [projpred-package]).
#'
#' @seealso [cv_varsel()]
#'
#' @examples
#' if (requireNamespace("rstanarm", quietly = TRUE)) {
#'   # Data:
#'   dat_gauss <- data.frame(y = df_gaussian$y, df_gaussian$x)
#'
#'   # The "stanreg" fit which will be used as the reference model (with small
#'   # values for `chains` and `iter`, but only for technical reasons in this
#'   # example; this is not recommended in general):
#'   fit <- rstanarm::stan_glm(
#'     y ~ X1 + X2 + X3 + X4 + X5, family = gaussian(), data = dat_gauss,
#'     QR = TRUE, chains = 2, iter = 500, refresh = 0, seed = 9876
#'   )
#'
#'   # Variable selection (here without cross-validation and with small values
#'   # for `nterms_max`, `nclusters`, and `nclusters_pred`, but only for the
#'   # sake of speed in this example; this is not recommended in general):
#'   vs <- varsel(fit, nterms_max = 3, nclusters = 5, nclusters_pred = 10,
#'                seed = 5555)
#'   # Now see, for example, `?print.vsel`, `?plot.vsel`, `?suggest_size.vsel`,
#'   # and `?solution_terms.vsel` for possible post-processing functions.
#' }
#'
#' @export
varsel <- function(object, ...) {
  UseMethod("varsel")
}

#' @rdname varsel
#' @export
varsel.default <- function(object, ...) {
  refmodel <- get_refmodel(object, ...)
  return(varsel(refmodel, ...))
}

#' @rdname varsel
#' @export
varsel.refmodel <- function(object, d_test = NULL, method = NULL,
                            ndraws = NULL, nclusters = 20, ndraws_pred = 400,
                            nclusters_pred = NULL,
                            refit_prj = !inherits(object, "datafit"),
                            nterms_max = NULL, verbose = TRUE,
                            lambda_min_ratio = 1e-5, nlambda = 150,
                            thresh = 1e-6, regul = 1e-4, penalty = NULL,
                            search_terms = NULL,
                            seed = sample.int(.Machine$integer.max, 1), ...) {
  # Set seed, but ensure the old RNG state is restored on exit:
  if (exists(".Random.seed", envir = .GlobalEnv)) {
    rng_state_old <- get(".Random.seed", envir = .GlobalEnv)
    on.exit(assign(".Random.seed", rng_state_old, envir = .GlobalEnv))
  }
  if (!is.na(seed)) set.seed(seed)

  refmodel <- object

  ## fetch the default arguments or replace them by the user defined values
  args <- parse_args_varsel(
    refmodel = refmodel, method = method, refit_prj = refit_prj,
    nterms_max = nterms_max, nclusters = nclusters, search_terms = search_terms
  )
  method <- args$method
  refit_prj <- args$refit_prj
  nterms_max <- args$nterms_max
  nclusters <- args$nclusters
  search_terms <- args$search_terms

  if (is.null(d_test)) {
    d_test <- list(type = "train", data = NULL, offset = refmodel$offset,
                   weights = refmodel$wobs, y = refmodel$y)
  } else {
    d_test$type <- "test"
    d_test <- d_test[nms_d_test()]
  }

  ## reference distributions for selection and prediction after selection
  p_sel <- .get_refdist(refmodel, ndraws, nclusters)
  p_pred <- .get_refdist(refmodel, ndraws_pred, nclusters_pred)

  ## perform the selection
  opt <- nlist(lambda_min_ratio, nlambda, thresh, regul)
  search_path <- select(
    method = method, p_sel = p_sel, refmodel = refmodel,
    nterms_max = nterms_max, penalty = penalty, verbose = verbose, opt = opt,
    search_terms = search_terms, ...
  )

  ## statistics for the selected submodels
  submodels <- .get_submodels(
    search_path = search_path,
    nterms = c(0, seq_along(search_path$solution_terms)),
    p_ref = p_pred, refmodel = refmodel, regul = regul, refit_prj = refit_prj,
    ...
  )
  sub <- .get_sub_summaries(submodels = submodels,
                            refmodel = refmodel,
                            test_points = NULL,
                            newdata = d_test$data,
                            offset = d_test$offset,
                            wobs = d_test$weights,
                            y = d_test$y)

  ## predictive statistics of the reference model on test data. if no test data
  ## are provided,
  ## simply fetch the statistics on the train data
  if (inherits(refmodel, "datafit")) {
    ## no actual reference model, so we don't know how to predict test
    ## observations
    nobs_test <- nrow(d_test$data %||% refmodel$fetch_data())
    ref <- list(mu = rep(NA, nobs_test), lppd = rep(NA, nobs_test))
  } else {
    if (d_test$type == "train") {
      mu_test <- refmodel$mu
      if (!all(refmodel$offset == 0)) {
        mu_test <- refmodel$family$linkinv(
          refmodel$family$linkfun(mu_test) + refmodel$offset
        )
      }
    } else {
      newdata_for_ref <- d_test$data
      if (inherits(refmodel$fit, "stanreg") &&
          length(refmodel$fit$offset) > 0) {
        if ("projpred_internal_offs_stanreg" %in% names(newdata_for_ref)) {
          stop("Need to write to column `projpred_internal_offs_stanreg` of ",
               "`d_test$data`, but that column already exists. Please rename ",
               "this column in `d_test$data` and try again.")
        }
        newdata_for_ref$projpred_internal_offs_stanreg <- d_test$offset
      }
      mu_test <- refmodel$family$linkinv(
        refmodel$ref_predfun(refmodel$fit, newdata = newdata_for_ref) +
          d_test$offset
      )
    }
    ref <- .weighted_summary_means(
      y_test = d_test, family = refmodel$family, wsample = refmodel$wsample,
      mu = mu_test, dis = refmodel$dis
    )
  }

  ## store the relevant fields into the object to be returned
  vs <- nlist(
    refmodel,
    search_path,
    d_test,
    summaries = nlist(sub, ref),
    solution_terms = search_path$solution_terms,
    ce = sapply(submodels, "[[", "ce"),
    nterms_max,
    nterms_all = count_terms_in_formula(refmodel$formula),
    method = method,
    cv_method = NULL,
    validate_search = NULL,
    clust_used_search = p_sel$clust_used,
    clust_used_eval = p_pred$clust_used,
    nprjdraws_search = NCOL(p_sel$mu),
    nprjdraws_eval = NCOL(p_pred$mu)
  )
  ## suggest model size
  class(vs) <- "vsel"
  vs$suggested_size <- suggest_size(vs, warnings = FALSE)
  summary <- summary(vs)
  vs$summary <- summary$selection

  return(vs)
}

select <- function(method, p_sel, refmodel, nterms_max, penalty, verbose, opt,
                   search_terms = NULL, ...) {
  ##
  ## Auxiliary function, performs variable selection with the given method,
  ## and returns the search_path, i.e., a list with the followint entries (the
  ## last three
  ## are returned only if one cluster projection is used for selection):
  ##   solution_terms: the variable ordering
  ##   beta: coefficients along the solution path
  ##   alpha: intercepts along the solution path
  ##   p_sel: the reference distribution used in the selection (the input
  ##   argument p_sel)
  ##
  ## routine that can be used with several clusters
  if (method == "l1") {
    search_path <- search_L1(p_sel, refmodel, nterms_max - refmodel$intercept,
                             penalty, opt)
    search_path$p_sel <- p_sel
    return(search_path)
  } else if (method == "forward") {
    search_path <- search_forward(p_sel, refmodel, nterms_max, verbose, opt,
                                  search_terms = search_terms, ...)
    search_path$p_sel <- p_sel
    return(search_path)
  }
}

## Auxiliary function for parsing the input arguments for varsel.
## The arguments specified by the user (or the function calling this function)
## are treated as they are, but if some are not given, then this function
## fills them in with the default values. The purpose of this function is to
## avoid repeating the same code both in varsel and cv_varsel.
parse_args_varsel <- function(refmodel, method, refit_prj, nterms_max,
                              nclusters, search_terms) {
  search_terms_was_null <- is.null(search_terms)
  if (search_terms_was_null) {
    search_terms <- split_formula(refmodel$formula,
                                  data = refmodel$fetch_data())
  }
  search_terms <- union("1", search_terms)
  has_group_features <- formula_contains_group_terms(refmodel$formula)
  has_additive_features <- formula_contains_additive_terms(refmodel$formula)

  if (is.null(method)) {
    if (has_group_features || has_additive_features || !search_terms_was_null) {
      method <- "forward"
    } else {
      method <- "l1"
    }
  } else {
    method <- tolower(method)
    if (method == "l1") {
      if (has_group_features || has_additive_features) {
        stop("L1 search is only supported for reference models without ",
             "multilevel and without additive (\"smoothing\") terms.")
      }
      if (!search_terms_was_null) {
        warning("Argument `search_terms` only takes effect if ",
                "`method = \"forward\"`.")
      }
    }
  }

  if (!(method %in% c("l1", "forward"))) {
    stop("Unknown search method")
  }

  stopifnot(!is.null(refit_prj))
  if (refit_prj && inherits(refmodel, "datafit")) {
    warning("For an `object` of class \"datafit\", `refit_prj` is ",
            "automatically set to `FALSE`.")
    refit_prj <- FALSE
  }

  if (method == "l1") {
    nclusters <- 1
  }

  search_terms_unq <- unique(unlist(
    strsplit(search_terms, split = "+", fixed = TRUE)
  ))
  max_nv_possible <- count_terms_chosen(search_terms_unq, duplicates = TRUE)
  if (is.null(nterms_max)) {
    nterms_max <- 19
  }
  nterms_max <- min(max_nv_possible, nterms_max + refmodel$intercept)

  return(nlist(method, refit_prj, nterms_max, nclusters, search_terms))
}