File: nest.R

package info (click to toggle)
r-cran-rsample 0.0.8-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye
  • size: 1,696 kB
  • sloc: sh: 13; makefile: 2
file content (101 lines) | stat: -rw-r--r-- 3,487 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#' Nested or Double Resampling
#'
#' `nested_cv` can be used to take the results of one resampling procedure
#'   and conduct further resamples within each split. Any type of resampling
#'   used in `rsample` can be used.
#'
#' @details
#' It is a bad idea to use bootstrapping as the outer resampling procedure (see
#'   the example below)
#' @param data  A data frame.
#' @param outside The initial resampling specification. This can be an already
#'   created object or an expression of a new object (see the examples below).
#'   If the latter is used, the `data` argument does not need to be
#'   specified and, if it is given, will be ignored.
#' @param inside An expression for the type of resampling to be conducted
#'   within the initial procedure.
#' @return  An tibble with `nested_cv` class and any other classes that
#'   outer resampling process normally contains. The results include a
#'  column for the outer data split objects, one or more `id` columns,
#'  and a column of nested tibbles called `inner_resamples` with the
#'  additional resamples.
#' @examples
#' ## Using expressions for the resampling procedures:
#' nested_cv(mtcars, outside = vfold_cv(v = 3), inside = bootstraps(times = 5))
#'
#' ## Using an existing object:
#' folds <- vfold_cv(mtcars)
#' nested_cv(mtcars, folds, inside = bootstraps(times = 5))
#'
#' ## The dangers of outer bootstraps:
#' set.seed(2222)
#' bad_idea <- nested_cv(mtcars,
#'                       outside = bootstraps(times = 5),
#'                       inside = vfold_cv(v = 3))
#'
#' first_outer_split <- bad_idea$splits[[1]]
#' outer_analysis <- as.data.frame(first_outer_split)
#' sum(grepl("Volvo 142E", rownames(outer_analysis)))
#'
#' ## For the 3-fold CV used inside of each bootstrap, how are the replicated
#' ## `Volvo 142E` data partitioned?
#' first_inner_split <- bad_idea$inner_resamples[[1]]$splits[[1]]
#' inner_analysis <- as.data.frame(first_inner_split)
#' inner_assess   <- as.data.frame(first_inner_split, data = "assessment")
#'
#' sum(grepl("Volvo 142E", rownames(inner_analysis)))
#' sum(grepl("Volvo 142E", rownames(inner_assess)))
#' @export
nested_cv <- function(data, outside, inside)  {
  nest_args <- formalArgs(nested_cv)
  cl <- match.call()

  boot_msg <-
    paste0(
      "Using bootstrapping as the outer resample is dangerous ",
      "since the inner resample might have the same data ",
      "point in both the analysis and assessment set."
    )

  outer_cl <- cl[["outside"]]
  if (is_call(outer_cl)) {
    if (grepl("^bootstraps", deparse(outer_cl)))
      warning(boot_msg, call. = FALSE)
    outer_cl$data <- quote(data)
    outside <- eval(outer_cl)
  } else {
    if (inherits(outside, "bootstraps"))
      warning(boot_msg, call. = FALSE)
  }

  inner_cl <- cl[["inside"]]
  if (!is_call(inner_cl))
    stop(
      "`inside` should be a expression such as `vfold()` or ",
      "bootstraps(times = 10)` instead of a existing object.",
      call. = FALSE
    )
  inside <- map(outside$splits, inside_resample, cl = inner_cl)

  out <- dplyr::mutate(outside, inner_resamples = inside)

  out <- add_class(out, cls = "nested_cv", at_end = FALSE)

  attr(out, "outside") <- cl$outside
  attr(out, "inside") <- cl$inside

  out
}

inside_resample <- function(src, cl) {
  cl$data <- quote(as.data.frame(src))
  eval(cl)
}

#' @export
print.nested_cv <- function(x, ...) {
  char_x <- paste("#", pretty(x))
  cat(char_x, sep = "\n")
  class(x) <- class(tibble())
  print(x, ...)
}