File: ResampleDesc.R

package info (click to toggle)
r-cran-mlr 2.19.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 8,264 kB
  • sloc: ansic: 65; sh: 13; makefile: 5
file content (305 lines) | stat: -rw-r--r-- 12,793 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
#' @title Create a description object for a resampling strategy.
#'
#' @description
#' A description of a resampling algorithm contains all necessary information to
#' create a [ResampleInstance], when given the size of the data set.
#'
#' @details
#' Some notes on some special strategies:
#' \describe{
#' \item{Repeated cross-validation}{Use \dQuote{RepCV}. Then you have to set the aggregation function
#'   for your preferred performance measure to \dQuote{testgroup.mean}
#'   via [setAggregation].}
#' \item{B632 bootstrap}{Use \dQuote{Bootstrap} for bootstrap and set predict to \dQuote{both}.
#'   Then you have to set the aggregation function for your preferred performance measure to
#'   \dQuote{b632} via [setAggregation].}
#' \item{B632+ bootstrap}{Use \dQuote{Bootstrap} for bootstrap and set predict to \dQuote{both}.
#'   Then you have to set the aggregation function for your preferred performance measure to
#'   \dQuote{b632plus} via [setAggregation].}
#' \item{Fixed Holdout set}{Use [makeFixedHoldoutInstance].}
#' }
#'
#' Object slots:
#' \describe{
#' \item{id (`character(1)`)}{Name of resampling strategy.}
#' \item{iters (`integer(1)`)}{Number of iterations. Note that this is always the complete number
#'   of generated train/test sets, so for a 10-times repeated 5fold cross-validation it would be 50.}
#' \item{predict (`character(1)`)}{See argument.}
#' \item{stratify (`logical(1)`)}{See argument.}
#' \item{All parameters passed in ... under the respective argument name}{See arguments.}
#' }
#'
#' @param method (`character(1)`)\cr
#'   \dQuote{CV} for cross-validation, \dQuote{LOO} for leave-one-out, \dQuote{RepCV} for
#'   repeated cross-validation, \dQuote{Bootstrap} for out-of-bag bootstrap, \dQuote{Subsample} for
#'   subsampling, \dQuote{Holdout} for holdout, \dQuote{GrowingWindowCV} for growing window
#'   cross-validation, \dQuote{FixedWindowCV} for fixed window cross validation.
#' @param predict (`character(1)`)\cr
#'   What to predict during resampling: \dQuote{train}, \dQuote{test} or \dQuote{both} sets.
#'   Default is \dQuote{test}.
#' @param ... (any)\cr
#'   Further parameters for strategies.\cr
#'   \describe{
#'   \item{iters (`integer(1)`)}{Number of iterations, for \dQuote{CV}, \dQuote{Subsample}
#'     and \dQuote{Bootstrap}.}
#'   \item{split (`numeric(1)`)}{Proportion of training cases for \dQuote{Holdout} and
#'     \dQuote{Subsample} between 0 and 1. Default is 2 / 3.}
#'   \item{reps (`integer(1)`)}{Repeats for \dQuote{RepCV}. Here `iters = folds * reps`.
#'     Default is 10.}
#'   \item{folds (`integer(1)`)}{Folds in the repeated CV for `RepCV`.
#'     Here `iters = folds * reps`. Default is 10.}
#'   \item{horizon (`numeric(1)`)}{Number of observations in the forecast test set for \dQuote{GrowingWindowCV}
#'    and \dQuote{FixedWindowCV}. When `horizon > 1` this will be treated as the number of
#'    observations to forecast, else it will be a fraction of the initial window. IE,
#'    for 100 observations, initial window of .5, and horizon of .2, the test set will have
#'    10 observations. Default is 1.}
#'   \item{initial.window (`numeric(1)`)}{Fraction of observations to start with
#'    in the training set for \dQuote{GrowingWindowCV} and \dQuote{FixedWindowCV}.
#'    When `initial.window > 1` this will be treated as the number of
#'    observations in the initial window, else it will be treated as the fraction
#'    of observations to have in the initial window. Default is 0.5.}
#'   \item{skip (`numeric(1)`)}{ How many resamples to skip to thin the total amount
#'    for \dQuote{GrowingWindowCV} and \dQuote{FixedWindowCV}. This is passed through as the \dQuote{by} argument
#'    in `seq()`. When `skip > 1` this will be treated as the increment of the sequence of resampling indices,
#'     else it will be a fraction of the total training indices. IE for 100 training sets and a value of .2, the increment
#'     of the resampling indices will be 20. Default is \dQuote{horizon} which gives mutually exclusive chunks
#'      of test indices.}
#'   }
#' @param fixed (`logical(1)`)\cr
#'   Whether indices supplied via argument 'blocking' in the task should be used as
#'   fully pre-defined indices. Default is `FALSE` which means
#'   they will be used following the 'blocking' approach.
#'   `fixed` only works with ResampleDesc `CV` and the supplied indices must match
#'   the number of observations. When `fixed = TRUE`, the `iters` argument will be ignored
#'   and is interally set to the number of supplied factor levels in `blocking`.
#' @param blocking.cv (`logical(1)`)\cr
#'   Should 'blocking' be used in `CV`? Default to `FALSE`.
#'   This is different to `fixed = TRUE` and cannot be combined. Please check the mlr online tutorial
#'   for more details.
#' @param stratify (`logical(1)`)\cr
#'   Should stratification be done for the target variable?
#'   For classification tasks, this means that the resampling strategy is applied to all classes
#'   individually and the resulting index sets are joined to make sure that the proportion of
#'   observations in each training set is as in the original data set. Useful for imbalanced class sizes.
#'   For survival tasks stratification is done on the events, resulting in training sets with comparable
#'   censoring rates.
#' @param stratify.cols ([character])\cr
#'   Stratify on specific columns referenced by name. All columns have to be factor or integer.
#'   Note that you have to ensure yourself that stratification is possible, i.e.
#'   that each strata contains enough observations.
#'   This argument and `stratify` are mutually exclusive.
#' @return ([ResampleDesc]).
#' @family resample
#' @export
#' @aliases ResampleDesc
#' @examples
#' # Bootstraping
#' makeResampleDesc("Bootstrap", iters = 10)
#' makeResampleDesc("Bootstrap", iters = 10, predict = "both")
#'
#' # Subsampling
#' makeResampleDesc("Subsample", iters = 10, split = 3 / 4)
#' makeResampleDesc("Subsample", iters = 10)
#'
#' # Holdout a.k.a. test sample estimation
#' makeResampleDesc("Holdout")
makeResampleDesc = function(method, predict = "test", ..., stratify = FALSE,
  stratify.cols = NULL, fixed = FALSE, blocking.cv = FALSE) {

  assertChoice(method, choices = c("Holdout", "CV", "LOO", "RepCV",
    "Subsample", "Bootstrap", "SpCV", "SpRepCV",
    "GrowingWindowCV", "FixedWindowCV"))
  assertChoice(predict, choices = c("train", "test", "both"))
  assertFlag(stratify)
  if (stratify && method == "LOO") {
    stop("Stratification cannot be done for LOO!")
  }
  if (stratify && !is.null(stratify.cols)) {
    stop("Arguments 'stratify' and 'stratify.cols' are mutually exclusive!")
  }
  d = do.call(stri_paste("makeResampleDesc", method), list(...))
  d$predict = predict
  d$stratify = stratify
  d$stratify.cols = stratify.cols
  d$fixed = fixed
  d$blocking.cv = blocking.cv
  addClasses(d, stri_paste(method, "Desc"))
}


makeResampleDescInternal = function(id, iters, predict = "test", ...) {
  setClasses(insert(list(...), list(id = id, iters = iters, predict = predict)),
    "ResampleDesc")
}

#' @export
print.ResampleDesc = function(x, ...) {
  catf("Resample description: %s with %i iterations.", x$id, x$iters)
  catf("Predict: %s", x$predict)
  catf("Stratification: %s", x$stratify)
}

##############################################################################################
# all following constructors are only called INTERNALLY in makeResampleDesc
# note that stuff like the stratify flag are set in that super-constructor.
# the methods cannot be directly exported like this!
# FIXME: the code style is not so good here, see issue 187.
##############################################################################################

makeResampleDescHoldout = function(iters, split = 2 / 3) {
  assertNumber(split, lower = 0, upper = 1)
  makeResampleDescInternal("holdout", iters = 1L, split = split)
}

makeResampleDescCV = function(iters = 10L, fixed = FALSE, blocking.cv = FALSE) {
  iters = asInt(iters, lower = 2L)
  makeResampleDescInternal("cross-validation", iters = iters, fixed = fixed,
    blocking.cv = blocking.cv)
}

makeResampleDescSpCV = function(iters = 10L) {
  iters = asInt(iters, lower = 2L)
  makeResampleDescInternal("spatial cross-validation", iters = iters)
}

makeResampleDescLOO = function() {
  makeResampleDescInternal("LOO", iters = NA_integer_)
}

makeResampleDescSubsample = function(iters = 30L, split = 2 / 3) {
  iters = asCount(iters, positive = TRUE)
  assertNumber(split, lower = 0, upper = 1)
  makeResampleDescInternal("subsampling", iters = iters, split = split)
}

makeResampleDescBootstrap = function(iters = 30L) {
  iters = asCount(iters, positive = TRUE)
  makeResampleDescInternal("OOB bootstrapping", iters = iters)
}

makeResampleDescRepCV = function(reps = 10L, folds = 10L, fixed = FALSE, blocking.cv = FALSE) {
  reps = asInt(reps, lower = 2L)
  folds = asInt(folds, lower = 2L)
  makeResampleDescInternal("repeated cross-validation", iters = folds * reps, folds = folds, reps = reps,
    fixed = fixed, blocking.cv = blocking.cv)
}

makeResampleDescSpRepCV = function(reps = 10L, folds = 10L) {
  reps = asInt(reps, lower = 2L)
  folds = asInt(folds, lower = 2L)
  makeResampleDescInternal("repeated spatial cross-validation", iters = folds * reps, folds = folds, reps = reps)
}


makeResampleDescFixedWindowCV = function(horizon = 1L, initial.window = .5, skip = horizon - 1) {
  assertNumeric(horizon, lower = 0)
  assertNumeric(initial.window, lower = 0)
  assertNumeric(skip, lower = 0)
  makeResampleDescInternal("Fixed", iters = NA_integer_, horizon = horizon,
    initial.window = initial.window, skip = skip, stratify = FALSE)
}

makeResampleDescGrowingWindowCV = function(horizon = 1L, initial.window = .5, skip = horizon - 1) {
  assertNumeric(horizon, lower = 0)
  assertNumeric(initial.window, lower = 0)
  assertNumeric(skip, lower = 0)
  makeResampleDescInternal("Growing", iters = NA_integer_, horizon = horizon,
    initial.window = initial.window, skip = skip, stratify = FALSE)
}

##############################################################################################

#' @export
print.HoldoutDesc = function(x, ...) {
  catf("Resample description: %s with %.2f split rate.",
    x$id, x$split)
  catf("Predict: %s", x$predict)
  catf("Stratification: %s", x$stratify)
}

#' @export
print.SubsampleDesc = function(x, ...) {
  catf("Resample description: %s with %i iterations and %.2f split rate.",
    x$id, x$iters, x$split)
  catf("Predict: %s", x$predict)
  catf("Stratification: %s", x$stratify)
}

#' @export
print.RepCVDesc = function(x, ...) {
  catf("Resample description: %s with %i iterations: %i folds and %i reps.",
    x$id, x$iters, x$iters / x$reps, x$reps)
  catf("Predict: %s", x$predict)
  catf("Stratification: %s", x$stratify)
}

#' @export
print.GrowingWindowCVDesc = function(x, ...) {
  catf("Window description:\n %s: %.2f in initial window, horizon of %.2f, and skipping %.2f windows.",
    x$id, x$initial.window, x$horizon, x$skip)
  catf("Predict: %s", x$predict)
  catf("Stratification: %s", x$stratify)
}

#' @export
print.FixedWindowCVDesc = function(x, ...) {
  catf("Window description:\n %s: %.2f in initial window, horizon of %.2f, and skipping %.2f windows.",
    x$id, x$initial.window, x$horizon, x$skip)
  catf("Predict: %s", x$predict)
  catf("Stratification: %s", x$stratify)
}

##############################################################################################
# Resample Convenience Objects, like cv10
##############################################################################################

#' @rdname makeResampleDesc
#' @section Standard ResampleDesc objects:
#' For common resampling strategies you can save some typing
#' by using the following description objects:
#' \describe{
#' \item{hout}{holdout a.k.a. test sample estimation
#' (two-thirds training set, one-third testing set)}
#' \item{cv2}{2-fold cross-validation}
#' \item{cv3}{3-fold cross-validation}
#' \item{cv5}{5-fold cross-validation}
#' \item{cv10}{10-fold cross-validation}
#' }
#' @export
#' @usage NULL
#' @docType NULL
#' @format NULL
#' @keywords NULL
hout = makeResampleDesc("Holdout")

#' @rdname makeResampleDesc
#' @export
#' @usage NULL
#' @docType NULL
#' @format NULL
#' @keywords NULL
cv2 = makeResampleDesc("CV", iters = 2L)

#' @rdname makeResampleDesc
#' @export
#' @usage NULL
#' @docType NULL
#' @format NULL
#' @keywords NULL
cv3 = makeResampleDesc("CV", iters = 3L)

#' @rdname makeResampleDesc
#' @export
#' @usage NULL
#' @docType NULL
#' @format NULL
#' @keywords NULL
cv5 = makeResampleDesc("CV", iters = 5L)

#' @rdname makeResampleDesc
#' @export
#' @usage NULL
#' @docType NULL
#' @format NULL
#' @keywords NULL
cv10 = makeResampleDesc("CV", iters = 10L)