File: trainControl.R

package info (click to toggle)
r-cran-caret 7.0-1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 4,036 kB
  • sloc: ansic: 210; sh: 10; makefile: 2
file content (275 lines) | stat: -rw-r--r-- 14,067 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
#' Control parameters for train
#'
#' Control the computational nuances of the \code{\link{train}} function
#'
#' When setting the seeds manually, the number of models being evaluated is
#' required. This may not be obvious as \code{train} does some optimizations
#' for certain models. For example, when tuning over PLS model, the only model
#' that is fit is the one with the largest number of components. So if the
#' model is being tuned over \code{comp in 1:10}, the only model fit is
#' \code{ncomp = 10}. However, if the vector of integers used in the
#' \code{seeds} arguments is longer than actually needed, no error is thrown.
#'
#' Using \code{method = "none"} and specifying more than one model in
#' \code{\link{train}}'s \code{tuneGrid} or \code{tuneLength} arguments will
#' result in an error.
#'
#' Using adaptive resampling when \code{method} is either \code{"adaptive_cv"},
#' \code{"adaptive_boot"} or \code{"adaptive_LGOCV"}, the full set of resamples
#' is not run for each model. As resampling continues, a futility analysis is
#' conducted and models with a low probability of being optimal are removed.
#' These features are experimental. See Kuhn (2014) for more details. The
#' options for this procedure are:
#'
#' \itemize{ \item \code{min}: the minimum number of resamples used before
#' models are removed \item \code{alpha}: the confidence level of the one-sided
#' intervals used to measure futility \item \code{method}: either generalized
#' least squares (\code{method = "gls"}) or a Bradley-Terry model (\code{method
#' = "BT"}) \item \code{complete}: if a single parameter value is found before
#' the end of resampling, should the full set of resamples be computed for that
#' parameter. ) }
#'
#' The option \code{search = "grid"} uses the default grid search routine. When
#' \code{search = "random"}, a random search procedure is used (Bergstra and
#' Bengio, 2012). See \url{http://topepo.github.io/caret/random-hyperparameter-search.html} for
#' details and an example.
#'
#' The supported bootstrap methods are:
#'
#' \itemize{
#'   \item \code{"boot"}: the usual bootstrap.
#'   \item \code{"boot632"}: the 0.632 bootstrap estimator (Efron, 1983).
#'   \item \code{"optimism_boot"}: the optimism bootstrap estimator.
#'     (Efron and Tibshirani, 1994).
#'   \item \code{"boot_all"}: all of the above (for efficiency,
#'     but "boot" will be used for calculations).
#' }
#'
#' The \code{"boot632"} method should not to be confused with the 0.632+
#' estimator proposed later by the same author.
#'
#' Note that if \code{index} or \code{indexOut} are specified, the label shown by \code{train} may not be accurate since these arguments supersede the \code{method} argument.
#'
#' @param method The resampling method: \code{"boot"}, \code{"boot632"},
#' \code{"optimism_boot"}, \code{"boot_all"},
#' \code{"cv"}, \code{"repeatedcv"}, \code{"LOOCV"}, \code{"LGOCV"} (for
#' repeated training/test splits), \code{"none"} (only fits one model to the
#' entire training set), \code{"oob"} (only for random forest, bagged trees,
#' bagged earth, bagged flexible discriminant analysis, or conditional tree
#' forest models), \code{timeslice}, \code{"adaptive_cv"}, \code{"adaptive_boot"} or
#' \code{"adaptive_LGOCV"}
#' @param number Either the number of folds or number of resampling iterations
#' @param repeats For repeated k-fold cross-validation only: the number of
#' complete sets of folds to compute
#' @param verboseIter A logical for printing a training log.
#' @param returnData A logical for saving the data
#' @param returnResamp A character string indicating how much of the resampled
#' summary metrics should be saved. Values can be \code{"final"}, \code{"all"}
#' or \code{"none"}
#' @param savePredictions an indicator of how much of the hold-out predictions
#' for each resample should be saved. Values can be either \code{"all"},
#' \code{"final"}, or \code{"none"}. A logical value can also be used that
#' convert to \code{"all"} (for true) or \code{"none"} (for false).
#' \code{"final"} saves the predictions for the optimal tuning parameters.
#' @param p For leave-group out cross-validation: the training percentage
#' @param search Either \code{"grid"} or \code{"random"}, describing how the
#' tuning parameter grid is determined. See details below.
#' @param initialWindow,horizon,fixedWindow,skip possible arguments to
#' \code{\link{createTimeSlices}} when method is \code{timeslice}.
#' @param classProbs a logical; should class probabilities be computed for
#' classification models (along with predicted values) in each resample?
#' @param summaryFunction a function to compute performance metrics across
#' resamples. The arguments to the function should be the same as those in
#' \code{\link{defaultSummary}}. Note that if \code{method = "oob"} is used,
#' this option is ignored and a warning is issued.
#' @param selectionFunction the function used to select the optimal tuning
#' parameter. This can be a name of the function or the function itself. See
#' \code{\link{best}} for details and other options.
#' @param preProcOptions A list of options to pass to \code{\link{preProcess}}.
#' The type of pre-processing (e.g. center, scaling etc) is passed in via the
#' \code{preProc} option in \code{\link{train}}.
#' @param sampling a single character value describing the type of additional
#' sampling that is conducted after resampling (usually to resolve class
#' imbalances). Values are \code{"none"}, \code{"down"}, \code{"up"},
#' \code{"smote"}, or \code{"rose"}. The latter two values require the
#' \pkg{themis} and \pkg{ROSE} packages, respectively. This argument can also be
#' a list to facilitate custom sampling and these details can be found on the
#' \pkg{caret} package website for sampling (link below).
#' @param index a list with elements for each resampling iteration. Each list
#' element is a vector of integers corresponding to the rows used for training
#' at that iteration.
#' @param indexOut a list (the same length as \code{index}) that dictates which
#' data are held-out for each resample (as integers). If \code{NULL}, then the
#' unique set of samples not contained in \code{index} is used.
#' @param indexFinal an optional vector of integers indicating which samples
#' are used to fit the final model after resampling. If \code{NULL}, then
#' entire data set is used.
#' @param timingSamps the number of training set samples that will be used to
#' measure the time for predicting samples (zero indicates that the prediction
#' time should not be estimated.
#' @param predictionBounds a logical or numeric vector of length 2 (regression
#' only). If logical, the predictions can be constrained to be within the limit
#' of the training set outcomes. For example, a value of \code{c(TRUE, FALSE)}
#' would only constrain the lower end of predictions. If numeric, specific
#' bounds can be used. For example, if \code{c(10, NA)}, values below 10 would
#' be predicted as 10 (with no constraint in the upper side).
#' @param seeds an optional set of integers that will be used to set the seed
#' at each resampling iteration. This is useful when the models are run in
#' parallel. A value of \code{NA} will stop the seed from being set within the
#' worker processes while a value of \code{NULL} will set the seeds using a
#' random set of integers. Alternatively, a list can be used. The list should
#' have \code{B+1} elements where \code{B} is the number of resamples, unless
#' \code{method} is \code{"boot632"} in which case \code{B} is the number of
#' resamples plus 1. The first \code{B} elements of the list should be vectors
#' of integers of length \code{M} where \code{M} is the number of models being
#' evaluated. The last element of the list only needs to be a single integer
#' (for the final model). See the Examples section below and the Details
#' section.
#' @param adaptive a list used when \code{method} is \code{"adaptive_cv"},
#' \code{"adaptive_boot"} or \code{"adaptive_LGOCV"}. See Details below.
#' @param trim a logical. If \code{TRUE} the final model in
#' \code{object\$finalModel} may have some components of the object removed so
#' reduce the size of the saved object. The \code{predict} method will still
#' work, but some other features of the model may not work. \code{trim}ing will
#' occur only for models where this feature has been implemented.
#' @param allowParallel if a parallel backend is loaded and available, should
#' the function use it?
#' @return An echo of the parameters specified
#' @author Max Kuhn
#' @references Efron (1983). ``Estimating the error rate of a prediction rule:
#' improvement on cross-validation''. Journal of the American Statistical
#' Association, 78(382):316-331
#'
#' Efron, B., & Tibshirani, R. J. (1994). ``An introduction to the bootstrap'',
#' pages 249-252. CRC press.
#'
#' Bergstra and Bengio (2012), ``Random Search for Hyper-Parameter
#' Optimization'', Journal of Machine Learning Research, 13(Feb):281-305
#'
#' Kuhn (2014), ``Futility Analysis in the Cross-Validation of Machine Learning
#' Models'' \url{https://arxiv.org/abs/1405.6974},
#'
#' Package website for subsampling:
#' \url{https://topepo.github.io/caret/subsampling-for-class-imbalances.html}
#' @keywords utilities
#' @examples
#'
#' \dontrun{
#'
#' ## Do 5 repeats of 10-Fold CV for the iris data. We will fit
#' ## a KNN model that evaluates 12 values of k and set the seed
#' ## at each iteration.
#'
#' set.seed(123)
#' seeds <- vector(mode = "list", length = 51)
#' for(i in 1:50) seeds[[i]] <- sample.int(1000, 22)
#'
#' ## For the last model:
#' seeds[[51]] <- sample.int(1000, 1)
#'
#' ctrl <- trainControl(method = "repeatedcv",
#'                      repeats = 5,
#'                      seeds = seeds)
#'
#' set.seed(1)
#' mod <- train(Species ~ ., data = iris,
#'              method = "knn",
#'              tuneLength = 12,
#'              trControl = ctrl)
#'
#'
#' ctrl2 <- trainControl(method = "adaptive_cv",
#'                       repeats = 5,
#'                       verboseIter = TRUE,
#'                       seeds = seeds)
#'
#' set.seed(1)
#' mod2 <- train(Species ~ ., data = iris,
#'               method = "knn",
#'               tuneLength = 12,
#'               trControl = ctrl2)
#'
#' }
#'
#' @export trainControl
trainControl <- function(method = "boot",
                         number = ifelse(grepl("cv", method), 10, 25),
                         repeats = ifelse(grepl("[d_]cv$", method), 1, NA),
                         p = .75,
                         search = "grid",
                         initialWindow = NULL,
                         horizon = 1,
                         fixedWindow = TRUE,
                         skip = 0,
                         verboseIter = FALSE,
                         returnData = TRUE,
                         returnResamp = "final",
                         savePredictions = FALSE,
                         classProbs = FALSE,
                         summaryFunction = defaultSummary,
                         selectionFunction = "best",
                         preProcOptions = list(thresh = 0.95, ICAcomp = 3, k = 5,
                                               freqCut = 95/5, uniqueCut = 10,
                                               cutoff = 0.9),
                         sampling = NULL,
                         index = NULL,
                         indexOut = NULL,
                         indexFinal = NULL,
                         timingSamps = 0,
                         predictionBounds = rep(FALSE, 2),
                         seeds = NA,
                         adaptive = list(min = 5, alpha = 0.05, method = "gls", complete = TRUE),
                         trim = FALSE,
                         allowParallel = TRUE)
{
  if(is.null(selectionFunction)) stop("null selectionFunction values not allowed")
  if(!(returnResamp %in% c("all", "final", "none"))) stop("incorrect value of returnResamp")
  if(length(predictionBounds) > 0 && length(predictionBounds) != 2) stop("'predictionBounds' should be a logical or numeric vector of length 2")
  if(any(names(preProcOptions) == "method")) stop("'method' cannot be specified here")
  if(any(names(preProcOptions) == "x")) stop("'x' cannot be specified here")
  if(!is.na(repeats) & !(method %in% c("repeatedcv", "adaptive_cv")))
    warning("`repeats` has no meaning for this resampling method.", call. = FALSE)

  if(!(adaptive$method %in% c("gls", "BT"))) stop("incorrect value of adaptive$method")
  if(adaptive$alpha < .0000001 | adaptive$alpha > 1) stop("incorrect value of adaptive$alpha")
  if(grepl("adapt", method)) {
    num <- if(method == "adaptive_cv") number*repeats else number
    if(adaptive$min >= num) stop(paste("adaptive$min should be less than", num))
    if(adaptive$min <= 1) stop("adaptive$min should be greater than 1")
  }
  if(!(search %in% c("grid", "random")))
    stop("`search` should be either 'grid' or 'random'")
  if(method == "oob" & any(names(match.call()) == "summaryFunction")) {
    warning("Custom summary measures cannot be computed for out-of-bag resampling. ",
            "This value of `summaryFunction` will be ignored.",
            call. = FALSE)
  }

  list(method = method,
       number = number,
       repeats = repeats,
       search = search,
       p = p,
       initialWindow = initialWindow,
       horizon = horizon,
       fixedWindow = fixedWindow,
       skip = skip,
       verboseIter = verboseIter,
       returnData = returnData,
       returnResamp = returnResamp,
       savePredictions = savePredictions,
       classProbs = classProbs,
       summaryFunction = summaryFunction,
       selectionFunction = selectionFunction,
       preProcOptions = preProcOptions,
       sampling = sampling,
       index = index,
       indexOut = indexOut,
       indexFinal = indexFinal,
       timingSamps = timingSamps,
       predictionBounds = predictionBounds,
       seeds = seeds,
       adaptive = adaptive,
       trim = trim,
       allowParallel = allowParallel)
}