1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262
|
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/trainControl.R
\name{trainControl}
\alias{trainControl}
\title{Control parameters for train}
\usage{
trainControl(
method = "boot",
number = ifelse(grepl("cv", method), 10, 25),
repeats = ifelse(grepl("[d_]cv$", method), 1, NA),
p = 0.75,
search = "grid",
initialWindow = NULL,
horizon = 1,
fixedWindow = TRUE,
skip = 0,
verboseIter = FALSE,
returnData = TRUE,
returnResamp = "final",
savePredictions = FALSE,
classProbs = FALSE,
summaryFunction = defaultSummary,
selectionFunction = "best",
preProcOptions = list(thresh = 0.95, ICAcomp = 3, k = 5, freqCut = 95/5, uniqueCut =
10, cutoff = 0.9),
sampling = NULL,
index = NULL,
indexOut = NULL,
indexFinal = NULL,
timingSamps = 0,
predictionBounds = rep(FALSE, 2),
seeds = NA,
adaptive = list(min = 5, alpha = 0.05, method = "gls", complete = TRUE),
trim = FALSE,
allowParallel = TRUE
)
}
\arguments{
\item{method}{The resampling method: \code{"boot"}, \code{"boot632"},
\code{"optimism_boot"}, \code{"boot_all"},
\code{"cv"}, \code{"repeatedcv"}, \code{"LOOCV"}, \code{"LGOCV"} (for
repeated training/test splits), \code{"none"} (only fits one model to the
entire training set), \code{"oob"} (only for random forest, bagged trees,
bagged earth, bagged flexible discriminant analysis, or conditional tree
forest models), \code{timeslice}, \code{"adaptive_cv"}, \code{"adaptive_boot"} or
\code{"adaptive_LGOCV"}}
\item{number}{Either the number of folds or number of resampling iterations}
\item{repeats}{For repeated k-fold cross-validation only: the number of
complete sets of folds to compute}
\item{p}{For leave-group out cross-validation: the training percentage}
\item{search}{Either \code{"grid"} or \code{"random"}, describing how the
tuning parameter grid is determined. See details below.}
\item{initialWindow, horizon, fixedWindow, skip}{possible arguments to
\code{\link{createTimeSlices}} when method is \code{timeslice}.}
\item{verboseIter}{A logical for printing a training log.}
\item{returnData}{A logical for saving the data}
\item{returnResamp}{A character string indicating how much of the resampled
summary metrics should be saved. Values can be \code{"final"}, \code{"all"}
or \code{"none"}}
\item{savePredictions}{an indicator of how much of the hold-out predictions
for each resample should be saved. Values can be either \code{"all"},
\code{"final"}, or \code{"none"}. A logical value can also be used that
convert to \code{"all"} (for true) or \code{"none"} (for false).
\code{"final"} saves the predictions for the optimal tuning parameters.}
\item{classProbs}{a logical; should class probabilities be computed for
classification models (along with predicted values) in each resample?}
\item{summaryFunction}{a function to compute performance metrics across
resamples. The arguments to the function should be the same as those in
\code{\link{defaultSummary}}. Note that if \code{method = "oob"} is used,
this option is ignored and a warning is issued.}
\item{selectionFunction}{the function used to select the optimal tuning
parameter. This can be a name of the function or the function itself. See
\code{\link{best}} for details and other options.}
\item{preProcOptions}{A list of options to pass to \code{\link{preProcess}}.
The type of pre-processing (e.g. center, scaling etc) is passed in via the
\code{preProc} option in \code{\link{train}}.}
\item{sampling}{a single character value describing the type of additional
sampling that is conducted after resampling (usually to resolve class
imbalances). Values are \code{"none"}, \code{"down"}, \code{"up"},
\code{"smote"}, or \code{"rose"}. The latter two values require the
\pkg{themis} and \pkg{ROSE} packages, respectively. This argument can also be
a list to facilitate custom sampling and these details can be found on the
\pkg{caret} package website for sampling (link below).}
\item{index}{a list with elements for each resampling iteration. Each list
element is a vector of integers corresponding to the rows used for training
at that iteration.}
\item{indexOut}{a list (the same length as \code{index}) that dictates which
data are held-out for each resample (as integers). If \code{NULL}, then the
unique set of samples not contained in \code{index} is used.}
\item{indexFinal}{an optional vector of integers indicating which samples
are used to fit the final model after resampling. If \code{NULL}, then
entire data set is used.}
\item{timingSamps}{the number of training set samples that will be used to
measure the time for predicting samples (zero indicates that the prediction
time should not be estimated.}
\item{predictionBounds}{a logical or numeric vector of length 2 (regression
only). If logical, the predictions can be constrained to be within the limit
of the training set outcomes. For example, a value of \code{c(TRUE, FALSE)}
would only constrain the lower end of predictions. If numeric, specific
bounds can be used. For example, if \code{c(10, NA)}, values below 10 would
be predicted as 10 (with no constraint in the upper side).}
\item{seeds}{an optional set of integers that will be used to set the seed
at each resampling iteration. This is useful when the models are run in
parallel. A value of \code{NA} will stop the seed from being set within the
worker processes while a value of \code{NULL} will set the seeds using a
random set of integers. Alternatively, a list can be used. The list should
have \code{B+1} elements where \code{B} is the number of resamples, unless
\code{method} is \code{"boot632"} in which case \code{B} is the number of
resamples plus 1. The first \code{B} elements of the list should be vectors
of integers of length \code{M} where \code{M} is the number of models being
evaluated. The last element of the list only needs to be a single integer
(for the final model). See the Examples section below and the Details
section.}
\item{adaptive}{a list used when \code{method} is \code{"adaptive_cv"},
\code{"adaptive_boot"} or \code{"adaptive_LGOCV"}. See Details below.}
\item{trim}{a logical. If \code{TRUE} the final model in
\code{object\$finalModel} may have some components of the object removed so
reduce the size of the saved object. The \code{predict} method will still
work, but some other features of the model may not work. \code{trim}ing will
occur only for models where this feature has been implemented.}
\item{allowParallel}{if a parallel backend is loaded and available, should
the function use it?}
}
\value{
An echo of the parameters specified
}
\description{
Control the computational nuances of the \code{\link{train}} function
}
\details{
When setting the seeds manually, the number of models being evaluated is
required. This may not be obvious as \code{train} does some optimizations
for certain models. For example, when tuning over PLS model, the only model
that is fit is the one with the largest number of components. So if the
model is being tuned over \code{comp in 1:10}, the only model fit is
\code{ncomp = 10}. However, if the vector of integers used in the
\code{seeds} arguments is longer than actually needed, no error is thrown.
Using \code{method = "none"} and specifying more than one model in
\code{\link{train}}'s \code{tuneGrid} or \code{tuneLength} arguments will
result in an error.
Using adaptive resampling when \code{method} is either \code{"adaptive_cv"},
\code{"adaptive_boot"} or \code{"adaptive_LGOCV"}, the full set of resamples
is not run for each model. As resampling continues, a futility analysis is
conducted and models with a low probability of being optimal are removed.
These features are experimental. See Kuhn (2014) for more details. The
options for this procedure are:
\itemize{ \item \code{min}: the minimum number of resamples used before
models are removed \item \code{alpha}: the confidence level of the one-sided
intervals used to measure futility \item \code{method}: either generalized
least squares (\code{method = "gls"}) or a Bradley-Terry model (\code{method
= "BT"}) \item \code{complete}: if a single parameter value is found before
the end of resampling, should the full set of resamples be computed for that
parameter. ) }
The option \code{search = "grid"} uses the default grid search routine. When
\code{search = "random"}, a random search procedure is used (Bergstra and
Bengio, 2012). See \url{http://topepo.github.io/caret/random-hyperparameter-search.html} for
details and an example.
The supported bootstrap methods are:
\itemize{
\item \code{"boot"}: the usual bootstrap.
\item \code{"boot632"}: the 0.632 bootstrap estimator (Efron, 1983).
\item \code{"optimism_boot"}: the optimism bootstrap estimator.
(Efron and Tibshirani, 1994).
\item \code{"boot_all"}: all of the above (for efficiency,
but "boot" will be used for calculations).
}
The \code{"boot632"} method should not to be confused with the 0.632+
estimator proposed later by the same author.
Note that if \code{index} or \code{indexOut} are specified, the label shown by \code{train} may not be accurate since these arguments supersede the \code{method} argument.
}
\examples{
\dontrun{
## Do 5 repeats of 10-Fold CV for the iris data. We will fit
## a KNN model that evaluates 12 values of k and set the seed
## at each iteration.
set.seed(123)
seeds <- vector(mode = "list", length = 51)
for(i in 1:50) seeds[[i]] <- sample.int(1000, 22)
## For the last model:
seeds[[51]] <- sample.int(1000, 1)
ctrl <- trainControl(method = "repeatedcv",
repeats = 5,
seeds = seeds)
set.seed(1)
mod <- train(Species ~ ., data = iris,
method = "knn",
tuneLength = 12,
trControl = ctrl)
ctrl2 <- trainControl(method = "adaptive_cv",
repeats = 5,
verboseIter = TRUE,
seeds = seeds)
set.seed(1)
mod2 <- train(Species ~ ., data = iris,
method = "knn",
tuneLength = 12,
trControl = ctrl2)
}
}
\references{
Efron (1983). ``Estimating the error rate of a prediction rule:
improvement on cross-validation''. Journal of the American Statistical
Association, 78(382):316-331
Efron, B., & Tibshirani, R. J. (1994). ``An introduction to the bootstrap'',
pages 249-252. CRC press.
Bergstra and Bengio (2012), ``Random Search for Hyper-Parameter
Optimization'', Journal of Machine Learning Research, 13(Feb):281-305
Kuhn (2014), ``Futility Analysis in the Cross-Validation of Machine Learning
Models'' \url{https://arxiv.org/abs/1405.6974},
Package website for subsampling:
\url{https://topepo.github.io/caret/subsampling-for-class-imbalances.html}
}
\author{
Max Kuhn
}
\keyword{utilities}
|