File: initial_validation_split.Rd

package info (click to toggle)
r-cran-rsample 1.2.1%2Bdfsg-1
links: PTS, VCS
area: main
in suites: forky, trixie
size: 1,932 kB
sloc: sh: 13; makefile: 2
file content (122 lines) | stat: -rw-r--r-- 4,502 bytes
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/initial_validation_split.R
\name{initial_validation_split}
\alias{initial_validation_split}
\alias{initial_validation_time_split}
\alias{group_initial_validation_split}
\alias{training.initial_validation_split}
\alias{testing.initial_validation_split}
\alias{validation}
\alias{validation.default}
\alias{validation.initial_validation_split}
\title{Create an Initial Train/Validation/Test Split}
\usage{
initial_validation_split(
  data,
  prop = c(0.6, 0.2),
  strata = NULL,
  breaks = 4,
  pool = 0.1,
  ...
)

initial_validation_time_split(data, prop = c(0.6, 0.2), ...)

group_initial_validation_split(
  data,
  group,
  prop = c(0.6, 0.2),
  ...,
  strata = NULL,
  pool = 0.1
)

\method{training}{initial_validation_split}(x, ...)

\method{testing}{initial_validation_split}(x, ...)

validation(x, ...)

\method{validation}{default}(x, ...)

\method{validation}{initial_validation_split}(x, ...)
}
\arguments{
\item{data}{A data frame.}

\item{prop}{A length-2 vector of proportions of data to be retained for training and
validation data, respectively.}

\item{strata}{A variable in \code{data} (single character or name) used to conduct
stratified sampling. When not \code{NULL}, each resample is created within the
stratification variable. Numeric \code{strata} are binned into quartiles.}

\item{breaks}{A single number giving the number of bins desired to stratify a
numeric stratification variable.}

\item{pool}{A proportion of data used to determine if a particular group is
too small and should be pooled into another group. We do not recommend
decreasing this argument below its default of 0.1 because of the dangers
of stratifying groups that are too small.}

\item{...}{These dots are for future extensions and must be empty.}

\item{group}{A variable in \code{data} (single character or name) used for
grouping observations with the same value to either the analysis or
assessment set within a fold.}

\item{x}{An object of class \code{initial_validation_split}.}
}
\value{
An \code{initial_validation_split} object that can be used with the
\code{\link[=training]{training()}}, \code{\link[=validation]{validation()}}, and \code{\link[=testing]{testing()}} functions to extract the data
in each split.
}
\description{
\code{initial_validation_split()} creates a random three-way split of the data
into a training set, a validation set, and a testing set.
\code{initial_validation_time_split()} does the same, but instead of a random
selection the training, validation, and testing set are in order of the full
data set, with the first observations being put into the training set.
\code{group_initial_validation_split()} creates similar random splits of the data
based on some grouping variable, so that all data in a "group" are assigned
to the same partition.
\code{training()}, \code{validation()}, and \code{testing()} can be used to extract the
resulting data sets.
Use \code{\link[=validation_set]{validation_set()}} to create an \code{rset} object for use with functions from
the tune package such as \code{tune::tune_grid()}.
}
\details{
With a \code{strata} argument, the random sampling is conducted
\emph{within the stratification variable}. This can help ensure that the
resamples have equivalent proportions as the original data set. For
a categorical variable, sampling is conducted separately within each class.
For a numeric stratification variable, \code{strata} is binned into quartiles,
which are then used to stratify. Strata below 10\% of the total are
pooled together; see \code{\link[=make_strata]{make_strata()}} for more details.
}
\examples{
\dontshow{if (rlang::is_installed("modeldata")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
set.seed(1353)
car_split <- initial_validation_split(mtcars)
train_data <- training(car_split)
validation_data <- validation(car_split)
test_data <- testing(car_split)

data(drinks, package = "modeldata")
drinks_split <- initial_validation_time_split(drinks)
train_data <- training(drinks_split)
validation_data <- validation(drinks_split)
c(max(train_data$date), min(validation_data$date))

data(ames, package = "modeldata")
set.seed(1353)
ames_split <- group_initial_validation_split(ames, group = Neighborhood)
train_data <- training(ames_split)
validation_data <- validation(ames_split)
test_data <- testing(ames_split)
\dontshow{\}) # examplesIf}
}
\seealso{
\code{\link[=validation_set]{validation_set()}}
}