1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
|
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/blueprint-xy-default.R, R/mold.R
\name{default_xy_blueprint}
\alias{default_xy_blueprint}
\alias{mold.data.frame}
\alias{mold.matrix}
\title{Default XY blueprint}
\usage{
default_xy_blueprint(
intercept = FALSE,
allow_novel_levels = FALSE,
composition = "tibble"
)
\method{mold}{data.frame}(x, y, ..., blueprint = NULL)
\method{mold}{matrix}(x, y, ..., blueprint = NULL)
}
\arguments{
\item{intercept}{A logical. Should an intercept be included in the
processed data? This information is used by the \code{process} function
in the \code{mold} and \code{forge} function list.}
\item{allow_novel_levels}{A logical. Should novel factor levels be allowed at
prediction time? This information is used by the \code{clean} function in the
\code{forge} function list, and is passed on to \code{\link[=scream]{scream()}}.}
\item{composition}{Either "tibble", "matrix", or "dgCMatrix" for the format
of the processed predictors. If "matrix" or "dgCMatrix" are chosen, all of
the predictors must be numeric after the preprocessing method has been
applied; otherwise an error is thrown.}
\item{x}{A data frame or matrix containing the predictors.}
\item{y}{A data frame, matrix, or vector containing the outcomes.}
\item{...}{Not used.}
\item{blueprint}{A preprocessing \code{blueprint}. If left as \code{NULL}, then a
\code{\link[=default_xy_blueprint]{default_xy_blueprint()}} is used.}
}
\value{
For \code{default_xy_blueprint()}, an XY blueprint.
}
\description{
This pages holds the details for the XY preprocessing blueprint. This
is the blueprint used by default from \code{mold()} if \code{x} and \code{y} are provided
separately (i.e. the XY interface is used).
}
\details{
As documented in \code{\link[=standardize]{standardize()}}, if \code{y} is a \emph{vector}, then the returned
outcomes tibble has 1 column with a standardized name of \code{".outcome"}.
The one special thing about the XY method's forge function is the behavior of
\code{outcomes = TRUE} when a \emph{vector} \code{y} value was provided to the original
call to \code{\link[=mold]{mold()}}. In that case, \code{mold()} converts \code{y} into a tibble, with
a default name of \code{.outcome}. This is the column that \code{forge()} will look
for in \code{new_data} to preprocess. See the examples section for a
demonstration of this.
}
\section{Mold}{
When \code{mold()} is used with the default xy blueprint:
\itemize{
\item It converts \code{x} to a tibble.
\item It adds an intercept column to \code{x} if \code{intercept = TRUE}.
\item It runs \code{\link[=standardize]{standardize()}} on \code{y}.
}
}
\section{Forge}{
When \code{forge()} is used with the default xy blueprint:
\itemize{
\item It calls \code{\link[=shrink]{shrink()}} to trim \code{new_data} to only the required columns and
coerce \code{new_data} to a tibble.
\item It calls \code{\link[=scream]{scream()}} to perform validation on the structure of the columns
of \code{new_data}.
\item It adds an intercept column onto \code{new_data} if \code{intercept = TRUE}.
}
}
\examples{
# ---------------------------------------------------------------------------
# Setup
train <- iris[1:100, ]
test <- iris[101:150, ]
train_x <- train[, "Sepal.Length", drop = FALSE]
train_y <- train[, "Species", drop = FALSE]
test_x <- test[, "Sepal.Length", drop = FALSE]
test_y <- test[, "Species", drop = FALSE]
# ---------------------------------------------------------------------------
# XY Example
# First, call mold() with the training data
processed <- mold(train_x, train_y)
# Then, call forge() with the blueprint and the test data
# to have it preprocess the test data in the same way
forge(test_x, processed$blueprint)
# ---------------------------------------------------------------------------
# Intercept
processed <- mold(train_x, train_y, blueprint = default_xy_blueprint(intercept = TRUE))
forge(test_x, processed$blueprint)
# ---------------------------------------------------------------------------
# XY Method and forge(outcomes = TRUE)
# You can request that the new outcome columns are preprocessed as well, but
# they have to be present in `new_data`!
processed <- mold(train_x, train_y)
# Can't do this!
try(forge(test_x, processed$blueprint, outcomes = TRUE))
# Need to use the full test set, including `y`
forge(test, processed$blueprint, outcomes = TRUE)
# With the XY method, if the Y value used in `mold()` is a vector,
# then a column name of `.outcome` is automatically generated.
# This name is what forge() looks for in `new_data`.
# Y is a vector!
y_vec <- train_y$Species
processed_vec <- mold(train_x, y_vec)
# This throws an informative error that tell you
# to include an `".outcome"` column in `new_data`.
try(forge(iris, processed_vec$blueprint, outcomes = TRUE))
test2 <- test
test2$.outcome <- test2$Species
test2$Species <- NULL
# This works, and returns a tibble in the $outcomes slot
forge(test2, processed_vec$blueprint, outcomes = TRUE)
# ---------------------------------------------------------------------------
# Matrix output for predictors
# You can change the `composition` of the predictor data set
bp <- default_xy_blueprint(composition = "dgCMatrix")
processed <- mold(train_x, train_y, blueprint = bp)
class(processed$predictors)
}
|