1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309
|
#' \if{html}{\Sexpr[stage=render,results=rd]{"\U0001f631"}} Scream.
#'
#' @description
#'
#' `scream()` ensures that the structure of `data` is the same as
#' prototype, `ptype`. Under the hood, [vctrs::vec_cast()] is used, which
#' casts each column of `data` to the same type as the corresponding
#' column in `ptype`.
#'
#' This casting enforces a number of important structural checks,
#' including but not limited to:
#'
#' - _Data Classes_ - Checks that the class of each column in `data` is the
#' same as the corresponding column in `ptype`.
#'
#' - _Novel Levels_ - Checks that the factor columns in `data` don't have any
#' _new_ levels when compared with the `ptype` columns. If there are new
#' levels, a warning is issued and they are coerced to `NA`. This check is
#' optional, and can be turned off with `allow_novel_levels = TRUE`.
#'
#' - _Level Recovery_ - Checks that the factor columns in `data` aren't
#' missing any factor levels when compared with the `ptype` columns. If
#' there are missing levels, then they are restored.
#'
#' @details
#'
#' `scream()` is called by [forge()] after [shrink()] but before the
#' actual processing is done. Generally, you don't need to call `scream()`
#' directly, as `forge()` will do it for you.
#'
#' If `scream()` is used as a standalone function, it is good practice to call
#' [shrink()] right before it as there are no checks in `scream()` that ensure
#' that all of the required column names actually exist in `data`. Those
#' checks exist in `shrink()`.
#'
#' @section Factor Levels:
#'
#' `scream()` tries to be helpful by recovering missing factor levels and
#' warning about novel levels. The following graphic outlines how `scream()`
#' handles factor levels when coercing _from_ a column in `data` _to_ a
#' column in `ptype`.
#'
#' \figure{factor-handling.png}
#'
#' Note that ordered factor handing is much stricter than factor handling.
#' Ordered factors in `data` must have _exactly_ the same levels as ordered
#' factors in `ptype`.
#'
#' @param data A data frame containing the new data to check the structure
#' of.
#'
#' @param ptype A data frame prototype to cast `data` to. This is commonly
#' a 0-row slice of the training set.
#'
#' @param allow_novel_levels Should novel factor levels in `data` be allowed?
#' The safest approach is the default, which throws a warning when novel levels
#' are found, and coerces them to `NA` values. Setting this argument to `TRUE`
#' will ignore all novel levels. This argument does not apply to ordered
#' factors. Novel levels are not allowed in ordered factors because the
#' level ordering is a critical part of the type.
#'
#' @return
#'
#' A tibble containing the required columns after any required structural
#' modifications have been made.
#'
#' @examples
#' # ---------------------------------------------------------------------------
#' # Setup
#'
#' train <- iris[1:100, ]
#' test <- iris[101:150, ]
#'
#' # mold() is run at model fit time
#' # and a formula preprocessing blueprint is recorded
#' x <- mold(log(Sepal.Width) ~ Species, train)
#'
#' # Inside the result of mold() are the prototype tibbles
#' # for the predictors and the outcomes
#' ptype_pred <- x$blueprint$ptypes$predictors
#' ptype_out <- x$blueprint$ptypes$outcomes
#'
#' # ---------------------------------------------------------------------------
#' # shrink() / scream()
#'
#' # Pass the test data, along with a prototype, to
#' # shrink() to extract the prototype columns
#' test_shrunk <- shrink(test, ptype_pred)
#'
#' # Now pass that to scream() to perform validation checks
#' # If no warnings / errors are thrown, the checks were
#' # successful!
#' scream(test_shrunk, ptype_pred)
#'
#' # ---------------------------------------------------------------------------
#' # Outcomes
#'
#' # To also extract the outcomes, use the outcome prototype
#' test_outcome <- shrink(test, ptype_out)
#' scream(test_outcome, ptype_out)
#'
#' # ---------------------------------------------------------------------------
#' # Casting
#'
#' # scream() uses vctrs::vec_cast() to intelligently convert
#' # new data to the prototype automatically. This means
#' # it can automatically perform certain conversions, like
#' # coercing character columns to factors.
#' test2 <- test
#' test2$Species <- as.character(test2$Species)
#'
#' test2_shrunk <- shrink(test2, ptype_pred)
#' scream(test2_shrunk, ptype_pred)
#'
#' # It can also recover missing factor levels.
#' # For example, it is plausible that the test data only had the
#' # "virginica" level
#' test3 <- test
#' test3$Species <- factor(test3$Species, levels = "virginica")
#'
#' test3_shrunk <- shrink(test3, ptype_pred)
#' test3_fixed <- scream(test3_shrunk, ptype_pred)
#'
#' # scream() recovered the missing levels
#' levels(test3_fixed$Species)
#'
#' # ---------------------------------------------------------------------------
#' # Novel levels
#'
#' # When novel levels with any data are present in `data`, the default
#' # is to coerce them to `NA` values with a warning.
#' test4 <- test
#' test4$Species <- as.character(test4$Species)
#' test4$Species[1] <- "new_level"
#'
#' test4$Species <- factor(
#' test4$Species,
#' levels = c(levels(test$Species), "new_level")
#' )
#'
#' test4 <- shrink(test4, ptype_pred)
#'
#' # Warning is thrown
#' test4_removed <- scream(test4, ptype_pred)
#'
#' # Novel level is removed
#' levels(test4_removed$Species)
#'
#' # No warning is thrown
#' test4_kept <- scream(test4, ptype_pred, allow_novel_levels = TRUE)
#'
#' # Novel level is kept
#' levels(test4_kept$Species)
#' @export
scream <- function(data, ptype, allow_novel_levels = FALSE) {
vec_assert(allow_novel_levels, ptype = logical(), size = 1L)
if (is.null(data)) {
return(NULL)
}
data <- check_is_data_like(data, "data")
if (allow_novel_levels) {
ptype <- add_novel_levels_to_ptype(ptype, data)
} else {
data <- remove_novel_levels(data, ptype)
}
vec_cast(data, ptype)
}
# ------------------------------------------------------------------------------
# vec_cast() throws an error for any lossy cast. This means that novel factor
# levels in the test data throw an error. For most modeling purposes,
# it is better to convert these to `NA` values, with a warning. We handle this
# before handing off to vctrs, checking each factor column to ensure that there
# are no novel levels.
remove_novel_levels <- function(data, ptype) {
ptype_fct_indicator <- map_lgl(ptype, is_bare_factor)
ptype_fct_locs <- which(ptype_fct_indicator)
if (length(ptype_fct_locs) == 0L) {
return(data)
}
fct_names <- names(ptype_fct_locs)
for (fct_name in fct_names) {
data[[fct_name]] <- check_novel_levels(
data[[fct_name]],
ptype[[fct_name]],
fct_name
)
}
data
}
check_novel_levels <- function(x, ptype, column) {
# Allow characters, consider them factors
if (is.character(x)) {
x <- factor(x, levels = unique(x))
}
# If not a bare factor, then let `vec_cast()` throw an error later.
# Ordered factors are stricter and do not allow novel levels in any way.
if (!is_bare_factor(x)) {
return(x)
}
x_lvls <- levels(x)
ptype_lvls <- levels(ptype)
new_lvls <- setdiff(x_lvls, ptype_lvls)
# All good
if (length(new_lvls) == 0L) {
return(x)
}
new_locs <- which(x %in% new_lvls | is.na(x))
# There is at least one new level, but none of them are used in the data
# vctrs will silently handle it for us
if (length(new_locs) == 0L) {
return(x)
}
# Use the levels from `x`, not `ptype` as we may still be missing levels
old_lvls <- setdiff(x_lvls, new_lvls)
warn_novel_levels(new_lvls, column)
factor(as.character(x), levels = old_lvls)
}
warn_novel_levels <- function(levels, column) {
message <- glue(
"Novel levels found in column '{column}': {glue_quote_collapse(levels)}. ",
"The levels have been removed, and values have been coerced to 'NA'."
)
warn(
message,
class = "hardhat_warn_novel_levels",
levels = levels,
column = column
)
}
# ------------------------------------------------------------------------------
# There are cases where we want to ignore any novel levels, but otherwise still
# validate a user's `new_data`. The issue with this is that vec_cast() throws an
# error for any lossy cast. This means that novel factor levels in the
# `new_data` throw an error. To handle this, we add the novel levels to the
# `ptype` to prevent vec_cast() from thinking that it is an error.
add_novel_levels_to_ptype <- function(ptype, data) {
ptype_fct_indicator <- map_lgl(ptype, is_bare_factor)
ptype_fct_locs <- which(ptype_fct_indicator)
if (length(ptype_fct_locs) == 0L) {
return(ptype)
}
fct_names <- names(ptype_fct_locs)
for (fct_name in fct_names) {
ptype[[fct_name]] <- add_novel_levels(
data[[fct_name]],
ptype[[fct_name]]
)
}
ptype
}
add_novel_levels <- function(x, ptype) {
# Allow characters, consider them factors
if (is.character(x)) {
x <- factor(x, levels = unique(x))
}
# If not a bare factor, then let `vec_cast()` throw an error later.
# Ordered factors are stricter and do not allow novel levels in any way.
if (!is_bare_factor(x)) {
return(ptype)
}
x_lvls <- levels(x)
ptype_lvls <- levels(ptype)
# Ensure that `x_lvls` is first, so order is maintained
new_ptype_lvls <- union(x_lvls, ptype_lvls)
factor(
as.character(ptype),
levels = new_ptype_lvls
)
}
# ------------------------------------------------------------------------------
is_bare_factor <- function(x) {
inherits_only(x, "factor")
}
|