1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
|
#' Convert values to predefined integers
#'
#' `step_integer` creates a *specification* of a recipe
#' step that will convert new data into a set of integers based
#' on the original data values.
#'
#' @inheritParams step_pca
#' @inheritParams step_center
#' @param key A list that contains the information needed to
#' create integer variables for each variable contained in
#' `terms`. This is `NULL` until the step is trained by
#' [prep()].
#' @param strict A logical for whether the values should be returned as
#' integers (as opposed to double).
#' @param zero_based A logical for whether the integers should start at zero and
#' new values be appended as the largest integer.
#' @template step-return
#' @family dummy variable and encoding steps
#' @export
#' @details `step_integer` will determine the unique values of
#' each variable from the training set (excluding missing values),
#' order them, and then assign integers to each value. When baked,
#' each data point is translated to its corresponding integer or a
#' value of zero for yet unseen data (although see the `zero_based`
#' argument above). Missing values propagate.
#'
#' Factor inputs are ordered by their levels. All others are
#' ordered by `sort`.
#'
#' Despite the name, the new values are returned as numeric unless
#' `strict = TRUE`, which will coerce the results to integers.
#'
#' # Tidying
#'
#' When you [`tidy()`][tidy.recipe()] this step, a tibble with columns
#' `terms` (the selectors or variables selected) and `value`
#' (a _list column_ with the conversion key) is returned.
#'
#' @template case-weights-not-supported
#'
#' @examplesIf rlang::is_installed("modeldata")
#' data(Sacramento, package = "modeldata")
#'
#' sacr_tr <- Sacramento[1:100, ]
#' sacr_tr$sqft[1] <- NA
#'
#' sacr_te <- Sacramento[101:105, ]
#' sacr_te$sqft[1] <- NA
#' sacr_te$city[1] <- "whoville"
#' sacr_te$city[2] <- NA
#'
#' rec <- recipe(type ~ ., data = sacr_tr) %>%
#' step_integer(all_predictors()) %>%
#' prep(training = sacr_tr)
#'
#' bake(rec, sacr_te, all_predictors())
#' tidy(rec, number = 1)
step_integer <-
function(recipe,
...,
role = "predictor",
trained = FALSE,
strict = TRUE,
zero_based = FALSE,
key = NULL,
skip = FALSE,
id = rand_id("integer")) {
add_step(
recipe,
step_integer_new(
terms = enquos(...),
role = role,
trained = trained,
strict = strict,
zero_based = zero_based,
key = key,
skip = skip,
id = id
)
)
}
step_integer_new <-
function(terms, role, trained, strict, zero_based, key, skip, id) {
step(
subclass = "integer",
terms = terms,
role = role,
trained = trained,
strict = strict,
zero_based = zero_based,
key = key,
skip = skip,
id = id
)
}
get_unique_values <- function(x, zero = FALSE) {
if (is.factor(x)) {
res <- levels(x)
} else {
res <- sort(unique(x))
}
res <- res[!is.na(res)]
ints <- seq_along(res)
if (zero) {
ints <- ints - 1
}
tibble(value = res, integer = ints)
}
#' @export
prep.step_integer <- function(x, training, info = NULL, ...) {
col_names <- recipes_eval_select(x$terms, training, info)
check_type(
training[, col_names],
types = c("string", "factor", "ordered", "integer", "double", "logical",
"date", "datetime")
)
step_integer_new(
terms = x$terms,
role = x$role,
trained = TRUE,
strict = x$strict,
zero_based = x$zero_based,
key = map(training[, col_names], get_unique_values, zero = x$zero_based),
skip = x$skip,
id = x$id
)
}
map_key_to_int <- function(dat, key, strict = FALSE, zero = FALSE) {
if (is.factor(dat)) {
dat <- as.character(dat)
}
res <- full_join(tibble(value = dat, .row = seq_along(dat)), key, by = "value")
res <- dplyr::filter(res, !is.na(.row))
res <- arrange(res, .row)
if (zero) {
res$integer[is.na(res$integer) & !is.na(res$value)] <-
max(key$integer, na.rm = TRUE) + 1
} else {
res$integer[is.na(res$integer) & !is.na(res$value)] <- 0
}
if (strict) {
res$integer <- as.integer(res$integer)
}
res[["integer"]]
}
#' @export
bake.step_integer <- function(object, new_data, ...) {
check_new_data(names(object$key), object, new_data)
for (i in names(object$key)) {
new_data[[i]] <-
map_key_to_int(new_data[[i]], object$key[[i]], object$strict, object$zero_based)
}
new_data
}
print.step_integer <-
function(x, width = max(20, options()$width - 20), ...) {
title <- "Integer encoding for "
print_step(names(x$key), x$terms, x$trained, title, width)
invisible(x)
}
#' @rdname tidy.recipe
#' @export
tidy.step_integer <- function(x, ...) {
if (is_trained(x)) {
res <- tibble(terms = names(x$key), value = unname(x$key))
} else {
res <- tibble(terms = sel2char(x$terms), value = list(NULL))
}
res$id <- x$id
res
}
|