1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
|
#' Convert Strings to Factors
#'
#' @description
#' `step_string2factor` will convert one or more character
#' vectors to factors (ordered or unordered).
#'
#' _Use this step only in special cases_ (see Details) and instead convert
#' strings to factors before using any tidymodels functions.
#'
#' @inheritParams step_center
#' @param levels An options specification of the levels to be used
#' for the new factor. If left `NULL`, the sorted unique
#' values present when `bake` is called will be used.
#' @param ordered A single logical value; should the factor(s) be
#' ordered?
#' @template step-return
#' @family dummy variable and encoding steps
#' @export
#' @details
#'
#' ## When should you use this step?
#'
#' In most cases, if you are planning to use `step_string2factor()`
#' without setting `levels`, you will be better off converting
#' those character variables to factor variables **before using a recipe**.
#'
#' This can be done using \pkg{dplyr} with the following code
#'
#' ```r
#' df <- mutate(df, across(where(is.character), as.factor))
#' ```
#'
#' During resampling, the complete set of values might
#' not be in the character data. Converting them to factors with
#' `step_string2factor()` then will misconfigure the levels.
#'
#' If the `levels` argument to `step_string2factor()`is used, it will
#' convert all variables affected by this step to have the same
#' levels. Because of this, you will need to know the full set of level
#' when you define the recipe.
#'
#' Also, note that [prep()] has an option `strings_as_factors` that
#' defaults to `TRUE`. This should be changed so that raw character
#' data will be applied to `step_string2factor()`. However, this step
#' can also take existing factors (but will leave them as-is).
#'
#' # Tidying
#'
#' When you [`tidy()`][tidy.recipe()] this step, a tibble with columns
#' `terms` (the selectors or variables selected) and `ordered` is
#' returned.
#'
#' @template case-weights-not-supported
#'
#' @examplesIf rlang::is_installed("modeldata")
#' data(Sacramento, package = "modeldata")
#'
#' # convert factor to string to demonstrate
#' Sacramento$city <- as.character(Sacramento$city)
#'
#' rec <- recipe(~ city + zip, data = Sacramento)
#'
#' make_factor <- rec %>%
#' step_string2factor(city)
#'
#' make_factor <- prep(make_factor,
#' training = Sacramento
#' )
#'
#' make_factor
#'
#' # note that `city` is a factor in recipe output
#' bake(make_factor, new_data = NULL) %>% head()
#'
#' # ...but remains a string in the data
#' Sacramento %>% head()
step_string2factor <-
function(recipe,
...,
role = NA,
trained = FALSE,
levels = NULL,
ordered = FALSE,
skip = FALSE,
id = rand_id("string2factor")) {
if (!is_tune(ordered) & !is_varying(ordered)) {
if (!is.logical(ordered) || length(ordered) != 1) {
rlang::abort("`ordered` should be a single logical variable")
}
}
if ((!is.null(levels) & !is.character(levels)) | is.list(levels)) {
rlang::abort("`levels` should be NULL or a single character vector")
}
add_step(
recipe,
step_string2factor_new(
terms = enquos(...),
role = role,
trained = trained,
levels = levels,
ordered = ordered,
skip = skip,
id = id
)
)
}
step_string2factor_new <-
function(terms, role, trained, levels, ordered, skip, id) {
step(
subclass = "string2factor",
terms = terms,
role = role,
trained = trained,
levels = levels,
ordered = ordered,
skip = skip,
id = id
)
}
get_ord_lvls <- function(x) {
sort(unique(x))
}
#' @export
prep.step_string2factor <- function(x, training, info = NULL, ...) {
col_names <- recipes_eval_select(x$terms, training, info)
check_type(training[, col_names], types = c("string", "factor", "ordered"))
if (is.null(x$levels)) {
res <- lapply(training[, col_names], get_ord_lvls)
} else {
res <- x$levels
}
ord <- rep(x$ordered, length(col_names))
names(ord) <- col_names
step_string2factor_new(
terms = x$terms,
role = x$role,
trained = TRUE,
levels = res,
ordered = ord,
skip = x$skip,
id = x$id
)
}
make_factor <- function(x, lvl, ord) {
if (is.factor(x)) {
return(x)
}
factor(x, levels = lvl, ordered = ord)
}
#' @export
bake.step_string2factor <- function(object, new_data, ...) {
col_names <- names(object$ordered)
if (is.list(object$levels)) {
new_data[, col_names] <-
purrr::map2(new_data[, col_names],
object$levels,
make_factor,
ord = object$ordered[1]
)
} else {
new_data[, col_names] <-
map(new_data[, col_names],
make_factor,
lvl = object$levels,
ord = object$ordered[1]
)
}
new_data
}
print.step_string2factor <-
function(x, width = max(20, options()$width - 30), ...) {
title <- "Factor variables from "
print_step(names(x$ordered), x$terms, x$trained, title, width)
invisible(x)
}
#' @rdname tidy.recipe
#' @export
tidy.step_string2factor <- function(x, ...) {
term_names <- sel2char(x$terms)
p <- length(term_names)
if (is_trained(x)) {
res <- tibble(
terms = term_names,
ordered = rep(unname(x$ordered), p)
)
} else {
res <- tibble(
terms = term_names,
ordered = rep(unname(x$ordered), p)
)
}
res$id <- x$id
res
}
|