File: string2factor.R

package info (click to toggle)
r-cran-recipes 1.0.4%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 3,636 kB
  • sloc: sh: 37; makefile: 2
file content (207 lines) | stat: -rw-r--r-- 5,509 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#' Convert Strings to Factors
#'
#' @description
#' `step_string2factor` will convert one or more character
#'  vectors to factors (ordered or unordered).
#'
#'  _Use this step only in special cases_ (see Details) and instead convert
#'  strings to factors before using any tidymodels functions.
#'
#' @inheritParams step_center
#' @param levels An options specification of the levels to be used
#'  for the new factor. If left `NULL`, the sorted unique
#'  values present when `bake` is called will be used.
#' @param ordered A single logical value; should the factor(s) be
#'  ordered?
#' @template step-return
#' @family dummy variable and encoding steps
#' @export
#' @details
#'
#'  ## When should you use this step?
#'
#'  In most cases, if you are planning to use `step_string2factor()`
#'  without setting `levels`, you will be better off converting
#'  those character variables to factor variables **before using a recipe**.
#'
#'  This can be done using \pkg{dplyr} with the following code
#'
#'  ```r
#'  df <- mutate(df, across(where(is.character), as.factor))
#'  ```
#'
#'  During resampling, the complete set of values might
#'  not be in the character data. Converting them to factors with
#'  `step_string2factor()`  then will misconfigure the levels.
#'
#'  If the `levels` argument to `step_string2factor()`is used, it will
#'  convert all variables affected by this step to have the same
#'  levels. Because of this, you will need to know the full set of level
#'  when you define the recipe.
#'
#'  Also, note that [prep()] has an option `strings_as_factors` that
#'  defaults to `TRUE`. This should be changed so that raw character
#'  data will be applied to `step_string2factor()`. However, this step
#'  can also take existing factors (but will leave them as-is).
#'
#'  # Tidying
#'
#'  When you [`tidy()`][tidy.recipe()] this step, a tibble with columns
#'  `terms` (the selectors or variables selected) and `ordered` is
#'  returned.
#'
#' @template case-weights-not-supported
#'
#' @examplesIf rlang::is_installed("modeldata")
#' data(Sacramento, package = "modeldata")
#'
#' # convert factor to string to demonstrate
#' Sacramento$city <- as.character(Sacramento$city)
#'
#' rec <- recipe(~ city + zip, data = Sacramento)
#'
#' make_factor <- rec %>%
#'   step_string2factor(city)
#'
#' make_factor <- prep(make_factor,
#'   training = Sacramento
#' )
#'
#' make_factor
#'
#' # note that `city` is a factor in recipe output
#' bake(make_factor, new_data = NULL) %>% head()
#'
#' # ...but remains a string in the data
#' Sacramento %>% head()
step_string2factor <-
  function(recipe,
           ...,
           role = NA,
           trained = FALSE,
           levels = NULL,
           ordered = FALSE,
           skip = FALSE,
           id = rand_id("string2factor")) {
    if (!is_tune(ordered) & !is_varying(ordered)) {
      if (!is.logical(ordered) || length(ordered) != 1) {
        rlang::abort("`ordered` should be a single logical variable")
      }
    }
    if ((!is.null(levels) & !is.character(levels)) | is.list(levels)) {
      rlang::abort("`levels` should be NULL or a single character vector")
    }

    add_step(
      recipe,
      step_string2factor_new(
        terms = enquos(...),
        role = role,
        trained = trained,
        levels = levels,
        ordered = ordered,
        skip = skip,
        id = id
      )
    )
  }

step_string2factor_new <-
  function(terms, role, trained, levels, ordered, skip, id) {
    step(
      subclass = "string2factor",
      terms = terms,
      role = role,
      trained = trained,
      levels = levels,
      ordered = ordered,
      skip = skip,
      id = id
    )
  }

get_ord_lvls <- function(x) {
  sort(unique(x))
}

#' @export
prep.step_string2factor <- function(x, training, info = NULL, ...) {
  col_names <- recipes_eval_select(x$terms, training, info)
  check_type(training[, col_names], types = c("string", "factor", "ordered"))

  if (is.null(x$levels)) {
    res <- lapply(training[, col_names], get_ord_lvls)
  } else {
    res <- x$levels
  }

  ord <- rep(x$ordered, length(col_names))
  names(ord) <- col_names

  step_string2factor_new(
    terms = x$terms,
    role = x$role,
    trained = TRUE,
    levels = res,
    ordered = ord,
    skip = x$skip,
    id = x$id
  )
}

make_factor <- function(x, lvl, ord) {
  if (is.factor(x)) {
    return(x)
  }
  factor(x, levels = lvl, ordered = ord)
}

#' @export
bake.step_string2factor <- function(object, new_data, ...) {
  col_names <- names(object$ordered)

  if (is.list(object$levels)) {
    new_data[, col_names] <-
      purrr::map2(new_data[, col_names],
        object$levels,
        make_factor,
        ord = object$ordered[1]
      )
  } else {
    new_data[, col_names] <-
      map(new_data[, col_names],
        make_factor,
        lvl = object$levels,
        ord = object$ordered[1]
      )
  }
  new_data
}

print.step_string2factor <-
  function(x, width = max(20, options()$width - 30), ...) {
    title <- "Factor variables from "
    print_step(names(x$ordered), x$terms, x$trained, title, width)
    invisible(x)
  }


#' @rdname tidy.recipe
#' @export
tidy.step_string2factor <- function(x, ...) {
  term_names <- sel2char(x$terms)
  p <- length(term_names)
  if (is_trained(x)) {
    res <- tibble(
      terms = term_names,
      ordered = rep(unname(x$ordered), p)
    )
  } else {
    res <- tibble(
      terms = term_names,
      ordered = rep(unname(x$ordered), p)
    )
  }
  res$id <- x$id
  res
}