File: cut.R

package info (click to toggle)
r-cran-recipes 0.1.15%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 2,496 kB
  • sloc: sh: 37; makefile: 2
file content (221 lines) | stat: -rw-r--r-- 6,862 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#' Cut a numeric variable into a factor
#'
#' `step_cut()` creates a *specification* of a recipe step that cuts a numeric
#'  variable into a factor based on provided boundary values
#'
#' @param recipe A recipe object. The step will be added to the sequence of
#'  operations for this recipe.
#' @param ... One or more selector functions to choose which variables are
#'  affected by the step. See [selections()] for more details. For the `tidy`
#'  method, these are not currently used.
#' @param role Not used by this step since no new variables are created.
#' @param trained A logical to indicate if the quantities for preprocessing
#'  have been estimated.
#' @param breaks A numeric vector with at least one cut point.
#' @param include_outside_range Logical, indicating if values outside the
#'  range in the train set should be included in the lowest or highest bucket.
#'  Defaults to `FALSE`, values outside the original range will be set to `NA`.
#' @param skip A logical. Should the step be skipped when the recipe is baked
#'  by [bake.recipe()]? While all operations are baked when [prep.recipe()] is
#'  run, some operations may not be able to be conducted on new data (e.g.
#'  processing the outcome variable(s)). Care should be taken when using `skip =
#'  TRUE` as it may affect the computations for subsequent operations
#' @param id A character string that is unique to this step to identify it.
#' @return An updated version of `recipe` with the new step added to the
#'  sequence of existing steps (if any).
#' @keywords datagen
#' @concept preprocessing
#' @export
#' @details Unlike the `base::cut()` function there is no need to specify the
#'  min and the max values in the breaks. All values before the lowest break
#'  point will end up in the first bucket, all values after the last break
#'  points will end up in the last.
#'
#'  `step_cut()` will call `base::cut()` in the baking step with
#'  `include.lowest` set to `TRUE`.
#'
#' @examples
#' df <- data.frame(x = 1:10, y = 5:14)
#' rec <- recipe(df)
#'
#' # The min and max of the variable are used as boundaries
#' # if they exceed the breaks
#' rec %>%
#'   step_cut(x, breaks = 5) %>%
#'   prep() %>%
#'   bake(df)
#'
#' # You can use the same breaks on multiple variables
#' # then for each variable the boundaries are set separately
#' rec %>%
#'   step_cut(x, y, breaks = c(6, 9)) %>%
#'   prep() %>%
#'   bake(df)
#'
#' # It is up to you if you want values outside the
#' # range learned at prep to be included
#' new_df <- data.frame(x = 1:11)
#' rec %>%
#'   step_cut(x, breaks = 5, include_outside_range = TRUE) %>%
#'   prep() %>%
#'   bake(new_df)
#'
#' rec %>%
#'   step_cut(x, breaks = 5, include_outside_range = FALSE) %>%
#'   prep() %>%
#'   bake(new_df)
step_cut <-
    function(recipe,
             ...,
             role = NA,
             trained = FALSE,
             breaks,
             include_outside_range = FALSE,
             skip = FALSE,
             id = rand_id("cut")) {
      add_step(
        recipe,
        step_cut_new(
          terms = ellipse_check(...),
          trained = trained,
          role = role,
          breaks = breaks,
          include_outside_range = include_outside_range,
          skip = skip,
          id = id
        )
      )
    }

step_cut_new <-
  function(terms, role, trained,
           breaks, include_outside_range, skip, id) {
    step(
      subclass = "cut",
      terms = terms,
      role = role,
      trained = trained,
      breaks = breaks,
      include_outside_range = include_outside_range,
      skip = skip,
      id = id
    )
  }

prep.step_cut <- function(x, training, info = NULL, ...) {
  col_names <- eval_select_recipes(x$terms, training, info)

  check_type(training[, col_names])

  all_breaks <- vector("list", length(col_names))
  names(all_breaks) <- col_names
  for (col_name in col_names) {
    all_breaks[[col_name]] <-
      create_full_breaks(training[ ,col_name, drop = TRUE], breaks = x$breaks)
    full_breaks_check(all_breaks[[col_name]])
  }

  step_cut_new(
    terms = x$terms,
    role = x$role,
    trained = TRUE,
    breaks = all_breaks,
    include_outside_range = x$include_outside_range,
    skip = x$skip,
    id = x$id
  )
}

create_full_breaks <- function(var, breaks) {
  stopifnot(is.numeric(var), is.numeric(breaks))
  if (min(var) < min(breaks)) {
    breaks <- c(min(var), breaks)
  }
  if (max(var) > max(breaks)) {
    breaks <- c(max(var), breaks)
  }
  sort(breaks)
}

full_breaks_check <- function(breaks) {
  if (length(breaks) == 1) {
    rlang::abort("In step_cut: variable is invariant and equal to break point.")
  }
  if (length(breaks) == 2) {
    rlang::warn("In step_cut: this will create a factor with one value only.")
  }
}

bake.step_cut <- function(object, new_data, ...) {
  for (col_name in names(object$breaks)) {
    res <- cut_var(new_data[, col_name, drop = TRUE],
              object$breaks[[col_name]],
              object$include_outside_range)
    new_data[, col_name] <- res
  }
  as_tibble(new_data)
}

cut_var <- function(var, breaks, include_outside_range) {
  if (include_outside_range) {
    if (min(var) < min(breaks)) {
      breaks[1] <- min(var)
    }
    if (max(var) > max(breaks)) {
      breaks[length(breaks)] <- max(var)
    }
  }
  cutted_var <- cut(var, breaks, include.lowest = TRUE)
  if (include_outside_range) {
    cutted_var <- adjust_levels_min_max(cutted_var)
  }
  cutted_var
}

# this is necessary because bake.recipe does first learn
# original levels when prep.recipe is called and then reverts
# the levels when bake.recipe itself is called. Moreover,
# it is cleaner to show it in this way.
adjust_levels_min_max <- function(x) {
  stopifnot(is.factor(x))
  levs <- levels(x)
  if (length(levs) == 1) {
    return(factor(rep("[min,max]", length(x))))
  }
  first_level <- sub("(?<=\\[)(.*?)(?=,)", "min", levs[1], perl = TRUE)
  last_level <-
    sub("(?<=,)(.+?)(?=\\])", "max", levs[length(levs)], perl = TRUE)
  remaining_levs <- levs[-c(1, length(levs))]
  new_levs <- c(first_level, remaining_levs, last_level)
  names(new_levs) <- levs
  new_x <- new_levs[x]
  names(new_x) <- NULL
  names(new_levs) <- NULL
  factor(new_x, levels = new_levs)
}

print.step_cut <-
  function(x, width = max(20, options()$width - 30), ...) {
    cat("Cut numeric for ", sep = "")
    printer(names(x$breaks), x$terms, x$trained, width = width)
    invisible(x)
  }

#' @rdname step_cut
#' @param x A `step_cut` object.
#' @export
tidy.step_cut <- function(x, ...) {
  if (is_trained(x)) {
    res <-
      tibble(terms = names(x$breaks),
             value = sapply(x$class_list,
                            function(x) paste0(x, collapse = "-")))
  } else {
    term_names <- sel2char(x$terms)
    res <- tibble(terms = term_names,
                  value = na_dbl)
  }
  res$id <- x$id
  res
}