File: bin2factor.R

package info (click to toggle)
r-cran-recipes 0.1.15%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 2,496 kB
  • sloc: sh: 37; makefile: 2
file content (142 lines) | stat: -rw-r--r-- 4,426 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#' Create a Factors from A Dummy Variable
#'
#' `step_bin2factor` creates a *specification* of a
#'  recipe step that will create a two-level factor from a single
#'  dummy variable.
#' @inheritParams step_center
#' @inherit step_center return
#' @param ... Selector functions that choose which variables will
#'  be converted. See [selections()] for more details. For
#'  the `tidy` method, these are not currently used.
#' @param role Not used by this step since no new variables are
#'  created.
#' @param levels A length 2 character string that indicates the
#'  factor levels for the 1's (in the first position) and the zeros
#'  (second)
#' @param ref_first Logical. Should the first level, which replaces
#' 1's, be the factor reference level?
#' @param columns A vector with the selected variable names. This
#'  is `NULL` until computed by [prep.recipe()].
#' @return An updated version of `recipe` with the new step
#'  added to the sequence of existing steps (if any). For the
#'  `tidy` method, a tibble with columns `terms` (the
#'  columns that will be affected).
#' @details This operation may be useful for situations where a
#'  binary piece of information may need to be represented as
#'  categorical instead of numeric. For example, naive Bayes models
#'  would do better to have factor predictors so that the binomial
#'  distribution is modeled instead of a Gaussian probability
#'  density of numeric binary data. Note that the numeric data is
#'  only verified to be numeric (and does not count levels).
#' @keywords datagen
#' @concept preprocessing
#' @concept dummy_variables
#' @concept factors
#' @export
#' @examples
#' library(modeldata)
#' data(covers)
#'
#' rec <- recipe(~ description, covers) %>%
#'  step_regex(description, pattern = "(rock|stony)", result = "rocks") %>%
#'  step_regex(description, pattern = "(rock|stony)", result = "more_rocks") %>%
#'  step_bin2factor(rocks)
#'
#' tidy(rec, number = 3)
#'
#' rec <- prep(rec, training = covers)
#' results <- bake(rec, new_data = covers)
#'
#' table(results$rocks, results$more_rocks)
#'
#' tidy(rec, number = 3)
step_bin2factor <-
  function(recipe,
           ...,
           role = NA,
           trained = FALSE,
           levels = c("yes", "no"),
           ref_first = TRUE,
           columns = NULL,
           skip = FALSE,
           id = rand_id("bin2factor")) {
    if (length(levels) != 2 | !is.character(levels))
      rlang::abort("`levels` should be a two element character string")
    add_step(
      recipe,
      step_bin2factor_new(
        terms = ellipse_check(...),
        role = role,
        trained = trained,
        levels = levels,
        ref_first = ref_first,
        columns = columns,
        skip = skip,
        id = id
      )
    )
  }

step_bin2factor_new <-
  function(terms, role, trained, levels, ref_first, columns, skip, id) {
    step(
      subclass = "bin2factor",
      terms = terms,
      role = role,
      trained = trained,
      levels = levels,
      ref_first = ref_first,
      columns = columns,
      skip = skip,
      id = id
    )
  }

#' @export
prep.step_bin2factor <- function(x, training, info = NULL, ...) {
  col_names <- eval_select_recipes(x$terms, training, info)
  if (length(col_names) < 1)
    rlang::abort("The selector should only select at least one variable")
  if (any(info$type[info$variable %in% col_names] != "numeric"))
    rlang::abort("The variables should be numeric")
  step_bin2factor_new(
    terms = x$terms,
    role = x$role,
    trained = TRUE,
    levels = x$levels,
    ref_first = x$ref_first,
    columns = col_names,
    skip = x$skip,
    id = x$id
  )
}

bake.step_bin2factor <- function(object, new_data, ...) {
  levs <- if (object$ref_first) object$levels else rev(object$levels)
  for (i in seq_along(object$columns))
    new_data[, object$columns[i]] <-
      factor(ifelse(
        getElement(new_data, object$columns[i]) == 1,
        object$levels[1],
        object$levels[2]
      ),
      levels = levs)
  new_data
}

print.step_bin2factor <-
  function(x, width = max(20, options()$width - 30), ...) {
    cat("Dummy variable to factor conversion for ", sep = "")
    printer(x$columns, x$terms, x$trained, width = width)
    invisible(x)
  }


#' @rdname step_bin2factor
#' @param x A `step_bin2factor` object.
#' @export
tidy.step_bin2factor <- function(x, ...) {
  res <-simple_terms(x, ...)
  res$id <- x$id
  res
}