File: bin2factor.R

package info (click to toggle)
r-cran-recipes 1.0.4%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 3,636 kB
  • sloc: sh: 37; makefile: 2
file content (139 lines) | stat: -rw-r--r-- 4,018 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#' Create a Factors from A Dummy Variable
#'
#' `step_bin2factor` creates a *specification* of a
#'  recipe step that will create a two-level factor from a single
#'  dummy variable.
#' @inheritParams step_center
#' @param levels A length 2 character string that indicates the
#'  factor levels for the 1's (in the first position) and the zeros
#'  (second)
#' @param ref_first Logical. Should the first level, which replaces
#' 1's, be the factor reference level?
#' @param columns A vector with the selected variable names. This
#'  is `NULL` until computed by [prep()].
#' @template step-return
#' @details This operation may be useful for situations where a
#'  binary piece of information may need to be represented as
#'  categorical instead of numeric. For example, naive Bayes models
#'  would do better to have factor predictors so that the binomial
#'  distribution is modeled instead of a Gaussian probability
#'  density of numeric binary data. Note that the numeric data is
#'  only verified to be numeric (and does not count levels).
#'
#'  # Tidying
#'
#'  When you [`tidy()`][tidy.recipe()] this step, a tibble with column
#'  `terms` (the columns that will be affected) is returned.
#'
#' @template case-weights-not-supported
#'
#' @family dummy variable and encoding steps
#' @export
#' @examplesIf rlang::is_installed("modeldata")
#' data(covers, package = "modeldata")
#'
#' rec <- recipe(~description, covers) %>%
#'   step_regex(description, pattern = "(rock|stony)", result = "rocks") %>%
#'   step_regex(description, pattern = "(rock|stony)", result = "more_rocks") %>%
#'   step_bin2factor(rocks)
#'
#' tidy(rec, number = 3)
#'
#' rec <- prep(rec, training = covers)
#' results <- bake(rec, new_data = covers)
#'
#' table(results$rocks, results$more_rocks)
#'
#' tidy(rec, number = 3)
step_bin2factor <-
  function(recipe,
           ...,
           role = NA,
           trained = FALSE,
           levels = c("yes", "no"),
           ref_first = TRUE,
           columns = NULL,
           skip = FALSE,
           id = rand_id("bin2factor")) {
    if (length(levels) != 2 | !is.character(levels)) {
      rlang::abort("`levels` should be a two element character string")
    }
    add_step(
      recipe,
      step_bin2factor_new(
        terms = enquos(...),
        role = role,
        trained = trained,
        levels = levels,
        ref_first = ref_first,
        columns = columns,
        skip = skip,
        id = id
      )
    )
  }

step_bin2factor_new <-
  function(terms, role, trained, levels, ref_first, columns, skip, id) {
    step(
      subclass = "bin2factor",
      terms = terms,
      role = role,
      trained = trained,
      levels = levels,
      ref_first = ref_first,
      columns = columns,
      skip = skip,
      id = id
    )
  }

#' @export
prep.step_bin2factor <- function(x, training, info = NULL, ...) {
  col_names <- recipes_eval_select(x$terms, training, info)
  check_type(training[, col_names], types = c("double", "integer", "logical"))

  step_bin2factor_new(
    terms = x$terms,
    role = x$role,
    trained = TRUE,
    levels = x$levels,
    ref_first = x$ref_first,
    columns = col_names,
    skip = x$skip,
    id = x$id
  )
}

bake.step_bin2factor <- function(object, new_data, ...) {
  check_new_data(names(object$columns), object, new_data)

  levs <- if (object$ref_first) object$levels else rev(object$levels)
  for (i in seq_along(object$columns)) {
    new_data[, object$columns[i]] <-
      factor(ifelse(
        getElement(new_data, object$columns[i]) == 1,
        object$levels[1],
        object$levels[2]
      ),
      levels = levs
      )
  }
  new_data
}

print.step_bin2factor <-
  function(x, width = max(20, options()$width - 30), ...) {
    title <- "Dummy variable to factor conversion for "
    print_step(x$columns, x$terms, x$trained, title, width)
    invisible(x)
  }


#' @rdname tidy.recipe
#' @export
tidy.step_bin2factor <- function(x, ...) {
  res <- simple_terms(x, ...)
  res$id <- x$id
  res
}