File: blueprint-xy-default.R

package info (click to toggle)
r-cran-hardhat 1.2.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,656 kB
  • sloc: sh: 13; makefile: 2
file content (380 lines) | stat: -rw-r--r-- 11,127 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
#' Default XY blueprint
#'
#' This pages holds the details for the XY preprocessing blueprint. This
#' is the blueprint used by default from `mold()` if `x` and `y` are provided
#' separately (i.e. the XY interface is used).
#'
#' @inheritParams new-blueprint
#'
#' @param x A data frame or matrix containing the predictors.
#'
#' @param y A data frame, matrix, or vector containing the outcomes.
#'
#' @param blueprint A preprocessing `blueprint`. If left as `NULL`, then a
#' [default_xy_blueprint()] is used.
#'
#' @param ... Not used.
#'
#' @return
#'
#' For `default_xy_blueprint()`, an XY blueprint.
#'
#' @details
#'
#' As documented in [standardize()], if `y` is a _vector_, then the returned
#' outcomes tibble has 1 column with a standardized name of `".outcome"`.
#'
#' The one special thing about the XY method's forge function is the behavior of
#' `outcomes = TRUE` when a _vector_ `y` value was provided to the original
#' call to [mold()]. In that case, `mold()` converts `y` into a tibble, with
#' a default name of `.outcome`. This is the column that `forge()` will look
#' for in `new_data` to preprocess. See the examples section for a
#' demonstration of this.
#'
#' @section Mold:
#'
#' When `mold()` is used with the default xy blueprint:
#'
#' - It converts `x` to a tibble.
#'
#' - It adds an intercept column to `x` if `intercept = TRUE`.
#'
#' - It runs [standardize()] on `y`.
#'
#' @section Forge:
#'
#' When `forge()` is used with the default xy blueprint:
#'
#' - It calls [shrink()] to trim `new_data` to only the required columns and
#' coerce `new_data` to a tibble.
#'
#' - It calls [scream()] to perform validation on the structure of the columns
#' of `new_data`.
#'
#' - It adds an intercept column onto `new_data` if `intercept = TRUE`.
#'
#' @examples
#' # ---------------------------------------------------------------------------
#' # Setup
#'
#' train <- iris[1:100, ]
#' test <- iris[101:150, ]
#'
#' train_x <- train[, "Sepal.Length", drop = FALSE]
#' train_y <- train[, "Species", drop = FALSE]
#'
#' test_x <- test[, "Sepal.Length", drop = FALSE]
#' test_y <- test[, "Species", drop = FALSE]
#'
#' # ---------------------------------------------------------------------------
#' # XY Example
#'
#' # First, call mold() with the training data
#' processed <- mold(train_x, train_y)
#'
#' # Then, call forge() with the blueprint and the test data
#' # to have it preprocess the test data in the same way
#' forge(test_x, processed$blueprint)
#'
#' # ---------------------------------------------------------------------------
#' # Intercept
#'
#' processed <- mold(train_x, train_y, blueprint = default_xy_blueprint(intercept = TRUE))
#'
#' forge(test_x, processed$blueprint)
#'
#' # ---------------------------------------------------------------------------
#' # XY Method and forge(outcomes = TRUE)
#'
#' # You can request that the new outcome columns are preprocessed as well, but
#' # they have to be present in `new_data`!
#'
#' processed <- mold(train_x, train_y)
#'
#' # Can't do this!
#' try(forge(test_x, processed$blueprint, outcomes = TRUE))
#'
#' # Need to use the full test set, including `y`
#' forge(test, processed$blueprint, outcomes = TRUE)
#'
#' # With the XY method, if the Y value used in `mold()` is a vector,
#' # then a column name of `.outcome` is automatically generated.
#' # This name is what forge() looks for in `new_data`.
#'
#' # Y is a vector!
#' y_vec <- train_y$Species
#'
#' processed_vec <- mold(train_x, y_vec)
#'
#' # This throws an informative error that tell you
#' # to include an `".outcome"` column in `new_data`.
#' try(forge(iris, processed_vec$blueprint, outcomes = TRUE))
#'
#' test2 <- test
#' test2$.outcome <- test2$Species
#' test2$Species <- NULL
#'
#' # This works, and returns a tibble in the $outcomes slot
#' forge(test2, processed_vec$blueprint, outcomes = TRUE)
#'
#' # ---------------------------------------------------------------------------
#' # Matrix output for predictors
#'
#' # You can change the `composition` of the predictor data set
#' bp <- default_xy_blueprint(composition = "dgCMatrix")
#' processed <- mold(train_x, train_y, blueprint = bp)
#' class(processed$predictors)
#' @export
default_xy_blueprint <- function(intercept = FALSE,
                                 allow_novel_levels = FALSE,
                                 composition = "tibble") {
  new_default_xy_blueprint(
    intercept = intercept,
    allow_novel_levels = allow_novel_levels,
    composition = composition
  )
}

#' Create a new default blueprint
#'
#' This page contains the constructors for the default blueprints. They can be
#' extended if you want to add extra behavior on top of what the default
#' blueprints already do, but generally you will extend the non-default versions
#' of the constructors found in the documentation for [new_blueprint()].
#'
#' @inheritParams new_xy_blueprint
#' @inheritParams new_formula_blueprint
#' @inheritParams new_recipe_blueprint
#'
#' @name new-default-blueprint
#' @export
new_default_xy_blueprint <- function(intercept = FALSE,
                                     allow_novel_levels = FALSE,
                                     composition = "tibble",
                                     ptypes = NULL,
                                     ...,
                                     subclass = character()) {
  new_xy_blueprint(
    intercept = intercept,
    allow_novel_levels = allow_novel_levels,
    composition = composition,
    ptypes = ptypes,
    ...,
    subclass = c(subclass, "default_xy_blueprint")
  )
}

#' @export
refresh_blueprint.default_xy_blueprint <- function(blueprint) {
  do.call(new_default_xy_blueprint, as.list(blueprint))
}

# ------------------------------------------------------------------------------

#' @param x A data frame or matrix containing the predictors.
#'
#' @param y A data frame, matrix, or vector containing the outcomes.
#'
#' @rdname run-mold
#' @export
run_mold.default_xy_blueprint <- function(blueprint, ..., x, y) {
  check_dots_empty0(...)

  cleaned <- mold_xy_default_clean(blueprint = blueprint, x = x, y = y)

  blueprint <- cleaned$blueprint
  x <- cleaned$x
  y <- cleaned$y

  mold_xy_default_process(blueprint = blueprint, x = x, y = y)
}

# ------------------------------------------------------------------------------
# mold - xy - clean

mold_xy_default_clean <- function(blueprint, x, y) {
  cleaned <- mold_xy_default_clean_predictors(blueprint, x)

  blueprint <- cleaned$blueprint
  x <- cleaned$x

  # Special case `y = NULL` as a 0 column variation on `x`
  if (is.null(y)) {
    y <- x[, 0L, drop = FALSE]
  }

  cleaned <- mold_xy_default_clean_outcomes(blueprint, y)

  blueprint <- cleaned$blueprint
  y <- cleaned$y

  new_mold_clean_xy(blueprint, x, y)
}

mold_xy_default_clean_predictors <- function(blueprint, x) {
  x <- tibble::as_tibble(x)
  list(blueprint = blueprint, x = x)
}

mold_xy_default_clean_outcomes <- function(blueprint, y) {
  y <- standardize(y)
  list(blueprint = blueprint, y = y)
}

# ------------------------------------------------------------------------------
# mold - xy - process

mold_xy_default_process <- function(blueprint, x, y) {
  processed <- mold_xy_default_process_predictors(blueprint, x)

  blueprint <- processed$blueprint
  predictors <- processed$data
  predictors_ptype <- processed$ptype
  predictors_extras <- processed$extras

  processed <- mold_xy_default_process_outcomes(blueprint, y)

  blueprint <- processed$blueprint
  outcomes <- processed$data
  outcomes_ptype <- processed$ptype
  outcomes_extras <- processed$extras

  ptypes <- new_ptypes(predictors_ptype, outcomes_ptype)
  extras <- new_extras(predictors_extras, outcomes_extras)

  blueprint <- update_blueprint(blueprint, ptypes = ptypes)

  new_mold_process(predictors, outcomes, blueprint, extras)
}

mold_xy_default_process_predictors <- function(blueprint, x) {

  # Important! Collect ptype before adding intercept!
  ptype <- extract_ptype(x)

  x <- maybe_add_intercept_column(x, blueprint$intercept)

  x <- recompose(x, blueprint$composition)

  new_mold_process_terms(
    blueprint = blueprint,
    data = x,
    ptype = ptype
  )
}

mold_xy_default_process_outcomes <- function(blueprint, y) {
  ptype <- extract_ptype(y)

  new_mold_process_terms(
    blueprint = blueprint,
    data = y,
    ptype = ptype
  )
}

# ------------------------------------------------------------------------------

#' @rdname run-forge
#' @export
run_forge.default_xy_blueprint <- function(blueprint,
                                           new_data,
                                           ...,
                                           outcomes = FALSE) {
  check_dots_empty0(...)

  cleaned <- forge_xy_default_clean(
    blueprint = blueprint,
    new_data = new_data,
    outcomes = outcomes
  )

  blueprint <- cleaned$blueprint
  predictors <- cleaned$predictors
  outcomes <- cleaned$outcomes
  extras <- cleaned$extras

  forge_xy_default_process(
    blueprint = blueprint,
    predictors = predictors,
    outcomes = outcomes,
    extras = extras
  )
}

# ------------------------------------------------------------------------------

forge_xy_default_clean <- function(blueprint, new_data, outcomes) {
  validate_is_new_data_like(new_data)
  validate_has_unique_column_names(new_data, "new_data")
  validate_is_bool(outcomes)

  predictors <- shrink(new_data, blueprint$ptypes$predictors)

  predictors <- scream(
    predictors,
    blueprint$ptypes$predictors,
    allow_novel_levels = blueprint$allow_novel_levels
  )

  if (outcomes) {
    outcomes <- shrink(new_data, blueprint$ptypes$outcomes)
    # Never allow novel levels for outcomes
    outcomes <- scream(outcomes, blueprint$ptypes$outcomes)
  } else {
    outcomes <- NULL
  }

  new_forge_clean(blueprint, predictors, outcomes)
}

# ------------------------------------------------------------------------------

forge_xy_default_process <- function(blueprint, predictors, outcomes, extras) {
  processed <- forge_xy_default_process_predictors(blueprint, predictors)

  blueprint <- processed$blueprint
  predictors <- processed$data
  predictors_extras <- processed$extras

  processed <- forge_xy_default_process_outcomes(blueprint, outcomes)

  blueprint <- processed$blueprint
  outcomes <- processed$data
  outcomes_extras <- processed$extras

  extras <- c(
    extras,
    new_extras(predictors_extras, outcomes_extras)
  )

  new_forge_process(predictors, outcomes, extras)
}

forge_xy_default_process_predictors <- function(blueprint, predictors) {
  predictors <- maybe_add_intercept_column(predictors, blueprint$intercept)

  predictors <- recompose(predictors, blueprint$composition)

  new_forge_process_terms(
    blueprint = blueprint,
    data = predictors
  )
}

forge_xy_default_process_outcomes <- function(blueprint, outcomes) {

  # no outcomes to process
  if (is.null(outcomes)) {
    result <- new_forge_process_terms(
      blueprint = blueprint,
      data = outcomes
    )
    return(result)
  }

  new_forge_process_terms(
    blueprint = blueprint,
    data = outcomes
  )
}