File: verb-pivot-wider.R

package info (click to toggle)
r-cran-dbplyr 2.5.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 2,644 kB
  • sloc: sh: 13; makefile: 2
file content (451 lines) | stat: -rw-r--r-- 15,785 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
#' Pivot data from long to wide
#'
#' @description
#' `pivot_wider()` "widens" data, increasing the number of columns and
#' decreasing the number of rows. The inverse transformation is
#' `pivot_longer()`.
#' Learn more in `vignette("pivot", "tidyr")`.
#'
#' Note that `pivot_wider()` is not and cannot be lazy because we need to look
#' at the data to figure out what the new column names will be.
#' If you have a long running query you have two options:
#'
#' * (temporarily) store the result of the query via `compute()`.
#' * Create a spec before and use `dbplyr_pivot_wider_spec()` - dbplyr's version
#'   of `tidyr::pivot_wider_spec()`. Note that this function is only a temporary
#'   solution until `pivot_wider_spec()` becomes a generic. It will then be
#'   removed soon afterwards.
#'
#' @details
#' The big difference to `pivot_wider()` for local data frames is that
#' `values_fn` must not be `NULL`. By default it is `max()` which yields
#' the same results as for local data frames if the combination of `id_cols`
#' and `value` column uniquely identify an observation.
#' Mind that you also do not get a warning if an observation is not uniquely
#' identified.
#'
#' The translation to SQL code basically works as follows:
#'
#' 1. Get unique keys in `names_from` column.
#' 2. For each key value generate an expression of the form:
#'     ```sql
#'     value_fn(
#'       CASE WHEN (`names from column` == `key value`)
#'       THEN (`value column`)
#'       END
#'     ) AS `output column`
#'     ```
#' 3. Group data by id columns.
#' 4. Summarise the grouped data with the expressions from step 2.
#'
#' @param data A lazy data frame backed by a database query.
#' @param id_cols A set of columns that uniquely identifies each observation.
#' @param id_expand Unused; included for compatibility with the generic.
#' @param names_from,values_from A pair of
#'   arguments describing which column (or columns) to get the name of the
#'   output column (`names_from`), and which column (or columns) to get the
#'   cell values from (`values_from`).
#'
#'   If `values_from` contains multiple values, the value will be added to the
#'   front of the output column.
#' @param names_prefix String added to the start of every variable name.
#' @param names_sep If `names_from` or `values_from` contains multiple
#'   variables, this will be used to join their values together into a single
#'   string to use as a column name.
#' @param names_glue Instead of `names_sep` and `names_prefix`, you can supply
#'   a glue specification that uses the `names_from` columns (and special
#'   `.value`) to create custom column names.
#' @param names_sort Should the column names be sorted? If `FALSE`, the default,
#'   column names are ordered by first appearance.
#' @param names_vary When `names_from` identifies a column (or columns) with
#'   multiple unique values, and multiple `values_from` columns are provided,
#'   in what order should the resulting column names be combined?
#'
#'   - `"fastest"` varies `names_from` values fastest, resulting in a column
#'     naming scheme of the form: `value1_name1, value1_name2, value2_name1,
#'     value2_name2`. This is the default.
#'
#'   - `"slowest"` varies `names_from` values slowest, resulting in a column
#'     naming scheme of the form: `value1_name1, value2_name1, value1_name2,
#'     value2_name2`.
#' @param names_repair What happens if the output has invalid column names?
#' @param names_expand Should the values in the `names_from` columns be expanded
#'   by [expand()] before pivoting? This results in more columns, the output
#'   will contain column names corresponding to a complete expansion of all
#'   possible values in `names_from`. Additionally, the column names will be
#'   sorted, identical to what `names_sort` would produce.
#' @param values_fill Optionally, a (scalar) value that specifies what each
#'   `value` should be filled in with when missing.
#' @param values_fn A function, the default is `max()`, applied to the `value`
#' in each cell in the output. In contrast to local data frames it must not be
#' `NULL`.
#' @param unused_fn Optionally, a function applied to summarize the values from
#'   the unused columns (i.e. columns not identified by `id_cols`,
#'   `names_from`, or `values_from`).
#'
#'   The default drops all unused columns from the result.
#'
#'   This can be a named list if you want to apply different aggregations
#'   to different unused columns.
#'
#'   `id_cols` must be supplied for `unused_fn` to be useful, since otherwise
#'   all unspecified columns will be considered `id_cols`.
#'
#'   This is similar to grouping by the `id_cols` then summarizing the
#'   unused columns using `unused_fn`.
#' @param ... Unused; included for compatibility with generic.
#'
#' @examplesIf rlang::is_installed("tidyr", version = "1.0.0")
#' memdb_frame(
#'   id = 1,
#'   key = c("x", "y"),
#'   value = 1:2
#' ) %>%
#'   tidyr::pivot_wider(
#'     id_cols = id,
#'     names_from = key,
#'     values_from = value
#'   )
#' @exportS3Method tidyr::pivot_wider
pivot_wider.tbl_lazy <- function(data,
                                 ...,
                                 id_cols = NULL,
                                 id_expand = FALSE,
                                 names_from = name,
                                 names_prefix = "",
                                 names_sep = "_",
                                 names_glue = NULL,
                                 names_sort = FALSE,
                                 names_vary = "fastest",
                                 names_expand = FALSE,
                                 names_repair = "check_unique",
                                 values_from = value,
                                 values_fill = NULL,
                                 values_fn = ~ max(.x, na.rm = TRUE),
                                 unused_fn = NULL) {
  rlang::check_dots_empty()
  check_unsupported_arg(id_expand, FALSE)

  names_from <- enquo(names_from)
  values_from <- enquo(values_from)

  spec <- dbplyr_build_wider_spec(data,
    names_from = !!names_from,
    values_from = !!values_from,
    names_prefix = names_prefix,
    names_sep = names_sep,
    names_glue = names_glue,
    names_sort = names_sort,
    names_vary = names_vary,
    names_expand = names_expand,
    error_call = current_env()
  )

  id_cols <- build_wider_id_cols_expr(
    data = data,
    id_cols = {{id_cols}},
    names_from = !!names_from,
    values_from = !!values_from
  )

  dbplyr_pivot_wider_spec(
    data = data,
    spec = spec,
    ...,
    names_repair = names_repair,
    id_cols = !!id_cols,
    id_expand = FALSE,
    values_fill = values_fill,
    values_fn = values_fn,
    unused_fn = unused_fn,
    error_call = current_env()
  )
}

dbplyr_build_wider_spec <- function(data,
                                    names_from = name,
                                    values_from = value,
                                    names_prefix = "",
                                    names_sep = "_",
                                    names_glue = NULL,
                                    names_sort = FALSE,
                                    names_vary = "fastest",
                                    names_expand = FALSE,
                                    error_call = current_env()) {
  if (!inherits(data, "tbl_sql")) {
    cli_abort(c(
      "{.fun dbplyr_build_wider_spec} doesn't work with local lazy tibbles.",
      i = "Use {.fun memdb_frame} together with {.fun show_query} to see the SQL code."
    ))
  }

  # prepare a minimal local tibble we can pass to `tidyr::build_wider_spec`
  # 1. create a tibble with unique values in the `names_from` column
  # row_ids <- vec_unique(data[names_from])
  names_from <- tidyselect::eval_select(enquo(names_from), data) %>% names()
  if (is_empty(names_from)) {
    cli_abort("{.arg names_from} must select at least one column.")
  }
  distinct_data <- collect(distinct(data, !!!syms(names_from)))

  # 2. add `values_from` column
  values_from <- tidyselect::eval_select(enquo(values_from), data) %>% names()
  if (is_empty(values_from)) {
    cli_abort("{.arg values_from} must select at least one column.")
  }
  dummy_data <- vctrs::vec_cbind(
    distinct_data,
    !!!rlang::rep_named(values_from, list(TRUE)),
    .name_repair = "check_unique"
  )

  tidyr::build_wider_spec(dummy_data,
    names_from = !!enquo(names_from),
    values_from = !!enquo(values_from),
    names_prefix = names_prefix,
    names_sep = names_sep,
    names_glue = names_glue,
    names_sort = names_sort,
    names_vary = names_vary,
    names_expand = names_expand,
    error_call = error_call
  )
}

#' @inheritParams rlang::args_error_context
#' @inheritParams tidyr::pivot_wider_spec
#' @export
#' @rdname pivot_wider.tbl_lazy
dbplyr_pivot_wider_spec <- function(data,
                                    spec,
                                    ...,
                                    names_repair = "check_unique",
                                    id_cols = NULL,
                                    id_expand = FALSE,
                                    values_fill = NULL,
                                    values_fn = ~ max(.x, na.rm = TRUE),
                                    unused_fn = NULL,
                                    error_call = current_env()) {
  check_unsupported_arg(id_expand, FALSE)
  spec <- tidyr::check_pivot_spec(spec)

  names_from_cols <- names(spec)[-(1:2)]
  values_from_cols <- vctrs::vec_unique(spec$.value)

  id_cols <- select_wider_id_cols(
    data = data,
    id_cols = {{ id_cols }},
    names_from_cols = names_from_cols,
    values_from_cols = values_from_cols,
    error_call = error_call
  )

  values_fn <- check_list_of_functions(values_fn, values_from_cols, call = error_call)

  unused_cols <- setdiff(colnames(data), c(id_cols, names_from_cols, values_from_cols))
  unused_fn <- check_list_of_functions(unused_fn, unused_cols, call = error_call)
  unused_cols <- names(unused_fn)

  if (is.null(values_fill)) {
    values_fill <- list()
  } else if (is_scalar(values_fill)) {
    values_fill <- rep_named(values_from_cols, list(values_fill))
  } else if (!vctrs::vec_is_list(values_fill)) {
    cli::cli_abort(
      "{.arg values_fill} must be {.code NULL}, a scalar, or a named list, not {.obj_type_friendly {values_fill}}.",
      call = error_call
    )
  }
  values_fill <- values_fill[intersect(names(values_fill), values_from_cols)]

  missing_values <- setdiff(values_from_cols, names(values_fn))
  if (!is_empty(missing_values)) {
    cli_abort("{.arg values_fn} must specify a function for each col in {.arg values_from}")
  }

  n_unused_fn <- length(unused_fn)

  unused_col_expr <- vector("list", length = n_unused_fn)
  names(unused_col_expr) <- unused_cols

  for (i in seq_len(n_unused_fn)) {
    unused_col <- unused_cols[[i]]
    unused_fn_i <- unused_fn[[i]]

    unused_col_expr[[i]] <- resolve_fun(unused_fn_i, sym(unused_col), call = error_call)
  }

  spec_idx <- set_names(vctrs::vec_seq_along(spec), spec$.name)
  pivot_exprs <- with_indexed_errors(
    purrr::map(
      spec_idx,
      ~ build_pivot_wider_exprs(.x, spec, values_fill, values_fn, call = NULL)
    ),
    message = function(cnd) {
      col <- spec[[".value"]][cnd$location]
      cli::format_inline("Can't pivot column {.field {col}}:")
    },
    .error_call = error_call
  )

  non_id_cols <- c(names_from_cols, values_from_cols)
  key_vars <- setdiff(id_cols, non_id_cols)
  data_grouped <- group_by(data, !!!syms(key_vars), .add = TRUE)

  group_names <- group_vars(data_grouped)
  out_nms <- c(group_names, names(pivot_exprs))
  out_nms_repaired <- vctrs::vec_as_names(out_nms, repair = names_repair)

  if (!is_empty(group_names)) {
    out_nms_repaired <- out_nms_repaired[-(1:length(group_names))]
  }
  pivot_exprs <- set_names(pivot_exprs, out_nms_repaired)

  data_grouped %>%
    summarise(
      !!!pivot_exprs,
      !!!unused_col_expr,
      .groups = "drop"
    ) %>%
    group_by(!!!syms(group_vars(data)))
}

utils::globalVariables(c("name", "value"))

build_wider_id_cols_expr <- function(data,
                                     id_cols = NULL,
                                     names_from = name,
                                     values_from = value,
                                     call = caller_env()) {
  # COPIED FROM tidyr
  names_from <- tidyselect::eval_select(
    enquo(names_from),
    data,
    allow_rename = FALSE,
    error_call = error_call
  )

  values_from <- tidyselect::eval_select(
    enquo(values_from),
    data,
    allow_rename = FALSE,
    error_call = error_call
  )

  out <- select_wider_id_cols(
    data = data,
    id_cols = {{ id_cols }},
    names_from_cols = names(names_from),
    values_from_cols = names(values_from),
    error_call = error_call
  )

  expr(c(!!!out))
}

select_wider_id_cols <- function(data,
                                 id_cols = NULL,
                                 names_from_cols = character(),
                                 values_from_cols = character(),
                                 error_call = caller_env()) {
  # COPIED FROM tidyr
  id_cols <- enquo(id_cols)
  sim_data <- tidyselect_data_proxy(data)

  # Remove known non-id-cols so they are never selected
  sim_data <- sim_data[setdiff(names(sim_data), c(names_from_cols, values_from_cols))]

  if (quo_is_null(id_cols)) {
    # Default selects everything in `sim_data` after non-id-cols have been removed
    return(names(sim_data))
  }

  withCallingHandlers(
    id_cols <- tidyselect::eval_select(
      enquo(id_cols),
      sim_data,
      allow_rename = FALSE,
      error_call = error_call
    ),
    vctrs_error_subscript_oob = function(cnd) {
      rethrow_id_cols_oob(cnd, names_from_cols, values_from_cols, error_call)
    }
  )

  names(id_cols)
}

rethrow_id_cols_oob <- function(cnd, names_from_cols, values_from_cols, call) {
  i <- cnd[["i"]]

  check_string(i, .internal = TRUE)

  if (i %in% names_from_cols) {
    stop_id_cols_oob(i, "names_from", call = call)
  } else if (i %in% values_from_cols) {
    stop_id_cols_oob(i, "values_from", call = call)
  } else {
    # Zap this special handler, throw the normal condition
    zap()
  }
}

stop_id_cols_oob <- function(i, arg, call) {
  cli::cli_abort(
    c(
      "`id_cols` can't select a column already selected by `{arg}`.",
      i = "Column `{i}` has already been selected."
    ),
    parent = NA,
    call = call
  )
}

build_pivot_wider_exprs <- function(row_id, spec, values_fill, values_fn, call) {
  values_col <- spec[[".value"]][row_id]
  fill_value <- values_fill[[values_col]]

  keys <- vctrs::vec_slice(spec[, -(1:2)], row_id)
  keys_cond <- purrr::imap(
    keys,
    function(value, name) {
      if (is.na(value)) {
        expr(is.na(!!sym(name)))
      } else {
        expr(!!sym(name) == !!value)
      }
    }
  )
  keys_cond <- purrr::reduce(keys_cond, ~ expr(!!.x & !!.y))

  case_expr <- expr(ifelse(!!keys_cond, !!sym(values_col), !!fill_value))

  agg_fn <- values_fn[[values_col]]
  resolve_fun(agg_fn, case_expr, arg = paste0("values_fn$", values_col), call = call)
}

is_scalar <- function(x) {
  if (is.null(x)) {
    return(FALSE)
  }

  if (is.list(x)) {
    (length(x) == 1) && !have_name(x)
  } else {
    length(x) == 1
  }
}


resolve_fun <- function(x, var, arg = caller_arg(x), call = caller_env()) {
  if (is_formula(x)) {
    .fn_expr <- across_fun(x, env = empty_env(), dots = NULL, fn = "across")
    exec(.fn_expr, var, NULL)
  } else {
    fn_name <- find_fun(x)
    if (is_null(fn_name)) {
      cli_abort("Can't convert {.arg {arg}}, {.code {as_label(x)}}, to a function.", call = call)
    }
    call2(fn_name, var)
  }
}