File: verb-pivot-longer.R

package info (click to toggle)
r-cran-dbplyr 2.3.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 2,376 kB
  • sloc: sh: 13; makefile: 2
file content (282 lines) | stat: -rw-r--r-- 9,108 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
#' Pivot data from wide to long
#'
#' @description
#' `pivot_longer()` "lengthens" data, increasing the number of rows and
#' decreasing the number of columns. The inverse transformation is
#' `tidyr::pivot_wider()]
#'
#' Learn more in `vignette("pivot", "tidyr")`.
#'
#' While most functionality is identical there are some differences to
#' `pivot_longer()` on local data frames:
#' * the output is sorted differently/not explicitly,
#' * the coercion of mixed column types is left to the database,
#' * `values_ptypes` NOT supported.
#'
#' Note that `build_longer_spec()` and `pivot_longer_spec()` do not work with
#' remote tables.
#'
#' @details
#' The SQL translation basically works as follows:
#'
#' 1. split the specification by its key columns i.e. by variables crammed
#' into the column names.
#' 2. for each part in the splitted specification `transmute()` `data` into the
#' following columns
#'   * id columns i.e. columns that are not pivotted
#'   * key columns
#'   * value columns i.e. columns that are pivotted
#' 3. combine all the parts with `union_all()`
#'
#' @param data A data frame to pivot.
#' @param cols Columns to pivot into longer format.
#' @param names_to A string specifying the name of the column to create
#'   from the data stored in the column names of `data`.
#' @param names_prefix A regular expression used to remove matching text
#'   from the start of each variable name.
#' @param names_sep,names_pattern If `names_to` contains multiple values,
#'   these arguments control how the column name is broken up.
#' @param names_repair What happens if the output has invalid column names?
#' @param values_to A string specifying the name of the column to create
#'   from the data stored in cell values. If `names_to` is a character
#'   containing the special `.value` sentinel, this value will be ignored,
#'   and the name of the value column will be derived from part of the
#'   existing column names.
#' @param values_drop_na If `TRUE`, will drop rows that contain only `NA`s
#'   in the `value_to` column.
#' @param names_transform,values_transform A list of column name-function pairs.
#' @param names_ptypes A list of column name-prototype pairs.
#' @param values_ptypes Not supported.
#' @param ... Additional arguments passed on to methods.
#' @examplesIf rlang::is_installed("tidyr", version = "1.0.0")
#' # See vignette("pivot") for examples and explanation
#'
#' # Simplest case where column names are character data
#' memdb_frame(
#'   id = c("a", "b"),
#'   x = 1:2,
#'   y = 3:4
#' ) %>%
#'   tidyr::pivot_longer(-id)
pivot_longer.tbl_lazy <- function(data,
                                  cols,
                                  names_to = "name",
                                  names_prefix = NULL,
                                  names_sep = NULL,
                                  names_pattern = NULL,
                                  names_ptypes = NULL,
                                  names_transform = NULL,
                                  names_repair = "check_unique",
                                  values_to = "value",
                                  values_drop_na = FALSE,
                                  values_ptypes,
                                  values_transform = NULL,
                                  ...) {
  if (!is_missing(values_ptypes)) {
    cli_abort("The {.arg values_ptypes} argument is not supported for remote back-ends")
  }

  rlang::check_dots_empty()

  cols <- enquo(cols)
  spec <- tidyr::build_longer_spec(tidyselect_data_proxy(data), !!cols,
    names_to = names_to,
    values_to = values_to,
    names_prefix = names_prefix,
    names_sep = names_sep,
    names_pattern = names_pattern,
    names_ptypes = names_ptypes,
    names_transform = names_transform
  )

  dbplyr_pivot_longer_spec(data, spec,
    names_repair = names_repair,
    values_drop_na = values_drop_na,
    values_transform = values_transform
  )
}

dbplyr_pivot_longer_spec <- function(data,
                                     spec,
                                     names_repair = "check_unique",
                                     values_drop_na = FALSE,
                                     values_transform = NULL) {
  spec <- tidyr::check_pivot_spec(spec)
  # .seq col needed if different input columns are mapped to the same output
  # column
  spec <- deduplicate_spec(spec, data)

  id_cols <- syms(setdiff(colnames(data), spec$.name))
  repair_info <- apply_name_repair_pivot_longer(id_cols, spec, names_repair)
  id_cols <- repair_info$id_cols
  spec <- repair_info$spec

  spec_split <- vctrs::vec_split(spec, spec[, -(1:2)])

  call <- current_env()
  value_names <- unique(spec$.value)
  values_transform <- check_list_of_functions(values_transform, value_names, "values_transform", call)

  nms_map <- tibble(
    name = colnames(spec_split$key),
    name_mapped = ifelse(
      name %in% unlist(spec_split$key),
      paste0("..", name),
      name
    )
  )
  spec_split$key <- set_names(spec_split$key, nms_map$name_mapped)

  data_long_list <- purrr::map(
    vctrs::vec_seq_along(spec_split),
    function(idx) {
      row <- spec_split$val[[idx]][, 1:2]
      keys <- spec_split$key[idx, ]
      keys$.seq <- NULL

      measure_cols_exprs <- get_measure_column_exprs(
        row[[".name"]],
        row[[".value"]],
        values_transform,
        data = data,
        call = call
      )

      transmute(
        data,
        !!!id_cols,
        !!!keys,
        !!!measure_cols_exprs
      )
    }
  )

  data_long <- purrr::reduce(data_long_list, union_all)


  if (values_drop_na) {
    value_cols <- unique(spec$.value)

    data_long <- dplyr::filter_at(
      data_long,
      value_cols,
      dplyr::all_vars(!is.na(.))
    )
  }

  data_long %>%
    rename(!!!tibble::deframe(nms_map))
}

get_measure_column_exprs <- function(name, value, values_transform, data, call) {
  measure_cols <- set_names(syms(name), value)
  purrr::imap(
    measure_cols,
    ~ {
      f_trans <- values_transform[[.y]]

      if (is_null(f_trans)) {
        .x
      } else {
        resolve_fun(f_trans, .x, data, call)
      }
    }
  )
}

apply_name_repair_pivot_longer <- function(id_cols, spec, names_repair) {
  # Calculates how the column names in `pivot_longer()` need to be repaired
  # and applies this to the `id_cols` and the `spec` argument:
  # * The names of `id_cols` are the repaired id column names
  # * The `spec` columns after the third column are renamed to the repaired name
  # * The entries in the `value` column of `spec` are changed to the repaired name

  nms_map_df <- vctrs::vec_rbind(
    tibble(from = "id_cols", name = as.character(id_cols)),
    tibble(from = "measure_cols", name = colnames(spec)[-(1:2)]),
    tibble(from = "value_cols", name = unique(spec[[".value"]]))
  ) %>%
    mutate(name_rep = vctrs::vec_as_names(name, repair = names_repair))
  nms_map <- split(nms_map_df, nms_map_df$from)

  id_cols <- purrr::set_names(id_cols, nms_map$id_cols$name_rep)

  colnames(spec)[-(1:2)] <- nms_map$measure_cols$name_rep

  value_nms_map <- purrr::set_names(
    nms_map$value_cols$name_rep,
    nms_map$value_cols$name
  )
  spec$.value <- dplyr::recode(spec$.value, !!!value_nms_map)

  list(id_cols = id_cols, spec = spec)
}

# The following is copy-pasted from `tidyr`

# nocov start
# Ensure that there's a one-to-one match from spec to data by adding
# a special .seq variable which is automatically removed after pivotting.
deduplicate_spec <- function(spec, df) {
  # COPIED FROM tidyr

  # Ensure each .name has a unique output identifier
  key <- spec[setdiff(names(spec), ".name")]
  if (vctrs::vec_duplicate_any(key)) {
    pos <- vctrs::vec_group_loc(key)$loc
    seq <- vector("integer", length = nrow(spec))
    for (i in seq_along(pos)) {
      seq[pos[[i]]] <- seq_along(pos[[i]])
    }
    spec$.seq <- seq
  }

  # Match spec to data, handling duplicated column names
  col_id <- vctrs::vec_match(names(df), spec$.name)
  has_match <- !is.na(col_id)

  if (!vctrs::vec_duplicate_any(col_id[has_match])) {
    return(spec)
  }

  spec <- vctrs::vec_slice(spec, col_id[has_match])
  # Need to use numeric indices because names only match first
  spec$.name <- seq_along(df)[has_match]

  pieces <- vctrs::vec_split(seq_len(nrow(spec)), col_id[has_match])
  copy <- integer(nrow(spec))
  for (i in seq_along(pieces$val)) {
    idx <- pieces$val[[i]]
    copy[idx] <- seq_along(idx)
  }

  spec$.seq <- copy
  spec
}

check_list_of_functions <- function(x, names, arg, call = caller_env()) {
  # mostly COPIED FROM tidyr
  if (is.null(x)) {
    x <- set_names(list(), character())
  }

  if (!vctrs::vec_is_list(x)) {
    x <- rep_named(names, list(x))
  }

  if (length(x) > 0L && !is_named(x)) {
    cli_abort("All elements of {.arg {arg}} must be named.", call = call)
  }

  if (vctrs::vec_duplicate_any(names(x))) {
    cli_abort("The names of {.arg {arg}} must be unique.", call = call)
  }

  # Silently drop user supplied names not found in the data
  x <- x[intersect(names(x), names)]

  x
}
# nocov end

globalVariables(".")