File: verb-distinct.R

package info (click to toggle)
r-cran-dbplyr 2.3.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 2,376 kB
  • sloc: sh: 13; makefile: 2
file content (69 lines) | stat: -rw-r--r-- 1,848 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#' Subset distinct/unique rows
#'
#' This is a method for the dplyr [distinct()] generic. It adds the
#' `DISTINCT` clause to the SQL query.
#'
#' @inheritParams arrange.tbl_lazy
#' @inheritParams dplyr::distinct
#' @inherit arrange.tbl_lazy return
#' @export
#' @importFrom dplyr distinct
#' @examples
#' library(dplyr, warn.conflicts = FALSE)
#'
#' db <- memdb_frame(x = c(1, 1, 2, 2), y = c(1, 2, 1, 1))
#' db %>% distinct() %>% show_query()
#' db %>% distinct(x) %>% show_query()
distinct.tbl_lazy <- function(.data, ..., .keep_all = FALSE) {
  grps <- syms(op_grps(.data))
  empty_dots <- dots_n(...) == 0
  can_use_distinct <- !.keep_all || (empty_dots && is_empty(grps))
  if (!can_use_distinct) {
    .data <- .data %>%
      group_by(..., .add = TRUE) %>%
      filter(row_number() == 1L) %>%
      group_by(!!!grps)

    return(.data)
  }

  if (empty_dots) {
    dots <- quos(!!!syms(colnames(.data)))
  } else {
    dots <- enquos(...)
  }
  prep <- dplyr::distinct_prepare(.data, dots, group_vars = group_vars(.data))
  out <- dplyr::select(prep$data, prep$keep)

  out$lazy_query <- add_distinct(out)
  out
}

add_distinct <- function(.data) {
  lazy_query <- .data$lazy_query

  out <- lazy_select_query(
    x = lazy_query,
    distinct = TRUE
  )
  # TODO this could also work for joins
  if (!inherits(lazy_query, "lazy_select_query")) {
    return(out)
  }

  # Optimisation overview
  # * `distinct()` adds the `DISTINCT` clause to `SELECT`
  # * `WHERE`, `GROUP BY`, and `HAVING` are executed before `SELECT`
  #   => they do not matter
  # * `ORDER BY`
  #   => but `arrange()` should not have an influence on `distinct()` so it
  #      should not matter
  # * `LIMIT` are executed after `SELECT`
  #   => needs a subquery
  if (!is_null(lazy_query$limit)) {
    return(out)
  }

  lazy_query$distinct <- TRUE
  lazy_query
}