1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
|
#' Subset rows using their positions
#'
#' @description
#' These are methods for the dplyr generics [slice_min()], [slice_max()], and
#' [slice_sample()]. They are translated to SQL using [filter()] and
#' window functions (`ROWNUMBER`, `MIN_RANK`, or `CUME_DIST` depending on
#' arguments). `slice()`, `slice_head()`, and `slice_tail()` are not supported
#' since database tables have no intrinsic order.
#'
#' If data is grouped, the operation will be performed on each group so that
#' (e.g.) `slice_min(db, x, n = 3)` will select the three rows with the smallest
#' value of `x` in each group.
#'
#' @inheritParams arrange.tbl_lazy
#' @inheritParams dplyr::slice
#' @inheritParams args_by
#' @param ... Not used.
#' @param n,prop Provide either `n`, the number of rows, or `prop`, the
#' proportion of rows to select. If neither are supplied, `n = 1` will be
#' used.
#'
#' If `n` is greater than the number of rows in the group (or `prop` > 1),
#' the result will be silently truncated to the group size. If the proportion
#' of a group size is not an integer, it is rounded down.
#' @param order_by Variable or function of variables to order by.
#' @param with_ties Should ties be kept together? The default, `TRUE`, may
#' return more rows than you request. Use FALSE to ignore ties, and return
#' the first n rows.
#' @param weight_by,replace Not supported for database backends.
#' @name dbplyr-slice
#' @aliases NULL
#' @examples
#' library(dplyr, warn.conflicts = FALSE)
#'
#' db <- memdb_frame(x = 1:3, y = c(1, 1, 2))
#' db %>% slice_min(x) %>% show_query()
#' db %>% slice_max(x) %>% show_query()
#' db %>% slice_sample() %>% show_query()
#'
#' db %>% group_by(y) %>% slice_min(x) %>% show_query()
#'
#' # By default, ties are includes so you may get more rows
#' # than you expect
#' db %>% slice_min(y, n = 1)
#' db %>% slice_min(y, n = 1, with_ties = FALSE)
#'
#' # Non-integer group sizes are rounded down
#' db %>% slice_min(x, prop = 0.5)
NULL
#' @importFrom dplyr slice
#' @export
slice.tbl_lazy <- function(.data, ...) {
cli_abort("{.fun slice} is not supported on database backends")
}
#' @importFrom dplyr slice_head
#' @export
slice_head.tbl_lazy <- function(.data, ..., n, prop, by = NULL) {
cli_abort(c(
"{.fun slice_head} is not supported on database backends",
i = "Please use {.fun slice_min} instead"
))
}
#' @importFrom dplyr slice_tail
#' @export
slice_tail.tbl_lazy <- function(.data, ..., n, prop, by = NULL) {
cli_abort(c(
"{.fun slice_tail} is not supported on database backends",
i = "Please use {.fun slice_max} instead"
))
}
#' @rdname dbplyr-slice
#' @importFrom dplyr slice_min
#' @export
slice_min.tbl_lazy <- function(.data, order_by, ..., n, prop, by = NULL, with_ties = TRUE) {
check_required(order_by)
size <- check_slice_size(n, prop)
slice_by(.data, {{order_by}}, size, {{ by }}, with_ties = with_ties)
}
#' @rdname dbplyr-slice
#' @importFrom dplyr slice_max
#' @export
slice_max.tbl_lazy <- function(.data, order_by, ..., n, by = NULL, prop, with_ties = TRUE) {
check_required(order_by)
size <- check_slice_size(n, prop)
slice_by(.data, desc({{order_by}}), size, {{ by }}, with_ties = with_ties)
}
#' @rdname dbplyr-slice
#' @importFrom dplyr slice_sample
#' @export
slice_sample.tbl_lazy <- function(.data, ..., n, prop, by = NULL, weight_by = NULL, replace = FALSE) {
size <- check_slice_size(n, prop)
weight_by <- enquo(weight_by)
if (size$type == "prop") {
cli_abort("Sampling by {.arg prop} is not supported on database backends")
}
if (!quo_is_null(weight_by)) {
cli_abort("Weighted resampling is not supported on database backends")
}
if (replace) {
cli_abort("Sampling with replacement is not supported on database backends")
}
slice_by(.data, !!sql_random(remote_con(.data)), size, {{ by }}, with_ties = FALSE)
}
slice_by <- function(.data, order_by, size, .by, with_ties = FALSE) {
by <- compute_by(
{{ .by }},
.data,
by_arg = "by",
data_arg = "data",
error_call = caller_env()
)
if (by$from_by) {
.data$lazy_query$group_vars <- by$names
}
old_frame <- op_sort(.data)
if (with_ties) {
window_fun <- switch(size$type,
n = expr(min_rank() <= !!size$n),
prop = expr(cume_dist() <= !!size$prop)
)
} else {
window_fun <- switch(size$type,
n = expr(row_number() <= !!size$n),
prop = cli_abort("Can only use {.arg prop} when {.code with_ties = TRUE}")
)
}
out <- .data %>%
window_order({{order_by}}) %>%
filter(!!window_fun) %>%
window_order(!!!old_frame)
if (by$from_by) {
out$lazy_query$group_vars <- character()
}
out
}
# helpers -----------------------------------------------------------------
check_slice_size <- function(n, prop) {
if (missing(n) && missing(prop)) {
list(type = "n", n = 1L)
} else if (!missing(n) && missing(prop)) {
if (!is.numeric(n) || length(n) != 1) {
cli_abort("{.arg n} must be a single number.", call = caller_env())
}
if (is.na(n) || n < 0) {
cli_abort("{.arg n} must be a non-missing positive number.", call = caller_env())
}
list(type = "n", n = as.integer(n))
} else if (!missing(prop) && missing(n)) {
if (!is.numeric(prop) || length(prop) != 1) {
cli_abort("{.arg prop} must be a single number", call = caller_env())
}
if (is.na(prop) || prop < 0) {
cli_abort("{.arg prop} must be a non-missing positive number.", call = caller_env())
}
list(type = "prop", prop = prop)
} else {
cli_abort("Must supply exactly one of {.arg n} and {.arg prop} arguments.", call = caller_env())
}
}
globalVariables(c("min_rank", "cume_dist", "row_number", "desc"))
|