1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
|
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# This code generates dplyr-funcs-doc.R.
# It requires that the package be installed.
file_template <- "# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# \"License\"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# Generated by using data-raw/docgen.R -> do not edit by hand
#' Functions available in Arrow dplyr queries
#'
#' The `arrow` package contains methods for %s `dplyr` table functions, many of
#' which are \"verbs\" that do transformations to one or more tables.
#' The package also has mappings of %s R functions to the corresponding
#' functions in the Arrow compute library. These allow you to write code inside
#' of `dplyr` methods that call R functions, including many in packages like
#' `stringr` and `lubridate`, and they will get translated to Arrow and run
#' on the Arrow query engine (Acero). This document lists all of the mapped
#' functions.
#'
#' # `dplyr` verbs
#'
#' Most verb functions return an `arrow_dplyr_query` object, similar in spirit
#' to a `dbplyr::tbl_lazy`. This means that the verbs do not eagerly evaluate
#' the query on the data. To run the query, call either `compute()`,
#' which returns an `arrow` [Table], or `collect()`, which pulls the resulting
#' Table into an R `tibble`.
#'
%s
#'
#' # Function mappings
#'
#' In the list below, any differences in behavior or support between Acero and
#' the R function are listed. If no notes follow the function name, then you
#' can assume that the function works in Acero just as it does in R.
#'
#' Functions can be called either as `pkg::fun()` or just `fun()`, i.e. both
#' `str_sub()` and `stringr::str_sub()` work.
#'
#' In addition to these functions, you can call any of Arrow's %s compute
#' functions directly. Arrow has many functions that don't map to an existing R
#' function. In other cases where there is an R function mapping, you can still
#' call the Arrow function directly if you don't want the adaptations that the R
#' mapping has that make Acero behave like R. These functions are listed in the
#' [C++ documentation](https://arrow.apache.org/docs/cpp/compute.html), and
#' in the function registry in R, they are named with an `arrow_` prefix, such
#' as `arrow_ascii_is_decimal`.
#'
%s
#'
#' @name acero
#'
#' @aliases arrow-functions arrow-verbs arrow-dplyr
NULL"
library(dplyr)
library(purrr)
# Functions that for whatever reason cause xref problems, so don't hyperlink
do_not_link <- c()
package_notes <- list(
stringr = "Pattern modifiers `coll()` and `boundary()` are not supported in any functions."
)
# Vectorized function to make entries for each function
render_fun <- function(fun, pkg_fun, notes) {
# Add () to fun if it's not an operator
not_operators <- grepl("^[[:alpha:]]", fun)
fun[not_operators] <- paste0(fun[not_operators], "()")
# Make it \code{} for better formatting
fun <- paste0("`", fun, "`")
# Wrap in \link{}
out <- ifelse(
pkg_fun %in% do_not_link,
fun,
paste0("[", fun, "][", pkg_fun, "()]")
)
# Add notes after :, if exist
has_notes <- nzchar(notes)
out[has_notes] <- paste0(out[has_notes], ": ", notes[has_notes])
# Make bullets
paste("*", out)
}
# This renders a bulleted list under a package heading
render_pkg <- function(df, pkg) {
bullets <- df |>
transmute(render_fun(fun, pkg_fun, notes)) |>
pull()
header <- paste0("## ", pkg, "\n#'")
# Some packages have global notes to include
pkg_notes <- package_notes[[pkg]]
if (!is.null(pkg_notes)) {
pkg_notes <- paste(pkg_notes, collapse = "\n#' ")
header <- c(header, paste0(pkg_notes, "\n#'"))
}
paste("#'", c(header, bullets), collapse = "\n")
}
# Load the current development version so we get the latest function mappings
if (requireNamespace("devtools", quietly = TRUE)) {
devtools::load_all()
} else {
warning(
"devtools is not installed. Using installed arrow package instead of current working code.\n",
"To generate accurate docs, install the current branch version of arrow first via `R CMD INSTALL .` ",
"or install devtools before running this script again."
)
}
docs <- arrow:::.cache$docs
# Add some functions
# across() is handled by manipulating the quosures, not by nse_funcs
docs[["dplyr::across"]] <- character(0)
# if_any() and if_all() are used instead of across() in filter()
# they are both handled by manipulating the quosures, not by nse_funcs
docs[["dplyr::if_any"]] <- character(0)
docs[["dplyr::if_all"]] <- character(0)
# desc() is a special helper handled inside of arrange()
docs[["dplyr::desc"]] <- character(0)
# add tidyselect helpers by parsing the reexports file
tidyselect <- grep("^tidyselect::", readLines("R/reexports-tidyselect.R"), value = TRUE)
# HACK: remove the _random_along UDF we're using (fix in ARROW-17974)
docs[["_random_along"]] <- NULL
# TODO - update the script to add this back in - will fail CI as tries to link
# to non-existent function as arrow::one only exists as registered binding
docs[["arrow::one"]] <- NULL
docs <- c(docs, setNames(rep(list(NULL), length(tidyselect)), tidyselect))
fun_df <- tibble::tibble(
pkg_fun = names(docs),
notes = docs
) |>
mutate(
has_pkg = grepl("::", pkg_fun),
fun = sub("^.*?:{+}", "", pkg_fun),
pkg = sub(":{+}.*$", "", pkg_fun),
# We will list operators under "base" (everything else must be pkg::fun)
pkg = if_else(has_pkg, pkg, "base"),
# Flatten notes to a single string
notes = map_chr(notes, ~ paste(., collapse = "\n#' "))
) |>
arrange(pkg, fun)
# Group by package name and render the lists
fun_doclets <- imap_chr(split(fun_df, fun_df$pkg), render_pkg)
dplyr_verbs <- c(
arrow:::supported_dplyr_methods,
# Because this only has a method for arrow_dplyr_query, it's not in the main list
tbl_vars = NULL
)
verb_bullets <- tibble::tibble(
fun = names(dplyr_verbs),
notes = dplyr_verbs
) |>
mutate(
pkg_fun = paste0("dplyr::", fun),
notes = map_chr(notes, ~ paste(., collapse = " "))
) |>
arrange(fun) |>
transmute(render_fun(fun, pkg_fun, notes)) |>
pull()
writeLines(
sprintf(
file_template,
length(dplyr_verbs),
length(docs),
paste("#'", verb_bullets, collapse = "\n"),
length(arrow::list_compute_functions()),
paste(fun_doclets, collapse = "\n#'\n")
),
"R/dplyr-funcs-doc.R"
)
|