File: register_scalar_function.Rd

package info (click to toggle)
apache-arrow 23.0.1-1
  • links: PTS
  • area: main
  • in suites: sid
  • size: 76,220 kB
  • sloc: cpp: 654,608; python: 70,522; ruby: 45,964; ansic: 18,742; sh: 7,365; makefile: 669; javascript: 125; xml: 41
file content (70 lines) | stat: -rw-r--r-- 3,124 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/udf.R
\name{register_scalar_function}
\alias{register_scalar_function}
\title{Register user-defined functions}
\usage{
register_scalar_function(name, fun, in_type, out_type, auto_convert = FALSE)
}
\arguments{
\item{name}{The function name to be used in the dplyr bindings}

\item{fun}{An R function or rlang-style lambda expression. The function
will be called with a first argument \code{context} which is a \code{list()}
with elements \code{batch_size} (the expected length of the output) and
\code{output_type} (the required \link{DataType} of the output) that may be used
to ensure that the output has the correct type and length. Subsequent
arguments are passed by position as specified by \code{in_types}. If
\code{auto_convert} is \code{TRUE}, subsequent arguments are converted to
R vectors before being passed to \code{fun} and the output is automatically
constructed with the expected output type via \code{\link[=as_arrow_array]{as_arrow_array()}}.}

\item{in_type}{A \link{DataType} of the input type or a \code{\link[=schema]{schema()}}
for functions with more than one argument. This signature will be used
to determine if this function is appropriate for a given set of arguments.
If this function is appropriate for more than one signature, pass a
\code{list()} of the above.}

\item{out_type}{A \link{DataType} of the output type or a function accepting
a single argument (\code{types}), which is a \code{list()} of \link{DataType}s. If a
function it must return a \link{DataType}.}

\item{auto_convert}{Use \code{TRUE} to convert inputs before passing to \code{fun}
and construct an Array of the correct type from the output. Use this
option to write functions of R objects as opposed to functions of
Arrow R6 objects.}
}
\value{
\code{NULL}, invisibly
}
\description{
These functions support calling R code from query engine execution
(i.e., a \code{\link[dplyr:mutate]{dplyr::mutate()}} or \code{\link[dplyr:filter]{dplyr::filter()}} on a \link{Table} or \link{Dataset}).
Use \code{\link[=register_scalar_function]{register_scalar_function()}} attach Arrow input and output types to an
R function and make it available for use in the dplyr interface and/or
\code{\link[=call_function]{call_function()}}. Scalar functions are currently the only type of
user-defined function supported. In Arrow, scalar functions must be
stateless and return output with the same shape (i.e., the same number
of rows) as the input.
}
\examples{
\dontshow{if (arrow_with_dataset() && identical(Sys.getenv("NOT_CRAN"), "true")) withAutoprint(\{ # examplesIf}
library(dplyr, warn.conflicts = FALSE)

some_model <- lm(mpg ~ disp + cyl, data = mtcars)
register_scalar_function(
  "mtcars_predict_mpg",
  function(context, disp, cyl) {
    predict(some_model, newdata = data.frame(disp, cyl))
  },
  in_type = schema(disp = float64(), cyl = float64()),
  out_type = float64(),
  auto_convert = TRUE
)

as_arrow_table(mtcars) |>
  transmute(mpg, mpg_predicted = mtcars_predict_mpg(disp, cyl)) |>
  collect() |>
  head()
\dontshow{\}) # examplesIf}
}