File: flight.R

package info (click to toggle)
apache-arrow 23.0.1-1
  • links: PTS
  • area: main
  • in suites: sid
  • size: 76,220 kB
  • sloc: cpp: 654,608; python: 70,522; ruby: 45,964; ansic: 18,742; sh: 7,365; makefile: 669; javascript: 125; xml: 41
file content (143 lines) | stat: -rw-r--r-- 4,925 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

#' Load a Python Flight server
#'
#' @param name string Python module name
#' @param path file system path where the Python module is found. Default is
#' to look in the `inst/` directory for included modules.
#' @export
#' @examplesIf FALSE
#' load_flight_server("demo_flight_server")
load_flight_server <- function(name, path = system.file(package = "arrow")) {
  reticulate::import_from_path(name, path)
}

#' Connect to a Flight server
#'
#' @param host string hostname to connect to
#' @param port integer port to connect on
#' @param scheme URL scheme, default is "grpc+tcp"
#' @return A `pyarrow.flight.FlightClient`.
#' @export
flight_connect <- function(host = "localhost", port, scheme = "grpc+tcp") {
  pa <- reticulate::import("pyarrow")
  location <- paste0(scheme, "://", host, ":", port)
  pa$flight$FlightClient(location)
}

#' Explicitly close a Flight client
#'
#' @param client The client to disconnect
#' @export
flight_disconnect <- function(client) {
  client$close()
}

#' Send data to a Flight server
#'
#' @param client `pyarrow.flight.FlightClient`, as returned by [flight_connect()]
#' @param data `data.frame`, [RecordBatch], or [Table] to upload
#' @param path string identifier to store the data under
#' @param overwrite logical: if `path` exists on `client` already, should we
#' replace it with the contents of `data`? Default is `TRUE`; if `FALSE` and
#' `path` exists, the function will error.
#' @param max_chunksize integer: Maximum number of rows for RecordBatch chunks
#'   when a `data.frame` is sent.
#' Individual chunks may be smaller depending on the chunk layout of individual columns.
#' @return `client`, invisibly.
#' @export
flight_put <- function(client, data, path, overwrite = TRUE, max_chunksize = NULL) {
  assert_is(data, c("data.frame", "Table", "RecordBatch"))

  if (!overwrite && flight_path_exists(client, path)) {
    stop(path, " exists.", call. = FALSE)
  }
  if (is.data.frame(data)) {
    data <- Table$create(data)
  }

  py_data <- reticulate::r_to_py(data)
  writer <- client$do_put(descriptor_for_path(path), py_data$schema)[[1]]
  if (inherits(data, "RecordBatch") && !is.null(max_chunksize)) {
    warning("`max_chunksize` is not supported for flight_put with RecordBatch")
    writer$write_batch(py_data)
  } else if (inherits(data, "RecordBatch")) {
    writer$write_batch(py_data)
  } else if (!is.null(max_chunksize)) {
    writer$write_table(py_data, max_chunksize)
  } else {
    writer$write_table(py_data)
  }
  writer$close()
  invisible(client)
}

#' Get data from a Flight server
#'
#' @param client `pyarrow.flight.FlightClient`, as returned by [flight_connect()]
#' @param path string identifier under which data is stored
#' @return A [Table]
#' @export
flight_get <- function(client, path) {
  reader <- flight_reader(client, path)
  reader$read_all()
}

# TODO: could use this as a RecordBatch iterator, call $read_chunk() on this
flight_reader <- function(client, path) {
  info <- client$get_flight_info(descriptor_for_path(path))
  # Hack: assume a single ticket, on the same server as client is already connected
  ticket <- info$endpoints[[1]]$ticket
  client$do_get(ticket)
}

descriptor_for_path <- function(path) {
  pa <- reticulate::import("pyarrow")
  pa$flight$FlightDescriptor$for_path(path)
}

#' See available resources on a Flight server
#'
#' @inheritParams flight_get
#' @return `list_flights()` returns a character vector of paths.
#' `flight_path_exists()` returns a logical value, the equivalent of `path %in% list_flights()`
#' @export
list_flights <- function(client) {
  generator <- client$list_flights()
  out <- reticulate::iterate(generator, function(x) as.character(x$descriptor$path[[1]]))
  out
}

#' @rdname list_flights
#' @export
flight_path_exists <- function(client, path) {
  it_exists <- tryCatch(
    expr = {
      client$get_flight_info(descriptor_for_path(path))
      TRUE
    },
    error = function(e) {
      msg <- conditionMessage(e)
      if (!any(grepl("ArrowKeyError", msg))) {
        # Raise an error if this fails for any reason other than not found
        stop(e)
      }
      FALSE
    }
  )
}