File: req-cache.R

package info (click to toggle)
r-cran-httr2 1.2.2-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 1,684 kB
  • sloc: sh: 13; makefile: 2
file content (393 lines) | stat: -rw-r--r-- 10,808 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
#' Automatically cache requests
#'
#' @description
#' Use `req_cache()` to automatically cache HTTP requests. Most API requests
#' are not cacheable, but static files often are.
#'
#' `req_cache()` caches responses to GET requests that have status code 200 and
#' at least one of the standard caching headers (e.g. `Expires`,
#' `Etag`, `Last-Modified`, `Cache-Control`), unless caching has been expressly
#' prohibited with `Cache-Control: no-store`. Typically, a request will still
#' be sent to the server to check that the cached value is still up-to-date,
#' but it will not need to re-download the body value.
#'
#' To learn more about HTTP caching, I recommend the MDN article
#' [HTTP caching](https://developer.mozilla.org/en-US/docs/Web/HTTP/Caching).
#'
#' @inheritParams req_perform
#' @param path Path to cache directory. Will be created automatically if it
#'   does not exist.
#'
#'   For quick and easy caching within a session, you can use `tempfile()`.
#'   To cache requests within a package, you can use something like
#'   `file.path(tools::R_user_dir("pkgdown", "cache"), "httr2")`.
#'
#'   httr2 doesn't provide helpers to manage the cache, but if you want to
#'   empty it, you can use something like
#'   `unlink(dir(cache_path, full.names = TRUE))`.
#' @param use_on_error If the request errors, and there's a cache response,
#'   should `req_perform()` return that instead of generating an error?
#' @param debug When `TRUE` will emit useful messages telling you about
#'   cache hits and misses. This can be helpful to understand whether or
#'   not caching is actually doing anything for your use case.
#' @param max_n,max_age,max_size Automatically prune the cache by specifying
#'   one or more of:
#'
#'   * `max_age`: to delete files older than this number of seconds.
#'   * `max_n`: to delete files (from oldest to newest) to preserve at
#'      most this many files.
#'   * `max_size`: to delete files (from oldest to newest) to preserve at
#'      most this many bytes.
#'
#'   The cache pruning is performed at most once per minute.
#' @returns A modified HTTP [request].
#' @export
#' @examples
#' # GitHub uses HTTP caching for all raw files.
#' url <- paste0(
#'   "https://raw.githubusercontent.com/allisonhorst/palmerpenguins/",
#'   "master/inst/extdata/penguins.csv"
#' )
#' # Here I set debug = TRUE so you can see what's happening
#' req <- request(url) |> req_cache(tempdir(), debug = TRUE)
#'
#' # First request downloads the data
#' resp <- req |> req_perform()
#'
#' # Second request retrieves it from the cache
#' resp <- req |> req_perform()
req_cache <- function(
  req,
  path,
  use_on_error = FALSE,
  debug = getOption("httr2_cache_debug", FALSE),
  max_age = Inf,
  max_n = Inf,
  max_size = 1024^3
) {
  check_number_whole(max_age, min = 0, allow_infinite = TRUE)
  check_number_whole(max_n, min = 0, allow_infinite = TRUE)
  check_number_decimal(max_size, min = 1, allow_infinite = TRUE)

  dir.create(path, showWarnings = FALSE, recursive = TRUE)
  req_policies(
    req,
    cache_path = path,
    cache_use_on_error = use_on_error,
    cache_debug = debug,
    cache_max = list(age = max_age, n = max_n, size = max_size)
  )
}

# Do I need to worry about hash collisions?
# No - even if the user stores a billion urls, the probably of a collision
# is ~ 1e-20: https://preshing.com/20110504/hash-collision-probabilities/
req_cache_path <- function(req, ext = ".rds") {
  file.path(req$policies$cache_path, paste0(hash(req$url), ext))
}
cache_use_on_error <- function(req) {
  req$policies$cache_use_on_error %||% FALSE
}
cache_debug <- function(req) {
  req$policies$cache_debug %||% FALSE
}

# Cache management --------------------------------------------------------

cache_active <- function(req) {
  req_policy_exists(req, "cache_path")
}

cache_get <- function(req) {
  # This check should be redudant but we keep it in for safety
  if (!cache_active(req)) {
    return(req)
  }

  path <- req_cache_path(req)
  if (!file.exists(path)) {
    return(NULL)
  }

  tryCatch(
    {
      rds <- readRDS(path)
      # Update file time if read successfully
      Sys.setFileTime(path, Sys.time())
      rds
    },
    error = function(e) NULL
  )
}

cache_set <- function(req, resp) {
  signal("", "httr2_cache_save")

  if (resp_body_type(resp) == "disk") {
    body_path <- req_cache_path(req, ".body")
    file.copy(resp$body, body_path, overwrite = TRUE)
    resp$body <- new_path(body_path)
  }

  saveRDS(resp, req_cache_path(req, ".rds"))
  invisible()
}

cache_prune_if_needed <- function(req, threshold = 60, debug = FALSE) {
  path <- req$policies$cache_path

  last_prune <- the$cache_throttle[[path]]
  if (is.null(last_prune) || last_prune + threshold <= Sys.time()) {
    if (debug) {
      cli::cli_text("Pruning cache")
    }
    cache_prune(path, max = req$policies$cache_max, debug = debug)
    the$cache_throttle[[path]] <- Sys.time()

    invisible(TRUE)
  } else {
    invisible(FALSE)
  }
}

# Adapted from
# https://github.com/r-lib/cachem/blob/main/R/cache-disk.R#L396-L467
cache_prune <- function(path, max, debug = TRUE) {
  info <- cache_info(path)

  info <- cache_prune_files(
    info,
    info$mtime + max$age < Sys.time(),
    "too old",
    debug
  )
  info <- cache_prune_files(
    info,
    seq_len(nrow(info)) > max$n,
    "too numerous",
    debug
  )
  info <- cache_prune_files(
    info,
    cumsum(info$size) > max$size,
    "too big",
    debug
  )

  invisible()
}

cache_info <- function(path, pattern = "\\.rds$") {
  filenames <- dir(path, pattern, full.names = TRUE)
  info <- file.info(filenames, extra_cols = FALSE)
  info <- info[info$isdir == FALSE, ]
  info$name <- rownames(info)
  rownames(info) <- NULL
  info[order(info$mtime, decreasing = TRUE), c("name", "size", "mtime")]
}

cache_prune_files <- function(info, to_remove, why, debug = TRUE) {
  if (any(to_remove)) {
    if (debug) {
      cli::cli_text("Deleted {sum(to_remove)} file{?s} that {?is/are} {why}")
    }

    file.remove(info$name[to_remove])
    info[!to_remove, ]
  } else {
    info
  }
}

# Hooks for req_perform -----------------------------------------------------

# Can return request or response
cache_pre_fetch <- function(req, path = NULL) {
  if (!cache_active(req)) {
    return(req)
  }

  # Only GET requests should be retrieved from cache. It's not sufficient to
  # only save GET requests, because the method is not part of the cache key
  if (req_get_method(req) != "GET") {
    return(req)
  }

  debug <- cache_debug(req)
  cache_prune_if_needed(req, debug = debug)

  cached_resp <- cache_get(req)
  if (is.null(cached_resp)) {
    return(req)
  }
  if (debug) {
    cli::cli_text("Found url in cache {.val {hash(req$url)}}")
  }

  info <- resp_cache_info(cached_resp)
  if (!is.na(info$expires) && info$expires >= Sys.time()) {
    signal("", "httr2_cache_cached")
    if (debug) {
      cli::cli_text("Cached value is fresh; using response from cache")
    }

    resp <- cached_resp
    resp$body <- cache_body(cached_resp, path)
    resp
  } else {
    if (debug) {
      cli::cli_text("Cached value is stale; checking for updates")
    }
    req_headers(
      req,
      `If-Modified-Since` = info$last_modified,
      `If-None-Match` = info$etag
    )
  }
}

# Always returns response
cache_post_fetch <- function(req, resp, path = NULL) {
  if (!cache_active(req)) {
    return(resp)
  }

  debug <- cache_debug(req)
  cached_resp <- cache_get(req)

  if (is_error(resp)) {
    if (cache_use_on_error(req) && !is.null(cached_resp)) {
      if (debug) {
        cli::cli_text("Request errored; retrieving response from cache")
      }
      cached_resp
    } else {
      resp
    }
  } else if (resp_status(resp) == 304 && !is.null(cached_resp)) {
    signal("", "httr2_cache_not_modified")
    if (debug) {
      cli::cli_text("Cached value still ok; retrieving body from cache")
    }

    # Combine headers
    resp$headers <- cache_headers(cached_resp, resp)
    # Replace body with cached result
    resp$body <- cache_body(cached_resp, path)

    # Re-cache, so we get any new headers
    cache_set(req, resp)
    resp
  } else if (resp_is_cacheable(resp)) {
    if (debug) {
      cli::cli_text("Saving response to cache {.val {hash(req$url)}}")
    }

    cache_set(req, resp)
    resp
  } else {
    resp
  }
}

cache_body <- function(cached_resp, path = NULL) {
  check_response(cached_resp)

  body <- cached_resp$body
  if (is.null(path)) {
    return(body)
  }

  switch(
    resp_body_type(cached_resp),
    disk = file.copy(body, path, overwrite = TRUE),
    memory = writeBin(body, path),
    stream = cli::cli_abort("Invalid body type", .internal = TRUE)
  )

  new_path(path)
}

# https://www.rfc-editor.org/rfc/rfc7232#section-4.1
cache_headers <- function(cached_resp, resp) {
  check_response(cached_resp)

  headers <- modify_list(
    cached_resp$headers,
    !!!resp$headers,
    .ignore_case = TRUE
  )
  as_headers(headers)
}

# Caching headers ---------------------------------------------------------

resp_is_cacheable <- function(resp, control = NULL) {
  if (resp$method != "GET") {
    return(FALSE)
  }

  if (resp_status(resp) != 200L) {
    return(FALSE)
  }

  if (resp_body_type(resp) == "stream") {
    return(FALSE)
  }

  control <- control %||% resp_cache_control(resp)
  if ("no-store" %in% control$flags) {
    return(FALSE)
  }
  if (has_name(control, "max-age")) {
    return(TRUE)
  }

  if (!any(resp_header_exists(resp, c("Etag", "Last-Modified", "Expires")))) {
    return(FALSE)
  }

  TRUE
}

resp_cache_info <- function(resp, control = NULL) {
  list(
    expires = resp_cache_expires(resp, control),
    last_modified = resp_header(resp, "Last-Modified"),
    etag = resp_header(resp, "Etag")
  )
}

resp_cache_expires <- function(resp, control = NULL) {
  control <- control %||% resp_cache_control(resp)

  # Prefer max-age parameter if it exists, otherwise use Expires header
  if (has_name(control, "max-age") && resp_header_exists(resp, "Date")) {
    resp_date(resp) + as.integer(control[["max-age"]])
  } else if (resp_header_exists(resp, "Expires")) {
    parse_http_date(resp_header(resp, "Expires"))
  } else {
    NA
  }
}

# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Cache-Control
resp_cache_control <- function(resp) {
  x <- resp_header(resp, "Cache-Control")
  if (is.null(x)) {
    return(NULL)
  }

  pieces <- strsplit(x, ",", fixed = TRUE)[[1]]
  pieces <- gsub("^\\s+|\\s+$", "", pieces)
  pieces <- tolower(pieces)

  is_value <- grepl("=", pieces, fixed = TRUE)
  flags <- pieces[!is_value]

  keyvalues <- strsplit(pieces[is_value], "\\s*=\\s*")
  keys <- c(rep("flags", length(flags)), lapply(keyvalues, "[[", 1))
  values <- c(flags, lapply(keyvalues, "[[", 2))

  stats::setNames(values, keys)
}