1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
|
#' write object out as nquads
#'
#' @param x an object that can be represented as nquads
#' @param file output filename
#' @param ... additional parameters, see examples
#'
#' @export
#'
#' @examples
#' tmp <- tempfile(fileext = ".nq")
#' library(datasets)
#' write_nquads(iris, tmp)
#' read_nquads(tmp)
write_nquads <- function(x, file, ...){
UseMethod("write_nquads")
}
#' read an nquads file
#' @param file path to nquads file
#' @param ... additional arguments to [rdf_parse()]
#' @return an rdf object. See [rdf_parse()]
#'
#' @examples
#' tmp <- tempfile(fileext = ".nq")
#' library(datasets)
#' write_nquads(iris, tmp)
#' read_nquads(tmp)
#'
#' @export
read_nquads <- function(file, ...){
rdf_parse(file, "nquads", ...)
}
#' @export
write_nquads.rdf <- function(x, file, ...){
rdf_serialize(rdf, file, "nquads", ...)
}
#' @export
write_nquads.data.frame <- function(x,
file,
...){
df <- normalize_table(x, ...)
poor_mans_nquads(df, file, ...)
}
#' @importFrom tidyr gather
#' @importFrom dplyr left_join
normalize_table <- function(df, key_column = NULL, ...){
## gather looses col-classes, so pre-compute them (with base R)
col_classes <- data.frame(datatype =
vapply(df,
xs_class,
character(1)))
col_classes$predicate <- rownames(col_classes)
rownames(col_classes) <- NULL
## Use row names as key (subject), unless a key column is specified
## Should we verify that requested key column is indeed a unique key first?
out <- df
if (is.null(key_column)) {
out$subject <- as.character(1:dim(out)[[1]])
} else {
names(out)[names(out) == key_column] <- "subject"
}
## FIXME consider taking an already-gathered table to avoid dependency?
suppressWarnings(# Possible warnings about mixed types
out <- tidyr::gather(out,
key = "predicate",
value = "object",
-"subject"))
## merge is Slow! ~ 5 seconds for 800K triples
## (almost as much time as rdf_parse)
# merge(out, col_classes, by = "predicate")
dplyr::left_join(out, col_classes, by = "predicate")
}
## x is a data.frame with columns: subject, predicate, object, & datatype
#' @importFrom utils write.table
poor_mans_nquads <- function(x, file, prefix = NULL, ...){
if (is.null(prefix)) {
prefix <- paste0(deparse(substitute(x)), ":")
warning(paste("prefix not declared, using", prefix))
}
prefix <- uri_prefix(prefix)
## Currently written to be base-R compatible,
## but a tidyverse implementation may speed serialization.
## However, this seems to be fast enough that it is rarely the bottleneck
## NOTE: paste0 is a little slow ~ 1 s on 800K triples
## No datatype on blank (missing) nodes
blank_object <-is.na(x$object)
blank_subject <- is.na(x$subject)
x$datatype[blank_object] <- as.character(NA)
## NA needs to become a unique blank node number, could do uuid or _:r<rownum>
x$object[blank_object] <- paste0("_:r", which(blank_object))
x$subject[blank_subject] <- paste0("_:r", which(blank_subject))
## strings and URIs do not get a datatype
needs_type <- !is.na(x$datatype)
## URIs that are not blank nodes need <>
x$subject[!blank_subject] <- paste0("<", prefix, x$subject[!blank_subject], ">")
## Predicate is always a URI
x$predicate <- paste0("<", prefix, x$predicate, ">")
## Strings should be quoted
is_string <- !grepl("\\w+:\\w.*", x$object) &
!needs_type & !blank_object
x$object[is_string] <- paste0('\"', x$object[is_string] , '\"')
## URIs should be <> instead, but not blanks!
x$object[!blank_object] <- gsub("(^\\w+:\\w.*$)", "<\\1>",
x$object[!blank_object])
## assumes datatype is not empty (e.g. string)
x$object[needs_type] <- paste0('\"', x$object[needs_type],
'\"^^<', x$datatype[needs_type], ">")
## quads needs a graph column
x$graph <- "."
## write table is a little slow, ~ 1s on 800K triples,
## but readr cannot write in nquads style
## drop datatype
x <- x[c("subject", "predicate", "object", "graph")]
utils::write.table(x, file, col.names = FALSE, quote = FALSE, row.names = FALSE)
}
|