File: write_nquads.R

package info (click to toggle)
r-cran-rdflib 0.2.9%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 596 kB
  • sloc: xml: 66; sh: 13; makefile: 2
file content (154 lines) | stat: -rw-r--r-- 4,540 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154


#' write object out as nquads
#'
#' @param x an object that can be represented as nquads
#' @param file output filename
#' @param ... additional parameters, see examples
#'
#' @export
#'
#' @examples
#' tmp <- tempfile(fileext = ".nq")
#' library(datasets)
#' 
#' ## convert data.frame to nquads
#' write_nquads(iris, tmp)
#' rdf <- read_nquads(tmp)
#' 
#' ## or starting a native rdf object
#' write_nquads(rdf, tempfile(fileext = ".nq"))
write_nquads <- function(x, file, ...){
  UseMethod("write_nquads")
}


#' read an nquads file
#' @param file path to nquads file
#' @param ... additional arguments to [rdf_parse()]
#' @return an rdf object.  See [rdf_parse()]
#' 
#' @examples
#' tmp <- tempfile(fileext = ".nq")
#' library(datasets)
#' write_nquads(iris, tmp)
#' read_nquads(tmp)
#' 
#' @export
read_nquads <- function(file, ...){
  rdf_parse(file, "nquads", ...)
}

#' @export
write_nquads.rdf <- function(x, file, ...){
  rdf_serialize(x, file, "nquads", ...)
}
  
#' @export
write_nquads.data.frame <- function(x, 
                         file,
                         ...){
  
  
  
  df <- normalize_table(x, ...)
  poor_mans_nquads(df, file, ...)
}


#' @importFrom tidyr gather
#' @importFrom dplyr left_join
normalize_table <- function(df, key_column = NULL, ...){
  ## gather looses col-classes, so pre-compute them (with base R)
  col_classes <- data.frame(datatype = 
                              vapply(df, 
                                     xs_class, 
                                     character(1)))
  col_classes$predicate <- rownames(col_classes)
  rownames(col_classes) <- NULL
  
  ## Use row names as key (subject), unless a key column is specified
  ## Should we verify that requested key column is indeed a unique key first?
  out <- df
  if (is.null(key_column)) {
    out$subject <- as.character(1:dim(out)[[1]])
  } else {
    names(out)[names(out) == key_column] <- "subject"
  }
  
  ## FIXME consider taking an already-gathered table to avoid dependency?
  
  suppressWarnings(# Possible warnings about mixed types
    out <- tidyr::gather(out, 
                       key = "predicate", 
                       value = "object", 
                       -"subject"))
  
  ## merge is Slow! ~ 5 seconds for 800K triples 
  ## (almost as much time as rdf_parse)
  # merge(out, col_classes, by = "predicate")
  
  dplyr::left_join(out, col_classes, by = "predicate")
  
}



## x is a data.frame with columns: subject, predicate, object, & datatype
#' @importFrom utils write.table
poor_mans_nquads <- function(x, file, prefix = NULL, ...){
  
  if (is.null(prefix)) {
    prefix <- paste0(deparse(substitute(x)), ":")
    warning(paste("prefix not declared, using", prefix))
  }
  
  prefix <- uri_prefix(prefix)
  ## Currently written to be base-R compatible, 
  ## but a tidyverse implementation may speed serialization.  
  ## However, this seems to be fast enough that it is rarely the bottleneck
  
  ## NOTE: paste0 is a little slow ~ 1 s on 800K triples
  ## No datatype on blank (missing) nodes
  
  blank_object <-is.na(x$object)
  blank_subject <- is.na(x$subject)
  
  x$datatype[blank_object] <- as.character(NA)
  ## NA needs to become a unique blank node number, could do uuid or _:r<rownum>
  x$object[blank_object] <- paste0("_:r", which(blank_object))
  x$subject[blank_subject] <- paste0("_:r", which(blank_subject))
  
  ## strings and URIs do not get a datatype
  needs_type <- !is.na(x$datatype)
  
  ## URIs that are not blank nodes need <>
  x$subject[!blank_subject] <- paste0("<", prefix, x$subject[!blank_subject], ">")
  ## Predicate is always a URI
  x$predicate <- paste0("<", prefix, x$predicate, ">")
  
  ## Strings should be quoted
  is_string <- !grepl("\\w+:\\w.*", x$object) &
    !needs_type & !blank_object
  x$object[is_string] <- paste0('\"', x$object[is_string] , '\"')
  
  ## URIs should be <> instead, but not blanks!
  x$object[!blank_object] <- gsub("(^\\w+:\\w.*$)", "<\\1>", 
                                  x$object[!blank_object])
  
  ## assumes datatype is not empty (e.g. string)
  x$object[needs_type] <- paste0('\"', x$object[needs_type], 
                                 '\"^^<', x$datatype[needs_type], ">")
  
  ## quads needs a graph column
  x$graph <- "."
  
  ## write table is a little slow, ~ 1s on 800K triples, 
  ## but readr cannot write in nquads style
  
  ## drop datatype
  x <- x[c("subject", "predicate", "object", "graph")]       
  utils::write.table(x, file, col.names = FALSE, quote = FALSE, row.names = FALSE)
}