File: rdf-wo-redland.R

package info (click to toggle)
r-cran-rdflib 0.2.9%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 596 kB
  • sloc: xml: 66; sh: 13; makefile: 2
file content (62 lines) | stat: -rw-r--r-- 1,861 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
library(nycflights13)
library(dplyr)

dim(flights) # 336,776 x 19 

as_uri <- function(x, base_uri = "x:") paste0(base_uri, x)
uri_flights <- flights %>% 
  mutate(tailnum = as_uri(tailnum),
         carrier = as_uri(carrier))

  
df1 <- rdflib:::normalize_table(airlines, key = "carrier", prefix = "x:")
df2 <- rdflib:::normalize_table(planes,   key = "tailnum", prefix = "x:")
df3 <- rdflib:::normalize_table(uri_flights, key = NULL, prefix = "x:")

df <- bind_rows(df1,df2,df3)

dim(df3) # 6,398,744 x 4
  
df %>% 
  filter(predicate %in% c("carrier", "name", "manufacturer", "model", "dep_delay")) %>% 
  count(predicate)

## We can recover an individual table
df %>% 
  filter(predicate %in% c("manufacturer", "model")) %>% 
  select(subject, predicate, object) %>% 
  tidyr::spread(predicate, object) %>% 
  rename(tailnum = subject) ## Also need to apply datatype...
## ... and join manually....


## What about recovering the "joined" table?
df %>% 
  filter(predicate %in% c("carrier", "name", "manufacturer", "model", "dep_delay")) %>%
  select(subject, predicate, object) %>% tidyr::spread(predicate, object) %>% 
  filter(!is.na(carrier))


## Compare to pure approach on original tables
flights %>% 
  left_join(airlines, by = "carrier") %>%
  left_join(planes, by = "tailnum") %>% 
  select(carrier, name, manufacturer, model, dep_delay) %>% 
  distinct()

#### All fits in memory anyway
#library(MonetDBLite)
#library(DBI)
#triplestore <- rappdirs::user_data_dir("rdflib")
#con <- dbConnect(MonetDBLite::MonetDBLite(), triplestore)

# Could use append=TRUE instead to extend triplestore later
#DBI::dbWriteTable(con, "flights", df, overwrite = TRUE)  


## size as flat files on disk:
readr::write_tsv(flights, "flights.tsv")    # 29.6 MB
readr::write_tsv(df, "triplestore.tsv")     # 319 MB
readr::write_tsv(df, "triplestore.tsv.bz2") # 17 MB