1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
|
"""
BerkeleyDB in use as a persistent Graph store.
Example 1: simple actions
* creating a ConjunctiveGraph using the BerkeleyDB Store
* adding triples to it
* counting them
* closing the store, emptying the graph
* re-opening the store using the same DB files
* getting the same count of triples as before
Example 2: larger data
* loads multiple graphs downloaded from GitHub into a BerkeleyDB-baked graph stored in the folder gsq_vocabs.
* does not delete the DB at the end so you can see it on disk
"""
import os
import tempfile
from rdflib import ConjunctiveGraph, Literal, Namespace
from rdflib.plugins.stores.berkeleydb import has_bsddb
from rdflib.store import NO_STORE, VALID_STORE
def example_1():
"""Creates a ConjunctiveGraph and performs some BerkeleyDB tasks with it"""
path = tempfile.NamedTemporaryFile().name
# Declare we are using a BerkeleyDB Store
graph = ConjunctiveGraph("BerkeleyDB")
# Open previously created store, or create it if it doesn't exist yet
# (always doesn't exist in this example as using temp file location)
rt = graph.open(path, create=False)
if rt == NO_STORE:
# There is no underlying BerkeleyDB infrastructure, so create it
print("Creating new DB")
graph.open(path, create=True)
else:
print("Using existing DB")
assert rt == VALID_STORE, "The underlying store is corrupt"
print("Triples in graph before add:", len(graph))
print("(will always be 0 when using temp file for DB)")
# Now we'll add some triples to the graph & commit the changes
EG = Namespace("http://example.net/test/") # noqa: N806
graph.bind("eg", EG)
graph.add((EG["pic:1"], EG.name, Literal("Jane & Bob")))
graph.add((EG["pic:2"], EG.name, Literal("Squirrel in Tree")))
graph.commit()
print("Triples in graph after add:", len(graph))
print("(should be 2)")
# display the graph in Turtle
print(graph.serialize())
# close when done, otherwise BerkeleyDB will leak lock entries.
graph.close()
graph = None
# reopen the graph
graph = ConjunctiveGraph("BerkeleyDB")
graph.open(path, create=False)
print("Triples still in graph:", len(graph))
print("(should still be 2)")
graph.close()
# Clean up the temp folder to remove the BerkeleyDB database files...
for f in os.listdir(path):
os.unlink(path + "/" + f)
os.rmdir(path)
def example_2():
"""Loads a number of SKOS vocabularies from GitHub into a BerkeleyDB-backed graph stored in the local folder
'gsq_vocabs'
Should print out the number of triples after each load, e.g.:
177
248
289
379
421
628
764
813
965
1381
9666
9719
...
"""
import base64
import json
from urllib.error import HTTPError
from urllib.request import Request, urlopen
g = ConjunctiveGraph("BerkeleyDB")
g.open("gsg_vocabs", create=True)
# gsq_vocabs = "https://api.github.com/repos/geological-survey-of-queensland/vocabularies/git/trees/master"
gsq_vocabs = "https://api.github.com/repos/geological-survey-of-queensland/vocabularies/git/trees/cd7244d39337c1f4ef164b1cf1ea1f540a7277db"
try:
res = urlopen(Request(gsq_vocabs, headers={"Accept": "application/json"}))
except HTTPError as e:
return e.code, str(e), None
data = res.read()
encoding = res.info().get_content_charset("utf-8")
j = json.loads(data.decode(encoding))
for v in j["tree"]:
# process the element in GitHub result if it's a Turtle file
if v["path"].endswith(".ttl"):
# for each file, call it by URL, decode it and parse it into the graph
r = urlopen(v["url"])
content = json.loads(r.read().decode())["content"]
g.parse(data=base64.b64decode(content).decode(), format="turtle")
print(len(g))
print("loading complete")
if __name__ == "__main__":
if has_bsddb:
# Only run the examples if BerkeleyDB is available
example_1()
example_2()
|