1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
|
"""
Query a SPARQL endpoint and return results as a Pandas dataframe.
"""
import io
from typing import TYPE_CHECKING, Any, Dict, List, Union
from SPARQLWrapper.SmartWrapper import Bindings, SPARQLWrapper2, Value
from SPARQLWrapper.Wrapper import CSV, SELECT, SPARQLWrapper
if TYPE_CHECKING:
import pandas as pd
class QueryException(Exception):
pass
def get_sparql_dataframe_orig(
endpoint: str, query: Union[str, bytes]
) -> "pd.DataFrame":
"""copy paste from: https://github.com/lawlesst/sparql-dataframe"""
# pandas inside to avoid requiring it
import pandas as pd
sparql = SPARQLWrapper(endpoint)
sparql.setQuery(query)
if sparql.queryType != SELECT:
raise QueryException("Only SPARQL SELECT queries are supported.")
sparql.setReturnFormat(CSV)
results = sparql.query().convert()
if isinstance(results, bytes):
_csv = io.StringIO(results.decode("utf-8"))
return pd.read_csv(_csv, sep=",")
else:
raise TypeError(type(results))
def get_sparql_typed_dict(
endpoint: str, query: Union[str, bytes]
) -> List[Dict[str, Value]]:
"""modified from: https://github.com/lawlesst/sparql-dataframe"""
# pandas inside to avoid requiring it
import pandas as pd
# rdflib in here because there is some meta stuff in the setup.py and Travis fails because rdflib is installed later
import rdflib.term
sparql = SPARQLWrapper2(endpoint)
sparql.setQuery(query)
if sparql.queryType != SELECT:
raise QueryException("Only SPARQL SELECT queries are supported.")
# sparql.setReturnFormat(JSON)
results = sparql.query()
if not isinstance(results, Bindings):
raise TypeError(type(results))
# consider perf hacking later, probably slow
# convert list of dicts to python types
d = []
for x in results.bindings:
row = {}
for k in x:
v = x[k]
vv = rdflib.term.Literal(v.value, datatype=v.datatype).toPython() # type: ignore[no-untyped-call]
row[k] = vv
d.append(row)
return d
def get_sparql_dataframe(endpoint: str, query: Union[str, bytes]) -> "pd.DataFrame":
# pandas inside to avoid requiring it
import pandas as pd
d = get_sparql_typed_dict(endpoint, query)
# TODO: will nan fill somehow, make more strict if there is way of getting the nan types from rdflib
df = pd.DataFrame(d)
return df
|