1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
|
# mypy: disallow_untyped_defs=False
import json
import unittest
from pprint import pformat
from lxml.etree import XML, canonicalize
from extruct.rdfa import RDFaExtractor
from tests import get_testdata
def tupleize(d):
if isinstance(d, list):
return sorted(tupleize(e) for e in d)
if isinstance(d, dict):
# Workaround: canonicalize XML so that attribute re-ordering is ignored
# See: https://github.com/scrapinghub/extruct/pull/161
if d.get("@type") == "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral":
d["@value"] = canonicalize(XML(d["@value"]))
return sorted((k, tupleize(v)) for k, v in d.items())
return d
class TestRDFa(unittest.TestCase):
maxDiff = None
def assertJsonLDEqual(self, a, b, normalize_bnode_ids=True):
sa = json.dumps(
a, indent=2, separators=(",", ": "), sort_keys=True, ensure_ascii=True
)
sb = json.dumps(
b, indent=2, separators=(",", ": "), sort_keys=True, ensure_ascii=True
)
if normalize_bnode_ids:
sa = self.normalize_bnode_ids(sa)
sb = self.normalize_bnode_ids(sb)
self.assertEqual(tupleize(json.loads(sa)), tupleize(json.loads(sb)))
def normalize_bnode_ids(self, jsld):
import re
bnode_ids = set(re.findall(r'"_:(\w+)"', jsld))
for i, bnid in enumerate(bnode_ids, start=1):
jsld = jsld.replace(bnid, "%06d" % i)
return jsld
def prettify(self, a, normalize_bnode_ids=True):
output = json.dumps(
a, indent=2, separators=(",", ": "), sort_keys=True, ensure_ascii=True
)
if normalize_bnode_ids:
output = self.normalize_bnode_ids(output)
return output
def test_w3c_rdfalite(self):
for i in [3, 4, 5]:
fileprefix = "w3c.rdfalite.example{:03d}".format(i)
body = get_testdata("w3crdfa", fileprefix + ".html")
expected = json.loads(
get_testdata("w3crdfa", fileprefix + ".expanded.json").decode("UTF-8")
)
rdfae = RDFaExtractor()
data = rdfae.extract(body, base_url="http://www.example.com/index.html")
self.assertJsonLDEqual(data, expected)
def test_w3c_rdf11primer(self):
for i in [14]:
fileprefix = "w3c.rdf11primer.example{:03d}".format(i)
body = get_testdata("w3crdfa", fileprefix + ".html")
expected = json.loads(
get_testdata("w3crdfa", fileprefix + ".expanded.json").decode("UTF-8")
)
rdfae = RDFaExtractor()
data = rdfae.extract(body, base_url="http://www.example.com/index.html")
self.assertJsonLDEqual(data, expected)
def test_w3c_rdfaprimer(self):
for i in [5, 6, 7, 8, 9, 10, 11, 15]:
fileprefix = "w3c.rdfaprimer.example{:03d}".format(i)
print(fileprefix)
body = get_testdata("w3crdfa", fileprefix + ".html")
expected = json.loads(
get_testdata("w3crdfa", fileprefix + ".expanded.json").decode("UTF-8")
)
rdfae = RDFaExtractor()
data = rdfae.extract(body, base_url="http://www.example.com/index.html")
self.assertJsonLDEqual(data, expected)
# This is for testing that the fix to issue 116 does not affect
# severely rdfa output even in a presence of a bug in the code
def mocked_fix_order(x, y, z):
raise Exception()
rdfae._fix_order = mocked_fix_order # type: ignore[assignment]
data = rdfae.extract(body, base_url="http://www.example.com/index.html")
self.assertJsonLDEqual(data, expected)
def test_wikipedia_xhtml_rdfa(self):
fileprefix = "xhtml+rdfa"
body = get_testdata("wikipedia", fileprefix + ".html")
expected = json.loads(
get_testdata("wikipedia", fileprefix + ".expanded.json").decode("UTF-8")
)
rdfae = RDFaExtractor()
data = rdfae.extract(body, base_url="http://www.example.com/index.html")
self.assertJsonLDEqual(data, expected)
def test_wikipedia_xhtml_rdfa_no_prefix(self):
body = get_testdata("misc", "Portfolio_Niels_Lubberman.html")
expected = json.loads(
get_testdata("misc", "Portfolio_Niels_Lubberman.json").decode("UTF-8")
)
rdfae = RDFaExtractor()
data = rdfae.extract(body, base_url="http://nielslubberman.nl/drupal/")
self.assertJsonLDEqual(data, expected)
def test_expanded_opengraph_support(self):
body = get_testdata("misc", "expanded_OG_support_test.html")
expected = json.loads(
get_testdata("misc", "expanded_OG_support_test.json").decode("UTF-8")
)
rdfae = RDFaExtractor()
data = rdfae.extract(body, base_url="http://www.example.com/index.html")
self.assertJsonLDEqual(data, expected)
|