File: test_rdfa.py

package info (click to toggle)
extruct 0.18.0-1
  • links: PTS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,048 kB
  • sloc: python: 2,106; makefile: 10
file content (134 lines) | stat: -rw-r--r-- 5,002 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# mypy: disallow_untyped_defs=False
import json
import unittest
from pprint import pformat

from lxml.etree import XML, canonicalize

from extruct.rdfa import RDFaExtractor
from tests import get_testdata


def tupleize(d):
    if isinstance(d, list):
        return sorted(tupleize(e) for e in d)
    if isinstance(d, dict):
        # Workaround: canonicalize XML so that attribute re-ordering is ignored
        # See: https://github.com/scrapinghub/extruct/pull/161
        if d.get("@type") == "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral":
            d["@value"] = canonicalize(XML(d["@value"]))
        return sorted((k, tupleize(v)) for k, v in d.items())
    return d


class TestRDFa(unittest.TestCase):

    maxDiff = None

    def assertJsonLDEqual(self, a, b, normalize_bnode_ids=True):
        sa = json.dumps(
            a, indent=2, separators=(",", ": "), sort_keys=True, ensure_ascii=True
        )
        sb = json.dumps(
            b, indent=2, separators=(",", ": "), sort_keys=True, ensure_ascii=True
        )
        if normalize_bnode_ids:
            sa = self.normalize_bnode_ids(sa)
            sb = self.normalize_bnode_ids(sb)
        self.assertEqual(tupleize(json.loads(sa)), tupleize(json.loads(sb)))

    def normalize_bnode_ids(self, jsld):
        import re

        bnode_ids = set(re.findall(r'"_:(\w+)"', jsld))
        for i, bnid in enumerate(bnode_ids, start=1):
            jsld = jsld.replace(bnid, "%06d" % i)
        return jsld

    def prettify(self, a, normalize_bnode_ids=True):
        output = json.dumps(
            a, indent=2, separators=(",", ": "), sort_keys=True, ensure_ascii=True
        )
        if normalize_bnode_ids:
            output = self.normalize_bnode_ids(output)
        return output

    def test_w3c_rdfalite(self):
        for i in [3, 4, 5]:
            fileprefix = "w3c.rdfalite.example{:03d}".format(i)
            body = get_testdata("w3crdfa", fileprefix + ".html")
            expected = json.loads(
                get_testdata("w3crdfa", fileprefix + ".expanded.json").decode("UTF-8")
            )

            rdfae = RDFaExtractor()
            data = rdfae.extract(body, base_url="http://www.example.com/index.html")
            self.assertJsonLDEqual(data, expected)

    def test_w3c_rdf11primer(self):
        for i in [14]:
            fileprefix = "w3c.rdf11primer.example{:03d}".format(i)
            body = get_testdata("w3crdfa", fileprefix + ".html")
            expected = json.loads(
                get_testdata("w3crdfa", fileprefix + ".expanded.json").decode("UTF-8")
            )

            rdfae = RDFaExtractor()
            data = rdfae.extract(body, base_url="http://www.example.com/index.html")
            self.assertJsonLDEqual(data, expected)

    def test_w3c_rdfaprimer(self):
        for i in [5, 6, 7, 8, 9, 10, 11, 15]:
            fileprefix = "w3c.rdfaprimer.example{:03d}".format(i)
            print(fileprefix)
            body = get_testdata("w3crdfa", fileprefix + ".html")
            expected = json.loads(
                get_testdata("w3crdfa", fileprefix + ".expanded.json").decode("UTF-8")
            )

            rdfae = RDFaExtractor()
            data = rdfae.extract(body, base_url="http://www.example.com/index.html")
            self.assertJsonLDEqual(data, expected)

            # This is for testing that the fix to issue 116 does not affect
            # severely rdfa output even in a presence of a bug in the code
            def mocked_fix_order(x, y, z):
                raise Exception()

            rdfae._fix_order = mocked_fix_order  # type: ignore[assignment]
            data = rdfae.extract(body, base_url="http://www.example.com/index.html")
            self.assertJsonLDEqual(data, expected)

    def test_wikipedia_xhtml_rdfa(self):
        fileprefix = "xhtml+rdfa"
        body = get_testdata("wikipedia", fileprefix + ".html")
        expected = json.loads(
            get_testdata("wikipedia", fileprefix + ".expanded.json").decode("UTF-8")
        )

        rdfae = RDFaExtractor()
        data = rdfae.extract(body, base_url="http://www.example.com/index.html")

        self.assertJsonLDEqual(data, expected)

    def test_wikipedia_xhtml_rdfa_no_prefix(self):
        body = get_testdata("misc", "Portfolio_Niels_Lubberman.html")
        expected = json.loads(
            get_testdata("misc", "Portfolio_Niels_Lubberman.json").decode("UTF-8")
        )

        rdfae = RDFaExtractor()
        data = rdfae.extract(body, base_url="http://nielslubberman.nl/drupal/")

        self.assertJsonLDEqual(data, expected)

    def test_expanded_opengraph_support(self):
        body = get_testdata("misc", "expanded_OG_support_test.html")
        expected = json.loads(
            get_testdata("misc", "expanded_OG_support_test.json").decode("UTF-8")
        )

        rdfae = RDFaExtractor()
        data = rdfae.extract(body, base_url="http://www.example.com/index.html")

        self.assertJsonLDEqual(data, expected)