File: test_extruct_uniform.py

package info (click to toggle)
extruct 0.18.0-1
  • links: PTS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,048 kB
  • sloc: python: 2,106; makefile: 10
file content (71 lines) | stat: -rw-r--r-- 2,632 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# mypy: disallow_untyped_defs=False
import json
import unittest

import extruct
from extruct.utils import parse_html
from tests import get_testdata, jsonize_dict, replace_node_ref_with_node_id


class TestFlatten(unittest.TestCase):

    maxDiff = None

    def test_microdata(self):
        body, tree, expected = self._testdata_html_and_tree(
            "schema.org",
            "CreativeWork.001.html",
            "CreativeWork_flat.001.json",
        )
        syntax = "microdata"
        data = extruct.extract(body, uniform=True, syntaxes=[syntax])
        self.assertEqual(jsonize_dict(data[syntax]), expected[syntax])

        data = extruct.extract(tree, uniform=True, syntaxes=[syntax])
        self.assertEqual(jsonize_dict(data[syntax]), expected[syntax])

    def test_opengraph(self):
        body, tree, expected = self._testdata_html_and_tree(
            "misc",
            "opengraph_test.html",
            "opengraph_flat_test.json",
        )
        syntax = "opengraph"
        data = extruct.extract(body, uniform=True, syntaxes=[syntax])
        self.assertEqual(jsonize_dict(data[syntax]), expected)

        data = extruct.extract(tree, uniform=True, syntaxes=[syntax])
        self.assertEqual(jsonize_dict(data[syntax]), expected)

    def test_microdata_with_returning_node(self):
        body, tree, expected = self._testdata_html_and_tree(
            "schema.org",
            "CreativeWork.001.html",
            "CreativeWork_flat_with_node_id.001.json",
        )
        syntax = "microdata"
        data = extruct.extract(
            body, uniform=True, return_html_node=True, syntaxes=[syntax]
        )
        replace_node_ref_with_node_id(data[syntax])
        self.assertEqual(jsonize_dict(data[syntax]), expected[syntax])

        data = extruct.extract(
            tree, uniform=True, return_html_node=True, syntaxes=[syntax]
        )
        replace_node_ref_with_node_id(data[syntax])
        self.assertEqual(jsonize_dict(data[syntax]), expected[syntax])

    def test_microformat(self):
        body = get_testdata("misc", "microformat_test.html")
        expected = json.loads(
            get_testdata("misc", "microformat_flat_test.json").decode("UTF-8")
        )
        data = extruct.extract(body, uniform=True, syntaxes=["microformat"])
        self.assertEqual(jsonize_dict(data["microformat"]), expected)

    def _testdata_html_and_tree(self, root, path1, path2):
        body = get_testdata(root, path1)
        tree = parse_html(body, encoding="UTF-8")
        expected = json.loads(get_testdata(root, path2).decode("UTF-8"))
        return body, tree, expected