File: test_extruct.py

package info (click to toggle)
extruct 0.18.0-1
  • links: PTS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,048 kB
  • sloc: python: 2,106; makefile: 10
file content (94 lines) | stat: -rw-r--r-- 3,110 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# mypy: disallow_untyped_defs=False
import json
import unittest

import pytest

import extruct
from extruct.utils import parse_html
from tests import get_testdata, jsonize_dict, replace_node_ref_with_node_id


class TestGeneric(unittest.TestCase):

    maxDiff = None

    def test_all(self):
        body = get_testdata("songkick", "elysianfields.html")
        expected = json.loads(
            get_testdata("songkick", "elysianfields.json").decode("UTF-8")
        )
        data = extruct.extract(
            body, base_url="http://www.songkick.com/artists/236156-elysian-fields"
        )

        self.assertEqual(jsonize_dict(data), expected)

    def test_rdfa_is_preserving_order(self):
        # See https://github.com/scrapinghub/extruct/issues/116
        body = get_testdata("songkick", "elysianfields_1.html")
        expected = json.loads(
            get_testdata("songkick", "elysianfields_1.json").decode("UTF-8")
        )
        data = extruct.extract(
            body, base_url="http://www.songkick.com/artists/236156-elysian-fields"
        )
        self.assertEqual(jsonize_dict(data)["rdfa"], expected["rdfa"])

    def test_microdata_custom_url(self):
        body, expected = self._microdata_custom_url("product_custom_url.json")
        tree = parse_html(body, encoding="UTF-8")
        data = extruct.extract(
            tree, base_url="http://some-example.com", syntaxes=["microdata"]
        )
        self.assertEqual(data, expected)

    def test_microdata_with_returning_node(self):
        body, expected = self._microdata_custom_url(
            "product_custom_url_and_node_id.json"
        )
        data = extruct.extract(
            body,
            base_url="http://some-example.com",
            syntaxes=["microdata"],
            return_html_node=True,
        )
        replace_node_ref_with_node_id(data)
        self.assertEqual(data, expected)

    def test_deprecated_url(self):
        body, expected = self._microdata_custom_url("product_custom_url.json")
        with pytest.warns(DeprecationWarning):
            data = extruct.extract(
                body, url="http://some-example.com", syntaxes=["microdata"]
            )
        self.assertEqual(data, expected)

    def test_extra_kwargs(self):
        body, _ = self._microdata_custom_url("product_custom_url.json")
        with self.assertRaises(TypeError):
            extruct.extract(body, foo="bar")  # type: ignore[call-arg]

    def _microdata_custom_url(self, test_file):
        body = get_testdata("schema.org", "product.html")
        expected = {
            "microdata": json.loads(
                get_testdata("schema.org", test_file).decode("UTF-8")
            )
        }
        return body, expected

    def test_errors(self):
        body = ""

        # raise exceptions
        with self.assertRaises(Exception):
            data = extruct.extract(body)

        # ignore exceptions
        data = extruct.extract(body, errors="ignore")
        assert data == {}

        # ignore exceptions
        data = extruct.extract(body, errors="log")
        assert data == {}