1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
|
# mypy: disallow_untyped_defs=False
import json
import unittest
import pytest
import extruct
from extruct.utils import parse_html
from tests import get_testdata, jsonize_dict, replace_node_ref_with_node_id
class TestGeneric(unittest.TestCase):
maxDiff = None
def test_all(self):
body = get_testdata("songkick", "elysianfields.html")
expected = json.loads(
get_testdata("songkick", "elysianfields.json").decode("UTF-8")
)
data = extruct.extract(
body, base_url="http://www.songkick.com/artists/236156-elysian-fields"
)
self.assertEqual(jsonize_dict(data), expected)
def test_rdfa_is_preserving_order(self):
# See https://github.com/scrapinghub/extruct/issues/116
body = get_testdata("songkick", "elysianfields_1.html")
expected = json.loads(
get_testdata("songkick", "elysianfields_1.json").decode("UTF-8")
)
data = extruct.extract(
body, base_url="http://www.songkick.com/artists/236156-elysian-fields"
)
self.assertEqual(jsonize_dict(data)["rdfa"], expected["rdfa"])
def test_microdata_custom_url(self):
body, expected = self._microdata_custom_url("product_custom_url.json")
tree = parse_html(body, encoding="UTF-8")
data = extruct.extract(
tree, base_url="http://some-example.com", syntaxes=["microdata"]
)
self.assertEqual(data, expected)
def test_microdata_with_returning_node(self):
body, expected = self._microdata_custom_url(
"product_custom_url_and_node_id.json"
)
data = extruct.extract(
body,
base_url="http://some-example.com",
syntaxes=["microdata"],
return_html_node=True,
)
replace_node_ref_with_node_id(data)
self.assertEqual(data, expected)
def test_deprecated_url(self):
body, expected = self._microdata_custom_url("product_custom_url.json")
with pytest.warns(DeprecationWarning):
data = extruct.extract(
body, url="http://some-example.com", syntaxes=["microdata"]
)
self.assertEqual(data, expected)
def test_extra_kwargs(self):
body, _ = self._microdata_custom_url("product_custom_url.json")
with self.assertRaises(TypeError):
extruct.extract(body, foo="bar") # type: ignore[call-arg]
def _microdata_custom_url(self, test_file):
body = get_testdata("schema.org", "product.html")
expected = {
"microdata": json.loads(
get_testdata("schema.org", test_file).decode("UTF-8")
)
}
return body, expected
def test_errors(self):
body = ""
# raise exceptions
with self.assertRaises(Exception):
data = extruct.extract(body)
# ignore exceptions
data = extruct.extract(body, errors="ignore")
assert data == {}
# ignore exceptions
data = extruct.extract(body, errors="log")
assert data == {}
|