1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
|
from parsel import Selector
from itemloaders import ItemLoader
class TestSubselectorLoader:
selector = Selector(
text="""
<html>
<body>
<header>
<div id="id">marta</div>
<p>paragraph</p>
</header>
<footer class="footer">
<a href="http://www.scrapy.org">homepage</a>
<img src="/images/logo.png" width="244" height="65" alt="Scrapy">
</footer>
</body>
</html>
"""
)
def test_nested_xpath(self):
loader = ItemLoader(selector=self.selector)
nl = loader.nested_xpath("//header")
nl.add_xpath("name", "div/text()")
nl.add_css("name_div", "#id")
assert nl.selector
nl.add_value("name_value", nl.selector.xpath('div[@id = "id"]/text()').getall())
assert loader.get_output_value("name") == ["marta"]
assert loader.get_output_value("name_div") == ['<div id="id">marta</div>']
assert loader.get_output_value("name_value") == ["marta"]
assert loader.get_output_value("name") == nl.get_output_value("name")
assert loader.get_output_value("name_div") == nl.get_output_value("name_div")
assert loader.get_output_value("name_value") == nl.get_output_value(
"name_value"
)
def test_nested_css(self):
loader = ItemLoader(selector=self.selector)
nl = loader.nested_css("header")
nl.add_xpath("name", "div/text()")
nl.add_css("name_div", "#id")
assert nl.selector
nl.add_value("name_value", nl.selector.xpath('div[@id = "id"]/text()').getall())
assert loader.get_output_value("name") == ["marta"]
assert loader.get_output_value("name_div") == ['<div id="id">marta</div>']
assert loader.get_output_value("name_value") == ["marta"]
assert loader.get_output_value("name") == nl.get_output_value("name")
assert loader.get_output_value("name_div") == nl.get_output_value("name_div")
assert loader.get_output_value("name_value") == nl.get_output_value(
"name_value"
)
def test_nested_replace(self):
loader = ItemLoader(selector=self.selector)
nl1 = loader.nested_xpath("//footer")
nl2 = nl1.nested_xpath("a")
loader.add_xpath("url", "//footer/a/@href")
assert loader.get_output_value("url") == ["http://www.scrapy.org"]
nl1.replace_xpath("url", "img/@src")
assert loader.get_output_value("url") == ["/images/logo.png"]
nl2.replace_xpath("url", "@href")
assert loader.get_output_value("url") == ["http://www.scrapy.org"]
def test_nested_ordering(self):
loader = ItemLoader(selector=self.selector)
nl1 = loader.nested_xpath("//footer")
nl2 = nl1.nested_xpath("a")
nl1.add_xpath("url", "img/@src")
loader.add_xpath("url", "//footer/a/@href")
nl2.add_xpath("url", "text()")
loader.add_xpath("url", "//footer/a/@href")
assert loader.get_output_value("url") == [
"/images/logo.png",
"http://www.scrapy.org",
"homepage",
"http://www.scrapy.org",
]
def test_nested_load_item(self):
loader = ItemLoader(selector=self.selector)
nl1 = loader.nested_xpath("//footer")
nl2 = nl1.nested_xpath("img")
loader.add_xpath("name", "//header/div/text()")
nl1.add_xpath("url", "a/@href")
nl2.add_xpath("image", "@src")
item = loader.load_item()
assert item is loader.item
assert item is nl1.item
assert item is nl2.item
assert item["name"] == ["marta"]
assert item["url"] == ["http://www.scrapy.org"]
assert item["image"] == ["/images/logo.png"]
def test_nested_empty_selector(self):
loader = ItemLoader(selector=self.selector)
nested_xpath = loader.nested_xpath("//bar")
assert isinstance(nested_xpath, ItemLoader)
nested_xpath.add_xpath("foo", "./foo")
nested_css = loader.nested_css("bar")
assert isinstance(nested_css, ItemLoader)
nested_css.add_css("foo", "foo")
|