File: lxmlloader.py

package info (click to toggle)
python-scrapy 0.8-3
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 2,904 kB
  • ctags: 2,981
  • sloc: python: 15,349; xml: 199; makefile: 68; sql: 64; sh: 34
file content (34 lines) | stat: -rw-r--r-- 1,069 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from lxml import html, etree

from scrapy.contrib.loader import ItemLoader


class LxmlItemLoader(ItemLoader):

    def __init__(self, response, item=None, **context):
        self.tree = html.fromstring(response.body_as_unicode())
        context.update(response=response)
        super(LxmlItemLoader, self).__init__(item, **context)

    def add_xpath(self, field_name, xpath):
        self.add_value(field_name, self._get_xpath(xpath))

    def replace_xpath(self, field_name, xpath):
        self.replace_value(field_name, self._get_xpath(xpath))

    def _get_xpath(self, xpath):
        return self._get_values(self.tree.xpath(xpath))

    def add_css(self, field_name, css):
        self.add_value(field_name, self._get_css(css))

    def replace_css(self, field_name, css):
        self.replace_value(field_name, self._get_css(css))

    def _get_css(self, css):
        return self._get_values(self.tree.cssselect(css))

    def _get_values(self, elems):
        for e in elems:
            yield etree.tostring(e) if isinstance(e, etree.ElementBase) else e