1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
|
"""
Item Loader
See documentation in docs/topics/loaders.rst
"""
from collections import defaultdict
import re
from scrapy.item import Item
from scrapy.selector import HtmlXPathSelector
from scrapy.utils.misc import arg_to_iter, extract_regex
from scrapy.utils.python import flatten
from .common import wrap_loader_context
from .processor import Identity
class ItemLoader(object):
default_item_class = Item
default_input_processor = Identity()
default_output_processor = Identity()
def __init__(self, item=None, **context):
if item is None:
item = self.default_item_class()
self.item = context['item'] = item
self.context = context
self._values = defaultdict(list)
def add_value(self, field_name, value, *processors, **kw):
value = self.get_value(value, *processors, **kw)
if value is None:
return
if not field_name:
for k,v in value.iteritems():
self._add_value(k, v)
else:
self._add_value(field_name, value)
def replace_value(self, field_name, value, *processors, **kw):
value = self.get_value(value, *processors, **kw)
if value is None:
return
if not field_name:
for k,v in value.iteritems():
self._replace_value(k, v)
else:
self._replace_value(field_name, value)
def _add_value(self, field_name, value):
value = arg_to_iter(value)
processed_value = self._process_input_value(field_name, value)
if processed_value:
self._values[field_name] += arg_to_iter(processed_value)
def _replace_value(self, field_name, value):
self._values.pop(field_name, None)
self._add_value(field_name, value)
def get_value(self, value, *processors, **kw):
regex = kw.get('re', None)
if regex:
value = arg_to_iter(value)
value = flatten([extract_regex(regex, x) for x in value])
for proc in processors:
if value is None:
break
proc = wrap_loader_context(proc, self.context)
value = proc(value)
return value
def load_item(self):
item = self.item
for field_name in self._values:
item[field_name] = self.get_output_value(field_name)
return item
def get_output_value(self, field_name):
proc = self.get_output_processor(field_name)
proc = wrap_loader_context(proc, self.context)
try:
return proc(self._values[field_name])
except Exception, e:
raise ValueError("Error with output processor: field=%r value=%r error='%s: %s'" % \
(field_name, self._values[field_name], type(e).__name__, str(e)))
def get_collected_values(self, field_name):
return self._values[field_name]
def get_input_processor(self, field_name):
proc = getattr(self, '%s_in' % field_name, None)
if not proc:
proc = self._get_item_field_attr(field_name, 'input_processor', \
self.default_input_processor)
return proc
def get_output_processor(self, field_name):
proc = getattr(self, '%s_out' % field_name, None)
if not proc:
proc = self._get_item_field_attr(field_name, 'output_processor', \
self.default_output_processor)
return proc
def _process_input_value(self, field_name, value):
proc = self.get_input_processor(field_name)
proc = wrap_loader_context(proc, self.context)
return proc(value)
def _get_item_field_attr(self, field_name, key, default=None):
if isinstance(self.item, Item):
value = self.item.fields[field_name].get(key, default)
else:
value = default
return value
class XPathItemLoader(ItemLoader):
default_selector_class = HtmlXPathSelector
def __init__(self, item=None, selector=None, response=None, **context):
if selector is None and response is None:
raise RuntimeError("%s must be instantiated with a selector " \
"or response" % self.__class__.__name__)
if selector is None:
selector = self.default_selector_class(response)
self.selector = selector
context.update(selector=selector, response=response)
super(XPathItemLoader, self).__init__(item, **context)
def add_xpath(self, field_name, xpath, *processors, **kw):
values = self._get_values(xpath, **kw)
self.add_value(field_name, values, *processors, **kw)
def replace_xpath(self, field_name, xpath, *processors, **kw):
values = self._get_values(xpath, **kw)
self.replace_value(field_name, values, *processors, **kw)
def get_xpath(self, xpath, *processors, **kw):
values = self._get_values(xpath, **kw)
return self.get_value(values, *processors, **kw)
def _get_values(self, xpaths, **kw):
xpaths = arg_to_iter(xpaths)
return flatten([self.selector.select(xpath).extract() for xpath in xpaths])
|