File: __init__.py

package info (click to toggle)
python-itemloaders 1.3.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 320 kB
  • sloc: python: 1,547; makefile: 78
file content (650 lines) | stat: -rw-r--r-- 23,058 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
"""
Item Loader

See documentation in docs/topics/loaders.rst
"""

from __future__ import annotations

from contextlib import suppress
from typing import (
    TYPE_CHECKING,
    Any,
    Callable,
    Dict,
    Iterable,
    List,
    MutableMapping,
    Optional,
    Pattern,
    Union,
)

from itemadapter import ItemAdapter
from parsel import Selector
from parsel.utils import extract_regex, flatten

from itemloaders.common import wrap_loader_context
from itemloaders.processors import Identity
from itemloaders.utils import arg_to_iter

if TYPE_CHECKING:
    # typing.Self requires Python 3.11
    from typing_extensions import Self


def unbound_method(method: Callable[..., Any]) -> Callable[..., Any]:
    """
    Allow to use single-argument functions as input or output processors
    (no need to define an unused first 'self' argument)
    """
    with suppress(AttributeError):
        if "." not in method.__qualname__:
            return method.__func__  # type: ignore[attr-defined, no-any-return]
    return method


class ItemLoader:
    """
    Return a new Item Loader for populating the given item. If no item is
    given, one is instantiated automatically using the class in
    :attr:`default_item_class`.

    When instantiated with a :param ``selector`` parameter the :class:`ItemLoader` class
    provides convenient mechanisms for extracting data from web pages
    using parsel_ selectors.

    :param item: The item instance to populate using subsequent calls to
        :meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css`,
        :meth:`~ItemLoader.add_jmes` or :meth:`~ItemLoader.add_value`.
    :type item: :class:`dict` object

    :param selector: The selector to extract data from, when using the
        :meth:`add_xpath` (resp. :meth:`add_css`, :meth:`add_jmes`) or :meth:`replace_xpath`
        (resp. :meth:`replace_css`, :meth:`replace_jmes`) method.
    :type selector: :class:`~parsel.selector.Selector` object

    The item, selector and the remaining keyword arguments are
    assigned to the Loader context (accessible through the :attr:`context` attribute).

    .. attribute:: item

        The item object being parsed by this Item Loader.
        This is mostly used as a property so when attempting to override this
        value, you may want to check out :attr:`default_item_class` first.

    .. attribute:: context

        The currently active :ref:`Context <loaders-context>` of this Item Loader.
        Refer to <loaders-context> for more information about the Loader Context.

    .. attribute:: default_item_class

        An Item class (or factory), used to instantiate items when not given in
        the ``__init__`` method.

        .. warning:: Currently, this factory/class needs to be
            callable/instantiated without any arguments.
            If you are using ``dataclasses``, please consider the following
            alternative::

                from dataclasses import dataclass, field
                from typing import Optional

                @dataclass
                class Product:
                    name: Optional[str] = field(default=None)
                    price: Optional[float] = field(default=None)

    .. attribute:: default_input_processor

        The default input processor to use for those fields which don't specify
        one.

    .. attribute:: default_output_processor

        The default output processor to use for those fields which don't specify
        one.

    .. attribute:: selector

        The :class:`~parsel.selector.Selector` object to extract data from.
        It's the selector given in the ``__init__`` method.
        This attribute is meant to be read-only.

    .. _parsel: https://parsel.readthedocs.io/en/latest/
    """

    default_item_class: type = dict
    default_input_processor: Callable[..., Any] = Identity()
    default_output_processor: Callable[..., Any] = Identity()

    def __init__(
        self,
        item: Any = None,
        selector: Optional[Selector] = None,
        parent: Optional[ItemLoader] = None,
        **context: Any,
    ):
        self.selector: Optional[Selector] = selector
        context.update(selector=selector)
        if item is None:
            item = self.default_item_class()
        self._local_item = item
        context["item"] = item
        self.context: MutableMapping[str, Any] = context
        self.parent: Optional[ItemLoader] = parent
        self._local_values: Dict[str, List[Any]] = {}
        # values from initial item
        for field_name, value in ItemAdapter(item).items():
            self._values.setdefault(field_name, [])
            self._values[field_name] += arg_to_iter(value)

    @property
    def _values(self) -> Dict[str, List[Any]]:
        if self.parent is not None:
            return self.parent._values
        else:
            return self._local_values

    @property
    def item(self) -> Any:
        if self.parent is not None:
            return self.parent.item
        else:
            return self._local_item

    def nested_xpath(self, xpath: str, **context: Any) -> Self:
        """
        Create a nested loader with an xpath selector.
        The supplied selector is applied relative to selector associated
        with this :class:`ItemLoader`. The nested loader shares the item
        with the parent :class:`ItemLoader` so calls to :meth:`add_xpath`,
        :meth:`add_value`, :meth:`replace_value`, etc. will behave as expected.
        """
        self._check_selector_method()
        assert self.selector is not None
        selector = self.selector.xpath(xpath)
        context.update(selector=selector)
        subloader = self.__class__(item=self.item, parent=self, **context)
        return subloader

    def nested_css(self, css: str, **context: Any) -> Self:
        """
        Create a nested loader with a css selector.
        The supplied selector is applied relative to selector associated
        with this :class:`ItemLoader`. The nested loader shares the item
        with the parent :class:`ItemLoader` so calls to :meth:`add_xpath`,
        :meth:`add_value`, :meth:`replace_value`, etc. will behave as expected.
        """
        self._check_selector_method()
        assert self.selector is not None
        selector = self.selector.css(css)
        context.update(selector=selector)
        subloader = self.__class__(item=self.item, parent=self, **context)
        return subloader

    def add_value(
        self,
        field_name: Optional[str],
        value: Any,
        *processors: Callable[..., Any],
        re: Union[str, Pattern[str], None] = None,
        **kw: Any,
    ) -> Self:
        """
        Process and then add the given ``value`` for the given field.

        The value is first passed through :meth:`get_value` by giving the
        ``processors`` and ``kwargs``, and then passed through the
        :ref:`field input processor <processors>` and its result
        appended to the data collected for that field. If the field already
        contains collected data, the new data is added.

        The given ``field_name`` can be ``None``, in which case values for
        multiple fields may be added. And the processed value should be a dict
        with field_name mapped to values.

        :returns: The current ItemLoader instance for method chaining.
        :rtype: ItemLoader

        Examples::

            loader.add_value('name', 'Color TV')
            loader.add_value('colours', ['white', 'blue'])
            loader.add_value('length', '100')
            loader.add_value('name', 'name: foo', TakeFirst(), re='name: (.+)')
            loader.add_value(None, {'name': 'foo', 'sex': 'male'})

        """
        value = self.get_value(value, *processors, re=re, **kw)
        if value is None:
            return self
        if not field_name:
            for k, v in value.items():
                self._add_value(k, v)
        else:
            self._add_value(field_name, value)
        return self

    def replace_value(
        self,
        field_name: Optional[str],
        value: Any,
        *processors: Callable[..., Any],
        re: Union[str, Pattern[str], None] = None,
        **kw: Any,
    ) -> Self:
        """
        Similar to :meth:`add_value` but replaces the collected data with the
        new value instead of adding it.

        :returns: The current ItemLoader instance for method chaining.
        :rtype: ItemLoader
        """
        value = self.get_value(value, *processors, re=re, **kw)
        if value is None:
            return self
        if not field_name:
            for k, v in value.items():
                self._replace_value(k, v)
        else:
            self._replace_value(field_name, value)
        return self

    def _add_value(self, field_name: str, value: Any) -> None:
        value = arg_to_iter(value)
        processed_value = self._process_input_value(field_name, value)
        if processed_value:
            self._values.setdefault(field_name, [])
            self._values[field_name] += arg_to_iter(processed_value)

    def _replace_value(self, field_name: str, value: Any) -> None:
        self._values.pop(field_name, None)
        self._add_value(field_name, value)

    def get_value(
        self,
        value: Any,
        *processors: Callable[..., Any],
        re: Union[str, Pattern[str], None] = None,
        **kw: Any,
    ) -> Any:
        """
        Process the given ``value`` by the given ``processors`` and keyword
        arguments.

        Available keyword arguments:

        :param re: a regular expression to use for extracting data from the
            given value using :func:`~parsel.utils.extract_regex` method,
            applied before processors
        :type re: str or typing.Pattern[str]

        Examples:

        >>> from itemloaders import ItemLoader
        >>> from itemloaders.processors import TakeFirst
        >>> loader = ItemLoader()
        >>> loader.get_value('name: foo', TakeFirst(), str.upper, re='name: (.+)')
        'FOO'
        """
        if re:
            value = arg_to_iter(value)
            value = flatten(extract_regex(re, x) for x in value)

        for proc in processors:
            if value is None:
                break
            _proc = proc
            proc = wrap_loader_context(proc, self.context)
            try:
                value = proc(value)
            except Exception as e:
                raise ValueError(
                    "Error with processor %s value=%r error='%s: %s'"
                    % (_proc.__class__.__name__, value, type(e).__name__, str(e))
                ) from e
        return value

    def load_item(self) -> Any:
        """
        Populate the item with the data collected so far, and return it. The
        data collected is first passed through the :ref:`output processors
        <processors>` to get the final value to assign to each item field.
        """
        adapter = ItemAdapter(self.item)
        for field_name in tuple(self._values):
            value = self.get_output_value(field_name)
            if value is not None:
                adapter[field_name] = value

        return adapter.item

    def get_output_value(self, field_name: str) -> Any:
        """
        Return the collected values parsed using the output processor, for the
        given field. This method doesn't populate or modify the item at all.
        """
        proc = self.get_output_processor(field_name)
        proc = wrap_loader_context(proc, self.context)
        value = self._values.get(field_name, [])
        try:
            return proc(value)
        except Exception as e:
            raise ValueError(
                "Error with output processor: field=%r value=%r error='%s: %s'"
                % (field_name, value, type(e).__name__, str(e))
            ) from e

    def get_collected_values(self, field_name: str) -> List[Any]:
        """Return the collected values for the given field."""
        return self._values.get(field_name, [])

    def get_input_processor(self, field_name: str) -> Callable[..., Any]:
        proc = getattr(self, "%s_in" % field_name, None)
        if not proc:
            proc = self._get_item_field_attr(
                field_name, "input_processor", self.default_input_processor
            )
        return unbound_method(proc)

    def get_output_processor(self, field_name: str) -> Callable[..., Any]:
        proc = getattr(self, "%s_out" % field_name, None)
        if not proc:
            proc = self._get_item_field_attr(
                field_name, "output_processor", self.default_output_processor
            )
        return unbound_method(proc)

    def _get_item_field_attr(
        self, field_name: str, key: Any, default: Any = None
    ) -> Any:
        field_meta = ItemAdapter(self.item).get_field_meta(field_name)
        return field_meta.get(key, default)

    def _process_input_value(self, field_name: str, value: Any) -> Any:
        proc = self.get_input_processor(field_name)
        _proc = proc
        proc = wrap_loader_context(proc, self.context)
        try:
            return proc(value)
        except Exception as e:
            raise ValueError(
                "Error with input processor %s: field=%r value=%r "
                "error='%s: %s'"
                % (
                    _proc.__class__.__name__,
                    field_name,
                    value,
                    type(e).__name__,
                    str(e),
                )
            ) from e

    def _check_selector_method(self) -> None:
        if self.selector is None:
            raise RuntimeError(
                "To use XPath or CSS selectors, %s "
                "must be instantiated with a selector" % self.__class__.__name__
            )

    def add_xpath(
        self,
        field_name: Optional[str],
        xpath: Union[str, Iterable[str]],
        *processors: Callable[..., Any],
        re: Union[str, Pattern[str], None] = None,
        **kw: Any,
    ) -> Self:
        """
        Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a
        value, which is used to extract a list of strings from the
        selector associated with this :class:`ItemLoader`.

        See :meth:`get_xpath` for ``kwargs``.

        :param xpath: the XPath to extract data from
        :type xpath: str

        :returns: The current ItemLoader instance for method chaining.
        :rtype: ItemLoader

        Examples::

            # HTML snippet: <p class="product-name">Color TV</p>
            loader.add_xpath('name', '//p[@class="product-name"]')
            # HTML snippet: <p id="price">the price is $1200</p>
            loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)')

        """
        values = self._get_xpathvalues(xpath, **kw)
        return self.add_value(field_name, values, *processors, re=re, **kw)

    def replace_xpath(
        self,
        field_name: Optional[str],
        xpath: Union[str, Iterable[str]],
        *processors: Callable[..., Any],
        re: Union[str, Pattern[str], None] = None,
        **kw: Any,
    ) -> Self:
        """
        Similar to :meth:`add_xpath` but replaces collected data instead of adding it.

        :returns: The current ItemLoader instance for method chaining.
        :rtype: ItemLoader

        """
        values = self._get_xpathvalues(xpath, **kw)
        return self.replace_value(field_name, values, *processors, re=re, **kw)

    def get_xpath(
        self,
        xpath: Union[str, Iterable[str]],
        *processors: Callable[..., Any],
        re: Union[str, Pattern[str], None] = None,
        **kw: Any,
    ) -> Any:
        """
        Similar to :meth:`ItemLoader.get_value` but receives an XPath instead of a
        value, which is used to extract a list of unicode strings from the
        selector associated with this :class:`ItemLoader`.

        :param xpath: the XPath to extract data from
        :type xpath: str

        :param re: a regular expression to use for extracting data from the
            selected XPath region
        :type re: str or typing.Pattern[str]

        Examples::

            # HTML snippet: <p class="product-name">Color TV</p>
            loader.get_xpath('//p[@class="product-name"]')
            # HTML snippet: <p id="price">the price is $1200</p>
            loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)')

        """
        values = self._get_xpathvalues(xpath, **kw)
        return self.get_value(values, *processors, re=re, **kw)

    def _get_xpathvalues(
        self, xpaths: Union[str, Iterable[str]], **kw: Any
    ) -> List[Any]:
        self._check_selector_method()
        assert self.selector is not None
        xpaths = arg_to_iter(xpaths)
        return flatten(self.selector.xpath(xpath, **kw).getall() for xpath in xpaths)

    def add_css(
        self,
        field_name: Optional[str],
        css: Union[str, Iterable[str]],
        *processors: Callable[..., Any],
        re: Union[str, Pattern[str], None] = None,
        **kw: Any,
    ) -> Self:
        """
        Similar to :meth:`ItemLoader.add_value` but receives a CSS selector
        instead of a value, which is used to extract a list of unicode strings
        from the selector associated with this :class:`ItemLoader`.

        See :meth:`get_css` for ``kwargs``.

        :param css: the CSS selector to extract data from
        :type css: str

        :returns: The current ItemLoader instance for method chaining.
        :rtype: ItemLoader

        Examples::

            # HTML snippet: <p class="product-name">Color TV</p>
            loader.add_css('name', 'p.product-name')
            # HTML snippet: <p id="price">the price is $1200</p>
            loader.add_css('price', 'p#price', re='the price is (.*)')

        """
        values = self._get_cssvalues(css)
        return self.add_value(field_name, values, *processors, re=re, **kw)

    def replace_css(
        self,
        field_name: Optional[str],
        css: Union[str, Iterable[str]],
        *processors: Callable[..., Any],
        re: Union[str, Pattern[str], None] = None,
        **kw: Any,
    ) -> Self:
        """
        Similar to :meth:`add_css` but replaces collected data instead of adding it.

        :returns: The current ItemLoader instance for method chaining.
        :rtype: ItemLoader

        """
        values = self._get_cssvalues(css)
        return self.replace_value(field_name, values, *processors, re=re, **kw)

    def get_css(
        self,
        css: Union[str, Iterable[str]],
        *processors: Callable[..., Any],
        re: Union[str, Pattern[str], None] = None,
        **kw: Any,
    ) -> Any:
        """
        Similar to :meth:`ItemLoader.get_value` but receives a CSS selector
        instead of a value, which is used to extract a list of unicode strings
        from the selector associated with this :class:`ItemLoader`.

        :param css: the CSS selector to extract data from
        :type css: str

        :param re: a regular expression to use for extracting data from the
            selected CSS region
        :type re: str or typing.Pattern[str]

        Examples::

            # HTML snippet: <p class="product-name">Color TV</p>
            loader.get_css('p.product-name')
            # HTML snippet: <p id="price">the price is $1200</p>
            loader.get_css('p#price', TakeFirst(), re='the price is (.*)')
        """
        values = self._get_cssvalues(css)
        return self.get_value(values, *processors, re=re, **kw)

    def _get_cssvalues(self, csss: Union[str, Iterable[str]]) -> List[Any]:
        self._check_selector_method()
        assert self.selector is not None
        csss = arg_to_iter(csss)
        return flatten(self.selector.css(css).getall() for css in csss)

    def add_jmes(
        self,
        field_name: Optional[str],
        jmes: str,
        *processors: Callable[..., Any],
        re: Union[str, Pattern[str], None] = None,
        **kw: Any,
    ) -> Self:
        """
        Similar to :meth:`ItemLoader.add_value` but receives a JMESPath selector
        instead of a value, which is used to extract a list of unicode strings
        from the selector associated with this :class:`ItemLoader`.

        See :meth:`get_jmes` for ``kwargs``.

        :param jmes: the JMESPath selector to extract data from
        :type jmes: str

        :returns: The current ItemLoader instance for method chaining.
        :rtype: ItemLoader

        Examples::

            # HTML snippet: {"name": "Color TV"}
            loader.add_jmes('name')
            # HTML snippet: {"price": the price is $1200"}
            loader.add_jmes('price', TakeFirst(), re='the price is (.*)')
        """
        values = self._get_jmesvalues(jmes)
        return self.add_value(field_name, values, *processors, re=re, **kw)

    def replace_jmes(
        self,
        field_name: Optional[str],
        jmes: Union[str, Iterable[str]],
        *processors: Callable[..., Any],
        re: Union[str, Pattern[str], None] = None,
        **kw: Any,
    ) -> Self:
        """
        Similar to :meth:`add_jmes` but replaces collected data instead of adding it.

        :returns: The current ItemLoader instance for method chaining.
        :rtype: ItemLoader
        """
        values = self._get_jmesvalues(jmes)
        return self.replace_value(field_name, values, *processors, re=re, **kw)

    def get_jmes(
        self,
        jmes: Union[str, Iterable[str]],
        *processors: Callable[..., Any],
        re: Union[str, Pattern[str], None] = None,
        **kw: Any,
    ) -> Any:
        """
        Similar to :meth:`ItemLoader.get_value` but receives a JMESPath selector
        instead of a value, which is used to extract a list of unicode strings
        from the selector associated with this :class:`ItemLoader`.

        :param jmes: the JMESPath selector to extract data from
        :type jmes: str

        :param re: a regular expression to use for extracting data from the
            selected JMESPath
        :type re: str or typing.Pattern

        Examples::

            # HTML snippet: {"name": "Color TV"}
            loader.get_jmes('name')
            # HTML snippet: {"price": the price is $1200"}
            loader.get_jmes('price', TakeFirst(), re='the price is (.*)')
        """
        values = self._get_jmesvalues(jmes)
        return self.get_value(values, *processors, re=re, **kw)

    def _get_jmesvalues(self, jmess: Union[str, Iterable[str]]) -> List[Any]:
        self._check_selector_method()
        assert self.selector is not None
        jmess = arg_to_iter(jmess)
        if not hasattr(self.selector, "jmespath"):
            raise AttributeError(
                "Please install parsel >= 1.8.1 to get jmespath support"
            )
        return flatten(self.selector.jmespath(jmes).getall() for jmes in jmess)