File: __init__.py

package info (click to toggle)
python-scrapy 2.13.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 5,664 kB
  • sloc: python: 52,028; xml: 199; makefile: 25; sh: 7
file content (106 lines) | stat: -rw-r--r-- 3,757 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
"""
Item Loader

See documentation in docs/topics/loaders.rst
"""

from __future__ import annotations

from typing import TYPE_CHECKING, Any

import itemloaders

from scrapy.item import Item
from scrapy.selector import Selector

if TYPE_CHECKING:
    from scrapy.http import TextResponse


class ItemLoader(itemloaders.ItemLoader):
    """
    A user-friendly abstraction to populate an :ref:`item <topics-items>` with data
    by applying :ref:`field processors <topics-loaders-processors>` to scraped data.
    When instantiated with a ``selector`` or a ``response`` it supports
    data extraction from web pages using :ref:`selectors <topics-selectors>`.

    :param item: The item instance to populate using subsequent calls to
        :meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css`,
        or :meth:`~ItemLoader.add_value`.
    :type item: scrapy.item.Item

    :param selector: The selector to extract data from, when using the
        :meth:`add_xpath`, :meth:`add_css`, :meth:`replace_xpath`, or
        :meth:`replace_css` method.
    :type selector: :class:`~scrapy.Selector` object

    :param response: The response used to construct the selector using the
        :attr:`default_selector_class`, unless the selector argument is given,
        in which case this argument is ignored.
    :type response: :class:`~scrapy.http.Response` object

    If no item is given, one is instantiated automatically using the class in
    :attr:`default_item_class`.

    The item, selector, response and remaining keyword arguments are
    assigned to the Loader context (accessible through the :attr:`context` attribute).

    .. attribute:: item

        The item object being parsed by this Item Loader.
        This is mostly used as a property so, when attempting to override this
        value, you may want to check out :attr:`default_item_class` first.

    .. attribute:: context

        The currently active :ref:`Context <loaders-context>` of this Item Loader.

    .. attribute:: default_item_class

        An :ref:`item <topics-items>` class (or factory), used to instantiate
        items when not given in the ``__init__`` method.

    .. attribute:: default_input_processor

        The default input processor to use for those fields which don't specify
        one.

    .. attribute:: default_output_processor

        The default output processor to use for those fields which don't specify
        one.

    .. attribute:: default_selector_class

        The class used to construct the :attr:`selector` of this
        :class:`ItemLoader`, if only a response is given in the ``__init__`` method.
        If a selector is given in the ``__init__`` method this attribute is ignored.
        This attribute is sometimes overridden in subclasses.

    .. attribute:: selector

        The :class:`~scrapy.Selector` object to extract data from.
        It's either the selector given in the ``__init__`` method or one created from
        the response given in the ``__init__`` method using the
        :attr:`default_selector_class`. This attribute is meant to be
        read-only.
    """

    default_item_class: type = Item
    default_selector_class = Selector

    def __init__(
        self,
        item: Any = None,
        selector: Selector | None = None,
        response: TextResponse | None = None,
        parent: itemloaders.ItemLoader | None = None,
        **context: Any,
    ):
        if selector is None and response is not None:
            try:
                selector = self.default_selector_class(response)
            except AttributeError:
                selector = None
        context.update(response=response)
        super().__init__(item=item, selector=selector, parent=parent, **context)