File: base.py

package info (click to toggle)
python-scrapy 2.13.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 5,664 kB
  • sloc: python: 52,028; xml: 199; makefile: 25; sh: 7
file content (110 lines) | stat: -rw-r--r-- 3,844 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from __future__ import annotations

from typing import TYPE_CHECKING, Any

from scrapy import Request, Spider

if TYPE_CHECKING:
    from collections.abc import AsyncIterator, Iterable

    # typing.Self requires Python 3.11
    from typing_extensions import Self

    from scrapy.crawler import Crawler
    from scrapy.http import Response


class BaseSpiderMiddleware:
    """Optional base class for spider middlewares.

    .. versionadded:: 2.13

    This class provides helper methods for asynchronous
    ``process_spider_output()`` and ``process_start()`` methods. Middlewares
    that don't have either of these methods don't need to use this class.

    You can override the
    :meth:`~scrapy.spidermiddlewares.base.BaseSpiderMiddleware.get_processed_request`
    method to add processing code for requests and the
    :meth:`~scrapy.spidermiddlewares.base.BaseSpiderMiddleware.get_processed_item`
    method to add processing code for items. These methods take a single
    request or item from the spider output iterable and return a request or
    item (the same or a new one), or ``None`` to remove this request or item
    from the processing.
    """

    def __init__(self, crawler: Crawler):
        self.crawler: Crawler = crawler

    @classmethod
    def from_crawler(cls, crawler: Crawler) -> Self:
        return cls(crawler)

    def process_start_requests(
        self, start: Iterable[Any], spider: Spider
    ) -> Iterable[Any]:
        for o in start:
            if (o := self._get_processed(o, None)) is not None:
                yield o

    async def process_start(self, start: AsyncIterator[Any]) -> AsyncIterator[Any]:
        async for o in start:
            if (o := self._get_processed(o, None)) is not None:
                yield o

    def process_spider_output(
        self, response: Response, result: Iterable[Any], spider: Spider
    ) -> Iterable[Any]:
        for o in result:
            if (o := self._get_processed(o, response)) is not None:
                yield o

    async def process_spider_output_async(
        self, response: Response, result: AsyncIterator[Any], spider: Spider
    ) -> AsyncIterator[Any]:
        async for o in result:
            if (o := self._get_processed(o, response)) is not None:
                yield o

    def _get_processed(self, o: Any, response: Response | None) -> Any:
        if isinstance(o, Request):
            return self.get_processed_request(o, response)
        return self.get_processed_item(o, response)

    def get_processed_request(
        self, request: Request, response: Response | None
    ) -> Request | None:
        """Return a processed request from the spider output.

        This method is called with a single request from the start seeds or the
        spider output. It should return the same or a different request, or
        ``None`` to ignore it.

        :param request: the input request
        :type request: :class:`~scrapy.Request` object

        :param response: the response being processed
        :type response: :class:`~scrapy.http.Response` object or ``None`` for
            start seeds

        :return: the processed request or ``None``
        """
        return request

    def get_processed_item(self, item: Any, response: Response | None) -> Any:
        """Return a processed item from the spider output.

        This method is called with a single item from the start seeds or the
        spider output. It should return the same or a different item, or
        ``None`` to ignore it.

        :param item: the input item
        :type item: item object

        :param response: the response being processed
        :type response: :class:`~scrapy.http.Response` object or ``None`` for
            start seeds

        :return: the processed item or ``None``
        """
        return item