1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
|
"""
Base class for Scrapy spiders
See documentation in docs/topics/spiders.rst
"""
from __future__ import annotations
import logging
import warnings
from typing import TYPE_CHECKING, Any, cast
from scrapy import signals
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.http import Request, Response
from scrapy.utils.trackref import object_ref
from scrapy.utils.url import url_is_from_spider
if TYPE_CHECKING:
from collections.abc import AsyncIterator, Iterable
from twisted.internet.defer import Deferred
# typing.Self requires Python 3.11
from typing_extensions import Self
from scrapy.crawler import Crawler
from scrapy.http.request import CallbackT
from scrapy.settings import BaseSettings, _SettingsKeyT
from scrapy.utils.log import SpiderLoggerAdapter
class Spider(object_ref):
"""Base class that any spider must subclass.
It provides a default :meth:`start` implementation that sends
requests based on the :attr:`start_urls` class attribute and calls the
:meth:`parse` method for each response.
"""
name: str
custom_settings: dict[_SettingsKeyT, Any] | None = None
#: Start URLs. See :meth:`start`.
start_urls: list[str]
def __init__(self, name: str | None = None, **kwargs: Any):
if name is not None:
self.name: str = name
elif not getattr(self, "name", None):
raise ValueError(f"{type(self).__name__} must have a name")
self.__dict__.update(kwargs)
if not hasattr(self, "start_urls"):
self.start_urls: list[str] = []
@property
def logger(self) -> SpiderLoggerAdapter:
from scrapy.utils.log import SpiderLoggerAdapter
logger = logging.getLogger(self.name)
return SpiderLoggerAdapter(logger, {"spider": self})
def log(self, message: Any, level: int = logging.DEBUG, **kw: Any) -> None:
"""Log the given message at the given log level
This helper wraps a log call to the logger within the spider, but you
can use it directly (e.g. Spider.logger.info('msg')) or use any other
Python logger too.
"""
self.logger.log(level, message, **kw)
@classmethod
def from_crawler(cls, crawler: Crawler, *args: Any, **kwargs: Any) -> Self:
spider = cls(*args, **kwargs)
spider._set_crawler(crawler)
return spider
def _set_crawler(self, crawler: Crawler) -> None:
self.crawler: Crawler = crawler
self.settings: BaseSettings = crawler.settings
crawler.signals.connect(self.close, signals.spider_closed)
async def start(self) -> AsyncIterator[Any]:
"""Yield the initial :class:`~scrapy.Request` objects to send.
.. versionadded:: 2.13
For example:
.. code-block:: python
from scrapy import Request, Spider
class MySpider(Spider):
name = "myspider"
async def start(self):
yield Request("https://toscrape.com/")
The default implementation reads URLs from :attr:`start_urls` and
yields a request for each with :attr:`~scrapy.Request.dont_filter`
enabled. It is functionally equivalent to:
.. code-block:: python
async def start(self):
for url in self.start_urls:
yield Request(url, dont_filter=True)
You can also yield :ref:`items <topics-items>`. For example:
.. code-block:: python
async def start(self):
yield {"foo": "bar"}
To write spiders that work on Scrapy versions lower than 2.13,
define also a synchronous ``start_requests()`` method that returns an
iterable. For example:
.. code-block:: python
def start_requests(self):
yield Request("https://toscrape.com/")
.. seealso:: :ref:`start-requests`
"""
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore", category=ScrapyDeprecationWarning, module=r"^scrapy\.spiders$"
)
for item_or_request in self.start_requests():
yield item_or_request
def start_requests(self) -> Iterable[Any]:
warnings.warn(
(
"The Spider.start_requests() method is deprecated, use "
"Spider.start() instead. If you are calling "
"super().start_requests() from a Spider.start() override, "
"iterate super().start() instead."
),
ScrapyDeprecationWarning,
stacklevel=2,
)
if not self.start_urls and hasattr(self, "start_url"):
raise AttributeError(
"Crawling could not start: 'start_urls' not found "
"or empty (but found 'start_url' attribute instead, "
"did you miss an 's'?)"
)
for url in self.start_urls:
yield Request(url, dont_filter=True)
def _parse(self, response: Response, **kwargs: Any) -> Any:
return self.parse(response, **kwargs)
if TYPE_CHECKING:
parse: CallbackT
else:
def parse(self, response: Response, **kwargs: Any) -> Any:
raise NotImplementedError(
f"{self.__class__.__name__}.parse callback is not defined"
)
@classmethod
def update_settings(cls, settings: BaseSettings) -> None:
settings.setdict(cls.custom_settings or {}, priority="spider")
@classmethod
def handles_request(cls, request: Request) -> bool:
return url_is_from_spider(request.url, cls)
@staticmethod
def close(spider: Spider, reason: str) -> Deferred[None] | None:
closed = getattr(spider, "closed", None)
if callable(closed):
return cast("Deferred[None] | None", closed(reason))
return None
def __repr__(self) -> str:
return f"<{type(self).__name__} {self.name!r} at 0x{id(self):0x}>"
# Top-level imports
from scrapy.spiders.crawl import CrawlSpider, Rule
from scrapy.spiders.feed import CSVFeedSpider, XMLFeedSpider
from scrapy.spiders.sitemap import SitemapSpider
__all__ = [
"CSVFeedSpider",
"CrawlSpider",
"Rule",
"SitemapSpider",
"Spider",
"XMLFeedSpider",
]
|