1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
|
"""Download handlers for different schemes"""
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Any, Protocol, cast
from twisted.internet import defer
from scrapy import Request, Spider, signals
from scrapy.exceptions import NotConfigured, NotSupported
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.misc import build_from_crawler, load_object
from scrapy.utils.python import without_none_values
if TYPE_CHECKING:
from collections.abc import Callable, Generator
from twisted.internet.defer import Deferred
from scrapy.crawler import Crawler
from scrapy.http import Response
logger = logging.getLogger(__name__)
class DownloadHandlerProtocol(Protocol):
def download_request(
self, request: Request, spider: Spider
) -> Deferred[Response]: ...
class DownloadHandlers:
def __init__(self, crawler: Crawler):
self._crawler: Crawler = crawler
# stores acceptable schemes on instancing
self._schemes: dict[str, str | Callable[..., Any]] = {}
# stores instanced handlers for schemes
self._handlers: dict[str, DownloadHandlerProtocol] = {}
# remembers failed handlers
self._notconfigured: dict[str, str] = {}
handlers: dict[str, str | Callable[..., Any]] = without_none_values(
cast(
"dict[str, str | Callable[..., Any]]",
crawler.settings.getwithbase("DOWNLOAD_HANDLERS"),
)
)
for scheme, clspath in handlers.items():
self._schemes[scheme] = clspath
self._load_handler(scheme, skip_lazy=True)
crawler.signals.connect(self._close, signals.engine_stopped)
def _get_handler(self, scheme: str) -> DownloadHandlerProtocol | None:
"""Lazy-load the downloadhandler for a scheme
only on the first request for that scheme.
"""
if scheme in self._handlers:
return self._handlers[scheme]
if scheme in self._notconfigured:
return None
if scheme not in self._schemes:
self._notconfigured[scheme] = "no handler available for that scheme"
return None
return self._load_handler(scheme)
def _load_handler(
self, scheme: str, skip_lazy: bool = False
) -> DownloadHandlerProtocol | None:
path = self._schemes[scheme]
try:
dhcls: type[DownloadHandlerProtocol] = load_object(path)
if skip_lazy and getattr(dhcls, "lazy", True):
return None
dh = build_from_crawler(
dhcls,
self._crawler,
)
except NotConfigured as ex:
self._notconfigured[scheme] = str(ex)
return None
except Exception as ex:
logger.error(
'Loading "%(clspath)s" for scheme "%(scheme)s"',
{"clspath": path, "scheme": scheme},
exc_info=True,
extra={"crawler": self._crawler},
)
self._notconfigured[scheme] = str(ex)
return None
self._handlers[scheme] = dh
return dh
def download_request(self, request: Request, spider: Spider) -> Deferred[Response]:
scheme = urlparse_cached(request).scheme
handler = self._get_handler(scheme)
if not handler:
raise NotSupported(
f"Unsupported URL scheme '{scheme}': {self._notconfigured[scheme]}"
)
return handler.download_request(request, spider)
@defer.inlineCallbacks
def _close(self, *_a: Any, **_kw: Any) -> Generator[Deferred[Any], Any, None]:
for dh in self._handlers.values():
if hasattr(dh, "close"):
yield dh.close()
|