1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
|
"""Download handlers for different schemes"""
import logging
from twisted.internet import defer
from scrapy import signals
from scrapy.exceptions import NotConfigured, NotSupported
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.misc import create_instance, load_object
from scrapy.utils.python import without_none_values
logger = logging.getLogger(__name__)
class DownloadHandlers:
def __init__(self, crawler):
self._crawler = crawler
self._schemes = {} # stores acceptable schemes on instancing
self._handlers = {} # stores instanced handlers for schemes
self._notconfigured = {} # remembers failed handlers
handlers = without_none_values(
crawler.settings.getwithbase('DOWNLOAD_HANDLERS'))
for scheme, clspath in handlers.items():
self._schemes[scheme] = clspath
self._load_handler(scheme, skip_lazy=True)
crawler.signals.connect(self._close, signals.engine_stopped)
def _get_handler(self, scheme):
"""Lazy-load the downloadhandler for a scheme
only on the first request for that scheme.
"""
if scheme in self._handlers:
return self._handlers[scheme]
if scheme in self._notconfigured:
return None
if scheme not in self._schemes:
self._notconfigured[scheme] = 'no handler available for that scheme'
return None
return self._load_handler(scheme)
def _load_handler(self, scheme, skip_lazy=False):
path = self._schemes[scheme]
try:
dhcls = load_object(path)
if skip_lazy and getattr(dhcls, 'lazy', True):
return None
dh = create_instance(
objcls=dhcls,
settings=self._crawler.settings,
crawler=self._crawler,
)
except NotConfigured as ex:
self._notconfigured[scheme] = str(ex)
return None
except Exception as ex:
logger.error('Loading "%(clspath)s" for scheme "%(scheme)s"',
{"clspath": path, "scheme": scheme},
exc_info=True, extra={'crawler': self._crawler})
self._notconfigured[scheme] = str(ex)
return None
else:
self._handlers[scheme] = dh
return dh
def download_request(self, request, spider):
scheme = urlparse_cached(request).scheme
handler = self._get_handler(scheme)
if not handler:
raise NotSupported(f"Unsupported URL scheme '{scheme}': {self._notconfigured[scheme]}")
return handler.download_request(request, spider)
@defer.inlineCallbacks
def _close(self, *_a, **_kw):
for dh in self._handlers.values():
if hasattr(dh, 'close'):
yield dh.close()
|