1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
|
from __future__ import annotations
import traceback
import warnings
from collections import defaultdict
from typing import TYPE_CHECKING, Protocol, cast
from zope.interface import implementer
from zope.interface.verify import verifyClass
from scrapy.interfaces import ISpiderLoader
from scrapy.utils.misc import load_object, walk_modules
from scrapy.utils.spider import iter_spider_classes
if TYPE_CHECKING:
from types import ModuleType
# typing.Self requires Python 3.11
from typing_extensions import Self
from scrapy import Request, Spider
from scrapy.settings import BaseSettings
def get_spider_loader(settings: BaseSettings) -> SpiderLoaderProtocol:
"""Get SpiderLoader instance from settings"""
cls_path = settings.get("SPIDER_LOADER_CLASS")
loader_cls = load_object(cls_path)
verifyClass(ISpiderLoader, loader_cls)
return cast("SpiderLoaderProtocol", loader_cls.from_settings(settings.frozencopy()))
class SpiderLoaderProtocol(Protocol):
@classmethod
def from_settings(cls, settings: BaseSettings) -> Self:
"""Return an instance of the class for the given settings"""
def load(self, spider_name: str) -> type[Spider]:
"""Return the Spider class for the given spider name. If the spider
name is not found, it must raise a KeyError."""
def list(self) -> list[str]:
"""Return a list with the names of all spiders available in the
project"""
def find_by_request(self, request: Request) -> __builtins__.list[str]:
"""Return the list of spiders names that can handle the given request"""
@implementer(ISpiderLoader)
class SpiderLoader:
"""
SpiderLoader is a class which locates and loads spiders
in a Scrapy project.
"""
def __init__(self, settings: BaseSettings):
self.spider_modules: list[str] = settings.getlist("SPIDER_MODULES")
self.warn_only: bool = settings.getbool("SPIDER_LOADER_WARN_ONLY")
self._spiders: dict[str, type[Spider]] = {}
self._found: defaultdict[str, list[tuple[str, str]]] = defaultdict(list)
self._load_all_spiders()
def _check_name_duplicates(self) -> None:
dupes = []
for name, locations in self._found.items():
dupes.extend(
[
f" {cls} named {name!r} (in {mod})"
for mod, cls in locations
if len(locations) > 1
]
)
if dupes:
dupes_string = "\n\n".join(dupes)
warnings.warn(
"There are several spiders with the same name:\n\n"
f"{dupes_string}\n\n This can cause unexpected behavior.",
category=UserWarning,
)
def _load_spiders(self, module: ModuleType) -> None:
for spcls in iter_spider_classes(module):
self._found[spcls.name].append((module.__name__, spcls.__name__))
self._spiders[spcls.name] = spcls
def _load_all_spiders(self) -> None:
for name in self.spider_modules:
try:
for module in walk_modules(name):
self._load_spiders(module)
except (ImportError, SyntaxError):
if self.warn_only:
warnings.warn(
f"\n{traceback.format_exc()}Could not load spiders "
f"from module '{name}'. "
"See above traceback for details.",
category=RuntimeWarning,
)
else:
raise
self._check_name_duplicates()
@classmethod
def from_settings(cls, settings: BaseSettings) -> Self:
return cls(settings)
def load(self, spider_name: str) -> type[Spider]:
"""
Return the Spider class for the given spider name. If the spider
name is not found, raise a KeyError.
"""
try:
return self._spiders[spider_name]
except KeyError:
raise KeyError(f"Spider not found: {spider_name}")
def find_by_request(self, request: Request) -> list[str]:
"""
Return the list of spider names that can handle the given request.
"""
return [
name for name, cls in self._spiders.items() if cls.handles_request(request)
]
def list(self) -> list[str]:
"""
Return a list with the names of all spiders available in the project.
"""
return list(self._spiders.keys())
@implementer(ISpiderLoader)
class DummySpiderLoader:
"""A dummy spider loader that does not load any spiders."""
@classmethod
def from_settings(cls, settings: BaseSettings) -> Self:
return cls()
def load(self, spider_name: str) -> type[Spider]:
raise KeyError("DummySpiderLoader doesn't load any spiders")
def list(self) -> list[str]:
return []
def find_by_request(self, request: Request) -> __builtins__.list[str]:
return []
|