1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
|
"""
This is a middleware to respect robots.txt policies. To activate it you must
enable this middleware and enable the ROBOTSTXT_OBEY setting.
"""
from __future__ import annotations
import logging
from typing import TYPE_CHECKING
from twisted.internet.defer import Deferred, maybeDeferred
from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http import Request, Response
from scrapy.http.request import NO_CALLBACK
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.log import failure_to_exc_info
from scrapy.utils.misc import load_object
if TYPE_CHECKING:
from twisted.python.failure import Failure
# typing.Self requires Python 3.11
from typing_extensions import Self
from scrapy import Spider
from scrapy.crawler import Crawler
from scrapy.robotstxt import RobotParser
logger = logging.getLogger(__name__)
class RobotsTxtMiddleware:
DOWNLOAD_PRIORITY: int = 1000
def __init__(self, crawler: Crawler):
if not crawler.settings.getbool("ROBOTSTXT_OBEY"):
raise NotConfigured
self._default_useragent: str = crawler.settings.get("USER_AGENT", "Scrapy")
self._robotstxt_useragent: str | None = crawler.settings.get(
"ROBOTSTXT_USER_AGENT", None
)
self.crawler: Crawler = crawler
self._parsers: dict[str, RobotParser | Deferred[RobotParser | None] | None] = {}
self._parserimpl: RobotParser = load_object(
crawler.settings.get("ROBOTSTXT_PARSER")
)
# check if parser dependencies are met, this should throw an error otherwise.
self._parserimpl.from_crawler(self.crawler, b"")
@classmethod
def from_crawler(cls, crawler: Crawler) -> Self:
return cls(crawler)
def process_request(
self, request: Request, spider: Spider
) -> Deferred[None] | None:
if request.meta.get("dont_obey_robotstxt"):
return None
if request.url.startswith("data:") or request.url.startswith("file:"):
return None
d: Deferred[RobotParser | None] = maybeDeferred(
self.robot_parser,
request,
spider, # type: ignore[call-overload]
)
d2: Deferred[None] = d.addCallback(self.process_request_2, request, spider)
return d2
def process_request_2(
self, rp: RobotParser | None, request: Request, spider: Spider
) -> None:
if rp is None:
return
useragent: str | bytes | None = self._robotstxt_useragent
if not useragent:
useragent = request.headers.get(b"User-Agent", self._default_useragent)
assert useragent is not None
if not rp.allowed(request.url, useragent):
logger.debug(
"Forbidden by robots.txt: %(request)s",
{"request": request},
extra={"spider": spider},
)
assert self.crawler.stats
self.crawler.stats.inc_value("robotstxt/forbidden")
raise IgnoreRequest("Forbidden by robots.txt")
def robot_parser(
self, request: Request, spider: Spider
) -> RobotParser | Deferred[RobotParser | None] | None:
url = urlparse_cached(request)
netloc = url.netloc
if netloc not in self._parsers:
self._parsers[netloc] = Deferred()
robotsurl = f"{url.scheme}://{url.netloc}/robots.txt"
robotsreq = Request(
robotsurl,
priority=self.DOWNLOAD_PRIORITY,
meta={"dont_obey_robotstxt": True},
callback=NO_CALLBACK,
)
assert self.crawler.engine
assert self.crawler.stats
dfd = self.crawler.engine.download(robotsreq)
dfd.addCallback(self._parse_robots, netloc, spider)
dfd.addErrback(self._logerror, robotsreq, spider)
dfd.addErrback(self._robots_error, netloc)
self.crawler.stats.inc_value("robotstxt/request_count")
parser = self._parsers[netloc]
if isinstance(parser, Deferred):
d: Deferred[RobotParser | None] = Deferred()
def cb(result: RobotParser | None) -> RobotParser | None:
d.callback(result)
return result
parser.addCallback(cb)
return d
return parser
def _logerror(self, failure: Failure, request: Request, spider: Spider) -> Failure:
if failure.type is not IgnoreRequest:
logger.error(
"Error downloading %(request)s: %(f_exception)s",
{"request": request, "f_exception": failure.value},
exc_info=failure_to_exc_info(failure),
extra={"spider": spider},
)
return failure
def _parse_robots(self, response: Response, netloc: str, spider: Spider) -> None:
assert self.crawler.stats
self.crawler.stats.inc_value("robotstxt/response_count")
self.crawler.stats.inc_value(
f"robotstxt/response_status_count/{response.status}"
)
rp = self._parserimpl.from_crawler(self.crawler, response.body)
rp_dfd = self._parsers[netloc]
assert isinstance(rp_dfd, Deferred)
self._parsers[netloc] = rp
rp_dfd.callback(rp)
def _robots_error(self, failure: Failure, netloc: str) -> None:
if failure.type is not IgnoreRequest:
key = f"robotstxt/exception_count/{failure.type}"
assert self.crawler.stats
self.crawler.stats.inc_value(key)
rp_dfd = self._parsers[netloc]
assert isinstance(rp_dfd, Deferred)
self._parsers[netloc] = None
rp_dfd.callback(None)
|