1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
|
"""
This is a middleware to respect robots.txt policies. To activate it you must
enable this middleware and enable the ROBOTSTXT_OBEY setting.
"""
import robotparser
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals, log
from scrapy.exceptions import NotConfigured, IgnoreRequest
from scrapy.http import Request
from scrapy.utils.httpobj import urlparse_cached
class RobotsTxtMiddleware(object):
DOWNLOAD_PRIORITY = 1000
def __init__(self, crawler):
if not crawler.settings.getbool('ROBOTSTXT_OBEY'):
raise NotConfigured
self.crawler = crawler
self._parsers = {}
self._spider_netlocs = {}
self._useragents = {}
dispatcher.connect(self.spider_opened, signals.spider_opened)
dispatcher.connect(self.spider_closed, signals.spider_closed)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
useragent = self._useragents[spider]
rp = self.robot_parser(request, spider)
if rp and not rp.can_fetch(useragent, request.url):
log.msg("Forbidden by robots.txt: %s" % request, log.DEBUG)
raise IgnoreRequest
def robot_parser(self, request, spider):
url = urlparse_cached(request)
netloc = url.netloc
if netloc not in self._parsers:
self._parsers[netloc] = None
robotsurl = "%s://%s/robots.txt" % (url.scheme, url.netloc)
robotsreq = Request(robotsurl, priority=self.DOWNLOAD_PRIORITY)
dfd = self.crawler.engine.download(robotsreq, spider)
dfd.addCallback(self._parse_robots)
self._spider_netlocs[spider].add(netloc)
return self._parsers[netloc]
def _parse_robots(self, response):
rp = robotparser.RobotFileParser(response.url)
rp.parse(response.body.splitlines())
self._parsers[urlparse_cached(response).netloc] = rp
def spider_opened(self, spider):
self._spider_netlocs[spider] = set()
self._useragents[spider] = spider.settings['USER_AGENT']
def spider_closed(self, spider):
for netloc in self._spider_netlocs[spider]:
del self._parsers[netloc]
del self._spider_netlocs[spider]
del self._useragents[spider]
|