1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
|
"""
This is a middleware to respect robots.txt policies. To activate it you must
enable this middleware and enable the ROBOTSTXT_OBEY setting.
"""
import logging
from six.moves.urllib import robotparser
from scrapy.exceptions import NotConfigured, IgnoreRequest
from scrapy.http import Request
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.log import failure_to_exc_info
logger = logging.getLogger(__name__)
class RobotsTxtMiddleware(object):
DOWNLOAD_PRIORITY = 1000
def __init__(self, crawler):
if not crawler.settings.getbool('ROBOTSTXT_OBEY'):
raise NotConfigured
self.crawler = crawler
self._useragent = crawler.settings.get('USER_AGENT')
self._parsers = {}
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
if request.meta.get('dont_obey_robotstxt'):
return
rp = self.robot_parser(request, spider)
if rp and not rp.can_fetch(self._useragent, request.url):
logger.debug("Forbidden by robots.txt: %(request)s",
{'request': request}, extra={'spider': spider})
raise IgnoreRequest
def robot_parser(self, request, spider):
url = urlparse_cached(request)
netloc = url.netloc
if netloc not in self._parsers:
self._parsers[netloc] = None
robotsurl = "%s://%s/robots.txt" % (url.scheme, url.netloc)
robotsreq = Request(
robotsurl,
priority=self.DOWNLOAD_PRIORITY,
meta={'dont_obey_robotstxt': True}
)
dfd = self.crawler.engine.download(robotsreq, spider)
dfd.addCallback(self._parse_robots)
dfd.addErrback(self._logerror, robotsreq, spider)
return self._parsers[netloc]
def _logerror(self, failure, request, spider):
if failure.type is not IgnoreRequest:
logger.error("Error downloading %(request)s: %(f_exception)s",
{'request': request, 'f_exception': failure.value},
exc_info=failure_to_exc_info(failure),
extra={'spider': spider})
def _parse_robots(self, response):
rp = robotparser.RobotFileParser(response.url)
rp.parse(response.body.splitlines())
self._parsers[urlparse_cached(response).netloc] = rp
|