1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
|
import sys
import logging
from abc import ABCMeta, abstractmethod
from scrapy.utils.python import to_unicode
logger = logging.getLogger(__name__)
def decode_robotstxt(robotstxt_body, spider, to_native_str_type=False):
try:
if to_native_str_type:
robotstxt_body = to_unicode(robotstxt_body)
else:
robotstxt_body = robotstxt_body.decode('utf-8')
except UnicodeDecodeError:
# If we found garbage or robots.txt in an encoding other than UTF-8, disregard it.
# Switch to 'allow all' state.
logger.warning(
"Failure while parsing robots.txt. File either contains garbage or "
"is in an encoding other than UTF-8, treating it as an empty file.",
exc_info=sys.exc_info(),
extra={'spider': spider},
)
robotstxt_body = ''
return robotstxt_body
class RobotParser(metaclass=ABCMeta):
@classmethod
@abstractmethod
def from_crawler(cls, crawler, robotstxt_body):
"""Parse the content of a robots.txt_ file as bytes. This must be a class method.
It must return a new instance of the parser backend.
:param crawler: crawler which made the request
:type crawler: :class:`~scrapy.crawler.Crawler` instance
:param robotstxt_body: content of a robots.txt_ file.
:type robotstxt_body: bytes
"""
pass
@abstractmethod
def allowed(self, url, user_agent):
"""Return ``True`` if ``user_agent`` is allowed to crawl ``url``, otherwise return ``False``.
:param url: Absolute URL
:type url: str
:param user_agent: User agent
:type user_agent: str
"""
pass
class PythonRobotParser(RobotParser):
def __init__(self, robotstxt_body, spider):
from urllib.robotparser import RobotFileParser
self.spider = spider
robotstxt_body = decode_robotstxt(robotstxt_body, spider, to_native_str_type=True)
self.rp = RobotFileParser()
self.rp.parse(robotstxt_body.splitlines())
@classmethod
def from_crawler(cls, crawler, robotstxt_body):
spider = None if not crawler else crawler.spider
o = cls(robotstxt_body, spider)
return o
def allowed(self, url, user_agent):
user_agent = to_unicode(user_agent)
url = to_unicode(url)
return self.rp.can_fetch(user_agent, url)
class ReppyRobotParser(RobotParser):
def __init__(self, robotstxt_body, spider):
from reppy.robots import Robots
self.spider = spider
self.rp = Robots.parse('', robotstxt_body)
@classmethod
def from_crawler(cls, crawler, robotstxt_body):
spider = None if not crawler else crawler.spider
o = cls(robotstxt_body, spider)
return o
def allowed(self, url, user_agent):
return self.rp.allowed(url, user_agent)
class RerpRobotParser(RobotParser):
def __init__(self, robotstxt_body, spider):
from robotexclusionrulesparser import RobotExclusionRulesParser
self.spider = spider
self.rp = RobotExclusionRulesParser()
robotstxt_body = decode_robotstxt(robotstxt_body, spider)
self.rp.parse(robotstxt_body)
@classmethod
def from_crawler(cls, crawler, robotstxt_body):
spider = None if not crawler else crawler.spider
o = cls(robotstxt_body, spider)
return o
def allowed(self, url, user_agent):
user_agent = to_unicode(user_agent)
url = to_unicode(url)
return self.rp.is_allowed(user_agent, url)
class ProtegoRobotParser(RobotParser):
def __init__(self, robotstxt_body, spider):
from protego import Protego
self.spider = spider
robotstxt_body = decode_robotstxt(robotstxt_body, spider)
self.rp = Protego.parse(robotstxt_body)
@classmethod
def from_crawler(cls, crawler, robotstxt_body):
spider = None if not crawler else crawler.spider
o = cls(robotstxt_body, spider)
return o
def allowed(self, url, user_agent):
user_agent = to_unicode(user_agent)
url = to_unicode(url)
return self.rp.can_fetch(url, user_agent)
|