1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
|
import pytest
from scrapy.robotstxt import decode_robotstxt
def rerp_available():
# check if robotexclusionrulesparser is installed
try:
from robotexclusionrulesparser import RobotExclusionRulesParser # noqa: F401
except ImportError:
return False
return True
def protego_available():
# check if protego parser is installed
try:
from protego import Protego # noqa: F401
except ImportError:
return False
return True
class BaseRobotParserTest:
def _setUp(self, parser_cls):
self.parser_cls = parser_cls
def test_allowed(self):
robotstxt_robotstxt_body = (
b"User-agent: * \nDisallow: /disallowed \nAllow: /allowed \nCrawl-delay: 10"
)
rp = self.parser_cls.from_crawler(
crawler=None, robotstxt_body=robotstxt_robotstxt_body
)
assert rp.allowed("https://www.site.local/allowed", "*")
assert not rp.allowed("https://www.site.local/disallowed", "*")
def test_allowed_wildcards(self):
robotstxt_robotstxt_body = b"""User-agent: first
Disallow: /disallowed/*/end$
User-agent: second
Allow: /*allowed
Disallow: /
"""
rp = self.parser_cls.from_crawler(
crawler=None, robotstxt_body=robotstxt_robotstxt_body
)
assert rp.allowed("https://www.site.local/disallowed", "first")
assert not rp.allowed("https://www.site.local/disallowed/xyz/end", "first")
assert not rp.allowed("https://www.site.local/disallowed/abc/end", "first")
assert rp.allowed("https://www.site.local/disallowed/xyz/endinglater", "first")
assert rp.allowed("https://www.site.local/allowed", "second")
assert rp.allowed("https://www.site.local/is_still_allowed", "second")
assert rp.allowed("https://www.site.local/is_allowed_too", "second")
def test_length_based_precedence(self):
robotstxt_robotstxt_body = b"User-agent: * \nDisallow: / \nAllow: /page"
rp = self.parser_cls.from_crawler(
crawler=None, robotstxt_body=robotstxt_robotstxt_body
)
assert rp.allowed("https://www.site.local/page", "*")
def test_order_based_precedence(self):
robotstxt_robotstxt_body = b"User-agent: * \nDisallow: / \nAllow: /page"
rp = self.parser_cls.from_crawler(
crawler=None, robotstxt_body=robotstxt_robotstxt_body
)
assert not rp.allowed("https://www.site.local/page", "*")
def test_empty_response(self):
"""empty response should equal 'allow all'"""
rp = self.parser_cls.from_crawler(crawler=None, robotstxt_body=b"")
assert rp.allowed("https://site.local/", "*")
assert rp.allowed("https://site.local/", "chrome")
assert rp.allowed("https://site.local/index.html", "*")
assert rp.allowed("https://site.local/disallowed", "*")
def test_garbage_response(self):
"""garbage response should be discarded, equal 'allow all'"""
robotstxt_robotstxt_body = b"GIF89a\xd3\x00\xfe\x00\xa2"
rp = self.parser_cls.from_crawler(
crawler=None, robotstxt_body=robotstxt_robotstxt_body
)
assert rp.allowed("https://site.local/", "*")
assert rp.allowed("https://site.local/", "chrome")
assert rp.allowed("https://site.local/index.html", "*")
assert rp.allowed("https://site.local/disallowed", "*")
def test_unicode_url_and_useragent(self):
robotstxt_robotstxt_body = """
User-Agent: *
Disallow: /admin/
Disallow: /static/
# taken from https://en.wikipedia.org/robots.txt
Disallow: /wiki/K%C3%A4ytt%C3%A4j%C3%A4:
Disallow: /wiki/Käyttäjä:
User-Agent: UnicödeBöt
Disallow: /some/randome/page.html""".encode()
rp = self.parser_cls.from_crawler(
crawler=None, robotstxt_body=robotstxt_robotstxt_body
)
assert rp.allowed("https://site.local/", "*")
assert not rp.allowed("https://site.local/admin/", "*")
assert not rp.allowed("https://site.local/static/", "*")
assert rp.allowed("https://site.local/admin/", "UnicödeBöt")
assert not rp.allowed("https://site.local/wiki/K%C3%A4ytt%C3%A4j%C3%A4:", "*")
assert not rp.allowed("https://site.local/wiki/Käyttäjä:", "*")
assert rp.allowed("https://site.local/some/randome/page.html", "*")
assert not rp.allowed("https://site.local/some/randome/page.html", "UnicödeBöt")
class TestDecodeRobotsTxt:
def test_native_string_conversion(self):
robotstxt_body = b"User-agent: *\nDisallow: /\n"
decoded_content = decode_robotstxt(
robotstxt_body, spider=None, to_native_str_type=True
)
assert decoded_content == "User-agent: *\nDisallow: /\n"
def test_decode_utf8(self):
robotstxt_body = b"User-agent: *\nDisallow: /\n"
decoded_content = decode_robotstxt(robotstxt_body, spider=None)
assert decoded_content == "User-agent: *\nDisallow: /\n"
def test_decode_non_utf8(self):
robotstxt_body = b"User-agent: *\n\xffDisallow: /\n"
decoded_content = decode_robotstxt(robotstxt_body, spider=None)
assert decoded_content == "User-agent: *\nDisallow: /\n"
class TestPythonRobotParser(BaseRobotParserTest):
def setup_method(self):
from scrapy.robotstxt import PythonRobotParser
super()._setUp(PythonRobotParser)
def test_length_based_precedence(self):
pytest.skip(
"RobotFileParser does not support length based directives precedence."
)
def test_allowed_wildcards(self):
pytest.skip("RobotFileParser does not support wildcards.")
@pytest.mark.skipif(not rerp_available(), reason="Rerp parser is not installed")
class TestRerpRobotParser(BaseRobotParserTest):
def setup_method(self):
from scrapy.robotstxt import RerpRobotParser
super()._setUp(RerpRobotParser)
def test_length_based_precedence(self):
pytest.skip("Rerp does not support length based directives precedence.")
@pytest.mark.skipif(not protego_available(), reason="Protego parser is not installed")
class TestProtegoRobotParser(BaseRobotParserTest):
def setup_method(self):
from scrapy.robotstxt import ProtegoRobotParser
super()._setUp(ProtegoRobotParser)
def test_order_based_precedence(self):
pytest.skip("Protego does not support order based directives precedence.")
|