File: test_robotstxt_interface.py

package info (click to toggle)
python-scrapy 2.14.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 6,308 kB
  • sloc: python: 55,321; xml: 199; makefile: 25; sh: 7
file content (166 lines) | stat: -rw-r--r-- 6,635 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import pytest

from scrapy.robotstxt import (
    ProtegoRobotParser,
    PythonRobotParser,
    RerpRobotParser,
    decode_robotstxt,
)


def rerp_available():
    # check if robotexclusionrulesparser is installed
    try:
        from robotexclusionrulesparser import (  # noqa: PLC0415
            RobotExclusionRulesParser,  # noqa: F401
        )
    except ImportError:
        return False
    return True


class BaseRobotParserTest:
    def _setUp(self, parser_cls):
        self.parser_cls = parser_cls

    def test_allowed(self):
        robotstxt_robotstxt_body = (
            b"User-agent: * \nDisallow: /disallowed \nAllow: /allowed \nCrawl-delay: 10"
        )
        rp = self.parser_cls.from_crawler(
            crawler=None, robotstxt_body=robotstxt_robotstxt_body
        )
        assert rp.allowed("https://www.site.local/allowed", "*")
        assert not rp.allowed("https://www.site.local/disallowed", "*")

    def test_allowed_wildcards(self):
        robotstxt_robotstxt_body = b"""User-agent: first
                                Disallow: /disallowed/*/end$

                                User-agent: second
                                Allow: /*allowed
                                Disallow: /
                                """
        rp = self.parser_cls.from_crawler(
            crawler=None, robotstxt_body=robotstxt_robotstxt_body
        )

        assert rp.allowed("https://www.site.local/disallowed", "first")
        assert not rp.allowed("https://www.site.local/disallowed/xyz/end", "first")
        assert not rp.allowed("https://www.site.local/disallowed/abc/end", "first")
        assert rp.allowed("https://www.site.local/disallowed/xyz/endinglater", "first")

        assert rp.allowed("https://www.site.local/allowed", "second")
        assert rp.allowed("https://www.site.local/is_still_allowed", "second")
        assert rp.allowed("https://www.site.local/is_allowed_too", "second")

    def test_length_based_precedence(self):
        robotstxt_robotstxt_body = b"User-agent: * \nDisallow: / \nAllow: /page"
        rp = self.parser_cls.from_crawler(
            crawler=None, robotstxt_body=robotstxt_robotstxt_body
        )
        assert rp.allowed("https://www.site.local/page", "*")

    def test_order_based_precedence(self):
        robotstxt_robotstxt_body = b"User-agent: * \nDisallow: / \nAllow: /page"
        rp = self.parser_cls.from_crawler(
            crawler=None, robotstxt_body=robotstxt_robotstxt_body
        )
        assert not rp.allowed("https://www.site.local/page", "*")

    def test_empty_response(self):
        """empty response should equal 'allow all'"""
        rp = self.parser_cls.from_crawler(crawler=None, robotstxt_body=b"")
        assert rp.allowed("https://site.local/", "*")
        assert rp.allowed("https://site.local/", "chrome")
        assert rp.allowed("https://site.local/index.html", "*")
        assert rp.allowed("https://site.local/disallowed", "*")

    def test_garbage_response(self):
        """garbage response should be discarded, equal 'allow all'"""
        robotstxt_robotstxt_body = b"GIF89a\xd3\x00\xfe\x00\xa2"
        rp = self.parser_cls.from_crawler(
            crawler=None, robotstxt_body=robotstxt_robotstxt_body
        )
        assert rp.allowed("https://site.local/", "*")
        assert rp.allowed("https://site.local/", "chrome")
        assert rp.allowed("https://site.local/index.html", "*")
        assert rp.allowed("https://site.local/disallowed", "*")

    def test_unicode_url_and_useragent(self):
        robotstxt_robotstxt_body = """
        User-Agent: *
        Disallow: /admin/
        Disallow: /static/
        # taken from https://en.wikipedia.org/robots.txt
        Disallow: /wiki/K%C3%A4ytt%C3%A4j%C3%A4:
        Disallow: /wiki/Käyttäjä:

        User-Agent: UnicödeBöt
        Disallow: /some/randome/page.html""".encode()
        rp = self.parser_cls.from_crawler(
            crawler=None, robotstxt_body=robotstxt_robotstxt_body
        )
        assert rp.allowed("https://site.local/", "*")
        assert not rp.allowed("https://site.local/admin/", "*")
        assert not rp.allowed("https://site.local/static/", "*")
        assert rp.allowed("https://site.local/admin/", "UnicödeBöt")
        assert not rp.allowed("https://site.local/wiki/K%C3%A4ytt%C3%A4j%C3%A4:", "*")
        assert not rp.allowed("https://site.local/wiki/Käyttäjä:", "*")
        assert rp.allowed("https://site.local/some/randome/page.html", "*")
        assert not rp.allowed("https://site.local/some/randome/page.html", "UnicödeBöt")


class TestDecodeRobotsTxt:
    def test_native_string_conversion(self):
        robotstxt_body = b"User-agent: *\nDisallow: /\n"
        decoded_content = decode_robotstxt(
            robotstxt_body, spider=None, to_native_str_type=True
        )
        assert decoded_content == "User-agent: *\nDisallow: /\n"

    def test_decode_utf8(self):
        robotstxt_body = b"User-agent: *\nDisallow: /\n"
        decoded_content = decode_robotstxt(robotstxt_body, spider=None)
        assert decoded_content == "User-agent: *\nDisallow: /\n"

    def test_decode_non_utf8(self):
        robotstxt_body = b"User-agent: *\n\xffDisallow: /\n"
        decoded_content = decode_robotstxt(robotstxt_body, spider=None)
        assert decoded_content == "User-agent: *\nDisallow: /\n"

    # UTF-8 BOM at the beginning of the file ignored
    def test_decode_utf8_bom(self):
        robotstxt_body = b"\xef\xbb\xbfUser-agent: *\nDisallow: /\n"
        decoded_content = decode_robotstxt(robotstxt_body, spider=None)
        assert decoded_content == "User-agent: *\nDisallow: /\n"


class TestPythonRobotParser(BaseRobotParserTest):
    def setup_method(self):
        super()._setUp(PythonRobotParser)

    def test_length_based_precedence(self):
        pytest.skip(
            "RobotFileParser does not support length based directives precedence."
        )

    def test_allowed_wildcards(self):
        pytest.skip("RobotFileParser does not support wildcards.")


@pytest.mark.skipif(not rerp_available(), reason="Rerp parser is not installed")
class TestRerpRobotParser(BaseRobotParserTest):
    def setup_method(self):
        super()._setUp(RerpRobotParser)

    def test_length_based_precedence(self):
        pytest.skip("Rerp does not support length based directives precedence.")


class TestProtegoRobotParser(BaseRobotParserTest):
    def setup_method(self):
        super()._setUp(ProtegoRobotParser)

    def test_order_based_precedence(self):
        pytest.skip("Protego does not support order based directives precedence.")