File: test_robotstxt_interface.py

package info (click to toggle)
python-scrapy 2.13.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 5,664 kB
  • sloc: python: 52,028; xml: 199; makefile: 25; sh: 7
file content (169 lines) | stat: -rw-r--r-- 6,664 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import pytest

from scrapy.robotstxt import decode_robotstxt


def rerp_available():
    # check if robotexclusionrulesparser is installed
    try:
        from robotexclusionrulesparser import RobotExclusionRulesParser  # noqa: F401
    except ImportError:
        return False
    return True


def protego_available():
    # check if protego parser is installed
    try:
        from protego import Protego  # noqa: F401
    except ImportError:
        return False
    return True


class BaseRobotParserTest:
    def _setUp(self, parser_cls):
        self.parser_cls = parser_cls

    def test_allowed(self):
        robotstxt_robotstxt_body = (
            b"User-agent: * \nDisallow: /disallowed \nAllow: /allowed \nCrawl-delay: 10"
        )
        rp = self.parser_cls.from_crawler(
            crawler=None, robotstxt_body=robotstxt_robotstxt_body
        )
        assert rp.allowed("https://www.site.local/allowed", "*")
        assert not rp.allowed("https://www.site.local/disallowed", "*")

    def test_allowed_wildcards(self):
        robotstxt_robotstxt_body = b"""User-agent: first
                                Disallow: /disallowed/*/end$

                                User-agent: second
                                Allow: /*allowed
                                Disallow: /
                                """
        rp = self.parser_cls.from_crawler(
            crawler=None, robotstxt_body=robotstxt_robotstxt_body
        )

        assert rp.allowed("https://www.site.local/disallowed", "first")
        assert not rp.allowed("https://www.site.local/disallowed/xyz/end", "first")
        assert not rp.allowed("https://www.site.local/disallowed/abc/end", "first")
        assert rp.allowed("https://www.site.local/disallowed/xyz/endinglater", "first")

        assert rp.allowed("https://www.site.local/allowed", "second")
        assert rp.allowed("https://www.site.local/is_still_allowed", "second")
        assert rp.allowed("https://www.site.local/is_allowed_too", "second")

    def test_length_based_precedence(self):
        robotstxt_robotstxt_body = b"User-agent: * \nDisallow: / \nAllow: /page"
        rp = self.parser_cls.from_crawler(
            crawler=None, robotstxt_body=robotstxt_robotstxt_body
        )
        assert rp.allowed("https://www.site.local/page", "*")

    def test_order_based_precedence(self):
        robotstxt_robotstxt_body = b"User-agent: * \nDisallow: / \nAllow: /page"
        rp = self.parser_cls.from_crawler(
            crawler=None, robotstxt_body=robotstxt_robotstxt_body
        )
        assert not rp.allowed("https://www.site.local/page", "*")

    def test_empty_response(self):
        """empty response should equal 'allow all'"""
        rp = self.parser_cls.from_crawler(crawler=None, robotstxt_body=b"")
        assert rp.allowed("https://site.local/", "*")
        assert rp.allowed("https://site.local/", "chrome")
        assert rp.allowed("https://site.local/index.html", "*")
        assert rp.allowed("https://site.local/disallowed", "*")

    def test_garbage_response(self):
        """garbage response should be discarded, equal 'allow all'"""
        robotstxt_robotstxt_body = b"GIF89a\xd3\x00\xfe\x00\xa2"
        rp = self.parser_cls.from_crawler(
            crawler=None, robotstxt_body=robotstxt_robotstxt_body
        )
        assert rp.allowed("https://site.local/", "*")
        assert rp.allowed("https://site.local/", "chrome")
        assert rp.allowed("https://site.local/index.html", "*")
        assert rp.allowed("https://site.local/disallowed", "*")

    def test_unicode_url_and_useragent(self):
        robotstxt_robotstxt_body = """
        User-Agent: *
        Disallow: /admin/
        Disallow: /static/
        # taken from https://en.wikipedia.org/robots.txt
        Disallow: /wiki/K%C3%A4ytt%C3%A4j%C3%A4:
        Disallow: /wiki/Käyttäjä:

        User-Agent: UnicödeBöt
        Disallow: /some/randome/page.html""".encode()
        rp = self.parser_cls.from_crawler(
            crawler=None, robotstxt_body=robotstxt_robotstxt_body
        )
        assert rp.allowed("https://site.local/", "*")
        assert not rp.allowed("https://site.local/admin/", "*")
        assert not rp.allowed("https://site.local/static/", "*")
        assert rp.allowed("https://site.local/admin/", "UnicödeBöt")
        assert not rp.allowed("https://site.local/wiki/K%C3%A4ytt%C3%A4j%C3%A4:", "*")
        assert not rp.allowed("https://site.local/wiki/Käyttäjä:", "*")
        assert rp.allowed("https://site.local/some/randome/page.html", "*")
        assert not rp.allowed("https://site.local/some/randome/page.html", "UnicödeBöt")


class TestDecodeRobotsTxt:
    def test_native_string_conversion(self):
        robotstxt_body = b"User-agent: *\nDisallow: /\n"
        decoded_content = decode_robotstxt(
            robotstxt_body, spider=None, to_native_str_type=True
        )
        assert decoded_content == "User-agent: *\nDisallow: /\n"

    def test_decode_utf8(self):
        robotstxt_body = b"User-agent: *\nDisallow: /\n"
        decoded_content = decode_robotstxt(robotstxt_body, spider=None)
        assert decoded_content == "User-agent: *\nDisallow: /\n"

    def test_decode_non_utf8(self):
        robotstxt_body = b"User-agent: *\n\xffDisallow: /\n"
        decoded_content = decode_robotstxt(robotstxt_body, spider=None)
        assert decoded_content == "User-agent: *\nDisallow: /\n"


class TestPythonRobotParser(BaseRobotParserTest):
    def setup_method(self):
        from scrapy.robotstxt import PythonRobotParser

        super()._setUp(PythonRobotParser)

    def test_length_based_precedence(self):
        pytest.skip(
            "RobotFileParser does not support length based directives precedence."
        )

    def test_allowed_wildcards(self):
        pytest.skip("RobotFileParser does not support wildcards.")


@pytest.mark.skipif(not rerp_available(), reason="Rerp parser is not installed")
class TestRerpRobotParser(BaseRobotParserTest):
    def setup_method(self):
        from scrapy.robotstxt import RerpRobotParser

        super()._setUp(RerpRobotParser)

    def test_length_based_precedence(self):
        pytest.skip("Rerp does not support length based directives precedence.")


@pytest.mark.skipif(not protego_available(), reason="Protego parser is not installed")
class TestProtegoRobotParser(BaseRobotParserTest):
    def setup_method(self):
        from scrapy.robotstxt import ProtegoRobotParser

        super()._setUp(ProtegoRobotParser)

    def test_order_based_precedence(self):
        pytest.skip("Protego does not support order based directives precedence.")