File: test_on_google_spec.py

package info (click to toggle)
python-protego 0.5.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 30,052 kB
  • sloc: python: 1,579; perl: 190; cpp: 33; sh: 4; makefile: 3
file content (119 lines) | stat: -rw-r--r-- 3,997 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from __future__ import annotations

import pytest

from protego import Protego


@pytest.mark.parametrize(
    "path,user_agent",
    [
        ("/group1", "hedwig-news"),
        ("/group1", "hedwig-news2"),
        ("/group2", "hedwi"),
        ("/group2", "a"),
        ("/group3", "hedwig-new"),
        ("/group3", "hedwig-images"),
    ],
)
def test_user_agent_precedence(path, user_agent):
    robotstxt_content = """
    user-agent: hedwig-news
    disallow: /
    allow: /group1
    user-agent: *
    disallow: /
    allow: /group2
    user-agent: hedwig
    disallow: /
    allow: /group3
    """
    rp = Protego.parse(content=robotstxt_content)
    allowed_path: str | None
    for allowed_path in ("/group1", "/group2", "/group3"):
        if rp.can_fetch(allowed_path, user_agent):
            break
    else:
        allowed_path = None
    assert allowed_path == path


@pytest.mark.parametrize(
    "pattern,path,match",
    [
        ("/", "/harry", True),
        ("/", "/device/time-turner", True),
        ("/", "/hogwards.html", True),
        ("/*", "/harry", True),
        ("/*", "/device/time-turner", True),
        ("/*", "/hogwards.html", True),
        ("/phoenix", "/phoenix", True),
        ("/phoenix", "/phoenix.html", True),
        ("/phoenix", "/phoenix/sparky.html", True),
        ("/phoenix", "/phoenixheads", True),
        ("/phoenix", "/phoenixheads/yummy.html", True),
        ("/phoenix", "/phoenix.php?id=anything", True),
        ("/phoenix", "/Phoenix.asp", False),
        ("/phoenix", "/redphoenix", False),
        ("/phoenix", "/?id=phoenix", False),
        ("/phoenix*", "/phoenix", True),
        ("/phoenix*", "/phoenix.html", True),
        ("/phoenix*", "/phoenix/sparky.html", True),
        ("/phoenix*", "/phoenixheads", True),
        ("/phoenix*", "/phoenixheads/yummy.html", True),
        ("/phoenix*", "/phoenix.php?id=anything", True),
        ("/phoenix*", "/Phoenix.asp", False),
        ("/phoenix*", "/redphoenix", False),
        ("/phoenix*", "/?id=phoenix", False),
        ("/phoenix/", "/phoenix/", True),
        ("/phoenix/", "/phoenix/?id=anything", True),
        ("/phoenix/", "/phoenix/sparky.htm", True),
        ("/phoenix/", "/phoenix", False),
        ("/phoenix/", "/phoenix.html", False),
        ("/phoenix/", "/Phoenix/Sparky.asp", False),
        ("/*.php", "/filename.php", True),
        ("/*.php", "/folder/filename.php", True),
        ("/*.php", "/folder/filename.php?parameters", True),
        ("/*.php", "/folder/any.php.file.html", True),
        ("/*.php", "/filename.php/", True),
        ("/*.php", "/windows.PHP", False),
        ("/*.php", "/", False),
        ("/*.php", "/index?f=filename.php/", True),
        ("/*.php", "/index?php", False),
        ("/*.php$", "/filename.php", True),
        ("/*.php$", "/folder/filename.php", True),
        ("/*.php$", "/filename.php?parameters", False),
        ("/*.php$", "/filename.php/", False),
        ("/*.php$", "/filename.php5", False),
        ("/*.php$", "/windows.PHP", False),
        ("/*.php$", "/filename?php", False),
        ("/fish*.php", "/fish.php", True),
        ("/fish*.php", "/fishheads/catfish.php?parameters", True),
        ("/fish*.php", "/Fish.PHP", False),
    ],
)
def test_path_matching(pattern, path, match):
    content = f"""
    User-Agent: *
    disallow: {pattern}
    """
    rp = Protego.parse(content)
    assert (not rp.can_fetch(path, "*")) == match


@pytest.mark.parametrize(
    "rules,url,allowed",
    [
        ("allow: /p \n disallow: /", "http://example.com/page", True),
        ("allow: /folder \n disallow: /folder", "http://example.com/folder/page", True),
        ("allow: /$ \n disallow: /", "http://example.com/", True),
        ("allow: /$ \n disallow: /", "http://example.com/page.htm", False),
    ],
)
def test_record_precedence(rules, url, allowed):
    content = f"""
    User-Agent: *
    {rules}
    """
    rp = Protego.parse(content)
    assert rp.can_fetch(url, "*") == allowed