1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
|
from __future__ import annotations
import pytest
from protego import Protego
@pytest.mark.parametrize(
"path,user_agent",
[
("/group1", "hedwig-news"),
("/group1", "hedwig-news2"),
("/group2", "hedwi"),
("/group2", "a"),
("/group3", "hedwig-new"),
("/group3", "hedwig-images"),
],
)
def test_user_agent_precedence(path, user_agent):
robotstxt_content = """
user-agent: hedwig-news
disallow: /
allow: /group1
user-agent: *
disallow: /
allow: /group2
user-agent: hedwig
disallow: /
allow: /group3
"""
rp = Protego.parse(content=robotstxt_content)
allowed_path: str | None
for allowed_path in ("/group1", "/group2", "/group3"):
if rp.can_fetch(allowed_path, user_agent):
break
else:
allowed_path = None
assert allowed_path == path
@pytest.mark.parametrize(
"pattern,path,match",
[
("/", "/harry", True),
("/", "/device/time-turner", True),
("/", "/hogwards.html", True),
("/*", "/harry", True),
("/*", "/device/time-turner", True),
("/*", "/hogwards.html", True),
("/phoenix", "/phoenix", True),
("/phoenix", "/phoenix.html", True),
("/phoenix", "/phoenix/sparky.html", True),
("/phoenix", "/phoenixheads", True),
("/phoenix", "/phoenixheads/yummy.html", True),
("/phoenix", "/phoenix.php?id=anything", True),
("/phoenix", "/Phoenix.asp", False),
("/phoenix", "/redphoenix", False),
("/phoenix", "/?id=phoenix", False),
("/phoenix*", "/phoenix", True),
("/phoenix*", "/phoenix.html", True),
("/phoenix*", "/phoenix/sparky.html", True),
("/phoenix*", "/phoenixheads", True),
("/phoenix*", "/phoenixheads/yummy.html", True),
("/phoenix*", "/phoenix.php?id=anything", True),
("/phoenix*", "/Phoenix.asp", False),
("/phoenix*", "/redphoenix", False),
("/phoenix*", "/?id=phoenix", False),
("/phoenix/", "/phoenix/", True),
("/phoenix/", "/phoenix/?id=anything", True),
("/phoenix/", "/phoenix/sparky.htm", True),
("/phoenix/", "/phoenix", False),
("/phoenix/", "/phoenix.html", False),
("/phoenix/", "/Phoenix/Sparky.asp", False),
("/*.php", "/filename.php", True),
("/*.php", "/folder/filename.php", True),
("/*.php", "/folder/filename.php?parameters", True),
("/*.php", "/folder/any.php.file.html", True),
("/*.php", "/filename.php/", True),
("/*.php", "/windows.PHP", False),
("/*.php", "/", False),
("/*.php", "/index?f=filename.php/", True),
("/*.php", "/index?php", False),
("/*.php$", "/filename.php", True),
("/*.php$", "/folder/filename.php", True),
("/*.php$", "/filename.php?parameters", False),
("/*.php$", "/filename.php/", False),
("/*.php$", "/filename.php5", False),
("/*.php$", "/windows.PHP", False),
("/*.php$", "/filename?php", False),
("/fish*.php", "/fish.php", True),
("/fish*.php", "/fishheads/catfish.php?parameters", True),
("/fish*.php", "/Fish.PHP", False),
],
)
def test_path_matching(pattern, path, match):
content = f"""
User-Agent: *
disallow: {pattern}
"""
rp = Protego.parse(content)
assert (not rp.can_fetch(path, "*")) == match
@pytest.mark.parametrize(
"rules,url,allowed",
[
("allow: /p \n disallow: /", "http://example.com/page", True),
("allow: /folder \n disallow: /folder", "http://example.com/folder/page", True),
("allow: /$ \n disallow: /", "http://example.com/", True),
("allow: /$ \n disallow: /", "http://example.com/page.htm", False),
],
)
def test_record_precedence(rules, url, allowed):
content = f"""
User-Agent: *
{rules}
"""
rp = Protego.parse(content)
assert rp.can_fetch(url, "*") == allowed
|