1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
|
import pytest
from protego import Protego
@pytest.mark.parametrize(
'path,user_agent',
[
('/group1', 'hedwig-news'),
('/group1', 'hedwig-news2'),
('/group2', 'hedwi'),
('/group2', 'a'),
('/group3', 'hedwig-new'),
('/group3', 'hedwig-images'),
]
)
def test_user_agent_precedence(path, user_agent):
robotstxt_content = u"""
user-agent: hedwig-news
disallow: /
allow: /group1
user-agent: *
disallow: /
allow: /group2
user-agent: hedwig
disallow: /
allow: /group3
"""
rp = Protego.parse(content=robotstxt_content)
for allowed_path in ('/group1', '/group2', '/group3'):
if rp.can_fetch(allowed_path, user_agent):
break
else:
allowed_path = None
assert allowed_path == path
@pytest.mark.parametrize(
'pattern,path,match',
[
('/', '/harry', True),
('/', '/device/time-turner', True),
('/', '/hogwards.html', True),
('/*', '/harry', True),
('/*', '/device/time-turner', True),
('/*', '/hogwards.html', True),
('/phoenix', '/phoenix', True),
('/phoenix', '/phoenix.html', True),
('/phoenix', '/phoenix/sparky.html', True),
('/phoenix', '/phoenixheads', True),
('/phoenix', '/phoenixheads/yummy.html', True),
('/phoenix', '/phoenix.php?id=anything', True),
('/phoenix', '/Phoenix.asp', False),
('/phoenix', '/redphoenix', False),
('/phoenix', '/?id=phoenix', False),
('/phoenix*', '/phoenix', True),
('/phoenix*', '/phoenix.html', True),
('/phoenix*', '/phoenix/sparky.html', True),
('/phoenix*', '/phoenixheads', True),
('/phoenix*', '/phoenixheads/yummy.html', True),
('/phoenix*', '/phoenix.php?id=anything', True),
('/phoenix*', '/Phoenix.asp', False),
('/phoenix*', '/redphoenix', False),
('/phoenix*', '/?id=phoenix', False),
('/phoenix/', '/phoenix/', True),
('/phoenix/', '/phoenix/?id=anything', True),
('/phoenix/', '/phoenix/sparky.htm', True),
('/phoenix/', '/phoenix', False),
('/phoenix/', '/phoenix.html', False),
('/phoenix/', '/Phoenix/Sparky.asp', False),
('/*.php', '/filename.php', True),
('/*.php', '/folder/filename.php', True),
('/*.php', '/folder/filename.php?parameters', True),
('/*.php', '/folder/any.php.file.html', True),
('/*.php', '/filename.php/', True),
('/*.php', '/windows.PHP', False),
('/*.php', '/', False),
('/*.php', '/index?f=filename.php/', True),
('/*.php', '/index?php', False),
('/*.php$', '/filename.php', True),
('/*.php$', '/folder/filename.php', True),
('/*.php$', '/filename.php?parameters', False),
('/*.php$', '/filename.php/', False),
('/*.php$', '/filename.php5', False),
('/*.php$', '/windows.PHP', False),
('/*.php$', '/filename?php', False),
('/fish*.php', '/fish.php', True),
('/fish*.php', '/fishheads/catfish.php?parameters', True),
('/fish*.php', '/Fish.PHP', False),
]
)
def test_path_matching(pattern, path, match):
content = """
User-Agent: *
disallow: {}
""".format(pattern)
rp = Protego.parse(content)
assert (not rp.can_fetch(path, '*')) == match
@pytest.mark.parametrize(
'rules,url,allowed',
[
("allow: /p \n disallow: /", "http://example.com/page", True),
("allow: /folder \n disallow: /folder",
"http://example.com/folder/page", True),
("allow: /$ \n disallow: /", "http://example.com/", True),
("allow: /$ \n disallow: /", "http://example.com/page.htm", False),
]
)
def test_record_precedence(rules, url, allowed):
content = """
User-Agent: *
{}
""".format(rules)
rp = Protego.parse(content)
assert rp.can_fetch(url, '*') == allowed
|