1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
|
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2008 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
"""
Test robots.txt parsing.
"""
import unittest
import linkcheck.robotparser2
class TestRobotParser (unittest.TestCase):
"""
Test robots.txt parser (needs internet access).
"""
needed_resources = ['network']
def setUp (self):
"""
Initialize self.rp as a robots.txt parser.
"""
self.rp = linkcheck.robotparser2.RobotFileParser()
def check (self, a, b):
"""
Helper function comparing two results a and b.
"""
if not b:
ac = "access denied"
else:
ac = "access allowed"
if a != b:
self.fail("%s != %s (%s)" % (a, b, ac))
def test_existing_robots (self):
"""
Test parsing and access of an existing robots.txt file.
"""
# robots.txt that exists (use web archive to be sure to have the
# same robots.txt every time).
self.rp.set_url('http://web.archive.org/web/20050312093828/http://www.musi-cal.com/robots.txt')
self.rp.read()
# test for re.escape
self.check(self.rp.can_fetch('*', 'http://www.musi-cal.com/'), True)
# this should match the first rule, which is a disallow
self.check(self.rp.can_fetch('', 'http://www.musi-cal.com/'), False)
# various cherry pickers
self.check(self.rp.can_fetch('CherryPickerSE',
'http://www.musi-cal.com/cgi-bin/event-search'
'?city=San+Francisco'), False)
self.check(self.rp.can_fetch('CherryPickerSE/1.0',
'http://www.musi-cal.com/cgi-bin/event-search'
'?city=San+Francisco'), False)
self.check(self.rp.can_fetch('CherryPickerSE/1.5',
'http://www.musi-cal.com/cgi-bin/event-search'
'?city=San+Francisco'), False)
# case sensitivity
self.check(self.rp.can_fetch('ExtractorPro',
'http://www.musi-cal.com/blubba'), False)
self.check(self.rp.can_fetch('extractorpro',
'http://www.musi-cal.com/blubba'), False)
# substring test
self.check(self.rp.can_fetch('toolpak/1.1',
'http://www.musi-cal.com/blubba'), False)
# tests for catch-all * agent
self.check(self.rp.can_fetch('spam',
'http://www.musi-cal.com/vsearch'), False)
self.check(self.rp.can_fetch('spam',
'http://www.musi-cal.com/Musician/me'), True)
self.check(self.rp.can_fetch('spam',
'http://www.musi-cal.com/'), True)
self.check(self.rp.can_fetch('spam',
'http://www.musi-cal.com/'), True)
def test_nonexisting_robots (self):
"""
Test access of a non-existing robots.txt file.
"""
# robots.txt that does not exist
self.rp.set_url('http://www.lycos.com/robots.txt')
self.rp.read()
self.check(self.rp.can_fetch('Mozilla',
'http://www.lycos.com/search'), True)
def test_password_robots (self):
# whole site is password-protected.
self.rp.set_url('http://mueblesmoraleda.com/robots.txt')
self.rp.read()
self.check(self.rp.can_fetch("*",
"http://mueblesmoraleda.com/"), False)
def test_suite ():
"""
Build and return a TestSuite.
"""
return unittest.makeSuite(TestRobotParser)
if __name__ == '__main__':
unittest.main()
|