File: test_robotparser.py

package info (click to toggle)
linkchecker 4.9-1
links: PTS
area: main
in suites: lenny
size: 3,308 kB
ctags: 3,736
sloc: python: 21,328; lex: 1,114; yacc: 781; ansic: 551; makefile: 244; sh: 100; sql: 19; awk: 4
file content (115 lines) | stat: -rw-r--r-- 4,426 bytes
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2008 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
"""
Test robots.txt parsing.
"""

import unittest
import linkcheck.robotparser2


class TestRobotParser (unittest.TestCase):
    """
    Test robots.txt parser (needs internet access).
    """

    needed_resources = ['network']

    def setUp (self):
        """
        Initialize self.rp as a robots.txt parser.
        """
        self.rp = linkcheck.robotparser2.RobotFileParser()

    def check (self, a, b):
        """
        Helper function comparing two results a and b.
        """
        if not b:
            ac = "access denied"
        else:
            ac = "access allowed"
        if a != b:
            self.fail("%s != %s (%s)" % (a, b, ac))

    def test_existing_robots (self):
        """
        Test parsing and access of an existing robots.txt file.
        """
        # robots.txt that exists (use web archive to be sure to have the
        # same robots.txt every time).
        self.rp.set_url('http://web.archive.org/web/20050312093828/http://www.musi-cal.com/robots.txt')
        self.rp.read()
        # test for re.escape
        self.check(self.rp.can_fetch('*', 'http://www.musi-cal.com/'), True)
        # this should match the first rule, which is a disallow
        self.check(self.rp.can_fetch('', 'http://www.musi-cal.com/'), False)
        # various cherry pickers
        self.check(self.rp.can_fetch('CherryPickerSE',
                           'http://www.musi-cal.com/cgi-bin/event-search'
                           '?city=San+Francisco'), False)
        self.check(self.rp.can_fetch('CherryPickerSE/1.0',
                           'http://www.musi-cal.com/cgi-bin/event-search'
                           '?city=San+Francisco'), False)
        self.check(self.rp.can_fetch('CherryPickerSE/1.5',
                           'http://www.musi-cal.com/cgi-bin/event-search'
                           '?city=San+Francisco'), False)
        # case sensitivity
        self.check(self.rp.can_fetch('ExtractorPro',
                                     'http://www.musi-cal.com/blubba'), False)
        self.check(self.rp.can_fetch('extractorpro',
                                     'http://www.musi-cal.com/blubba'), False)
        # substring test
        self.check(self.rp.can_fetch('toolpak/1.1',
                                     'http://www.musi-cal.com/blubba'), False)
        # tests for catch-all * agent
        self.check(self.rp.can_fetch('spam',
                                    'http://www.musi-cal.com/vsearch'), False)
        self.check(self.rp.can_fetch('spam',
                                 'http://www.musi-cal.com/Musician/me'), True)
        self.check(self.rp.can_fetch('spam',
                                     'http://www.musi-cal.com/'), True)
        self.check(self.rp.can_fetch('spam',
                                     'http://www.musi-cal.com/'), True)

    def test_nonexisting_robots (self):
        """
        Test access of a non-existing robots.txt file.
        """
        # robots.txt that does not exist
        self.rp.set_url('http://www.lycos.com/robots.txt')
        self.rp.read()
        self.check(self.rp.can_fetch('Mozilla',
                                     'http://www.lycos.com/search'), True)

    def test_password_robots (self):
        # whole site is password-protected.
        self.rp.set_url('http://mueblesmoraleda.com/robots.txt')
        self.rp.read()
        self.check(self.rp.can_fetch("*",
                                     "http://mueblesmoraleda.com/"), False)


def test_suite ():
    """
    Build and return a TestSuite.
    """
    return unittest.makeSuite(TestRobotParser)


if __name__ == '__main__':
    unittest.main()