File: test_downloadermiddleware_robotstxt.py

package info (click to toggle)
python-scrapy 1.5.1-1%2Bdeb10u1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 4,404 kB
  • sloc: python: 25,793; xml: 199; makefile: 95; sh: 33
file content (172 lines) | stat: -rw-r--r-- 7,747 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import re
from twisted.internet import reactor, error
from twisted.internet.defer import Deferred, DeferredList, maybeDeferred
from twisted.python import failure
from twisted.trial import unittest
from scrapy.downloadermiddlewares.robotstxt import (RobotsTxtMiddleware,
                                                    logger as mw_module_logger)
from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http import Request, Response, TextResponse
from scrapy.settings import Settings
from tests import mock


class RobotsTxtMiddlewareTest(unittest.TestCase):

    def setUp(self):
        self.crawler = mock.MagicMock()
        self.crawler.settings = Settings()
        self.crawler.engine.download = mock.MagicMock()

    def tearDown(self):
        del self.crawler

    def test_robotstxt_settings(self):
        self.crawler.settings = Settings()
        self.crawler.settings.set('USER_AGENT', 'CustomAgent')
        self.assertRaises(NotConfigured, RobotsTxtMiddleware, self.crawler)

    def _get_successful_crawler(self):
        crawler = self.crawler
        crawler.settings.set('ROBOTSTXT_OBEY', True)
        ROBOTS = re.sub(b'^\s+(?m)', b'', u'''
        User-Agent: *
        Disallow: /admin/
        Disallow: /static/

        # taken from https://en.wikipedia.org/robots.txt
        Disallow: /wiki/K%C3%A4ytt%C3%A4j%C3%A4:
        Disallow: /wiki/Käyttäjä:

        User-Agent: UnicödeBöt
        Disallow: /some/randome/page.html
        '''.encode('utf-8'))
        response = TextResponse('http://site.local/robots.txt', body=ROBOTS)
        def return_response(request, spider):
            deferred = Deferred()
            reactor.callFromThread(deferred.callback, response)
            return deferred
        crawler.engine.download.side_effect = return_response
        return crawler

    def test_robotstxt(self):
        middleware = RobotsTxtMiddleware(self._get_successful_crawler())
        return DeferredList([
            self.assertNotIgnored(Request('http://site.local/allowed'), middleware),
            self.assertIgnored(Request('http://site.local/admin/main'), middleware),
            self.assertIgnored(Request('http://site.local/static/'), middleware),
            self.assertIgnored(Request('http://site.local/wiki/K%C3%A4ytt%C3%A4j%C3%A4:'), middleware),
            self.assertIgnored(Request(u'http://site.local/wiki/Käyttäjä:'), middleware)
        ], fireOnOneErrback=True)

    def test_robotstxt_ready_parser(self):
        middleware = RobotsTxtMiddleware(self._get_successful_crawler())
        d = self.assertNotIgnored(Request('http://site.local/allowed'), middleware)
        d.addCallback(lambda _: self.assertNotIgnored(Request('http://site.local/allowed'), middleware))
        return d

    def test_robotstxt_meta(self):
        middleware = RobotsTxtMiddleware(self._get_successful_crawler())
        meta = {'dont_obey_robotstxt': True}
        return DeferredList([
            self.assertNotIgnored(Request('http://site.local/allowed', meta=meta), middleware),
            self.assertNotIgnored(Request('http://site.local/admin/main', meta=meta), middleware),
            self.assertNotIgnored(Request('http://site.local/static/', meta=meta), middleware)
        ], fireOnOneErrback=True)

    def _get_garbage_crawler(self):
        crawler = self.crawler
        crawler.settings.set('ROBOTSTXT_OBEY', True)
        response = Response('http://site.local/robots.txt', body=b'GIF89a\xd3\x00\xfe\x00\xa2')
        def return_response(request, spider):
            deferred = Deferred()
            reactor.callFromThread(deferred.callback, response)
            return deferred
        crawler.engine.download.side_effect = return_response
        return crawler

    def test_robotstxt_garbage(self):
        # garbage response should be discarded, equal 'allow all'
        middleware = RobotsTxtMiddleware(self._get_garbage_crawler())
        deferred = DeferredList([
            self.assertNotIgnored(Request('http://site.local'), middleware),
            self.assertNotIgnored(Request('http://site.local/allowed'), middleware),
            self.assertNotIgnored(Request('http://site.local/admin/main'), middleware),
            self.assertNotIgnored(Request('http://site.local/static/'), middleware)
        ], fireOnOneErrback=True)
        return deferred

    def _get_emptybody_crawler(self):
        crawler = self.crawler
        crawler.settings.set('ROBOTSTXT_OBEY', True)
        response = Response('http://site.local/robots.txt')
        def return_response(request, spider):
            deferred = Deferred()
            reactor.callFromThread(deferred.callback, response)
            return deferred
        crawler.engine.download.side_effect = return_response
        return crawler

    def test_robotstxt_empty_response(self):
        # empty response should equal 'allow all'
        middleware = RobotsTxtMiddleware(self._get_emptybody_crawler())
        return DeferredList([
            self.assertNotIgnored(Request('http://site.local/allowed'), middleware),
            self.assertNotIgnored(Request('http://site.local/admin/main'), middleware),
            self.assertNotIgnored(Request('http://site.local/static/'), middleware)
        ], fireOnOneErrback=True)

    def test_robotstxt_error(self):
        self.crawler.settings.set('ROBOTSTXT_OBEY', True)
        err = error.DNSLookupError('Robotstxt address not found')
        def return_failure(request, spider):
            deferred = Deferred()
            reactor.callFromThread(deferred.errback, failure.Failure(err))
            return deferred
        self.crawler.engine.download.side_effect = return_failure

        middleware = RobotsTxtMiddleware(self.crawler)
        middleware._logerror = mock.MagicMock(side_effect=middleware._logerror)
        deferred = middleware.process_request(Request('http://site.local'), None)
        deferred.addCallback(lambda _: self.assertTrue(middleware._logerror.called))
        return deferred

    def test_robotstxt_immediate_error(self):
        self.crawler.settings.set('ROBOTSTXT_OBEY', True)
        err = error.DNSLookupError('Robotstxt address not found')
        def immediate_failure(request, spider):
            deferred = Deferred()
            deferred.errback(failure.Failure(err))
            return deferred
        self.crawler.engine.download.side_effect = immediate_failure

        middleware = RobotsTxtMiddleware(self.crawler)
        return self.assertNotIgnored(Request('http://site.local'), middleware)

    def test_ignore_robotstxt_request(self):
        self.crawler.settings.set('ROBOTSTXT_OBEY', True)
        def ignore_request(request, spider):
            deferred = Deferred()
            reactor.callFromThread(deferred.errback, failure.Failure(IgnoreRequest()))
            return deferred
        self.crawler.engine.download.side_effect = ignore_request

        middleware = RobotsTxtMiddleware(self.crawler)
        mw_module_logger.error = mock.MagicMock()

        d = self.assertNotIgnored(Request('http://site.local/allowed'), middleware)
        d.addCallback(lambda _: self.assertFalse(mw_module_logger.error.called))
        return d

    def assertNotIgnored(self, request, middleware):
        spider = None  # not actually used
        dfd = maybeDeferred(middleware.process_request, request, spider)
        dfd.addCallback(self.assertIsNone)
        return dfd

    def assertIgnored(self, request, middleware):
        spider = None  # not actually used
        return self.assertFailure(maybeDeferred(middleware.process_request, request, spider),
                                  IgnoreRequest)