1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
|
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import re
from twisted.internet import reactor, error
from twisted.internet.defer import Deferred, DeferredList, maybeDeferred
from twisted.python import failure
from twisted.trial import unittest
from scrapy.downloadermiddlewares.robotstxt import (RobotsTxtMiddleware,
logger as mw_module_logger)
from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http import Request, Response, TextResponse
from scrapy.settings import Settings
from tests import mock
class RobotsTxtMiddlewareTest(unittest.TestCase):
def setUp(self):
self.crawler = mock.MagicMock()
self.crawler.settings = Settings()
self.crawler.engine.download = mock.MagicMock()
def tearDown(self):
del self.crawler
def test_robotstxt_settings(self):
self.crawler.settings = Settings()
self.crawler.settings.set('USER_AGENT', 'CustomAgent')
self.assertRaises(NotConfigured, RobotsTxtMiddleware, self.crawler)
def _get_successful_crawler(self):
crawler = self.crawler
crawler.settings.set('ROBOTSTXT_OBEY', True)
ROBOTS = re.sub(b'^\s+(?m)', b'', u'''
User-Agent: *
Disallow: /admin/
Disallow: /static/
# taken from https://en.wikipedia.org/robots.txt
Disallow: /wiki/K%C3%A4ytt%C3%A4j%C3%A4:
Disallow: /wiki/Käyttäjä:
User-Agent: UnicödeBöt
Disallow: /some/randome/page.html
'''.encode('utf-8'))
response = TextResponse('http://site.local/robots.txt', body=ROBOTS)
def return_response(request, spider):
deferred = Deferred()
reactor.callFromThread(deferred.callback, response)
return deferred
crawler.engine.download.side_effect = return_response
return crawler
def test_robotstxt(self):
middleware = RobotsTxtMiddleware(self._get_successful_crawler())
return DeferredList([
self.assertNotIgnored(Request('http://site.local/allowed'), middleware),
self.assertIgnored(Request('http://site.local/admin/main'), middleware),
self.assertIgnored(Request('http://site.local/static/'), middleware),
self.assertIgnored(Request('http://site.local/wiki/K%C3%A4ytt%C3%A4j%C3%A4:'), middleware),
self.assertIgnored(Request(u'http://site.local/wiki/Käyttäjä:'), middleware)
], fireOnOneErrback=True)
def test_robotstxt_ready_parser(self):
middleware = RobotsTxtMiddleware(self._get_successful_crawler())
d = self.assertNotIgnored(Request('http://site.local/allowed'), middleware)
d.addCallback(lambda _: self.assertNotIgnored(Request('http://site.local/allowed'), middleware))
return d
def test_robotstxt_meta(self):
middleware = RobotsTxtMiddleware(self._get_successful_crawler())
meta = {'dont_obey_robotstxt': True}
return DeferredList([
self.assertNotIgnored(Request('http://site.local/allowed', meta=meta), middleware),
self.assertNotIgnored(Request('http://site.local/admin/main', meta=meta), middleware),
self.assertNotIgnored(Request('http://site.local/static/', meta=meta), middleware)
], fireOnOneErrback=True)
def _get_garbage_crawler(self):
crawler = self.crawler
crawler.settings.set('ROBOTSTXT_OBEY', True)
response = Response('http://site.local/robots.txt', body=b'GIF89a\xd3\x00\xfe\x00\xa2')
def return_response(request, spider):
deferred = Deferred()
reactor.callFromThread(deferred.callback, response)
return deferred
crawler.engine.download.side_effect = return_response
return crawler
def test_robotstxt_garbage(self):
# garbage response should be discarded, equal 'allow all'
middleware = RobotsTxtMiddleware(self._get_garbage_crawler())
deferred = DeferredList([
self.assertNotIgnored(Request('http://site.local'), middleware),
self.assertNotIgnored(Request('http://site.local/allowed'), middleware),
self.assertNotIgnored(Request('http://site.local/admin/main'), middleware),
self.assertNotIgnored(Request('http://site.local/static/'), middleware)
], fireOnOneErrback=True)
return deferred
def _get_emptybody_crawler(self):
crawler = self.crawler
crawler.settings.set('ROBOTSTXT_OBEY', True)
response = Response('http://site.local/robots.txt')
def return_response(request, spider):
deferred = Deferred()
reactor.callFromThread(deferred.callback, response)
return deferred
crawler.engine.download.side_effect = return_response
return crawler
def test_robotstxt_empty_response(self):
# empty response should equal 'allow all'
middleware = RobotsTxtMiddleware(self._get_emptybody_crawler())
return DeferredList([
self.assertNotIgnored(Request('http://site.local/allowed'), middleware),
self.assertNotIgnored(Request('http://site.local/admin/main'), middleware),
self.assertNotIgnored(Request('http://site.local/static/'), middleware)
], fireOnOneErrback=True)
def test_robotstxt_error(self):
self.crawler.settings.set('ROBOTSTXT_OBEY', True)
err = error.DNSLookupError('Robotstxt address not found')
def return_failure(request, spider):
deferred = Deferred()
reactor.callFromThread(deferred.errback, failure.Failure(err))
return deferred
self.crawler.engine.download.side_effect = return_failure
middleware = RobotsTxtMiddleware(self.crawler)
middleware._logerror = mock.MagicMock(side_effect=middleware._logerror)
deferred = middleware.process_request(Request('http://site.local'), None)
deferred.addCallback(lambda _: self.assertTrue(middleware._logerror.called))
return deferred
def test_robotstxt_immediate_error(self):
self.crawler.settings.set('ROBOTSTXT_OBEY', True)
err = error.DNSLookupError('Robotstxt address not found')
def immediate_failure(request, spider):
deferred = Deferred()
deferred.errback(failure.Failure(err))
return deferred
self.crawler.engine.download.side_effect = immediate_failure
middleware = RobotsTxtMiddleware(self.crawler)
return self.assertNotIgnored(Request('http://site.local'), middleware)
def test_ignore_robotstxt_request(self):
self.crawler.settings.set('ROBOTSTXT_OBEY', True)
def ignore_request(request, spider):
deferred = Deferred()
reactor.callFromThread(deferred.errback, failure.Failure(IgnoreRequest()))
return deferred
self.crawler.engine.download.side_effect = ignore_request
middleware = RobotsTxtMiddleware(self.crawler)
mw_module_logger.error = mock.MagicMock()
d = self.assertNotIgnored(Request('http://site.local/allowed'), middleware)
d.addCallback(lambda _: self.assertFalse(mw_module_logger.error.called))
return d
def assertNotIgnored(self, request, middleware):
spider = None # not actually used
dfd = maybeDeferred(middleware.process_request, request, spider)
dfd.addCallback(self.assertIsNone)
return dfd
def assertIgnored(self, request, middleware):
spider = None # not actually used
return self.assertFailure(maybeDeferred(middleware.process_request, request, spider),
IgnoreRequest)
|