1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
|
# -*- coding: utf-8 -*-
import unittest
from scrapy.downloadermiddlewares.redirect import RedirectMiddleware, MetaRefreshMiddleware
from scrapy.spiders import Spider
from scrapy.exceptions import IgnoreRequest
from scrapy.http import Request, Response, HtmlResponse
from scrapy.utils.test import get_crawler
class RedirectMiddlewareTest(unittest.TestCase):
def setUp(self):
self.crawler = get_crawler(Spider)
self.spider = self.crawler._create_spider('foo')
self.mw = RedirectMiddleware.from_crawler(self.crawler)
def test_priority_adjust(self):
req = Request('http://a.com')
rsp = Response('http://a.com', headers={'Location': 'http://a.com/redirected'}, status=301)
req2 = self.mw.process_response(req, rsp, self.spider)
assert req2.priority > req.priority
def test_redirect_3xx_permanent(self):
def _test(method, status=301):
url = 'http://www.example.com/{}'.format(status)
url2 = 'http://www.example.com/redirected'
req = Request(url, method=method)
rsp = Response(url, headers={'Location': url2}, status=status)
req2 = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req2, Request)
self.assertEqual(req2.url, url2)
self.assertEqual(req2.method, method)
# response without Location header but with status code is 3XX should be ignored
del rsp.headers['Location']
assert self.mw.process_response(req, rsp, self.spider) is rsp
_test('GET')
_test('POST')
_test('HEAD')
_test('GET', status=307)
_test('POST', status=307)
_test('HEAD', status=307)
_test('GET', status=308)
_test('POST', status=308)
_test('HEAD', status=308)
def test_dont_redirect(self):
url = 'http://www.example.com/301'
url2 = 'http://www.example.com/redirected'
req = Request(url, meta={'dont_redirect': True})
rsp = Response(url, headers={'Location': url2}, status=301)
r = self.mw.process_response(req, rsp, self.spider)
assert isinstance(r, Response)
assert r is rsp
# Test that it redirects when dont_redirect is False
req = Request(url, meta={'dont_redirect': False})
rsp = Response(url2, status=200)
r = self.mw.process_response(req, rsp, self.spider)
assert isinstance(r, Response)
assert r is rsp
def test_redirect_302(self):
url = 'http://www.example.com/302'
url2 = 'http://www.example.com/redirected2'
req = Request(url, method='POST', body='test',
headers={'Content-Type': 'text/plain', 'Content-length': '4'})
rsp = Response(url, headers={'Location': url2}, status=302)
req2 = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req2, Request)
self.assertEqual(req2.url, url2)
self.assertEqual(req2.method, 'GET')
assert 'Content-Type' not in req2.headers, \
"Content-Type header must not be present in redirected request"
assert 'Content-Length' not in req2.headers, \
"Content-Length header must not be present in redirected request"
assert not req2.body, \
"Redirected body must be empty, not '%s'" % req2.body
# response without Location header but with status code is 3XX should be ignored
del rsp.headers['Location']
assert self.mw.process_response(req, rsp, self.spider) is rsp
def test_redirect_302_head(self):
url = 'http://www.example.com/302'
url2 = 'http://www.example.com/redirected2'
req = Request(url, method='HEAD')
rsp = Response(url, headers={'Location': url2}, status=302)
req2 = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req2, Request)
self.assertEqual(req2.url, url2)
self.assertEqual(req2.method, 'HEAD')
# response without Location header but with status code is 3XX should be ignored
del rsp.headers['Location']
assert self.mw.process_response(req, rsp, self.spider) is rsp
def test_max_redirect_times(self):
self.mw.max_redirect_times = 1
req = Request('http://scrapytest.org/302')
rsp = Response('http://scrapytest.org/302', headers={'Location': '/redirected'}, status=302)
req = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req, Request)
assert 'redirect_times' in req.meta
self.assertEqual(req.meta['redirect_times'], 1)
self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider)
def test_ttl(self):
self.mw.max_redirect_times = 100
req = Request('http://scrapytest.org/302', meta={'redirect_ttl': 1})
rsp = Response('http://www.scrapytest.org/302', headers={'Location': '/redirected'}, status=302)
req = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req, Request)
self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider)
def test_redirect_urls(self):
req1 = Request('http://scrapytest.org/first')
rsp1 = Response('http://scrapytest.org/first', headers={'Location': '/redirected'}, status=302)
req2 = self.mw.process_response(req1, rsp1, self.spider)
rsp2 = Response('http://scrapytest.org/redirected', headers={'Location': '/redirected2'}, status=302)
req3 = self.mw.process_response(req2, rsp2, self.spider)
self.assertEqual(req2.url, 'http://scrapytest.org/redirected')
self.assertEqual(req2.meta['redirect_urls'], ['http://scrapytest.org/first'])
self.assertEqual(req3.url, 'http://scrapytest.org/redirected2')
self.assertEqual(req3.meta['redirect_urls'], ['http://scrapytest.org/first', 'http://scrapytest.org/redirected'])
def test_spider_handling(self):
smartspider = self.crawler._create_spider('smarty')
smartspider.handle_httpstatus_list = [404, 301, 302]
url = 'http://www.example.com/301'
url2 = 'http://www.example.com/redirected'
req = Request(url)
rsp = Response(url, headers={'Location': url2}, status=301)
r = self.mw.process_response(req, rsp, smartspider)
self.assertIs(r, rsp)
def test_request_meta_handling(self):
url = 'http://www.example.com/301'
url2 = 'http://www.example.com/redirected'
def _test_passthrough(req):
rsp = Response(url, headers={'Location': url2}, status=301, request=req)
r = self.mw.process_response(req, rsp, self.spider)
self.assertIs(r, rsp)
_test_passthrough(Request(url, meta={'handle_httpstatus_list':
[404, 301, 302]}))
_test_passthrough(Request(url, meta={'handle_httpstatus_all': True}))
def test_latin1_location(self):
req = Request('http://scrapytest.org/first')
latin1_location = u'/ação'.encode('latin1') # HTTP historically supports latin1
resp = Response('http://scrapytest.org/first', headers={'Location': latin1_location}, status=302)
req_result = self.mw.process_response(req, resp, self.spider)
perc_encoded_utf8_url = 'http://scrapytest.org/a%E7%E3o'
self.assertEqual(perc_encoded_utf8_url, req_result.url)
def test_utf8_location(self):
req = Request('http://scrapytest.org/first')
utf8_location = u'/ação'.encode('utf-8') # header using UTF-8 encoding
resp = Response('http://scrapytest.org/first', headers={'Location': utf8_location}, status=302)
req_result = self.mw.process_response(req, resp, self.spider)
perc_encoded_utf8_url = 'http://scrapytest.org/a%C3%A7%C3%A3o'
self.assertEqual(perc_encoded_utf8_url, req_result.url)
class MetaRefreshMiddlewareTest(unittest.TestCase):
def setUp(self):
crawler = get_crawler(Spider)
self.spider = crawler._create_spider('foo')
self.mw = MetaRefreshMiddleware.from_crawler(crawler)
def _body(self, interval=5, url='http://example.org/newpage'):
html = u"""<html><head><meta http-equiv="refresh" content="{0};url={1}"/></head></html>"""
return html.format(interval, url).encode('utf-8')
def test_priority_adjust(self):
req = Request('http://a.com')
rsp = HtmlResponse(req.url, body=self._body())
req2 = self.mw.process_response(req, rsp, self.spider)
assert req2.priority > req.priority
def test_meta_refresh(self):
req = Request(url='http://example.org')
rsp = HtmlResponse(req.url, body=self._body())
req2 = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req2, Request)
self.assertEqual(req2.url, 'http://example.org/newpage')
def test_meta_refresh_with_high_interval(self):
# meta-refresh with high intervals don't trigger redirects
req = Request(url='http://example.org')
rsp = HtmlResponse(url='http://example.org',
body=self._body(interval=1000),
encoding='utf-8')
rsp2 = self.mw.process_response(req, rsp, self.spider)
assert rsp is rsp2
def test_meta_refresh_trough_posted_request(self):
req = Request(url='http://example.org', method='POST', body='test',
headers={'Content-Type': 'text/plain', 'Content-length': '4'})
rsp = HtmlResponse(req.url, body=self._body())
req2 = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req2, Request)
self.assertEqual(req2.url, 'http://example.org/newpage')
self.assertEqual(req2.method, 'GET')
assert 'Content-Type' not in req2.headers, \
"Content-Type header must not be present in redirected request"
assert 'Content-Length' not in req2.headers, \
"Content-Length header must not be present in redirected request"
assert not req2.body, \
"Redirected body must be empty, not '%s'" % req2.body
def test_max_redirect_times(self):
self.mw.max_redirect_times = 1
req = Request('http://scrapytest.org/max')
rsp = HtmlResponse(req.url, body=self._body())
req = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req, Request)
assert 'redirect_times' in req.meta
self.assertEqual(req.meta['redirect_times'], 1)
self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider)
def test_ttl(self):
self.mw.max_redirect_times = 100
req = Request('http://scrapytest.org/302', meta={'redirect_ttl': 1})
rsp = HtmlResponse(req.url, body=self._body())
req = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req, Request)
self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider)
def test_redirect_urls(self):
req1 = Request('http://scrapytest.org/first')
rsp1 = HtmlResponse(req1.url, body=self._body(url='/redirected'))
req2 = self.mw.process_response(req1, rsp1, self.spider)
assert isinstance(req2, Request), req2
rsp2 = HtmlResponse(req2.url, body=self._body(url='/redirected2'))
req3 = self.mw.process_response(req2, rsp2, self.spider)
assert isinstance(req3, Request), req3
self.assertEqual(req2.url, 'http://scrapytest.org/redirected')
self.assertEqual(req2.meta['redirect_urls'], ['http://scrapytest.org/first'])
self.assertEqual(req3.url, 'http://scrapytest.org/redirected2')
self.assertEqual(req3.meta['redirect_urls'], ['http://scrapytest.org/first', 'http://scrapytest.org/redirected'])
if __name__ == "__main__":
unittest.main()
|