1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
|
import unittest
from scrapy.contrib.downloadermiddleware.redirect import RedirectMiddleware
from scrapy.spider import BaseSpider
from scrapy.exceptions import IgnoreRequest
from scrapy.http import Request, Response, HtmlResponse, Headers
class RedirectMiddlewareTest(unittest.TestCase):
def setUp(self):
self.spider = BaseSpider('foo')
self.mw = RedirectMiddleware()
def test_priority_adjust(self):
req = Request('http://a.com')
rsp = Response('http://a.com', headers={'Location': 'http://a.com/redirected'}, status=301)
req2 = self.mw.process_response(req, rsp, self.spider)
assert req2.priority > req.priority
def test_redirect_301(self):
def _test(method):
url = 'http://www.example.com/301'
url2 = 'http://www.example.com/redirected'
req = Request(url, method=method)
rsp = Response(url, headers={'Location': url2}, status=301)
req2 = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req2, Request)
self.assertEqual(req2.url, url2)
self.assertEqual(req2.method, method)
# response without Location header but with status code is 3XX should be ignored
del rsp.headers['Location']
assert self.mw.process_response(req, rsp, self.spider) is rsp
_test('GET')
_test('POST')
_test('HEAD')
def test_dont_redirect(self):
url = 'http://www.example.com/301'
url2 = 'http://www.example.com/redirected'
req = Request(url, meta={'dont_redirect': True})
rsp = Response(url, headers={'Location': url2}, status=301)
r = self.mw.process_response(req, rsp, self.spider)
assert isinstance(r, Response)
assert r is rsp
def test_redirect_302(self):
url = 'http://www.example.com/302'
url2 = 'http://www.example.com/redirected2'
req = Request(url, method='POST', body='test',
headers={'Content-Type': 'text/plain', 'Content-length': '4'})
rsp = Response(url, headers={'Location': url2}, status=302)
req2 = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req2, Request)
self.assertEqual(req2.url, url2)
self.assertEqual(req2.method, 'GET')
assert 'Content-Type' not in req2.headers, \
"Content-Type header must not be present in redirected request"
assert 'Content-Length' not in req2.headers, \
"Content-Length header must not be present in redirected request"
assert not req2.body, \
"Redirected body must be empty, not '%s'" % req2.body
# response without Location header but with status code is 3XX should be ignored
del rsp.headers['Location']
assert self.mw.process_response(req, rsp, self.spider) is rsp
def test_redirect_302_head(self):
url = 'http://www.example.com/302'
url2 = 'http://www.example.com/redirected2'
req = Request(url, method='HEAD')
rsp = Response(url, headers={'Location': url2}, status=302)
req2 = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req2, Request)
self.assertEqual(req2.url, url2)
self.assertEqual(req2.method, 'HEAD')
# response without Location header but with status code is 3XX should be ignored
del rsp.headers['Location']
assert self.mw.process_response(req, rsp, self.spider) is rsp
def test_meta_refresh(self):
body = """<html>
<head><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
</html>"""
req = Request(url='http://example.org')
rsp = HtmlResponse(url='http://example.org', body=body)
req2 = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req2, Request)
self.assertEqual(req2.url, 'http://example.org/newpage')
def test_meta_refresh_with_high_interval(self):
# meta-refresh with high intervals don't trigger redirects
body = """<html>
<head><meta http-equiv="refresh" content="1000;url=http://example.org/newpage" /></head>
</html>"""
req = Request(url='http://example.org')
rsp = HtmlResponse(url='http://example.org', body=body)
rsp2 = self.mw.process_response(req, rsp, self.spider)
assert rsp is rsp2
def test_meta_refresh_trough_posted_request(self):
body = """<html>
<head><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
</html>"""
req = Request(url='http://example.org', method='POST', body='test',
headers={'Content-Type': 'text/plain', 'Content-length': '4'})
rsp = HtmlResponse(url='http://example.org', body=body)
req2 = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req2, Request)
self.assertEqual(req2.url, 'http://example.org/newpage')
self.assertEqual(req2.method, 'GET')
assert 'Content-Type' not in req2.headers, \
"Content-Type header must not be present in redirected request"
assert 'Content-Length' not in req2.headers, \
"Content-Length header must not be present in redirected request"
assert not req2.body, \
"Redirected body must be empty, not '%s'" % req2.body
def test_max_redirect_times(self):
self.mw.max_redirect_times = 1
req = Request('http://scrapytest.org/302')
rsp = Response('http://scrapytest.org/302', headers={'Location': '/redirected'}, status=302)
req = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req, Request)
assert 'redirect_times' in req.meta
self.assertEqual(req.meta['redirect_times'], 1)
self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider)
def test_ttl(self):
self.mw.max_redirect_times = 100
req = Request('http://scrapytest.org/302', meta={'redirect_ttl': 1})
rsp = Response('http://www.scrapytest.org/302', headers={'Location': '/redirected'}, status=302)
req = self.mw.process_response(req, rsp, self.spider)
assert isinstance(req, Request)
self.assertRaises(IgnoreRequest, self.mw.process_response, req, rsp, self.spider)
def test_redirect_urls(self):
req1 = Request('http://scrapytest.org/first')
rsp1 = Response('http://scrapytest.org/first', headers={'Location': '/redirected'}, status=302)
req2 = self.mw.process_response(req1, rsp1, self.spider)
rsp2 = Response('http://scrapytest.org/redirected', headers={'Location': '/redirected2'}, status=302)
req3 = self.mw.process_response(req2, rsp2, self.spider)
self.assertEqual(req2.url, 'http://scrapytest.org/redirected')
self.assertEqual(req2.meta['redirect_urls'], ['http://scrapytest.org/first'])
self.assertEqual(req3.url, 'http://scrapytest.org/redirected2')
self.assertEqual(req3.meta['redirect_urls'], ['http://scrapytest.org/first', 'http://scrapytest.org/redirected'])
if __name__ == "__main__":
unittest.main()
|