1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
|
import unittest, tempfile, shutil, time
from scrapy.http import Response, HtmlResponse, Request
from scrapy.spider import BaseSpider
from scrapy.contrib.downloadermiddleware.httpcache import FilesystemCacheStorage, HttpCacheMiddleware
from scrapy.settings import Settings
from scrapy.exceptions import IgnoreRequest
class HttpCacheMiddlewareTest(unittest.TestCase):
storage_class = FilesystemCacheStorage
def setUp(self):
self.spider = BaseSpider('example.com')
self.tmpdir = tempfile.mkdtemp()
self.request = Request('http://www.example.com', headers={'User-Agent': 'test'})
self.response = Response('http://www.example.com', headers={'Content-Type': 'text/html'}, body='test body', status=202)
def tearDown(self):
shutil.rmtree(self.tmpdir)
def _get_settings(self, **new_settings):
settings = {
'HTTPCACHE_ENABLED': True,
'HTTPCACHE_DIR': self.tmpdir,
'HTTPCACHE_EXPIRATION_SECS': 1,
'HTTPCACHE_IGNORE_HTTP_CODES': [],
}
settings.update(new_settings)
return Settings(settings)
def _get_storage(self, **new_settings):
return self.storage_class(self._get_settings(**new_settings))
def _get_middleware(self, **new_settings):
return HttpCacheMiddleware(self._get_settings(**new_settings))
def test_storage(self):
storage = self._get_storage()
request2 = self.request.copy()
assert storage.retrieve_response(self.spider, request2) is None
storage.store_response(self.spider, self.request, self.response)
response2 = storage.retrieve_response(self.spider, request2)
assert isinstance(response2, HtmlResponse) # inferred from content-type header
self.assertEqualResponse(self.response, response2)
time.sleep(2) # wait for cache to expire
assert storage.retrieve_response(self.spider, request2) is None
def test_storage_never_expire(self):
storage = self._get_storage(HTTPCACHE_EXPIRATION_SECS=0)
assert storage.retrieve_response(self.spider, self.request) is None
storage.store_response(self.spider, self.request, self.response)
time.sleep(0.5) # give the chance to expire
assert storage.retrieve_response(self.spider, self.request)
def test_middleware(self):
mw = HttpCacheMiddleware(self._get_settings())
assert mw.process_request(self.request, self.spider) is None
mw.process_response(self.request, self.response, self.spider)
response = mw.process_request(self.request, self.spider)
assert isinstance(response, HtmlResponse)
self.assertEqualResponse(self.response, response)
assert 'cached' in response.flags
def test_different_request_response_urls(self):
mw = HttpCacheMiddleware(self._get_settings())
req = Request('http://host.com/path')
res = Response('http://host2.net/test.html')
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)
cached = mw.process_request(req, self.spider)
assert isinstance(cached, Response)
self.assertEqualResponse(res, cached)
assert 'cached' in cached.flags
def test_middleware_ignore_missing(self):
mw = self._get_middleware(HTTPCACHE_IGNORE_MISSING=True)
self.assertRaises(IgnoreRequest, mw.process_request, self.request, self.spider)
mw.process_response(self.request, self.response, self.spider)
response = mw.process_request(self.request, self.spider)
assert isinstance(response, HtmlResponse)
self.assertEqualResponse(self.response, response)
assert 'cached' in response.flags
def test_middleware_ignore_schemes(self):
# http responses are cached by default
req, res = Request('http://test.com/'), Response('http://test.com/')
mw = self._get_middleware()
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)
cached = mw.process_request(req, self.spider)
assert isinstance(cached, Response), type(cached)
self.assertEqualResponse(res, cached)
assert 'cached' in cached.flags
# file response is not cached by default
req, res = Request('file:///tmp/t.txt'), Response('file:///tmp/t.txt')
mw = self._get_middleware()
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)
assert mw.storage.retrieve_response(self.spider, req) is None
assert mw.process_request(req, self.spider) is None
# s3 scheme response is cached by default
req, res = Request('s3://bucket/key'), Response('http://bucket/key')
mw = self._get_middleware()
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)
cached = mw.process_request(req, self.spider)
assert isinstance(cached, Response), type(cached)
self.assertEqualResponse(res, cached)
assert 'cached' in cached.flags
# ignore s3 scheme
req, res = Request('s3://bucket/key2'), Response('http://bucket/key2')
mw = self._get_middleware(HTTPCACHE_IGNORE_SCHEMES=['s3'])
assert mw.process_request(req, self.spider) is None
mw.process_response(req, res, self.spider)
assert mw.storage.retrieve_response(self.spider, req) is None
assert mw.process_request(req, self.spider) is None
def test_middleware_ignore_http_codes(self):
# test response is not cached
mw = self._get_middleware(HTTPCACHE_IGNORE_HTTP_CODES=[202])
assert mw.process_request(self.request, self.spider) is None
mw.process_response(self.request, self.response, self.spider)
assert mw.storage.retrieve_response(self.spider, self.request) is None
assert mw.process_request(self.request, self.spider) is None
# test response is cached
mw = self._get_middleware(HTTPCACHE_IGNORE_HTTP_CODES=[203])
mw.process_response(self.request, self.response, self.spider)
response = mw.process_request(self.request, self.spider)
assert isinstance(response, HtmlResponse)
self.assertEqualResponse(self.response, response)
assert 'cached' in response.flags
def assertEqualResponse(self, response1, response2):
self.assertEqual(response1.url, response2.url)
self.assertEqual(response1.status, response2.status)
self.assertEqual(response1.headers, response2.headers)
self.assertEqual(response1.body, response2.body)
if __name__ == '__main__':
unittest.main()
|