File: test_downloadermiddleware_httpcache.py

package info (click to toggle)
python-scrapy 0.14.4-1
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 3,064 kB
  • sloc: python: 19,468; xml: 199; sh: 134; makefile: 67
file content (147 lines) | stat: -rw-r--r-- 6,733 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import unittest, tempfile, shutil, time

from scrapy.http import Response, HtmlResponse, Request
from scrapy.spider import BaseSpider
from scrapy.contrib.downloadermiddleware.httpcache import FilesystemCacheStorage, HttpCacheMiddleware
from scrapy.settings import Settings
from scrapy.exceptions import IgnoreRequest


class HttpCacheMiddlewareTest(unittest.TestCase):

    storage_class = FilesystemCacheStorage

    def setUp(self):
        self.spider = BaseSpider('example.com')
        self.tmpdir = tempfile.mkdtemp()
        self.request = Request('http://www.example.com', headers={'User-Agent': 'test'})
        self.response = Response('http://www.example.com', headers={'Content-Type': 'text/html'}, body='test body', status=202)

    def tearDown(self):
        shutil.rmtree(self.tmpdir)

    def _get_settings(self, **new_settings):
        settings = {
            'HTTPCACHE_ENABLED': True,
            'HTTPCACHE_DIR': self.tmpdir,
            'HTTPCACHE_EXPIRATION_SECS': 1,
            'HTTPCACHE_IGNORE_HTTP_CODES': [],
        }
        settings.update(new_settings)
        return Settings(settings)

    def _get_storage(self, **new_settings):
        return self.storage_class(self._get_settings(**new_settings))

    def _get_middleware(self, **new_settings):
        return HttpCacheMiddleware(self._get_settings(**new_settings))

    def test_storage(self):
        storage = self._get_storage()
        request2 = self.request.copy()
        assert storage.retrieve_response(self.spider, request2) is None
        storage.store_response(self.spider, self.request, self.response)
        response2 = storage.retrieve_response(self.spider, request2)
        assert isinstance(response2, HtmlResponse) # inferred from content-type header
        self.assertEqualResponse(self.response, response2)
        time.sleep(2) # wait for cache to expire
        assert storage.retrieve_response(self.spider, request2) is None

    def test_storage_never_expire(self):
        storage = self._get_storage(HTTPCACHE_EXPIRATION_SECS=0)
        assert storage.retrieve_response(self.spider, self.request) is None
        storage.store_response(self.spider, self.request, self.response)
        time.sleep(0.5) # give the chance to expire
        assert storage.retrieve_response(self.spider, self.request)

    def test_middleware(self):
        mw = HttpCacheMiddleware(self._get_settings())
        assert mw.process_request(self.request, self.spider) is None
        mw.process_response(self.request, self.response, self.spider)
        response = mw.process_request(self.request, self.spider)
        assert isinstance(response, HtmlResponse)
        self.assertEqualResponse(self.response, response)
        assert 'cached' in response.flags

    def test_different_request_response_urls(self):
        mw = HttpCacheMiddleware(self._get_settings())
        req = Request('http://host.com/path')
        res = Response('http://host2.net/test.html')
        assert mw.process_request(req, self.spider) is None
        mw.process_response(req, res, self.spider)
        cached = mw.process_request(req, self.spider)
        assert isinstance(cached, Response)
        self.assertEqualResponse(res, cached)
        assert 'cached' in cached.flags

    def test_middleware_ignore_missing(self):
        mw = self._get_middleware(HTTPCACHE_IGNORE_MISSING=True)
        self.assertRaises(IgnoreRequest, mw.process_request, self.request, self.spider)
        mw.process_response(self.request, self.response, self.spider)
        response = mw.process_request(self.request, self.spider)
        assert isinstance(response, HtmlResponse)
        self.assertEqualResponse(self.response, response)
        assert 'cached' in response.flags

    def test_middleware_ignore_schemes(self):
        # http responses are cached by default
        req, res = Request('http://test.com/'), Response('http://test.com/')
        mw = self._get_middleware()
        assert mw.process_request(req, self.spider) is None
        mw.process_response(req, res, self.spider)
        cached = mw.process_request(req, self.spider)
        assert isinstance(cached, Response), type(cached)
        self.assertEqualResponse(res, cached)
        assert 'cached' in cached.flags

        # file response is not cached by default
        req, res = Request('file:///tmp/t.txt'), Response('file:///tmp/t.txt')
        mw = self._get_middleware()
        assert mw.process_request(req, self.spider) is None
        mw.process_response(req, res, self.spider)
        assert mw.storage.retrieve_response(self.spider, req) is None
        assert mw.process_request(req, self.spider) is None

        # s3 scheme response is cached by default
        req, res = Request('s3://bucket/key'), Response('http://bucket/key')
        mw = self._get_middleware()
        assert mw.process_request(req, self.spider) is None
        mw.process_response(req, res, self.spider)
        cached = mw.process_request(req, self.spider)
        assert isinstance(cached, Response), type(cached)
        self.assertEqualResponse(res, cached)
        assert 'cached' in cached.flags

        # ignore s3 scheme
        req, res = Request('s3://bucket/key2'), Response('http://bucket/key2')
        mw = self._get_middleware(HTTPCACHE_IGNORE_SCHEMES=['s3'])
        assert mw.process_request(req, self.spider) is None
        mw.process_response(req, res, self.spider)
        assert mw.storage.retrieve_response(self.spider, req) is None
        assert mw.process_request(req, self.spider) is None

    def test_middleware_ignore_http_codes(self):
        # test response is not cached
        mw = self._get_middleware(HTTPCACHE_IGNORE_HTTP_CODES=[202])
        assert mw.process_request(self.request, self.spider) is None
        mw.process_response(self.request, self.response, self.spider)
        assert mw.storage.retrieve_response(self.spider, self.request) is None
        assert mw.process_request(self.request, self.spider) is None

        # test response is cached
        mw = self._get_middleware(HTTPCACHE_IGNORE_HTTP_CODES=[203])
        mw.process_response(self.request, self.response, self.spider)
        response = mw.process_request(self.request, self.spider)
        assert isinstance(response, HtmlResponse)
        self.assertEqualResponse(self.response, response)
        assert 'cached' in response.flags

    def assertEqualResponse(self, response1, response2):
        self.assertEqual(response1.url, response2.url)
        self.assertEqual(response1.status, response2.status)
        self.assertEqual(response1.headers, response2.headers)
        self.assertEqual(response1.body, response2.body)

if __name__ == '__main__':
    unittest.main()