File: httpcache.py

package info (click to toggle)
python-scrapy 0.8-3
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 2,904 kB
  • ctags: 2,981
  • sloc: python: 15,349; xml: 199; makefile: 68; sql: 64; sh: 34
file content (122 lines) | stat: -rw-r--r-- 4,604 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from __future__ import with_statement

import os
from os.path import join, exists
from time import time
import cPickle as pickle

from scrapy.xlib.pydispatch import dispatcher
from scrapy.core import signals
from scrapy.http import Headers
from scrapy.core.exceptions import NotConfigured, IgnoreRequest
from scrapy.core.downloader.responsetypes import responsetypes
from scrapy.utils.request import request_fingerprint
from scrapy.utils.http import headers_dict_to_raw, headers_raw_to_dict
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.misc import load_object
from scrapy import conf


class HttpCacheMiddleware(object):

    def __init__(self, settings=conf.settings):
        self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
        self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
        dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)

    def spider_opened(self, spider):
        self.storage.open_spider(spider)

    def spider_closed(self, spider):
        self.storage.close_spider(spider)

    def process_request(self, request, spider):
        if not self.is_cacheable(request):
            return
        response = self.storage.retrieve_response(spider, request)
        if response:
            response.flags.append('cached')
            return response
        elif self.ignore_missing:
            raise IgnoreRequest("Ignored request not in cache: %s" % request)

    def process_response(self, request, response, spider):
        if self.is_cacheable(request):
            self.storage.store_response(spider, request, response)
        return response

    def is_cacheable(self, request):
        return urlparse_cached(request).scheme in ['http', 'https']


class FilesystemCacheStorage(object):

    def __init__(self, settings=conf.settings):
        cachedir = settings['HTTPCACHE_DIR']
        if not cachedir:
            raise NotConfigured
        self.cachedir = cachedir
        self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')

    def open_spider(self, spider):
        pass

    def close_spider(self, spider):
        pass

    def retrieve_response(self, spider, request):
        """Return response if present in cache, or None otherwise."""
        metadata = self._read_meta(spider, request)
        if metadata is None:
            return # not cached
        rpath = self._get_request_path(spider, request)
        with open(join(rpath, 'response_body'), 'rb') as f:
            body = f.read()
        with open(join(rpath, 'response_headers'), 'rb') as f:
            rawheaders = f.read()
        url = metadata['url']
        status = metadata['status']
        headers = Headers(headers_raw_to_dict(rawheaders))
        respcls = responsetypes.from_args(headers=headers, url=url)
        response = respcls(url=url, headers=headers, status=status, body=body)
        return response

    def store_response(self, spider, request, response):
        """Store the given response in the cache."""
        rpath = self._get_request_path(spider, request)
        if not exists(rpath):
            os.makedirs(rpath)
        metadata = {
            'url': request.url,
            'method': request.method,
            'status': response.status,
            'timestamp': time(),
        }
        with open(join(rpath, 'meta'), 'wb') as f:
            f.write(repr(metadata))
        with open(join(rpath, 'pickled_meta'), 'wb') as f:
            pickle.dump(metadata, f, protocol=2)
        with open(join(rpath, 'response_headers'), 'wb') as f:
            f.write(headers_dict_to_raw(response.headers))
        with open(join(rpath, 'response_body'), 'wb') as f:
            f.write(response.body)
        with open(join(rpath, 'request_headers'), 'wb') as f:
            f.write(headers_dict_to_raw(request.headers))
        with open(join(rpath, 'request_body'), 'wb') as f:
            f.write(request.body)

    def _get_request_path(self, spider, request):
        key = request_fingerprint(request)
        return join(self.cachedir, spider.domain_name, key[0:2], key)

    def _read_meta(self, spider, request):
        rpath = self._get_request_path(spider, request)
        metapath = join(rpath, 'pickled_meta')
        if not exists(metapath):
            return # not found
        mtime = os.stat(rpath).st_mtime
        if 0 <= self.expiration_secs < time() - mtime:
            return # expired
        with open(metapath, 'rb') as f:
            return pickle.load(f)