1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
|
from email.utils import formatdate
from scrapy import signals
from scrapy.exceptions import NotConfigured, IgnoreRequest
from scrapy.utils.misc import load_object
class HttpCacheMiddleware(object):
def __init__(self, settings, stats):
if not settings.getbool('HTTPCACHE_ENABLED'):
raise NotConfigured
self.policy = load_object(settings['HTTPCACHE_POLICY'])(settings)
self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
self.stats = stats
@classmethod
def from_crawler(cls, crawler):
o = cls(crawler.settings, crawler.stats)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
return o
def spider_opened(self, spider):
self.storage.open_spider(spider)
def spider_closed(self, spider):
self.storage.close_spider(spider)
def process_request(self, request, spider):
# Skip uncacheable requests
if not self.policy.should_cache_request(request):
request.meta['_dont_cache'] = True # flag as uncacheable
return
# Look for cached response and check if expired
cachedresponse = self.storage.retrieve_response(spider, request)
if cachedresponse is None:
self.stats.inc_value('httpcache/miss', spider=spider)
if self.ignore_missing:
self.stats.inc_value('httpcache/ignore', spider=spider)
raise IgnoreRequest("Ignored request not in cache: %s" % request)
return # first time request
# Return cached response only if not expired
cachedresponse.flags.append('cached')
if self.policy.is_cached_response_fresh(cachedresponse, request):
self.stats.inc_value('httpcache/hit', spider=spider)
return cachedresponse
# Keep a reference to cached response to avoid a second cache lookup on
# process_response hook
request.meta['cached_response'] = cachedresponse
def process_response(self, request, response, spider):
# Skip cached responses and uncacheable requests
if 'cached' in response.flags or '_dont_cache' in request.meta:
request.meta.pop('_dont_cache', None)
return response
# RFC2616 requires origin server to set Date header,
# http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.18
if 'Date' not in response.headers:
response.headers['Date'] = formatdate(usegmt=1)
# Do not validate first-hand responses
cachedresponse = request.meta.pop('cached_response', None)
if cachedresponse is None:
self.stats.inc_value('httpcache/firsthand', spider=spider)
self._cache_response(spider, response, request, cachedresponse)
return response
if self.policy.is_cached_response_valid(cachedresponse, response, request):
self.stats.inc_value('httpcache/revalidate', spider=spider)
return cachedresponse
self.stats.inc_value('httpcache/invalidate', spider=spider)
self._cache_response(spider, response, request, cachedresponse)
return response
def _cache_response(self, spider, response, request, cachedresponse):
if self.policy.should_cache_response(response, request):
self.stats.inc_value('httpcache/store', spider=spider)
self.storage.store_response(spider, request, response)
else:
self.stats.inc_value('httpcache/uncacheable', spider=spider)
from scrapy.contrib.httpcache import FilesystemCacheStorage as _FilesystemCacheStorage
class FilesystemCacheStorage(_FilesystemCacheStorage):
def __init__(self, *args, **kwargs):
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
warnings.warn('Importing FilesystemCacheStorage from '
'scrapy.contrib.downloadermiddlware.httpcache is '
'deprecated, use scrapy.contrib.httpcache instead.',
category=ScrapyDeprecationWarning, stacklevel=1)
super(FilesystemCacheStorage, self).__init__(*args, **kwargs)
|