File: httpcache.py

package info (click to toggle)
python-scrapy 2.13.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 5,664 kB
  • sloc: python: 52,028; xml: 199; makefile: 25; sh: 7
file content (157 lines) | stat: -rw-r--r-- 5,704 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
from __future__ import annotations

from email.utils import formatdate
from typing import TYPE_CHECKING

from twisted.internet import defer
from twisted.internet.error import (
    ConnectError,
    ConnectionDone,
    ConnectionLost,
    ConnectionRefusedError,
    DNSLookupError,
    TCPTimedOutError,
    TimeoutError,
)
from twisted.web.client import ResponseFailed

from scrapy import signals
from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.utils.misc import load_object

if TYPE_CHECKING:
    # typing.Self requires Python 3.11
    from typing_extensions import Self

    from scrapy.crawler import Crawler
    from scrapy.http.request import Request
    from scrapy.http.response import Response
    from scrapy.settings import Settings
    from scrapy.spiders import Spider
    from scrapy.statscollectors import StatsCollector


class HttpCacheMiddleware:
    DOWNLOAD_EXCEPTIONS = (
        defer.TimeoutError,
        TimeoutError,
        DNSLookupError,
        ConnectionRefusedError,
        ConnectionDone,
        ConnectError,
        ConnectionLost,
        TCPTimedOutError,
        ResponseFailed,
        OSError,
    )

    def __init__(self, settings: Settings, stats: StatsCollector) -> None:
        if not settings.getbool("HTTPCACHE_ENABLED"):
            raise NotConfigured
        self.policy = load_object(settings["HTTPCACHE_POLICY"])(settings)
        self.storage = load_object(settings["HTTPCACHE_STORAGE"])(settings)
        self.ignore_missing = settings.getbool("HTTPCACHE_IGNORE_MISSING")
        self.stats = stats

    @classmethod
    def from_crawler(cls, crawler: Crawler) -> Self:
        assert crawler.stats
        o = cls(crawler.settings, crawler.stats)
        crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
        return o

    def spider_opened(self, spider: Spider) -> None:
        self.storage.open_spider(spider)

    def spider_closed(self, spider: Spider) -> None:
        self.storage.close_spider(spider)

    def process_request(
        self, request: Request, spider: Spider
    ) -> Request | Response | None:
        if request.meta.get("dont_cache", False):
            return None

        # Skip uncacheable requests
        if not self.policy.should_cache_request(request):
            request.meta["_dont_cache"] = True  # flag as uncacheable
            return None

        # Look for cached response and check if expired
        cachedresponse: Response | None = self.storage.retrieve_response(
            spider, request
        )
        if cachedresponse is None:
            self.stats.inc_value("httpcache/miss", spider=spider)
            if self.ignore_missing:
                self.stats.inc_value("httpcache/ignore", spider=spider)
                raise IgnoreRequest(f"Ignored request not in cache: {request}")
            return None  # first time request

        # Return cached response only if not expired
        cachedresponse.flags.append("cached")
        if self.policy.is_cached_response_fresh(cachedresponse, request):
            self.stats.inc_value("httpcache/hit", spider=spider)
            return cachedresponse

        # Keep a reference to cached response to avoid a second cache lookup on
        # process_response hook
        request.meta["cached_response"] = cachedresponse

        return None

    def process_response(
        self, request: Request, response: Response, spider: Spider
    ) -> Request | Response:
        if request.meta.get("dont_cache", False):
            return response

        # Skip cached responses and uncacheable requests
        if "cached" in response.flags or "_dont_cache" in request.meta:
            request.meta.pop("_dont_cache", None)
            return response

        # RFC2616 requires origin server to set Date header,
        # https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.18
        if "Date" not in response.headers:
            response.headers["Date"] = formatdate(usegmt=True)

        # Do not validate first-hand responses
        cachedresponse: Response | None = request.meta.pop("cached_response", None)
        if cachedresponse is None:
            self.stats.inc_value("httpcache/firsthand", spider=spider)
            self._cache_response(spider, response, request, cachedresponse)
            return response

        if self.policy.is_cached_response_valid(cachedresponse, response, request):
            self.stats.inc_value("httpcache/revalidate", spider=spider)
            return cachedresponse

        self.stats.inc_value("httpcache/invalidate", spider=spider)
        self._cache_response(spider, response, request, cachedresponse)
        return response

    def process_exception(
        self, request: Request, exception: Exception, spider: Spider
    ) -> Request | Response | None:
        cachedresponse: Response | None = request.meta.pop("cached_response", None)
        if cachedresponse is not None and isinstance(
            exception, self.DOWNLOAD_EXCEPTIONS
        ):
            self.stats.inc_value("httpcache/errorrecovery", spider=spider)
            return cachedresponse
        return None

    def _cache_response(
        self,
        spider: Spider,
        response: Response,
        request: Request,
        cachedresponse: Response | None,
    ) -> None:
        if self.policy.should_cache_response(response, request):
            self.stats.inc_value("httpcache/store", spider=spider)
            self.storage.store_response(spider, request, response)
        else:
            self.stats.inc_value("httpcache/uncacheable", spider=spider)