File: referer.py

package info (click to toggle)
python-scrapy 2.14.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 6,332 kB
  • sloc: python: 55,629; xml: 199; makefile: 25; sh: 7
file content (436 lines) | stat: -rw-r--r-- 16,486 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
"""
RefererMiddleware: populates Request referer field, based on the Response which
originated it.
"""

from __future__ import annotations

import warnings
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, cast
from urllib.parse import urlparse
from warnings import warn

from scrapy.exceptions import NotConfigured
from scrapy.http import Request, Response
from scrapy.spidermiddlewares.base import BaseSpiderMiddleware
from scrapy.utils.misc import load_object
from scrapy.utils.python import _looks_like_import_path, to_unicode
from scrapy.utils.url import strip_url

if TYPE_CHECKING:
    # typing.Self requires Python 3.11
    from typing_extensions import Self

    from scrapy.crawler import Crawler
    from scrapy.settings import BaseSettings


LOCAL_SCHEMES: tuple[str, ...] = (
    "about",
    "blob",
    "data",
    "filesystem",
)

POLICY_NO_REFERRER = "no-referrer"
POLICY_NO_REFERRER_WHEN_DOWNGRADE = "no-referrer-when-downgrade"
POLICY_SAME_ORIGIN = "same-origin"
POLICY_ORIGIN = "origin"
POLICY_STRICT_ORIGIN = "strict-origin"
POLICY_ORIGIN_WHEN_CROSS_ORIGIN = "origin-when-cross-origin"
POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN = "strict-origin-when-cross-origin"
POLICY_UNSAFE_URL = "unsafe-url"
POLICY_SCRAPY_DEFAULT = "scrapy-default"


class ReferrerPolicy(ABC):
    """Abstract base class for referrer policies."""

    NOREFERRER_SCHEMES: tuple[str, ...] = LOCAL_SCHEMES
    name: str

    @abstractmethod
    def referrer(self, response_url: str, request_url: str) -> str | None:
        raise NotImplementedError

    def stripped_referrer(self, url: str) -> str | None:
        if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
            return self.strip_url(url)
        return None

    def origin_referrer(self, url: str) -> str | None:
        if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
            return self.origin(url)
        return None

    def strip_url(self, url: str, origin_only: bool = False) -> str | None:
        """
        https://www.w3.org/TR/referrer-policy/#strip-url

        If url is null, return no referrer.
        If url's scheme is a local scheme, then return no referrer.
        Set url's username to the empty string.
        Set url's password to null.
        Set url's fragment to null.
        If the origin-only flag is true, then:
            Set url's path to null.
            Set url's query to null.
        Return url.
        """
        if not url:
            return None
        return strip_url(
            url,
            strip_credentials=True,
            strip_fragment=True,
            strip_default_port=True,
            origin_only=origin_only,
        )

    def origin(self, url: str) -> str | None:
        """Return serialized origin (scheme, host, path) for a request or response URL."""
        return self.strip_url(url, origin_only=True)

    def potentially_trustworthy(self, url: str) -> bool:
        # Note: this does not follow https://w3c.github.io/webappsec-secure-contexts/#is-url-trustworthy
        parsed_url = urlparse(url)
        if parsed_url.scheme in ("data",):
            return False
        return self.tls_protected(url)

    def tls_protected(self, url: str) -> bool:
        return urlparse(url).scheme in ("https", "ftps")


class NoReferrerPolicy(ReferrerPolicy):
    """
    https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer

    The simplest policy is "no-referrer", which specifies that no referrer information
    is to be sent along with requests made from a particular request client to any origin.
    The header will be omitted entirely.
    """

    name: str = POLICY_NO_REFERRER

    def referrer(self, response_url: str, request_url: str) -> str | None:
        return None


class NoReferrerWhenDowngradePolicy(ReferrerPolicy):
    """
    https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer-when-downgrade

    The "no-referrer-when-downgrade" policy sends a full URL along with requests
    from a TLS-protected environment settings object to a potentially trustworthy URL,
    and requests from clients which are not TLS-protected to any origin.

    Requests from TLS-protected clients to non-potentially trustworthy URLs,
    on the other hand, will contain no referrer information.
    A Referer HTTP header will not be sent.

    This is a user agent's default behavior, if no policy is otherwise specified.
    """

    name: str = POLICY_NO_REFERRER_WHEN_DOWNGRADE

    def referrer(self, response_url: str, request_url: str) -> str | None:
        if not self.tls_protected(response_url) or self.tls_protected(request_url):
            return self.stripped_referrer(response_url)
        return None


class SameOriginPolicy(ReferrerPolicy):
    """
    https://www.w3.org/TR/referrer-policy/#referrer-policy-same-origin

    The "same-origin" policy specifies that a full URL, stripped for use as a referrer,
    is sent as referrer information when making same-origin requests from a particular request client.

    Cross-origin requests, on the other hand, will contain no referrer information.
    A Referer HTTP header will not be sent.
    """

    name: str = POLICY_SAME_ORIGIN

    def referrer(self, response_url: str, request_url: str) -> str | None:
        if self.origin(response_url) == self.origin(request_url):
            return self.stripped_referrer(response_url)
        return None


class OriginPolicy(ReferrerPolicy):
    """
    https://www.w3.org/TR/referrer-policy/#referrer-policy-origin

    The "origin" policy specifies that only the ASCII serialization
    of the origin of the request client is sent as referrer information
    when making both same-origin requests and cross-origin requests
    from a particular request client.
    """

    name: str = POLICY_ORIGIN

    def referrer(self, response_url: str, request_url: str) -> str | None:
        return self.origin_referrer(response_url)


class StrictOriginPolicy(ReferrerPolicy):
    """
    https://www.w3.org/TR/referrer-policy/#referrer-policy-strict-origin

    The "strict-origin" policy sends the ASCII serialization
    of the origin of the request client when making requests:
    - from a TLS-protected environment settings object to a potentially trustworthy URL, and
    - from non-TLS-protected environment settings objects to any origin.

    Requests from TLS-protected request clients to non- potentially trustworthy URLs,
    on the other hand, will contain no referrer information.
    A Referer HTTP header will not be sent.
    """

    name: str = POLICY_STRICT_ORIGIN

    def referrer(self, response_url: str, request_url: str) -> str | None:
        if (
            self.tls_protected(response_url)
            and self.potentially_trustworthy(request_url)
        ) or not self.tls_protected(response_url):
            return self.origin_referrer(response_url)
        return None


class OriginWhenCrossOriginPolicy(ReferrerPolicy):
    """
    https://www.w3.org/TR/referrer-policy/#referrer-policy-origin-when-cross-origin

    The "origin-when-cross-origin" policy specifies that a full URL,
    stripped for use as a referrer, is sent as referrer information
    when making same-origin requests from a particular request client,
    and only the ASCII serialization of the origin of the request client
    is sent as referrer information when making cross-origin requests
    from a particular request client.
    """

    name: str = POLICY_ORIGIN_WHEN_CROSS_ORIGIN

    def referrer(self, response_url: str, request_url: str) -> str | None:
        origin = self.origin(response_url)
        if origin == self.origin(request_url):
            return self.stripped_referrer(response_url)
        return origin


class StrictOriginWhenCrossOriginPolicy(ReferrerPolicy):
    """
    https://www.w3.org/TR/referrer-policy/#referrer-policy-strict-origin-when-cross-origin

    The "strict-origin-when-cross-origin" policy specifies that a full URL,
    stripped for use as a referrer, is sent as referrer information
    when making same-origin requests from a particular request client,
    and only the ASCII serialization of the origin of the request client
    when making cross-origin requests:

    - from a TLS-protected environment settings object to a potentially trustworthy URL, and
    - from non-TLS-protected environment settings objects to any origin.

    Requests from TLS-protected clients to non- potentially trustworthy URLs,
    on the other hand, will contain no referrer information.
    A Referer HTTP header will not be sent.
    """

    name: str = POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN

    def referrer(self, response_url: str, request_url: str) -> str | None:
        origin = self.origin(response_url)
        if origin == self.origin(request_url):
            return self.stripped_referrer(response_url)
        if (
            self.tls_protected(response_url)
            and self.potentially_trustworthy(request_url)
        ) or not self.tls_protected(response_url):
            return self.origin_referrer(response_url)
        return None


class UnsafeUrlPolicy(ReferrerPolicy):
    """
    https://www.w3.org/TR/referrer-policy/#referrer-policy-unsafe-url

    The "unsafe-url" policy specifies that a full URL, stripped for use as a referrer,
    is sent along with both cross-origin requests
    and same-origin requests made from a particular request client.

    Note: The policy's name doesn't lie; it is unsafe.
    This policy will leak origins and paths from TLS-protected resources
    to insecure origins.
    Carefully consider the impact of setting such a policy for potentially sensitive documents.
    """

    name: str = POLICY_UNSAFE_URL

    def referrer(self, response_url: str, request_url: str) -> str | None:
        return self.stripped_referrer(response_url)


class DefaultReferrerPolicy(NoReferrerWhenDowngradePolicy):
    """
    A variant of "no-referrer-when-downgrade",
    with the addition that "Referer" is not sent if the parent request was
    using ``file://`` or ``s3://`` scheme.
    """

    NOREFERRER_SCHEMES: tuple[str, ...] = (*LOCAL_SCHEMES, "file", "s3")
    name: str = POLICY_SCRAPY_DEFAULT


class RefererMiddleware(BaseSpiderMiddleware):
    def __init__(self, settings: BaseSettings | None = None):  # pylint: disable=super-init-not-called
        self.default_policy: type[ReferrerPolicy] = DefaultReferrerPolicy
        self.policies: dict[str, type[ReferrerPolicy]] = {
            p.name: p
            for p in (
                NoReferrerPolicy,
                NoReferrerWhenDowngradePolicy,
                SameOriginPolicy,
                OriginPolicy,
                StrictOriginPolicy,
                OriginWhenCrossOriginPolicy,
                StrictOriginWhenCrossOriginPolicy,
                UnsafeUrlPolicy,
                DefaultReferrerPolicy,
            )
        }
        # Reference: https://www.w3.org/TR/referrer-policy/#referrer-policy-empty-string
        self.policies[""] = NoReferrerWhenDowngradePolicy
        if settings is None:
            return
        setting_policies = settings.getdict("REFERRER_POLICIES")
        for policy_name, policy_class_import_path in setting_policies.items():
            if policy_class_import_path is None:
                del self.policies[policy_name]
            else:
                self.policies[policy_name] = load_object(policy_class_import_path)
        settings_policy = self._load_policy_class(
            settings.get("REFERRER_POLICY"), allow_import_path=True
        )
        assert settings_policy
        self.default_policy = settings_policy

    @classmethod
    def from_crawler(cls, crawler: Crawler) -> Self:
        if not crawler.settings.getbool("REFERER_ENABLED"):
            raise NotConfigured
        return cls(crawler.settings)

    def policy(
        self,
        response: Response | str | None = None,
        request: Request | None = None,
        **kwargs,
    ) -> ReferrerPolicy:
        """Return the referrer policy to use for *request* based on *request*
        meta, *response* and settings.

        - if a valid policy is set in Request meta, it is used.
        - if the policy is set in meta but is wrong (e.g. a typo error), the
          policy from settings is used
        - if the policy is not set in Request meta, but there is a
          Referrer-Policy header in the parent response, it is used if valid
        - otherwise, the policy from settings is used.
        """
        if "resp_or_url" in kwargs:
            if response is not None:
                raise TypeError("Cannot pass both 'response' and 'resp_or_url'")
            response = kwargs.pop("resp_or_url")
            warn(
                "Passing 'resp_or_url' is deprecated, use 'response' instead.",
                DeprecationWarning,
                stacklevel=2,
            )
        if response is None:
            raise TypeError("Missing required argument: 'response'")
        if request is None:
            raise TypeError("Missing required argument: 'request'")
        if isinstance(response, str):
            warn(
                "Passing a response URL to RefererMiddleware.policy() instead "
                "of a Response object is deprecated.",
                DeprecationWarning,
                stacklevel=2,
            )
        allow_import_path = True
        policy_name = request.meta.get("referrer_policy")
        if policy_name is None and isinstance(response, Response):
            policy_header = response.headers.get("Referrer-Policy")
            if policy_header is not None:
                policy_name = to_unicode(policy_header.decode("latin1"))
                allow_import_path = False
        if policy_name is None:
            return self.default_policy()
        cls = self._load_policy_class(
            policy_name, warning_only=True, allow_import_path=allow_import_path
        )
        return cls() if cls else self.default_policy()

    def _load_policy_class(
        self,
        policy: str,
        warning_only: bool = False,
        *,
        allow_import_path: bool = False,
    ) -> type[ReferrerPolicy] | None:
        """Load the :class:`ReferrerPolicy` class to use for *policy*.

        *policy* may be any of the following:

        -   A standard policy name, e.g. ``"no-referrer"``,
            ``"origin-when-cross-origin"``, etc.

        -   The special ``"scrapy-default"`` policy.

        -   The import path of a :class:`ReferrerPolicy` subclass, e.g.
            ``"scrapy.spidermiddlewares.referer.NoReferrerPolicy"`` or
            ``"myproject.policies.CustomReferrerPolicy"``.

        If *warning_only* is ``False`` (default) and *policy* cannot be turned
        into a :class:`ReferrerPolicy` subclass, a :exc:`RuntimeError` is
        raised. If *warning_only* is ``True``, a warning is logged and ``None``
        is returned instead.

        If *allow_import_path* is ``False`` (default), import paths are not
        allowed, resulting in :exc:`RuntimeError` or ``None``. If ``True``,
        they are allowed. Use ``True`` only if you trust the source of the
        *policy* value.
        """
        if allow_import_path:
            try:
                return cast("type[ReferrerPolicy]", load_object(policy))
            except ValueError:
                pass
        policy_names = [
            policy_name.strip() for policy_name in policy.lower().split(",")
        ]
        # https://www.w3.org/TR/referrer-policy/#parse-referrer-policy-from-header
        for policy_name in policy_names[::-1]:
            if policy_name in self.policies:
                return self.policies[policy_name]
        msg = f"Could not load referrer policy {policy!r}"
        if not allow_import_path and _looks_like_import_path(policy):
            msg += " (import paths from the response Referrer-Policy header are not allowed)"
        if not warning_only:
            raise RuntimeError(msg)
        warnings.warn(msg, RuntimeWarning)
        return None

    def get_processed_request(
        self, request: Request, response: Response | None
    ) -> Request | None:
        if response is None:
            # start requests
            return request
        referrer = self.policy(response, request).referrer(response.url, request.url)
        if referrer is not None:
            request.headers.setdefault("Referer", referrer)
        return request