File: referer.py

package info (click to toggle)
python-scrapy 1.5.1-1%2Bdeb10u1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 4,404 kB
  • sloc: python: 25,793; xml: 199; makefile: 95; sh: 33
file content (360 lines) | stat: -rw-r--r-- 13,626 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
"""
RefererMiddleware: populates Request referer field, based on the Response which
originated it.
"""
from six.moves.urllib.parse import urlparse
import warnings

from w3lib.url import safe_url_string

from scrapy.http import Request, Response
from scrapy.exceptions import NotConfigured
from scrapy import signals
from scrapy.utils.python import to_native_str
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.misc import load_object
from scrapy.utils.url import strip_url


LOCAL_SCHEMES = ('about', 'blob', 'data', 'filesystem',)

POLICY_NO_REFERRER = "no-referrer"
POLICY_NO_REFERRER_WHEN_DOWNGRADE = "no-referrer-when-downgrade"
POLICY_SAME_ORIGIN = "same-origin"
POLICY_ORIGIN = "origin"
POLICY_STRICT_ORIGIN = "strict-origin"
POLICY_ORIGIN_WHEN_CROSS_ORIGIN = "origin-when-cross-origin"
POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN = "strict-origin-when-cross-origin"
POLICY_UNSAFE_URL = "unsafe-url"
POLICY_SCRAPY_DEFAULT = "scrapy-default"


class ReferrerPolicy(object):

    NOREFERRER_SCHEMES = LOCAL_SCHEMES

    def referrer(self, response_url, request_url):
        raise NotImplementedError()

    def stripped_referrer(self, url):
        if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
            return self.strip_url(url)

    def origin_referrer(self, url):
        if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
            return self.origin(url)

    def strip_url(self, url, origin_only=False):
        """
        https://www.w3.org/TR/referrer-policy/#strip-url

        If url is null, return no referrer.
        If url's scheme is a local scheme, then return no referrer.
        Set url's username to the empty string.
        Set url's password to null.
        Set url's fragment to null.
        If the origin-only flag is true, then:
            Set url's path to null.
            Set url's query to null.
        Return url.
        """
        if not url:
            return None
        return strip_url(url,
                         strip_credentials=True,
                         strip_fragment=True,
                         strip_default_port=True,
                         origin_only=origin_only)

    def origin(self, url):
        """Return serialized origin (scheme, host, path) for a request or response URL."""
        return self.strip_url(url, origin_only=True)

    def potentially_trustworthy(self, url):
        # Note: this does not follow https://w3c.github.io/webappsec-secure-contexts/#is-url-trustworthy
        parsed_url = urlparse(url)
        if parsed_url.scheme in ('data',):
            return False
        return self.tls_protected(url)

    def tls_protected(self, url):
        return urlparse(url).scheme in ('https', 'ftps')


class NoReferrerPolicy(ReferrerPolicy):
    """
    https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer

    The simplest policy is "no-referrer", which specifies that no referrer information
    is to be sent along with requests made from a particular request client to any origin.
    The header will be omitted entirely.
    """
    name = POLICY_NO_REFERRER

    def referrer(self, response_url, request_url):
        return None


class NoReferrerWhenDowngradePolicy(ReferrerPolicy):
    """
    https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer-when-downgrade

    The "no-referrer-when-downgrade" policy sends a full URL along with requests
    from a TLS-protected environment settings object to a potentially trustworthy URL,
    and requests from clients which are not TLS-protected to any origin.

    Requests from TLS-protected clients to non-potentially trustworthy URLs,
    on the other hand, will contain no referrer information.
    A Referer HTTP header will not be sent.

    This is a user agent's default behavior, if no policy is otherwise specified.
    """
    name = POLICY_NO_REFERRER_WHEN_DOWNGRADE

    def referrer(self, response_url, request_url):
        if not self.tls_protected(response_url) or self.tls_protected(request_url):
            return self.stripped_referrer(response_url)


class SameOriginPolicy(ReferrerPolicy):
    """
    https://www.w3.org/TR/referrer-policy/#referrer-policy-same-origin

    The "same-origin" policy specifies that a full URL, stripped for use as a referrer,
    is sent as referrer information when making same-origin requests from a particular request client.

    Cross-origin requests, on the other hand, will contain no referrer information.
    A Referer HTTP header will not be sent.
    """
    name = POLICY_SAME_ORIGIN

    def referrer(self, response_url, request_url):
        if self.origin(response_url) == self.origin(request_url):
            return self.stripped_referrer(response_url)


class OriginPolicy(ReferrerPolicy):
    """
    https://www.w3.org/TR/referrer-policy/#referrer-policy-origin

    The "origin" policy specifies that only the ASCII serialization
    of the origin of the request client is sent as referrer information
    when making both same-origin requests and cross-origin requests
    from a particular request client.
    """
    name = POLICY_ORIGIN

    def referrer(self, response_url, request_url):
        return self.origin_referrer(response_url)


class StrictOriginPolicy(ReferrerPolicy):
    """
    https://www.w3.org/TR/referrer-policy/#referrer-policy-strict-origin

    The "strict-origin" policy sends the ASCII serialization
    of the origin of the request client when making requests:
    - from a TLS-protected environment settings object to a potentially trustworthy URL, and
    - from non-TLS-protected environment settings objects to any origin.

    Requests from TLS-protected request clients to non- potentially trustworthy URLs,
    on the other hand, will contain no referrer information.
    A Referer HTTP header will not be sent.
    """
    name = POLICY_STRICT_ORIGIN

    def referrer(self, response_url, request_url):
        if ((self.tls_protected(response_url) and
             self.potentially_trustworthy(request_url))
            or not self.tls_protected(response_url)):
            return self.origin_referrer(response_url)


class OriginWhenCrossOriginPolicy(ReferrerPolicy):
    """
    https://www.w3.org/TR/referrer-policy/#referrer-policy-origin-when-cross-origin

    The "origin-when-cross-origin" policy specifies that a full URL,
    stripped for use as a referrer, is sent as referrer information
    when making same-origin requests from a particular request client,
    and only the ASCII serialization of the origin of the request client
    is sent as referrer information when making cross-origin requests
    from a particular request client.
    """
    name = POLICY_ORIGIN_WHEN_CROSS_ORIGIN

    def referrer(self, response_url, request_url):
        origin = self.origin(response_url)
        if origin == self.origin(request_url):
            return self.stripped_referrer(response_url)
        else:
            return origin


class StrictOriginWhenCrossOriginPolicy(ReferrerPolicy):
    """
    https://www.w3.org/TR/referrer-policy/#referrer-policy-strict-origin-when-cross-origin

    The "strict-origin-when-cross-origin" policy specifies that a full URL,
    stripped for use as a referrer, is sent as referrer information
    when making same-origin requests from a particular request client,
    and only the ASCII serialization of the origin of the request client
    when making cross-origin requests:

    - from a TLS-protected environment settings object to a potentially trustworthy URL, and
    - from non-TLS-protected environment settings objects to any origin.

    Requests from TLS-protected clients to non- potentially trustworthy URLs,
    on the other hand, will contain no referrer information.
    A Referer HTTP header will not be sent.
    """
    name = POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN

    def referrer(self, response_url, request_url):
        origin = self.origin(response_url)
        if origin == self.origin(request_url):
            return self.stripped_referrer(response_url)
        elif ((self.tls_protected(response_url) and
               self.potentially_trustworthy(request_url))
              or not self.tls_protected(response_url)):
            return self.origin_referrer(response_url)


class UnsafeUrlPolicy(ReferrerPolicy):
    """
    https://www.w3.org/TR/referrer-policy/#referrer-policy-unsafe-url

    The "unsafe-url" policy specifies that a full URL, stripped for use as a referrer,
    is sent along with both cross-origin requests
    and same-origin requests made from a particular request client.

    Note: The policy's name doesn't lie; it is unsafe.
    This policy will leak origins and paths from TLS-protected resources
    to insecure origins.
    Carefully consider the impact of setting such a policy for potentially sensitive documents.
    """
    name = POLICY_UNSAFE_URL

    def referrer(self, response_url, request_url):
        return self.stripped_referrer(response_url)


class DefaultReferrerPolicy(NoReferrerWhenDowngradePolicy):
    """
    A variant of "no-referrer-when-downgrade",
    with the addition that "Referer" is not sent if the parent request was
    using ``file://`` or ``s3://`` scheme.
    """
    NOREFERRER_SCHEMES = LOCAL_SCHEMES + ('file', 's3')
    name = POLICY_SCRAPY_DEFAULT


_policy_classes = {p.name: p for p in (
    NoReferrerPolicy,
    NoReferrerWhenDowngradePolicy,
    SameOriginPolicy,
    OriginPolicy,
    StrictOriginPolicy,
    OriginWhenCrossOriginPolicy,
    StrictOriginWhenCrossOriginPolicy,
    UnsafeUrlPolicy,
    DefaultReferrerPolicy,
)}

# Reference: https://www.w3.org/TR/referrer-policy/#referrer-policy-empty-string
_policy_classes[''] = NoReferrerWhenDowngradePolicy


def _load_policy_class(policy, warning_only=False):
    """
    Expect a string for the path to the policy class,
    otherwise try to interpret the string as a standard value
    from https://www.w3.org/TR/referrer-policy/#referrer-policies
    """
    try:
        return load_object(policy)
    except ValueError:
        try:
            return _policy_classes[policy.lower()]
        except KeyError:
            msg = "Could not load referrer policy %r" % policy
            if not warning_only:
                raise RuntimeError(msg)
            else:
                warnings.warn(msg, RuntimeWarning)
                return None


class RefererMiddleware(object):

    def __init__(self, settings=None):
        self.default_policy = DefaultReferrerPolicy
        if settings is not None:
            self.default_policy = _load_policy_class(
                settings.get('REFERRER_POLICY'))

    @classmethod
    def from_crawler(cls, crawler):
        if not crawler.settings.getbool('REFERER_ENABLED'):
            raise NotConfigured
        mw = cls(crawler.settings)

        # Note: this hook is a bit of a hack to intercept redirections
        crawler.signals.connect(mw.request_scheduled, signal=signals.request_scheduled)

        return mw

    def policy(self, resp_or_url, request):
        """
        Determine Referrer-Policy to use from a parent Response (or URL),
        and a Request to be sent.

        - if a valid policy is set in Request meta, it is used.
        - if the policy is set in meta but is wrong (e.g. a typo error),
          the policy from settings is used
        - if the policy is not set in Request meta,
          but there is a Referrer-policy header in the parent response,
          it is used if valid
        - otherwise, the policy from settings is used.
        """
        policy_name = request.meta.get('referrer_policy')
        if policy_name is None:
            if isinstance(resp_or_url, Response):
                policy_header = resp_or_url.headers.get('Referrer-Policy')
                if policy_header is not None:
                    policy_name = to_native_str(policy_header.decode('latin1'))
        if policy_name is None:
            return self.default_policy()

        cls = _load_policy_class(policy_name, warning_only=True)
        return cls() if cls else self.default_policy()

    def process_spider_output(self, response, result, spider):
        def _set_referer(r):
            if isinstance(r, Request):
                referrer = self.policy(response, r).referrer(response.url, r.url)
                if referrer is not None:
                    r.headers.setdefault('Referer', referrer)
            return r
        return (_set_referer(r) for r in result or ())

    def request_scheduled(self, request, spider):
        # check redirected request to patch "Referer" header if necessary
        redirected_urls = request.meta.get('redirect_urls', [])
        if redirected_urls:
            request_referrer = request.headers.get('Referer')
            # we don't patch the referrer value if there is none
            if request_referrer is not None:
                # the request's referrer header value acts as a surrogate
                # for the parent response URL
                #
                # Note: if the 3xx response contained a Referrer-Policy header,
                #       the information is not available using this hook
                parent_url = safe_url_string(request_referrer)
                policy_referrer = self.policy(parent_url, request).referrer(
                    parent_url, request.url)
                if policy_referrer != request_referrer:
                    if policy_referrer is None:
                        request.headers.pop('Referer')
                    else:
                        request.headers['Referer'] = policy_referrer