File: CVE-2022-0577.patch

package info (click to toggle)
python-scrapy 1.5.1-1%2Bdeb10u1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 4,404 kB
  • sloc: python: 25,793; xml: 199; makefile: 95; sh: 33
file content (262 lines) | stat: -rw-r--r-- 9,505 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
From: =?utf-8?q?Adri=C3=A1n_Chaves?= <adrian@chaves.io>
Date: Tue, 1 Mar 2022 12:26:05 +0100
Subject: Merge pull request from GHSA-cjvr-mfj7-j4j8

* Do not carry over cookies to a different domain on redirect

* Cover the cookie-domain redirect fix in the release notes

* Cover 1.8.2 in the release notes

* Fix redirect Cookie handling when the cookie middleware is disabled

* Update the 1.8.2 release date

Fixes CVE-2022-0577
Origin: upstream, https://github.com/scrapy/scrapy/commit/8ce01b3b76d4634f55067d6cfdf632ec70ba304a
Bug-Debian: https://bugs.debian.org/1008234
---
 scrapy/downloadermiddlewares/redirect.py   |  30 +++++-
 tests/test_downloadermiddleware_cookies.py | 153 +++++++++++++++++++++++++++++
 2 files changed, 178 insertions(+), 5 deletions(-)

diff --git a/scrapy/downloadermiddlewares/redirect.py b/scrapy/downloadermiddlewares/redirect.py
index 30cae3f..b65cd19 100644
--- a/scrapy/downloadermiddlewares/redirect.py
+++ b/scrapy/downloadermiddlewares/redirect.py
@@ -4,12 +4,27 @@ from six.moves.urllib.parse import urljoin
 from w3lib.url import safe_url_string
 
 from scrapy.http import HtmlResponse
+from scrapy.utils.httpobj import urlparse_cached
 from scrapy.utils.response import get_meta_refresh
 from scrapy.exceptions import IgnoreRequest, NotConfigured
 
 logger = logging.getLogger(__name__)
 
 
+def _build_redirect_request(source_request, url, **kwargs):
+    redirect_request = source_request.replace(
+        url=url,
+        cookies=None,
+        **kwargs
+    )
+    if 'Cookie' in redirect_request.headers:
+        source_request_netloc = urlparse_cached(source_request).netloc
+        redirect_request_netloc = urlparse_cached(redirect_request).netloc
+        if source_request_netloc != redirect_request_netloc:
+            del redirect_request.headers['Cookie']
+    return redirect_request
+
+
 class BaseRedirectMiddleware(object):
 
     enabled_setting = 'REDIRECT_ENABLED'
@@ -46,10 +61,15 @@ class BaseRedirectMiddleware(object):
             raise IgnoreRequest("max redirections reached")
 
     def _redirect_request_using_get(self, request, redirect_url):
-        redirected = request.replace(url=redirect_url, method='GET', body='')
-        redirected.headers.pop('Content-Type', None)
-        redirected.headers.pop('Content-Length', None)
-        return redirected
+        redirect_request = _build_redirect_request(
+            request,
+            url=redirect_url,
+            method='GET',
+            body='',
+        )
+        redirect_request.headers.pop('Content-Type', None)
+        redirect_request.headers.pop('Content-Length', None)
+        return redirect_request
 
 
 class RedirectMiddleware(BaseRedirectMiddleware):
@@ -73,7 +93,7 @@ class RedirectMiddleware(BaseRedirectMiddleware):
         redirected_url = urljoin(request.url, location)
 
         if response.status in (301, 307, 308) or request.method == 'HEAD':
-            redirected = request.replace(url=redirected_url)
+            redirected = _build_redirect_request(request, url=redirected_url)
             return self._redirect(redirected, request, spider, response.status)
 
         redirected = self._redirect_request_using_get(request, redirected_url)
diff --git a/tests/test_downloadermiddleware_cookies.py b/tests/test_downloadermiddleware_cookies.py
index 17801e5..60ceb46 100644
--- a/tests/test_downloadermiddleware_cookies.py
+++ b/tests/test_downloadermiddleware_cookies.py
@@ -3,7 +3,9 @@ import logging
 from unittest import TestCase
 from testfixtures import LogCapture
 
+from scrapy.downloadermiddlewares.redirect import RedirectMiddleware
 from scrapy.http import Response, Request
+from scrapy.settings import Settings
 from scrapy.spiders import Spider
 from scrapy.utils.test import get_crawler
 from scrapy.exceptions import NotConfigured
@@ -21,9 +23,11 @@ class CookiesMiddlewareTest(TestCase):
     def setUp(self):
         self.spider = Spider('foo')
         self.mw = CookiesMiddleware()
+        self.redirect_middleware = RedirectMiddleware(settings=Settings())
 
     def tearDown(self):
         del self.mw
+        del self.redirect_middleware
 
     def test_basic(self):
         req = Request('http://scrapytest.org/')
@@ -229,3 +233,152 @@ class CookiesMiddlewareTest(TestCase):
         assert self.mw.process_request(request, self.spider) is None
         self.assertIn('Cookie', request.headers)
         self.assertEqual(b'currencyCookie=USD', request.headers['Cookie'])
+
+    def _test_cookie_redirect(
+        self,
+        source,
+        target,
+        cookies1,
+        cookies2,
+    ):
+        input_cookies = {'a': 'b'}
+
+        if not isinstance(source, dict):
+            source = {'url': source}
+        if not isinstance(target, dict):
+            target = {'url': target}
+        target.setdefault('status', 301)
+
+        request1 = Request(cookies=input_cookies, **source)
+        self.mw.process_request(request1, self.spider)
+        cookies = request1.headers.get('Cookie')
+        self.assertEqual(cookies, b"a=b" if cookies1 else None)
+
+        response = Response(
+            headers={
+                'Location': target['url'],
+            },
+            **target
+        )
+        self.assertEqual(
+            self.mw.process_response(request1, response, self.spider),
+            response,
+        )
+
+        request2 = self.redirect_middleware.process_response(
+            request1,
+            response,
+            self.spider,
+        )
+        self.assertIsInstance(request2, Request)
+
+        self.mw.process_request(request2, self.spider)
+        cookies = request2.headers.get('Cookie')
+        self.assertEqual(cookies, b"a=b" if cookies2 else None)
+
+    def test_cookie_redirect_same_domain(self):
+        self._test_cookie_redirect(
+            'https://toscrape.com',
+            'https://toscrape.com',
+            cookies1=True,
+            cookies2=True,
+        )
+
+    def test_cookie_redirect_same_domain_forcing_get(self):
+        self._test_cookie_redirect(
+            'https://toscrape.com',
+            {'url': 'https://toscrape.com', 'status': 302},
+            cookies1=True,
+            cookies2=True,
+        )
+
+    def test_cookie_redirect_different_domain(self):
+        self._test_cookie_redirect(
+            'https://toscrape.com',
+            'https://example.com',
+            cookies1=True,
+            cookies2=False,
+        )
+
+    def test_cookie_redirect_different_domain_forcing_get(self):
+        self._test_cookie_redirect(
+            'https://toscrape.com',
+            {'url': 'https://example.com', 'status': 302},
+            cookies1=True,
+            cookies2=False,
+        )
+
+    def _test_cookie_header_redirect(
+        self,
+        source,
+        target,
+        cookies2,
+    ):
+        """Test the handling of a user-defined Cookie header when building a
+        redirect follow-up request.
+
+        We follow RFC 6265 for cookie handling. The Cookie header can only
+        contain a list of key-value pairs (i.e. no additional cookie
+        parameters like Domain or Path). Because of that, we follow the same
+        rules that we would follow for the handling of the Set-Cookie response
+        header when the Domain is not set: the cookies must be limited to the
+        target URL domain (not even subdomains can receive those cookies).
+
+        .. note:: This method tests the scenario where the cookie middleware is
+                  disabled. Because of known issue #1992, when the cookies
+                  middleware is enabled we do not need to be concerned about
+                  the Cookie header getting leaked to unintended domains,
+                  because the middleware empties the header from every request.
+        """
+        if not isinstance(source, dict):
+            source = {'url': source}
+        if not isinstance(target, dict):
+            target = {'url': target}
+        target.setdefault('status', 301)
+
+        request1 = Request(headers={'Cookie': b'a=b'}, **source)
+
+        response = Response(
+            headers={
+                'Location': target['url'],
+            },
+            **target
+        )
+
+        request2 = self.redirect_middleware.process_response(
+            request1,
+            response,
+            self.spider,
+        )
+        self.assertIsInstance(request2, Request)
+
+        cookies = request2.headers.get('Cookie')
+        self.assertEqual(cookies, b"a=b" if cookies2 else None)
+
+    def test_cookie_header_redirect_same_domain(self):
+        self._test_cookie_header_redirect(
+            'https://toscrape.com',
+            'https://toscrape.com',
+            cookies2=True,
+        )
+
+    def test_cookie_header_redirect_same_domain_forcing_get(self):
+        self._test_cookie_header_redirect(
+            'https://toscrape.com',
+            {'url': 'https://toscrape.com', 'status': 302},
+            cookies2=True,
+        )
+
+    def test_cookie_header_redirect_different_domain(self):
+        self._test_cookie_header_redirect(
+            'https://toscrape.com',
+            'https://example.com',
+            cookies2=False,
+        )
+
+    def test_cookie_header_redirect_different_domain_forcing_get(self):
+        self._test_cookie_header_redirect(
+            'https://toscrape.com',
+            {'url': 'https://example.com', 'status': 302},
+            cookies2=False,
+        )