1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262
|
From: =?utf-8?q?Adri=C3=A1n_Chaves?= <adrian@chaves.io>
Date: Tue, 1 Mar 2022 12:26:05 +0100
Subject: Merge pull request from GHSA-cjvr-mfj7-j4j8
* Do not carry over cookies to a different domain on redirect
* Cover the cookie-domain redirect fix in the release notes
* Cover 1.8.2 in the release notes
* Fix redirect Cookie handling when the cookie middleware is disabled
* Update the 1.8.2 release date
Fixes CVE-2022-0577
Origin: upstream, https://github.com/scrapy/scrapy/commit/8ce01b3b76d4634f55067d6cfdf632ec70ba304a
Bug-Debian: https://bugs.debian.org/1008234
---
scrapy/downloadermiddlewares/redirect.py | 30 +++++-
tests/test_downloadermiddleware_cookies.py | 153 +++++++++++++++++++++++++++++
2 files changed, 178 insertions(+), 5 deletions(-)
diff --git a/scrapy/downloadermiddlewares/redirect.py b/scrapy/downloadermiddlewares/redirect.py
index 30cae3f..b65cd19 100644
--- a/scrapy/downloadermiddlewares/redirect.py
+++ b/scrapy/downloadermiddlewares/redirect.py
@@ -4,12 +4,27 @@ from six.moves.urllib.parse import urljoin
from w3lib.url import safe_url_string
from scrapy.http import HtmlResponse
+from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.response import get_meta_refresh
from scrapy.exceptions import IgnoreRequest, NotConfigured
logger = logging.getLogger(__name__)
+def _build_redirect_request(source_request, url, **kwargs):
+ redirect_request = source_request.replace(
+ url=url,
+ cookies=None,
+ **kwargs
+ )
+ if 'Cookie' in redirect_request.headers:
+ source_request_netloc = urlparse_cached(source_request).netloc
+ redirect_request_netloc = urlparse_cached(redirect_request).netloc
+ if source_request_netloc != redirect_request_netloc:
+ del redirect_request.headers['Cookie']
+ return redirect_request
+
+
class BaseRedirectMiddleware(object):
enabled_setting = 'REDIRECT_ENABLED'
@@ -46,10 +61,15 @@ class BaseRedirectMiddleware(object):
raise IgnoreRequest("max redirections reached")
def _redirect_request_using_get(self, request, redirect_url):
- redirected = request.replace(url=redirect_url, method='GET', body='')
- redirected.headers.pop('Content-Type', None)
- redirected.headers.pop('Content-Length', None)
- return redirected
+ redirect_request = _build_redirect_request(
+ request,
+ url=redirect_url,
+ method='GET',
+ body='',
+ )
+ redirect_request.headers.pop('Content-Type', None)
+ redirect_request.headers.pop('Content-Length', None)
+ return redirect_request
class RedirectMiddleware(BaseRedirectMiddleware):
@@ -73,7 +93,7 @@ class RedirectMiddleware(BaseRedirectMiddleware):
redirected_url = urljoin(request.url, location)
if response.status in (301, 307, 308) or request.method == 'HEAD':
- redirected = request.replace(url=redirected_url)
+ redirected = _build_redirect_request(request, url=redirected_url)
return self._redirect(redirected, request, spider, response.status)
redirected = self._redirect_request_using_get(request, redirected_url)
diff --git a/tests/test_downloadermiddleware_cookies.py b/tests/test_downloadermiddleware_cookies.py
index 17801e5..60ceb46 100644
--- a/tests/test_downloadermiddleware_cookies.py
+++ b/tests/test_downloadermiddleware_cookies.py
@@ -3,7 +3,9 @@ import logging
from unittest import TestCase
from testfixtures import LogCapture
+from scrapy.downloadermiddlewares.redirect import RedirectMiddleware
from scrapy.http import Response, Request
+from scrapy.settings import Settings
from scrapy.spiders import Spider
from scrapy.utils.test import get_crawler
from scrapy.exceptions import NotConfigured
@@ -21,9 +23,11 @@ class CookiesMiddlewareTest(TestCase):
def setUp(self):
self.spider = Spider('foo')
self.mw = CookiesMiddleware()
+ self.redirect_middleware = RedirectMiddleware(settings=Settings())
def tearDown(self):
del self.mw
+ del self.redirect_middleware
def test_basic(self):
req = Request('http://scrapytest.org/')
@@ -229,3 +233,152 @@ class CookiesMiddlewareTest(TestCase):
assert self.mw.process_request(request, self.spider) is None
self.assertIn('Cookie', request.headers)
self.assertEqual(b'currencyCookie=USD', request.headers['Cookie'])
+
+ def _test_cookie_redirect(
+ self,
+ source,
+ target,
+ cookies1,
+ cookies2,
+ ):
+ input_cookies = {'a': 'b'}
+
+ if not isinstance(source, dict):
+ source = {'url': source}
+ if not isinstance(target, dict):
+ target = {'url': target}
+ target.setdefault('status', 301)
+
+ request1 = Request(cookies=input_cookies, **source)
+ self.mw.process_request(request1, self.spider)
+ cookies = request1.headers.get('Cookie')
+ self.assertEqual(cookies, b"a=b" if cookies1 else None)
+
+ response = Response(
+ headers={
+ 'Location': target['url'],
+ },
+ **target
+ )
+ self.assertEqual(
+ self.mw.process_response(request1, response, self.spider),
+ response,
+ )
+
+ request2 = self.redirect_middleware.process_response(
+ request1,
+ response,
+ self.spider,
+ )
+ self.assertIsInstance(request2, Request)
+
+ self.mw.process_request(request2, self.spider)
+ cookies = request2.headers.get('Cookie')
+ self.assertEqual(cookies, b"a=b" if cookies2 else None)
+
+ def test_cookie_redirect_same_domain(self):
+ self._test_cookie_redirect(
+ 'https://toscrape.com',
+ 'https://toscrape.com',
+ cookies1=True,
+ cookies2=True,
+ )
+
+ def test_cookie_redirect_same_domain_forcing_get(self):
+ self._test_cookie_redirect(
+ 'https://toscrape.com',
+ {'url': 'https://toscrape.com', 'status': 302},
+ cookies1=True,
+ cookies2=True,
+ )
+
+ def test_cookie_redirect_different_domain(self):
+ self._test_cookie_redirect(
+ 'https://toscrape.com',
+ 'https://example.com',
+ cookies1=True,
+ cookies2=False,
+ )
+
+ def test_cookie_redirect_different_domain_forcing_get(self):
+ self._test_cookie_redirect(
+ 'https://toscrape.com',
+ {'url': 'https://example.com', 'status': 302},
+ cookies1=True,
+ cookies2=False,
+ )
+
+ def _test_cookie_header_redirect(
+ self,
+ source,
+ target,
+ cookies2,
+ ):
+ """Test the handling of a user-defined Cookie header when building a
+ redirect follow-up request.
+
+ We follow RFC 6265 for cookie handling. The Cookie header can only
+ contain a list of key-value pairs (i.e. no additional cookie
+ parameters like Domain or Path). Because of that, we follow the same
+ rules that we would follow for the handling of the Set-Cookie response
+ header when the Domain is not set: the cookies must be limited to the
+ target URL domain (not even subdomains can receive those cookies).
+
+ .. note:: This method tests the scenario where the cookie middleware is
+ disabled. Because of known issue #1992, when the cookies
+ middleware is enabled we do not need to be concerned about
+ the Cookie header getting leaked to unintended domains,
+ because the middleware empties the header from every request.
+ """
+ if not isinstance(source, dict):
+ source = {'url': source}
+ if not isinstance(target, dict):
+ target = {'url': target}
+ target.setdefault('status', 301)
+
+ request1 = Request(headers={'Cookie': b'a=b'}, **source)
+
+ response = Response(
+ headers={
+ 'Location': target['url'],
+ },
+ **target
+ )
+
+ request2 = self.redirect_middleware.process_response(
+ request1,
+ response,
+ self.spider,
+ )
+ self.assertIsInstance(request2, Request)
+
+ cookies = request2.headers.get('Cookie')
+ self.assertEqual(cookies, b"a=b" if cookies2 else None)
+
+ def test_cookie_header_redirect_same_domain(self):
+ self._test_cookie_header_redirect(
+ 'https://toscrape.com',
+ 'https://toscrape.com',
+ cookies2=True,
+ )
+
+ def test_cookie_header_redirect_same_domain_forcing_get(self):
+ self._test_cookie_header_redirect(
+ 'https://toscrape.com',
+ {'url': 'https://toscrape.com', 'status': 302},
+ cookies2=True,
+ )
+
+ def test_cookie_header_redirect_different_domain(self):
+ self._test_cookie_header_redirect(
+ 'https://toscrape.com',
+ 'https://example.com',
+ cookies2=False,
+ )
+
+ def test_cookie_header_redirect_different_domain_forcing_get(self):
+ self._test_cookie_header_redirect(
+ 'https://toscrape.com',
+ {'url': 'https://example.com', 'status': 302},
+ cookies2=False,
+ )
|