From: =?utf-8?q?Adri=C3=A1n_Chaves?= <adrian@chaves.io>
Date: Tue, 1 Mar 2022 12:26:05 +0100
Subject: Merge pull request from GHSA-cjvr-mfj7-j4j8

* Do not carry over cookies to a different domain on redirect

* Cover the cookie-domain redirect fix in the release notes

* Cover 1.8.2 in the release notes

* Fix redirect Cookie handling when the cookie middleware is disabled

* Update the 1.8.2 release date

Fixes CVE-2022-0577
Origin: upstream, https://github.com/scrapy/scrapy/commit/8ce01b3b76d4634f55067d6cfdf632ec70ba304a
Bug-Debian: https://bugs.debian.org/1008234
---
 scrapy/downloadermiddlewares/redirect.py   |  30 +++++-
 tests/test_downloadermiddleware_cookies.py | 155 +++++++++++++++++++++++++++++
 2 files changed, 180 insertions(+), 5 deletions(-)

diff --git a/scrapy/downloadermiddlewares/redirect.py b/scrapy/downloadermiddlewares/redirect.py
index 4053fec..c8c84ff 100644
--- a/scrapy/downloadermiddlewares/redirect.py
+++ b/scrapy/downloadermiddlewares/redirect.py
@@ -4,6 +4,7 @@ from urllib.parse import urljoin, urlparse
 from w3lib.url import safe_url_string
 
 from scrapy.http import HtmlResponse
+from scrapy.utils.httpobj import urlparse_cached
 from scrapy.utils.response import get_meta_refresh
 from scrapy.exceptions import IgnoreRequest, NotConfigured
 
@@ -11,6 +12,20 @@ from scrapy.exceptions import IgnoreRequest, NotConfigured
 logger = logging.getLogger(__name__)
 
 
+def _build_redirect_request(source_request, *, url, **kwargs):
+    redirect_request = source_request.replace(
+        url=url,
+        **kwargs,
+        cookies=None,
+    )
+    if 'Cookie' in redirect_request.headers:
+        source_request_netloc = urlparse_cached(source_request).netloc
+        redirect_request_netloc = urlparse_cached(redirect_request).netloc
+        if source_request_netloc != redirect_request_netloc:
+            del redirect_request.headers['Cookie']
+    return redirect_request
+
+
 class BaseRedirectMiddleware:
 
     enabled_setting = 'REDIRECT_ENABLED'
@@ -47,10 +62,15 @@ class BaseRedirectMiddleware:
             raise IgnoreRequest("max redirections reached")
 
     def _redirect_request_using_get(self, request, redirect_url):
-        redirected = request.replace(url=redirect_url, method='GET', body='')
-        redirected.headers.pop('Content-Type', None)
-        redirected.headers.pop('Content-Length', None)
-        return redirected
+        redirect_request = _build_redirect_request(
+            request,
+            url=redirect_url,
+            method='GET',
+            body='',
+        )
+        redirect_request.headers.pop('Content-Type', None)
+        redirect_request.headers.pop('Content-Length', None)
+        return redirect_request
 
 
 class RedirectMiddleware(BaseRedirectMiddleware):
@@ -80,7 +100,7 @@ class RedirectMiddleware(BaseRedirectMiddleware):
         redirected_url = urljoin(request.url, location)
 
         if response.status in (301, 307, 308) or request.method == 'HEAD':
-            redirected = request.replace(url=redirected_url)
+            redirected = _build_redirect_request(request, url=redirected_url)
             return self._redirect(redirected, request, spider, response.status)
 
         redirected = self._redirect_request_using_get(request, redirected_url)
diff --git a/tests/test_downloadermiddleware_cookies.py b/tests/test_downloadermiddleware_cookies.py
index aff8542..5263f63 100644
--- a/tests/test_downloadermiddleware_cookies.py
+++ b/tests/test_downloadermiddleware_cookies.py
@@ -6,8 +6,10 @@ import pytest
 
 from scrapy.downloadermiddlewares.cookies import CookiesMiddleware
 from scrapy.downloadermiddlewares.defaultheaders import DefaultHeadersMiddleware
+from scrapy.downloadermiddlewares.redirect import RedirectMiddleware
 from scrapy.exceptions import NotConfigured
 from scrapy.http import Response, Request
+from scrapy.settings import Settings
 from scrapy.spiders import Spider
 from scrapy.utils.python import to_bytes
 from scrapy.utils.test import get_crawler
@@ -23,9 +25,11 @@ class CookiesMiddlewareTest(TestCase):
     def setUp(self):
         self.spider = Spider('foo')
         self.mw = CookiesMiddleware()
+        self.redirect_middleware = RedirectMiddleware(settings=Settings())
 
     def tearDown(self):
         del self.mw
+        del self.redirect_middleware
 
     def test_basic(self):
         req = Request('http://scrapytest.org/')
@@ -347,3 +351,154 @@ class CookiesMiddlewareTest(TestCase):
         self.assertCookieValEqual(req1.headers['Cookie'], 'key=value1')
         self.assertCookieValEqual(req2.headers['Cookie'], 'key=value2')
         self.assertCookieValEqual(req3.headers['Cookie'], 'key=')
+
+    def _test_cookie_redirect(
+        self,
+        source,
+        target,
+        *,
+        cookies1,
+        cookies2,
+    ):
+        input_cookies = {'a': 'b'}
+
+        if not isinstance(source, dict):
+            source = {'url': source}
+        if not isinstance(target, dict):
+            target = {'url': target}
+        target.setdefault('status', 301)
+
+        request1 = Request(cookies=input_cookies, **source)
+        self.mw.process_request(request1, self.spider)
+        cookies = request1.headers.get('Cookie')
+        self.assertEqual(cookies, b"a=b" if cookies1 else None)
+
+        response = Response(
+            headers={
+                'Location': target['url'],
+            },
+            **target,
+        )
+        self.assertEqual(
+            self.mw.process_response(request1, response, self.spider),
+            response,
+        )
+
+        request2 = self.redirect_middleware.process_response(
+            request1,
+            response,
+            self.spider,
+        )
+        self.assertIsInstance(request2, Request)
+
+        self.mw.process_request(request2, self.spider)
+        cookies = request2.headers.get('Cookie')
+        self.assertEqual(cookies, b"a=b" if cookies2 else None)
+
+    def test_cookie_redirect_same_domain(self):
+        self._test_cookie_redirect(
+            'https://toscrape.com',
+            'https://toscrape.com',
+            cookies1=True,
+            cookies2=True,
+        )
+
+    def test_cookie_redirect_same_domain_forcing_get(self):
+        self._test_cookie_redirect(
+            'https://toscrape.com',
+            {'url': 'https://toscrape.com', 'status': 302},
+            cookies1=True,
+            cookies2=True,
+        )
+
+    def test_cookie_redirect_different_domain(self):
+        self._test_cookie_redirect(
+            'https://toscrape.com',
+            'https://example.com',
+            cookies1=True,
+            cookies2=False,
+        )
+
+    def test_cookie_redirect_different_domain_forcing_get(self):
+        self._test_cookie_redirect(
+            'https://toscrape.com',
+            {'url': 'https://example.com', 'status': 302},
+            cookies1=True,
+            cookies2=False,
+        )
+
+    def _test_cookie_header_redirect(
+        self,
+        source,
+        target,
+        *,
+        cookies2,
+    ):
+        """Test the handling of a user-defined Cookie header when building a
+        redirect follow-up request.
+
+        We follow RFC 6265 for cookie handling. The Cookie header can only
+        contain a list of key-value pairs (i.e. no additional cookie
+        parameters like Domain or Path). Because of that, we follow the same
+        rules that we would follow for the handling of the Set-Cookie response
+        header when the Domain is not set: the cookies must be limited to the
+        target URL domain (not even subdomains can receive those cookies).
+
+        .. note:: This method tests the scenario where the cookie middleware is
+                  disabled. Because of known issue #1992, when the cookies
+                  middleware is enabled we do not need to be concerned about
+                  the Cookie header getting leaked to unintended domains,
+                  because the middleware empties the header from every request.
+        """
+        if not isinstance(source, dict):
+            source = {'url': source}
+        if not isinstance(target, dict):
+            target = {'url': target}
+        target.setdefault('status', 301)
+
+        request1 = Request(headers={'Cookie': b'a=b'}, **source)
+
+        response = Response(
+            headers={
+                'Location': target['url'],
+            },
+            **target,
+        )
+
+        request2 = self.redirect_middleware.process_response(
+            request1,
+            response,
+            self.spider,
+        )
+        self.assertIsInstance(request2, Request)
+
+        cookies = request2.headers.get('Cookie')
+        self.assertEqual(cookies, b"a=b" if cookies2 else None)
+
+    def test_cookie_header_redirect_same_domain(self):
+        self._test_cookie_header_redirect(
+            'https://toscrape.com',
+            'https://toscrape.com',
+            cookies2=True,
+        )
+
+    def test_cookie_header_redirect_same_domain_forcing_get(self):
+        self._test_cookie_header_redirect(
+            'https://toscrape.com',
+            {'url': 'https://toscrape.com', 'status': 302},
+            cookies2=True,
+        )
+
+    def test_cookie_header_redirect_different_domain(self):
+        self._test_cookie_header_redirect(
+            'https://toscrape.com',
+            'https://example.com',
+            cookies2=False,
+        )
+
+    def test_cookie_header_redirect_different_domain_forcing_get(self):
+        self._test_cookie_header_redirect(
+            'https://toscrape.com',
+            {'url': 'https://example.com', 'status': 302},
+            cookies2=False,
+        )
