From: Andrey Rakhmatullin <wrar@wrar.name>
Date: Fri, 16 Aug 2019 14:53:42 +0500
Subject: Add http_auth_domain to HttpAuthMiddleware.

Fixes CVE-2021-41125
Origin: upstream, https://github.com/scrapy/scrapy/commit/b01d69a1bf48060daec8f751368622352d8b85a6
---
 docs/topics/downloader-middleware.rst       | 18 +++++-
 scrapy/downloadermiddlewares/httpauth.py    | 21 ++++++-
 tests/test_downloadermiddleware_httpauth.py | 85 ++++++++++++++++++++++++++++-
 3 files changed, 118 insertions(+), 6 deletions(-)

diff --git a/docs/topics/downloader-middleware.rst b/docs/topics/downloader-middleware.rst
index 6801adc..e0a3205 100644
--- a/docs/topics/downloader-middleware.rst
+++ b/docs/topics/downloader-middleware.rst
@@ -323,8 +323,21 @@ HttpAuthMiddleware
     This middleware authenticates all requests generated from certain spiders
     using `Basic access authentication`_ (aka. HTTP auth).
 
-    To enable HTTP authentication from certain spiders, set the ``http_user``
-    and ``http_pass`` attributes of those spiders.
+    To enable HTTP authentication for a spider, set the ``http_user`` and
+    ``http_pass`` spider attributes to the authentication data and the
+    ``http_auth_domain`` spider attribute to the domain which requires this
+    authentication (its subdomains will be also handled in the same way).
+    You can set ``http_auth_domain`` to ``None`` to enable the
+    authentication for all requests but usually this is not needed.
+
+    .. warning::
+        In the previous Scrapy versions HttpAuthMiddleware sent the
+        authentication data with all requests, which is a security problem if
+        the spider makes requests to several different domains. Currently if
+        the ``http_auth_domain`` attribute is not set, the middleware will use
+        the domain of the first request, which will work for some spider but
+        not for others. In the future the middleware will produce an error
+        instead.
 
     Example::
 
@@ -334,6 +347,7 @@ HttpAuthMiddleware
 
             http_user = 'someuser'
             http_pass = 'somepass'
+            http_auth_domain = 'intranet.example.com'
             name = 'intranet.example.com'
 
             # .. rest of the spider code omitted ...
diff --git a/scrapy/downloadermiddlewares/httpauth.py b/scrapy/downloadermiddlewares/httpauth.py
index 089bf0d..1bee3e2 100644
--- a/scrapy/downloadermiddlewares/httpauth.py
+++ b/scrapy/downloadermiddlewares/httpauth.py
@@ -3,10 +3,14 @@ HTTP basic auth downloader middleware
 
 See documentation in docs/topics/downloader-middleware.rst
 """
+import warnings
 
 from w3lib.http import basic_auth_header
 
 from scrapy import signals
+from scrapy.exceptions import ScrapyDeprecationWarning
+from scrapy.utils.httpobj import urlparse_cached
+from scrapy.utils.url import url_is_from_any_domain
 
 
 class HttpAuthMiddleware:
@@ -24,8 +28,23 @@ class HttpAuthMiddleware:
         pwd = getattr(spider, 'http_pass', '')
         if usr or pwd:
             self.auth = basic_auth_header(usr, pwd)
+            if not hasattr(spider, 'http_auth_domain'):
+                warnings.warn('Using HttpAuthMiddleware without http_auth_domain is deprecated and can cause security '
+                              'problems if the spider makes requests to several different domains. http_auth_domain '
+                              'will be set to the domain of the first request, please set it to the correct value '
+                              'explicitly.',
+                              category=ScrapyDeprecationWarning)
+                self.domain_unset = True
+            else:
+                self.domain = spider.http_auth_domain
+                self.domain_unset = False
 
     def process_request(self, request, spider):
         auth = getattr(self, 'auth', None)
         if auth and b'Authorization' not in request.headers:
-            request.headers[b'Authorization'] = auth
+            domain = urlparse_cached(request).hostname
+            if self.domain_unset:
+                self.domain = domain
+                self.domain_unset = False
+            if not self.domain or url_is_from_any_domain(request.url, [self.domain]):
+                request.headers[b'Authorization'] = auth
diff --git a/tests/test_downloadermiddleware_httpauth.py b/tests/test_downloadermiddleware_httpauth.py
index 3381632..0362e20 100644
--- a/tests/test_downloadermiddleware_httpauth.py
+++ b/tests/test_downloadermiddleware_httpauth.py
@@ -1,13 +1,60 @@
 import unittest
 
+from w3lib.http import basic_auth_header
+
 from scrapy.http import Request
 from scrapy.downloadermiddlewares.httpauth import HttpAuthMiddleware
 from scrapy.spiders import Spider
 
 
+class TestSpiderLegacy(Spider):
+    http_user = 'foo'
+    http_pass = 'bar'
+
+
 class TestSpider(Spider):
     http_user = 'foo'
     http_pass = 'bar'
+    http_auth_domain = 'example.com'
+
+
+class TestSpiderAny(Spider):
+    http_user = 'foo'
+    http_pass = 'bar'
+    http_auth_domain = None
+
+
+class HttpAuthMiddlewareLegacyTest(unittest.TestCase):
+
+    def setUp(self):
+        self.spider = TestSpiderLegacy('foo')
+
+    def test_auth(self):
+        mw = HttpAuthMiddleware()
+        mw.spider_opened(self.spider)
+
+        # initial request, sets the domain and sends the header
+        req = Request('http://example.com/')
+        assert mw.process_request(req, self.spider) is None
+        self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar'))
+
+        # subsequent request to the same domain, should send the header
+        req = Request('http://example.com/')
+        assert mw.process_request(req, self.spider) is None
+        self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar'))
+
+        # subsequent request to a different domain, shouldn't send the header
+        req = Request('http://example-noauth.com/')
+        assert mw.process_request(req, self.spider) is None
+        self.assertNotIn('Authorization', req.headers)
+
+    def test_auth_already_set(self):
+        mw = HttpAuthMiddleware()
+        mw.spider_opened(self.spider)
+        req = Request('http://example.com/',
+                      headers=dict(Authorization='Digest 123'))
+        assert mw.process_request(req, self.spider) is None
+        self.assertEqual(req.headers['Authorization'], b'Digest 123')
 
 
 class HttpAuthMiddlewareTest(unittest.TestCase):
@@ -20,13 +67,45 @@ class HttpAuthMiddlewareTest(unittest.TestCase):
     def tearDown(self):
         del self.mw
 
+    def test_no_auth(self):
+        req = Request('http://example-noauth.com/')
+        assert self.mw.process_request(req, self.spider) is None
+        self.assertNotIn('Authorization', req.headers)
+
+    def test_auth_domain(self):
+        req = Request('http://example.com/')
+        assert self.mw.process_request(req, self.spider) is None
+        self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar'))
+
+    def test_auth_subdomain(self):
+        req = Request('http://foo.example.com/')
+        assert self.mw.process_request(req, self.spider) is None
+        self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar'))
+
+    def test_auth_already_set(self):
+        req = Request('http://example.com/',
+                      headers=dict(Authorization='Digest 123'))
+        assert self.mw.process_request(req, self.spider) is None
+        self.assertEqual(req.headers['Authorization'], b'Digest 123')
+
+
+class HttpAuthAnyMiddlewareTest(unittest.TestCase):
+
+    def setUp(self):
+        self.mw = HttpAuthMiddleware()
+        self.spider = TestSpiderAny('foo')
+        self.mw.spider_opened(self.spider)
+
+    def tearDown(self):
+        del self.mw
+
     def test_auth(self):
-        req = Request('http://scrapytest.org/')
+        req = Request('http://example.com/')
         assert self.mw.process_request(req, self.spider) is None
-        self.assertEqual(req.headers['Authorization'], b'Basic Zm9vOmJhcg==')
+        self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar'))
 
     def test_auth_already_set(self):
-        req = Request('http://scrapytest.org/',
+        req = Request('http://example.com/',
                       headers=dict(Authorization='Digest 123'))
         assert self.mw.process_request(req, self.spider) is None
         self.assertEqual(req.headers['Authorization'], b'Digest 123')