1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
|
From: Andrey Rakhmatullin <wrar@wrar.name>
Date: Fri, 16 Aug 2019 14:53:42 +0500
Subject: Add http_auth_domain to HttpAuthMiddleware.
Fixes CVE-2021-41125
Origin: upstream, https://github.com/scrapy/scrapy/commit/b01d69a1bf48060daec8f751368622352d8b85a6
---
docs/topics/downloader-middleware.rst | 18 +++++-
scrapy/downloadermiddlewares/httpauth.py | 21 ++++++-
tests/test_downloadermiddleware_httpauth.py | 85 ++++++++++++++++++++++++++++-
3 files changed, 118 insertions(+), 6 deletions(-)
diff --git a/docs/topics/downloader-middleware.rst b/docs/topics/downloader-middleware.rst
index dfe4c13..73e7e0f 100644
--- a/docs/topics/downloader-middleware.rst
+++ b/docs/topics/downloader-middleware.rst
@@ -309,8 +309,21 @@ HttpAuthMiddleware
This middleware authenticates all requests generated from certain spiders
using `Basic access authentication`_ (aka. HTTP auth).
- To enable HTTP authentication from certain spiders, set the ``http_user``
- and ``http_pass`` attributes of those spiders.
+ To enable HTTP authentication for a spider, set the ``http_user`` and
+ ``http_pass`` spider attributes to the authentication data and the
+ ``http_auth_domain`` spider attribute to the domain which requires this
+ authentication (its subdomains will be also handled in the same way).
+ You can set ``http_auth_domain`` to ``None`` to enable the
+ authentication for all requests but usually this is not needed.
+
+ .. warning::
+ In the previous Scrapy versions HttpAuthMiddleware sent the
+ authentication data with all requests, which is a security problem if
+ the spider makes requests to several different domains. Currently if
+ the ``http_auth_domain`` attribute is not set, the middleware will use
+ the domain of the first request, which will work for some spider but
+ not for others. In the future the middleware will produce an error
+ instead.
Example::
@@ -320,6 +333,7 @@ HttpAuthMiddleware
http_user = 'someuser'
http_pass = 'somepass'
+ http_auth_domain = 'intranet.example.com'
name = 'intranet.example.com'
# .. rest of the spider code omitted ...
diff --git a/scrapy/downloadermiddlewares/httpauth.py b/scrapy/downloadermiddlewares/httpauth.py
index 7aa7a62..b9030f7 100644
--- a/scrapy/downloadermiddlewares/httpauth.py
+++ b/scrapy/downloadermiddlewares/httpauth.py
@@ -3,10 +3,14 @@ HTTP basic auth downloader middleware
See documentation in docs/topics/downloader-middleware.rst
"""
+import warnings
from w3lib.http import basic_auth_header
from scrapy import signals
+from scrapy.exceptions import ScrapyDeprecationWarning
+from scrapy.utils.httpobj import urlparse_cached
+from scrapy.utils.url import url_is_from_any_domain
class HttpAuthMiddleware(object):
@@ -24,8 +28,23 @@ class HttpAuthMiddleware(object):
pwd = getattr(spider, 'http_pass', '')
if usr or pwd:
self.auth = basic_auth_header(usr, pwd)
+ if not hasattr(spider, 'http_auth_domain'):
+ warnings.warn('Using HttpAuthMiddleware without http_auth_domain is deprecated and can cause security '
+ 'problems if the spider makes requests to several different domains. http_auth_domain '
+ 'will be set to the domain of the first request, please set it to the correct value '
+ 'explicitly.',
+ category=ScrapyDeprecationWarning)
+ self.domain_unset = True
+ else:
+ self.domain = spider.http_auth_domain
+ self.domain_unset = False
def process_request(self, request, spider):
auth = getattr(self, 'auth', None)
if auth and b'Authorization' not in request.headers:
- request.headers[b'Authorization'] = auth
+ domain = urlparse_cached(request).hostname
+ if self.domain_unset:
+ self.domain = domain
+ self.domain_unset = False
+ if not self.domain or url_is_from_any_domain(request.url, [self.domain]):
+ request.headers[b'Authorization'] = auth
diff --git a/tests/test_downloadermiddleware_httpauth.py b/tests/test_downloadermiddleware_httpauth.py
index 3381632..0362e20 100644
--- a/tests/test_downloadermiddleware_httpauth.py
+++ b/tests/test_downloadermiddleware_httpauth.py
@@ -1,13 +1,60 @@
import unittest
+from w3lib.http import basic_auth_header
+
from scrapy.http import Request
from scrapy.downloadermiddlewares.httpauth import HttpAuthMiddleware
from scrapy.spiders import Spider
+class TestSpiderLegacy(Spider):
+ http_user = 'foo'
+ http_pass = 'bar'
+
+
class TestSpider(Spider):
http_user = 'foo'
http_pass = 'bar'
+ http_auth_domain = 'example.com'
+
+
+class TestSpiderAny(Spider):
+ http_user = 'foo'
+ http_pass = 'bar'
+ http_auth_domain = None
+
+
+class HttpAuthMiddlewareLegacyTest(unittest.TestCase):
+
+ def setUp(self):
+ self.spider = TestSpiderLegacy('foo')
+
+ def test_auth(self):
+ mw = HttpAuthMiddleware()
+ mw.spider_opened(self.spider)
+
+ # initial request, sets the domain and sends the header
+ req = Request('http://example.com/')
+ assert mw.process_request(req, self.spider) is None
+ self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar'))
+
+ # subsequent request to the same domain, should send the header
+ req = Request('http://example.com/')
+ assert mw.process_request(req, self.spider) is None
+ self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar'))
+
+ # subsequent request to a different domain, shouldn't send the header
+ req = Request('http://example-noauth.com/')
+ assert mw.process_request(req, self.spider) is None
+ self.assertNotIn('Authorization', req.headers)
+
+ def test_auth_already_set(self):
+ mw = HttpAuthMiddleware()
+ mw.spider_opened(self.spider)
+ req = Request('http://example.com/',
+ headers=dict(Authorization='Digest 123'))
+ assert mw.process_request(req, self.spider) is None
+ self.assertEqual(req.headers['Authorization'], b'Digest 123')
class HttpAuthMiddlewareTest(unittest.TestCase):
@@ -20,13 +67,45 @@ class HttpAuthMiddlewareTest(unittest.TestCase):
def tearDown(self):
del self.mw
+ def test_no_auth(self):
+ req = Request('http://example-noauth.com/')
+ assert self.mw.process_request(req, self.spider) is None
+ self.assertNotIn('Authorization', req.headers)
+
+ def test_auth_domain(self):
+ req = Request('http://example.com/')
+ assert self.mw.process_request(req, self.spider) is None
+ self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar'))
+
+ def test_auth_subdomain(self):
+ req = Request('http://foo.example.com/')
+ assert self.mw.process_request(req, self.spider) is None
+ self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar'))
+
+ def test_auth_already_set(self):
+ req = Request('http://example.com/',
+ headers=dict(Authorization='Digest 123'))
+ assert self.mw.process_request(req, self.spider) is None
+ self.assertEqual(req.headers['Authorization'], b'Digest 123')
+
+
+class HttpAuthAnyMiddlewareTest(unittest.TestCase):
+
+ def setUp(self):
+ self.mw = HttpAuthMiddleware()
+ self.spider = TestSpiderAny('foo')
+ self.mw.spider_opened(self.spider)
+
+ def tearDown(self):
+ del self.mw
+
def test_auth(self):
- req = Request('http://scrapytest.org/')
+ req = Request('http://example.com/')
assert self.mw.process_request(req, self.spider) is None
- self.assertEqual(req.headers['Authorization'], b'Basic Zm9vOmJhcg==')
+ self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar'))
def test_auth_already_set(self):
- req = Request('http://scrapytest.org/',
+ req = Request('http://example.com/',
headers=dict(Authorization='Digest 123'))
assert self.mw.process_request(req, self.spider) is None
self.assertEqual(req.headers['Authorization'], b'Digest 123')
|