File: CVE-2021-41125.patch

package info (click to toggle)
python-scrapy 1.5.1-1%2Bdeb10u1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 4,404 kB
  • sloc: python: 25,793; xml: 199; makefile: 95; sh: 33
file content (206 lines) | stat: -rw-r--r-- 8,495 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
From: Andrey Rakhmatullin <wrar@wrar.name>
Date: Fri, 16 Aug 2019 14:53:42 +0500
Subject: Add http_auth_domain to HttpAuthMiddleware.

Fixes CVE-2021-41125
Origin: upstream, https://github.com/scrapy/scrapy/commit/b01d69a1bf48060daec8f751368622352d8b85a6
---
 docs/topics/downloader-middleware.rst       | 18 +++++-
 scrapy/downloadermiddlewares/httpauth.py    | 21 ++++++-
 tests/test_downloadermiddleware_httpauth.py | 85 ++++++++++++++++++++++++++++-
 3 files changed, 118 insertions(+), 6 deletions(-)

diff --git a/docs/topics/downloader-middleware.rst b/docs/topics/downloader-middleware.rst
index dfe4c13..73e7e0f 100644
--- a/docs/topics/downloader-middleware.rst
+++ b/docs/topics/downloader-middleware.rst
@@ -309,8 +309,21 @@ HttpAuthMiddleware
     This middleware authenticates all requests generated from certain spiders
     using `Basic access authentication`_ (aka. HTTP auth).
 
-    To enable HTTP authentication from certain spiders, set the ``http_user``
-    and ``http_pass`` attributes of those spiders.
+    To enable HTTP authentication for a spider, set the ``http_user`` and
+    ``http_pass`` spider attributes to the authentication data and the
+    ``http_auth_domain`` spider attribute to the domain which requires this
+    authentication (its subdomains will be also handled in the same way).
+    You can set ``http_auth_domain`` to ``None`` to enable the
+    authentication for all requests but usually this is not needed.
+
+    .. warning::
+        In the previous Scrapy versions HttpAuthMiddleware sent the
+        authentication data with all requests, which is a security problem if
+        the spider makes requests to several different domains. Currently if
+        the ``http_auth_domain`` attribute is not set, the middleware will use
+        the domain of the first request, which will work for some spider but
+        not for others. In the future the middleware will produce an error
+        instead.
 
     Example::
 
@@ -320,6 +333,7 @@ HttpAuthMiddleware
 
             http_user = 'someuser'
             http_pass = 'somepass'
+            http_auth_domain = 'intranet.example.com'
             name = 'intranet.example.com'
 
             # .. rest of the spider code omitted ...
diff --git a/scrapy/downloadermiddlewares/httpauth.py b/scrapy/downloadermiddlewares/httpauth.py
index 7aa7a62..b9030f7 100644
--- a/scrapy/downloadermiddlewares/httpauth.py
+++ b/scrapy/downloadermiddlewares/httpauth.py
@@ -3,10 +3,14 @@ HTTP basic auth downloader middleware
 
 See documentation in docs/topics/downloader-middleware.rst
 """
+import warnings
 
 from w3lib.http import basic_auth_header
 
 from scrapy import signals
+from scrapy.exceptions import ScrapyDeprecationWarning
+from scrapy.utils.httpobj import urlparse_cached
+from scrapy.utils.url import url_is_from_any_domain
 
 
 class HttpAuthMiddleware(object):
@@ -24,8 +28,23 @@ class HttpAuthMiddleware(object):
         pwd = getattr(spider, 'http_pass', '')
         if usr or pwd:
             self.auth = basic_auth_header(usr, pwd)
+            if not hasattr(spider, 'http_auth_domain'):
+                warnings.warn('Using HttpAuthMiddleware without http_auth_domain is deprecated and can cause security '
+                              'problems if the spider makes requests to several different domains. http_auth_domain '
+                              'will be set to the domain of the first request, please set it to the correct value '
+                              'explicitly.',
+                              category=ScrapyDeprecationWarning)
+                self.domain_unset = True
+            else:
+                self.domain = spider.http_auth_domain
+                self.domain_unset = False
 
     def process_request(self, request, spider):
         auth = getattr(self, 'auth', None)
         if auth and b'Authorization' not in request.headers:
-            request.headers[b'Authorization'] = auth
+            domain = urlparse_cached(request).hostname
+            if self.domain_unset:
+                self.domain = domain
+                self.domain_unset = False
+            if not self.domain or url_is_from_any_domain(request.url, [self.domain]):
+                request.headers[b'Authorization'] = auth
diff --git a/tests/test_downloadermiddleware_httpauth.py b/tests/test_downloadermiddleware_httpauth.py
index 3381632..0362e20 100644
--- a/tests/test_downloadermiddleware_httpauth.py
+++ b/tests/test_downloadermiddleware_httpauth.py
@@ -1,13 +1,60 @@
 import unittest
 
+from w3lib.http import basic_auth_header
+
 from scrapy.http import Request
 from scrapy.downloadermiddlewares.httpauth import HttpAuthMiddleware
 from scrapy.spiders import Spider
 
 
+class TestSpiderLegacy(Spider):
+    http_user = 'foo'
+    http_pass = 'bar'
+
+
 class TestSpider(Spider):
     http_user = 'foo'
     http_pass = 'bar'
+    http_auth_domain = 'example.com'
+
+
+class TestSpiderAny(Spider):
+    http_user = 'foo'
+    http_pass = 'bar'
+    http_auth_domain = None
+
+
+class HttpAuthMiddlewareLegacyTest(unittest.TestCase):
+
+    def setUp(self):
+        self.spider = TestSpiderLegacy('foo')
+
+    def test_auth(self):
+        mw = HttpAuthMiddleware()
+        mw.spider_opened(self.spider)
+
+        # initial request, sets the domain and sends the header
+        req = Request('http://example.com/')
+        assert mw.process_request(req, self.spider) is None
+        self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar'))
+
+        # subsequent request to the same domain, should send the header
+        req = Request('http://example.com/')
+        assert mw.process_request(req, self.spider) is None
+        self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar'))
+
+        # subsequent request to a different domain, shouldn't send the header
+        req = Request('http://example-noauth.com/')
+        assert mw.process_request(req, self.spider) is None
+        self.assertNotIn('Authorization', req.headers)
+
+    def test_auth_already_set(self):
+        mw = HttpAuthMiddleware()
+        mw.spider_opened(self.spider)
+        req = Request('http://example.com/',
+                      headers=dict(Authorization='Digest 123'))
+        assert mw.process_request(req, self.spider) is None
+        self.assertEqual(req.headers['Authorization'], b'Digest 123')
 
 
 class HttpAuthMiddlewareTest(unittest.TestCase):
@@ -20,13 +67,45 @@ class HttpAuthMiddlewareTest(unittest.TestCase):
     def tearDown(self):
         del self.mw
 
+    def test_no_auth(self):
+        req = Request('http://example-noauth.com/')
+        assert self.mw.process_request(req, self.spider) is None
+        self.assertNotIn('Authorization', req.headers)
+
+    def test_auth_domain(self):
+        req = Request('http://example.com/')
+        assert self.mw.process_request(req, self.spider) is None
+        self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar'))
+
+    def test_auth_subdomain(self):
+        req = Request('http://foo.example.com/')
+        assert self.mw.process_request(req, self.spider) is None
+        self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar'))
+
+    def test_auth_already_set(self):
+        req = Request('http://example.com/',
+                      headers=dict(Authorization='Digest 123'))
+        assert self.mw.process_request(req, self.spider) is None
+        self.assertEqual(req.headers['Authorization'], b'Digest 123')
+
+
+class HttpAuthAnyMiddlewareTest(unittest.TestCase):
+
+    def setUp(self):
+        self.mw = HttpAuthMiddleware()
+        self.spider = TestSpiderAny('foo')
+        self.mw.spider_opened(self.spider)
+
+    def tearDown(self):
+        del self.mw
+
     def test_auth(self):
-        req = Request('http://scrapytest.org/')
+        req = Request('http://example.com/')
         assert self.mw.process_request(req, self.spider) is None
-        self.assertEqual(req.headers['Authorization'], b'Basic Zm9vOmJhcg==')
+        self.assertEqual(req.headers['Authorization'], basic_auth_header('foo', 'bar'))
 
     def test_auth_already_set(self):
-        req = Request('http://scrapytest.org/',
+        req = Request('http://example.com/',
                       headers=dict(Authorization='Digest 123'))
         assert self.mw.process_request(req, self.spider) is None
         self.assertEqual(req.headers['Authorization'], b'Digest 123')