1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
|
From 439b9cfaf43080e91c4ad69f312f21fa098befc7 Mon Sep 17 00:00:00 2001
From: Ben Kallus <49924171+kenballus@users.noreply.github.com>
Date: Sun, 13 Nov 2022 18:25:55 +0000
Subject: [PATCH] gh-99418: Make urllib.parse.urlparse enforce that a scheme
must begin with an alphabetical ASCII character. (#99421)
Prevent urllib.parse.urlparse from accepting schemes that don't begin with an alphabetical ASCII character.
RFC 3986 defines a scheme like this: `scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )`
RFC 2234 defines an ALPHA like this: `ALPHA = %x41-5A / %x61-7A`
The WHATWG URL spec defines a scheme like this:
`"A URL-scheme string must be one ASCII alpha, followed by zero or more of ASCII alphanumeric, U+002B (+), U+002D (-), and U+002E (.)."`
---
Lib/test/test_urlparse.py | 18 ++++++++++++++++++
Lib/urllib/parse.py | 2 +-
...22-11-12-15-45-51.gh-issue-99418.FxfAXS.rst | 2 ++
3 files changed, 21 insertions(+), 1 deletion(-)
create mode 100644 Misc/NEWS.d/next/Library/2022-11-12-15-45-51.gh-issue-99418.FxfAXS.rst
From 2f630e1ce18ad2e07428296532a68b11dc66ad10 Mon Sep 17 00:00:00 2001
From: Illia Volochii <illia.volochii@gmail.com>
Date: Wed, 17 May 2023 11:49:20 +0300
Subject: [PATCH] gh-102153: Start stripping C0 control and space chars in
`urlsplit` (#102508)
`urllib.parse.urlsplit` has already been respecting the WHATWG spec a bit #25595.
This adds more sanitizing to respect the "Remove any leading C0 control or space from input" [rule](https://url.spec.whatwg.org/#url-parsing:~:text=Remove%20any%20leading%20and%20trailing%20C0%20control%20or%20space%20from%20input.) in response to [CVE-2023-24329](https://nvd.nist.gov/vuln/detail/CVE-2023-24329).
---------
Co-authored-by: Gregory P. Smith [Google] <greg@krypto.org>
---
Doc/library/urllib.parse.rst | 46 +++++++++++++-
Lib/test/test_urlparse.py | 61 ++++++++++++++++++-
Lib/urllib/parse.py | 12 ++++
...-03-07-20-59-17.gh-issue-102153.14CLSZ.rst | 3 +
4 files changed, 119 insertions(+), 3 deletions(-)
create mode 100644 Misc/NEWS.d/next/Security/2023-03-07-20-59-17.gh-issue-102153.14CLSZ.rst
Backport:
* Drop Misc/NEWS.d
* urllib.parse -> urlparse
* Update hunk context
* Implement str.isascii
* test_urlparse.py:
* Various str vs bytes issues
* Drop hunk in test_attributes_bad_port
* Avoid using TestCase.subTest
* Don't use non-ascii in source
diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py
--- a/Lib/test/test_urlparse.py
+++ b/Lib/test/test_urlparse.py
@@ -654,6 +654,65 @@ def test_urlsplit_remove_unsafe_bytes(self):
self.assertEqual(p.scheme, "http")
self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/?query=something#fragment")
+ def test_urlsplit_strip_url(self):
+ noise = "".join(map(chr, range(0, 0x20 + 1)))
+ base_url = "http://User:Pass@www.python.org:080/doc/?query=yes#frag"
+
+ url = (noise + base_url).decode("utf8")
+ p = urlparse.urlsplit(url)
+ self.assertEqual(p.scheme, u"http")
+ self.assertEqual(p.netloc, u"User:Pass@www.python.org:080")
+ self.assertEqual(p.path, u"/doc/")
+ self.assertEqual(p.query, u"query=yes")
+ self.assertEqual(p.fragment, u"frag")
+ self.assertEqual(p.username, u"User")
+ self.assertEqual(p.password, u"Pass")
+ self.assertEqual(p.hostname, u"www.python.org")
+ self.assertEqual(p.port, 80)
+ self.assertEqual(p.geturl(), base_url.decode("utf8"))
+
+ url = noise + base_url
+ p = urlparse.urlsplit(url)
+ self.assertEqual(p.scheme, b"http")
+ self.assertEqual(p.netloc, b"User:Pass@www.python.org:080")
+ self.assertEqual(p.path, b"/doc/")
+ self.assertEqual(p.query, b"query=yes")
+ self.assertEqual(p.fragment, b"frag")
+ self.assertEqual(p.username, b"User")
+ self.assertEqual(p.password, b"Pass")
+ self.assertEqual(p.hostname, b"www.python.org")
+ self.assertEqual(p.port, 80)
+ self.assertEqual(p.geturl(), base_url)
+
+ # Test that trailing space is preserved as some applications rely on
+ # this within query strings.
+ query_spaces_url = "https://www.python.org:88/doc/?query= "
+ p = urlparse.urlsplit(noise + query_spaces_url)
+ self.assertEqual(p.scheme, "https")
+ self.assertEqual(p.netloc, "www.python.org:88")
+ self.assertEqual(p.path, "/doc/")
+ self.assertEqual(p.query, "query= ")
+ self.assertEqual(p.port, 88)
+ self.assertEqual(p.geturl(), query_spaces_url)
+
+ p = urlparse.urlsplit("www.pypi.org ")
+ # That "hostname" gets considered a "path" due to the
+ # trailing space and our existing logic... YUCK...
+ # and re-assembles via geturl aka unurlsplit into the original.
+ # django.core.validators.URLValidator (at least through v3.2) relies on
+ # this, for better or worse, to catch it in a ValidationError via its
+ # regular expressions.
+ # Here we test the basic round trip concept of such a trailing space.
+ self.assertEqual(urlparse.urlunsplit(p), "www.pypi.org ")
+
+ # with scheme as cache-key
+ url = "//www.python.org/"
+ scheme = noise + "https" + noise
+ for _ in range(2):
+ p = urlparse.urlsplit(url, scheme=scheme)
+ self.assertEqual(p.scheme, "https")
+ self.assertEqual(p.geturl(), "https://www.python.org/")
+
def test_attributes_bad_port(self):
"""Check handling of non-integer ports."""
p = urlparse.urlsplit("http://www.example.net:foo")
@@ -668,6 +668,23 @@ def test_attributes_bad_port(self):
self.assertEqual(p.netloc, "www.example.net:foo")
self.assertRaises(ValueError, lambda: p.port)
+ def test_attributes_bad_scheme(self):
+ """Check handling of invalid schemes."""
+ for bytes in (False, True):
+ for parse in (urlparse.urlsplit, urlparse.urlparse):
+ for scheme in (u".", u"+", u"-", u"0", u"http&", u"\xe0http"):
+ url = scheme + u"://www.example.net"
+ if bytes:
+ if all(ord(c) < 128 for c in url):
+ url = url.encode("ascii")
+ else:
+ continue
+ p = parse(url)
+ if bytes:
+ self.assertEqual(p.scheme, b"")
+ else:
+ self.assertEqual(p.scheme, u"")
+
def test_attributes_without_netloc(self):
# This example is straight from RFC 3261. It looks like it
# should allow the username, hostname, and port to be filled
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
index 9a3102afd6..4f6867accb 100644
--- a/Lib/urlparse.py
+++ b/Lib/urlparse.py
@@ -25,6 +25,10 @@
scenarios for parsing, and for backward compatibility purposes, some
parsing quirks from older RFCs are retained. The testcases in
test_urlparse.py provides a good indicator of parsing behavior.
+
+The WHATWG URL Parser spec should also be considered. We are not compliant with
+it either due to existing user code API behavior expectations (Hyrum's Law).
+It serves as a useful guide when making changes.
"""
@@ -80,6 +84,10 @@
'0123456789'
'+-.')
+# Leading and trailing C0 control and space to be stripped per WHATWG spec.
+# == "".join([chr(i) for i in range(0, 0x20 + 1)])
+_WHATWG_C0_CONTROL_OR_SPACE = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f '
+
# Unsafe bytes to be removed per WHATWG spec
_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n']
@@ -464,9 +472,13 @@ def urlsplit(url, scheme='', allow_fragments=True):
return cached
if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
clear_cache()
+ # Only lstrip url as some applications rely on preserving trailing space.
+ # (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both)
+ url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE)
+ scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE)
netloc = query = fragment = ''
i = url.find(':')
- if i > 0:
+ if i > 0 and ord(url[0]) < 128 and url[0].isalpha():
if url[:i] == 'http': # optimize the common case
scheme = url[:i].lower()
url = url[i+1:]
diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst
index 96b3965107..5a9a53f83d 100644
--- a/Doc/library/urlparse.rst
+++ b/Doc/library/urlparse.rst
@@ -159,6 +159,11 @@ or on combining URL components into a URL string.
decomposed before parsing, or is not a Unicode string, no error will be
raised.
+ .. warning::
+
+ :func:`urlparse` does not perform validation. See :ref:`URL parsing
+ security <url-parsing-security>` for details.
+
.. versionchanged:: 2.5
Added attributes to return value.
@@ -324,6 +328,15 @@ or on combining URL components into a URL string.
Following the `WHATWG spec`_ that updates RFC 3986, ASCII newline
``\n``, ``\r`` and tab ``\t`` characters are stripped from the URL.
+ Following some of the `WHATWG spec`_ that updates RFC 3986, leading C0
+ control and space characters are stripped from the URL. ``\n``,
+ ``\r`` and tab ``\t`` characters are removed from the URL at any position.
+
+ .. warning::
+
+ :func:`urlsplit` does not perform validation. See :ref:`URL parsing
+ security <url-parsing-security>` for details.
+
.. versionadded:: 2.2
.. versionchanged:: 2.5
@@ -338,6 +348,9 @@ or on combining URL components into a URL string.
.. versionchanged:: 2.7 security update
ASCII newline and tab characters are stripped from the URL.
+ .. versionchanged:: 2.7 security update
+ Leading WHATWG C0 control and space characters are stripped from the URL.
+
.. _WHATWG spec: https://url.spec.whatwg.org/#concept-basic-url-parser
.. function:: urlunsplit(parts)
@@ -414,6 +427,35 @@ or on combining URL components into a URL string.
.. _WHATWG: https://url.spec.whatwg.org/
+.. _url-parsing-security:
+
+URL parsing security
+--------------------
+
+The :func:`urlsplit` and :func:`urlparse` APIs do not perform **validation** of
+inputs. They may not raise errors on inputs that other applications consider
+invalid. They may also succeed on some inputs that might not be considered
+URLs elsewhere. Their purpose is for practical functionality rather than
+purity.
+
+Instead of raising an exception on unusual input, they may instead return some
+component parts as empty strings. Or components may contain more than perhaps
+they should.
+
+We recommend that users of these APIs where the values may be used anywhere
+with security implications code defensively. Do some verification within your
+code before trusting a returned component part. Does that ``scheme`` make
+sense? Is that a sensible ``path``? Is there anything strange about that
+``hostname``? etc.
+
+What constitutes a URL is not universally well defined. Different applications
+have different needs and desired constraints. For instance the living `WHATWG
+spec`_ describes what user facing web clients such as a web browser require.
+While :rfc:`3986` is more general. These functions incorporate some aspects of
+both, but cannot be claimed compliant with either. The APIs and existing user
+code with expectations on specific behaviors predate both standards leading us
+to be very cautious about making API behavior changes.
+
.. _urlparse-result-object:
Results of :func:`urlparse` and :func:`urlsplit`
|