File: issue36216.diff

package info (click to toggle)
python2.7 2.7.16-2%2Bdeb10u1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 78,524 kB
  • sloc: python: 469,695; ansic: 444,315; sh: 17,604; asm: 14,304; makefile: 4,899; objc: 761; exp: 499; cpp: 128; xml: 76
file content (131 lines) | stat: -rw-r--r-- 5,096 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
diff --git a/Doc/library/urlparse.rst b/Doc/library/urlparse.rst
index 22249da54f..0989c88c30 100644
--- a/Doc/library/urlparse.rst
+++ b/Doc/library/urlparse.rst
@@ -119,12 +119,22 @@ The :mod:`urlparse` module defines the following functions:
    See section :ref:`urlparse-result-object` for more information on the result
    object.
 
+   Characters in the :attr:`netloc` attribute that decompose under NFKC
+   normalization (as used by the IDNA encoding) into any of ``/``, ``?``,
+   ``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is
+   decomposed before parsing, or is not a Unicode string, no error will be
+   raised.
+
    .. versionchanged:: 2.5
       Added attributes to return value.
 
    .. versionchanged:: 2.7
       Added IPv6 URL parsing capabilities.
 
+   .. versionchanged:: 2.7.17
+      Characters that affect netloc parsing under NFKC normalization will
+      now raise :exc:`ValueError`.
+
 
 .. function:: parse_qs(qs[, keep_blank_values[, strict_parsing[, max_num_fields]]])
 
@@ -232,11 +242,21 @@ The :mod:`urlparse` module defines the following functions:
    See section :ref:`urlparse-result-object` for more information on the result
    object.
 
+   Characters in the :attr:`netloc` attribute that decompose under NFKC
+   normalization (as used by the IDNA encoding) into any of ``/``, ``?``,
+   ``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is
+   decomposed before parsing, or is not a Unicode string, no error will be
+   raised.
+
    .. versionadded:: 2.2
 
    .. versionchanged:: 2.5
       Added attributes to return value.
 
+   .. versionchanged:: 2.7.17
+      Characters that affect netloc parsing under NFKC normalization will
+      now raise :exc:`ValueError`.
+
 
 .. function:: urlunsplit(parts)
 
diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py
index 4e1ded73c2..73b0228ea8 100644
--- a/Lib/test/test_urlparse.py
+++ b/Lib/test/test_urlparse.py
@@ -1,4 +1,6 @@
 from test import test_support
+import sys
+import unicodedata
 import unittest
 import urlparse
 
@@ -624,6 +626,28 @@ class UrlParseTestCase(unittest.TestCase):
         self.assertEqual(urlparse.urlparse("http://www.python.org:80"),
                 ('http','www.python.org:80','','','',''))
 
+    def test_urlsplit_normalization(self):
+        # Certain characters should never occur in the netloc,
+        # including under normalization.
+        # Ensure that ALL of them are detected and cause an error
+        illegal_chars = u'/:#?@'
+        hex_chars = {'{:04X}'.format(ord(c)) for c in illegal_chars}
+        denorm_chars = [
+            c for c in map(unichr, range(128, sys.maxunicode))
+            if (hex_chars & set(unicodedata.decomposition(c).split()))
+            and c not in illegal_chars
+        ]
+        # Sanity check that we found at least one such character
+        self.assertIn(u'\u2100', denorm_chars)
+        self.assertIn(u'\uFF03', denorm_chars)
+
+        for scheme in [u"http", u"https", u"ftp"]:
+            for c in denorm_chars:
+                url = u"{}://netloc{}false.netloc/path".format(scheme, c)
+                print "Checking %r" % url
+                with self.assertRaises(ValueError):
+                    urlparse.urlsplit(url)
+
 def test_main():
     test_support.run_unittest(UrlParseTestCase)
 
diff --git a/Lib/urlparse.py b/Lib/urlparse.py
index f7c2b032b0..54eda08651 100644
--- a/Lib/urlparse.py
+++ b/Lib/urlparse.py
@@ -165,6 +165,21 @@ def _splitnetloc(url, start=0):
             delim = min(delim, wdelim)     # use earliest delim position
     return url[start:delim], url[delim:]   # return (domain, rest)
 
+def _checknetloc(netloc):
+    if not netloc or not isinstance(netloc, unicode):
+        return
+    # looking for characters like \u2100 that expand to 'a/c'
+    # IDNA uses NFKC equivalence, so normalize for this check
+    import unicodedata
+    netloc2 = unicodedata.normalize('NFKC', netloc)
+    if netloc == netloc2:
+        return
+    _, _, netloc = netloc.rpartition('@') # anything to the left of '@' is okay
+    for c in '/?#@:':
+        if c in netloc2:
+            raise ValueError("netloc '" + netloc2 + "' contains invalid " +
+                             "characters under NFKC normalization")
+
 def urlsplit(url, scheme='', allow_fragments=True):
     """Parse a URL into 5 components:
     <scheme>://<netloc>/<path>?<query>#<fragment>
@@ -193,6 +208,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
                 url, fragment = url.split('#', 1)
             if '?' in url:
                 url, query = url.split('?', 1)
+            _checknetloc(netloc)
             v = SplitResult(scheme, netloc, url, query, fragment)
             _parse_cache[key] = v
             return v
@@ -216,6 +232,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
         url, fragment = url.split('#', 1)
     if '?' in url:
         url, query = url.split('?', 1)
+    _checknetloc(netloc)
     v = SplitResult(scheme, netloc, url, query, fragment)
     _parse_cache[key] = v
     return v