From 5e6781527eb05c27a7217a6f92aad0d630061ef5 Mon Sep 17 00:00:00 2001 From: Illia Volochii Date: Tue, 7 Mar 2023 21:02:32 +0200 Subject: [PATCH 01/10] gh-102153: Start stripping C0 control and space chars in `urlsplit` --- Doc/library/urllib.parse.rst | 9 ++++- Lib/test/test_urlparse.py | 40 ++++++++++++++++++- Lib/urllib/parse.py | 5 +++ ...-03-07-20-59-17.gh-issue-102153.14CLSZ.rst | 3 ++ 4 files changed, 54 insertions(+), 3 deletions(-) create mode 100644 Misc/NEWS.d/next/Security/2023-03-07-20-59-17.gh-issue-102153.14CLSZ.rst diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst index 96b396510794b4..35b329a58329de 100644 --- a/Doc/library/urllib.parse.rst +++ b/Doc/library/urllib.parse.rst @@ -324,8 +324,9 @@ or on combining URL components into a URL string. ``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is decomposed before parsing, no error will be raised. - Following the `WHATWG spec`_ that updates RFC 3986, ASCII newline - ``\n``, ``\r`` and tab ``\t`` characters are stripped from the URL. + Following the `WHATWG spec`_ that updates RFC 3986, leading and trailing C0 + control and space characters are stripped from the URL. ``\n``, ``\r`` and + tab ``\t`` characters are removed from the URL at any position. .. versionchanged:: 3.6 Out-of-range port numbers now raise :exc:`ValueError`, instead of @@ -338,6 +339,10 @@ or on combining URL components into a URL string. .. versionchanged:: 3.10 ASCII newline and tab characters are stripped from the URL. + .. versionchanged:: 3.12 + Leading and trailing C0 control and space characters are stripped from + the URL + .. _WHATWG spec: https://url.spec.whatwg.org/#concept-basic-url-parser .. function:: urlunsplit(parts) diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index 80fb9e5cd2a445..c522f75fb581b9 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -649,6 +649,44 @@ def test_urlsplit_remove_unsafe_bytes(self): self.assertEqual(p.scheme, "http") self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/?query=something#fragment") + def test_urlsplit_strip_url(self): + noise = bytes([*range(0, 0x1f), 0x20]) + base_url = "http://User:Pass@www.python.org:080/doc/?query=yes#frag" + + url = noise.decode() + base_url + noise.decode() + p = urllib.parse.urlsplit(url) + self.assertEqual(p.scheme, "http") + self.assertEqual(p.netloc, "User:Pass@www.python.org:080") + self.assertEqual(p.path, "/doc/") + self.assertEqual(p.query, "query=yes") + self.assertEqual(p.fragment, "frag") + self.assertEqual(p.username, "User") + self.assertEqual(p.password, "Pass") + self.assertEqual(p.hostname, "www.python.org") + self.assertEqual(p.port, 80) + self.assertEqual(p.geturl(), base_url) + + url = noise + base_url.encode() + noise + p = urllib.parse.urlsplit(url) + self.assertEqual(p.scheme, b"http") + self.assertEqual(p.netloc, b"User:Pass@www.python.org:080") + self.assertEqual(p.path, b"/doc/") + self.assertEqual(p.query, b"query=yes") + self.assertEqual(p.fragment, b"frag") + self.assertEqual(p.username, b"User") + self.assertEqual(p.password, b"Pass") + self.assertEqual(p.hostname, b"www.python.org") + self.assertEqual(p.port, 80) + self.assertEqual(p.geturl(), base_url.encode()) + + # with scheme as cache-key + url = "//www.python.org/" + scheme = noise.decode() + "https" + noise.decode() + for _ in range(2): + p = urllib.parse.urlsplit(url, scheme=scheme) + self.assertEqual(p.scheme, "https") + self.assertEqual(p.geturl(), "https://www.python.org/") + def test_attributes_bad_port(self): """Check handling of invalid ports.""" for bytes in (False, True): @@ -656,7 +694,7 @@ def test_attributes_bad_port(self): for port in ("foo", "1.5", "-1", "0x10", "-0", "1_1", " 1", "1 ", "६"): with self.subTest(bytes=bytes, parse=parse, port=port): netloc = "www.example.net:" + port - url = "http://" + netloc + url = "http://" + netloc + "/" if bytes: if netloc.isascii() and port.isascii(): netloc = netloc.encode("ascii") diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 5f95c5ff7f9c1c..fb4b57bdba4a99 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -79,6 +79,9 @@ '0123456789' '+-.') +# Leading and trailing C0 control and space to be stripped per WHATWG spec +_URL_CHARS_TO_STRIP = "".join([*(chr(i) for i in range(0, 0x1f + 1)), " "]) + # Unsafe bytes to be removed per WHATWG spec _UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n'] @@ -452,6 +455,8 @@ def urlsplit(url, scheme='', allow_fragments=True): """ url, scheme, _coerce_result = _coerce_args(url, scheme) + url = url.strip(_URL_CHARS_TO_STRIP) + scheme = scheme.strip(_URL_CHARS_TO_STRIP) for b in _UNSAFE_URL_BYTES_TO_REMOVE: url = url.replace(b, "") diff --git a/Misc/NEWS.d/next/Security/2023-03-07-20-59-17.gh-issue-102153.14CLSZ.rst b/Misc/NEWS.d/next/Security/2023-03-07-20-59-17.gh-issue-102153.14CLSZ.rst new file mode 100644 index 00000000000000..97652398a0fd70 --- /dev/null +++ b/Misc/NEWS.d/next/Security/2023-03-07-20-59-17.gh-issue-102153.14CLSZ.rst @@ -0,0 +1,3 @@ +:func:`urllib.parse.urlsplit` now strips leading and trailing C0 control and +space characters following the controlling specification for URLs defined by +WHATWG in response to CVE-2023-24329. Patch by Illia Volochii. From 84231baf9edef385b5f9121f61dcc916d0c0d3ef Mon Sep 17 00:00:00 2001 From: Illia Volochii Date: Tue, 7 Mar 2023 21:25:33 +0200 Subject: [PATCH 02/10] Add a period --- Doc/library/urllib.parse.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst index 35b329a58329de..4d009e73461d60 100644 --- a/Doc/library/urllib.parse.rst +++ b/Doc/library/urllib.parse.rst @@ -341,7 +341,7 @@ or on combining URL components into a URL string. .. versionchanged:: 3.12 Leading and trailing C0 control and space characters are stripped from - the URL + the URL. .. _WHATWG spec: https://url.spec.whatwg.org/#concept-basic-url-parser From 716e1c278bd3f2bd9931d7607ee6dc5da13fded6 Mon Sep 17 00:00:00 2001 From: Illia Volochii Date: Wed, 8 Mar 2023 14:15:39 +0200 Subject: [PATCH 03/10] Simplify code --- Lib/test/test_urlparse.py | 2 +- Lib/urllib/parse.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index c522f75fb581b9..60462059dad5ad 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -650,7 +650,7 @@ def test_urlsplit_remove_unsafe_bytes(self): self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/?query=something#fragment") def test_urlsplit_strip_url(self): - noise = bytes([*range(0, 0x1f), 0x20]) + noise = bytes(range(0, 0x20 + 1)) base_url = "http://User:Pass@www.python.org:080/doc/?query=yes#frag" url = noise.decode() + base_url + noise.decode() diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index fb4b57bdba4a99..8ed485760d0ba3 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -80,7 +80,7 @@ '+-.') # Leading and trailing C0 control and space to be stripped per WHATWG spec -_URL_CHARS_TO_STRIP = "".join([*(chr(i) for i in range(0, 0x1f + 1)), " "]) +_URL_CHARS_TO_STRIP = "".join([chr(i) for i in range(0, 0x20 + 1)]) # Unsafe bytes to be removed per WHATWG spec _UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n'] From a13bf41219b239511267a307703221f7c05514a4 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Thu, 27 Apr 2023 15:27:39 -0700 Subject: [PATCH 04/10] Expand the constant instead of computing it at import time. --- Lib/urllib/parse.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 8ed485760d0ba3..b846dc64a11874 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -80,7 +80,8 @@ '+-.') # Leading and trailing C0 control and space to be stripped per WHATWG spec -_URL_CHARS_TO_STRIP = "".join([chr(i) for i in range(0, 0x20 + 1)]) +# == "".join([chr(i) for i in range(0, 0x20 + 1)]) +_URL_CHARS_TO_STRIP = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f ' # Unsafe bytes to be removed per WHATWG spec _UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n'] From c863a8107ba2d03c4039fe04b451607f3bc1103f Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google LLC]" Date: Mon, 1 May 2023 17:04:54 -0700 Subject: [PATCH 05/10] Only lstrip the URL to avoid breaking applications. Many existing applications rely (for better or worse) on the trailing spaces being preserved by this API. So this moves more conservative and keeps those. The issue this change is addressing is triggered by leading spaces. One example library relyong on behavior: Django's URL validator library (at least in Django 3.2 and earlier; I have not checked later versions). If trailing spaces are stripped, its logic that involves urllib.parse for one logic path within its checks can fail to reject some URLs as invalid. --- Doc/library/urllib.parse.rst | 9 ++++----- Lib/test/test_urlparse.py | 21 +++++++++++++++++++++ Lib/urllib/parse.py | 14 ++++++++++---- 3 files changed, 35 insertions(+), 9 deletions(-) diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst index 4d009e73461d60..1f482655579a88 100644 --- a/Doc/library/urllib.parse.rst +++ b/Doc/library/urllib.parse.rst @@ -324,9 +324,9 @@ or on combining URL components into a URL string. ``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is decomposed before parsing, no error will be raised. - Following the `WHATWG spec`_ that updates RFC 3986, leading and trailing C0 - control and space characters are stripped from the URL. ``\n``, ``\r`` and - tab ``\t`` characters are removed from the URL at any position. + Following some of the `WHATWG spec`_ that updates RFC 3986, leading C0 + control control and space characters are stripped from the URL. ``\n``, + ``\r`` and tab ``\t`` characters are removed from the URL at any position. .. versionchanged:: 3.6 Out-of-range port numbers now raise :exc:`ValueError`, instead of @@ -340,8 +340,7 @@ or on combining URL components into a URL string. ASCII newline and tab characters are stripped from the URL. .. versionchanged:: 3.12 - Leading and trailing C0 control and space characters are stripped from - the URL. + Leading WHATWG C0 control and space characters are stripped from the URL. .. _WHATWG spec: https://url.spec.whatwg.org/#concept-basic-url-parser diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index 60462059dad5ad..38d8e624b88eca 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -679,6 +679,27 @@ def test_urlsplit_strip_url(self): self.assertEqual(p.port, 80) self.assertEqual(p.geturl(), base_url.encode()) + # Test that trailing space is preserved as some applications rely on + # this within query strings. + query_spaces_url = "https://www.python.org:88/doc/?query= " + p = urllib.parse.urlsplit(noise.decode("utf-8") + query_spaces_url) + self.assertEqual(p.scheme, "https") + self.assertEqual(p.netloc, "www.python.org:88") + self.assertEqual(p.path, "/doc/") + self.assertEqual(p.query, "query= ") + self.assertEqual(p.port, 88) + self.assertEqual(p.geturl(), query_spaces_url) + + p = urllib.parse.urlsplit("www.pypi.org ") + # That "hostname" gets considered a "path" due to the + # trailing space and our existing logic... YUCK... + # and re-assembles via geturl aka unurlsplit into the original. + # django.core.validators.URLValidator (at least through v3.2) relies on + # this, for better or worse, to catch it in a ValidationError via its + # regular expressions. + # Here we test the basic round trip concept of such a trailing space. + self.assertEqual(urllib.parse.urlunsplit(p), "www.pypi.org ") + # with scheme as cache-key url = "//www.python.org/" scheme = noise.decode() + "https" + noise.decode() diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index b846dc64a11874..01953614a314be 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -25,6 +25,10 @@ scenarios for parsing, and for backward compatibility purposes, some parsing quirks from older RFCs are retained. The testcases in test_urlparse.py provides a good indicator of parsing behavior. + +The WHATWG URL Parser spec should also be considered. We are not compliant with +it either due to existing user code API behavior expectations (Hyrum's Law). +It serves as a useful guide when making changes. """ from collections import namedtuple @@ -79,9 +83,9 @@ '0123456789' '+-.') -# Leading and trailing C0 control and space to be stripped per WHATWG spec +# Leading and trailing C0 control and space to be stripped per WHATWG spec. # == "".join([chr(i) for i in range(0, 0x20 + 1)]) -_URL_CHARS_TO_STRIP = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f ' +_WHATWG_C0_CONTROL_OR_SPACE = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f ' # Unsafe bytes to be removed per WHATWG spec _UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n'] @@ -456,8 +460,10 @@ def urlsplit(url, scheme='', allow_fragments=True): """ url, scheme, _coerce_result = _coerce_args(url, scheme) - url = url.strip(_URL_CHARS_TO_STRIP) - scheme = scheme.strip(_URL_CHARS_TO_STRIP) + # Only lstrip url as some applications rely on preserving trailing space. + # (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both) + url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE) + scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE) for b in _UNSAFE_URL_BYTES_TO_REMOVE: url = url.replace(b, "") From fd3e429a1c69b9a3141d5668ff9f110cc6f97035 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google LLC]" Date: Mon, 1 May 2023 17:21:35 -0700 Subject: [PATCH 06/10] Fix the unittests to allow trailing C0 Also be explicit about specifying utf-8 on encode and decode. --- Lib/test/test_urlparse.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index 38d8e624b88eca..61e67b17294432 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -653,7 +653,7 @@ def test_urlsplit_strip_url(self): noise = bytes(range(0, 0x20 + 1)) base_url = "http://User:Pass@www.python.org:080/doc/?query=yes#frag" - url = noise.decode() + base_url + noise.decode() + url = noise.decode("utf-8") + base_url p = urllib.parse.urlsplit(url) self.assertEqual(p.scheme, "http") self.assertEqual(p.netloc, "User:Pass@www.python.org:080") @@ -666,7 +666,7 @@ def test_urlsplit_strip_url(self): self.assertEqual(p.port, 80) self.assertEqual(p.geturl(), base_url) - url = noise + base_url.encode() + noise + url = noise + base_url.encode("utf-8") p = urllib.parse.urlsplit(url) self.assertEqual(p.scheme, b"http") self.assertEqual(p.netloc, b"User:Pass@www.python.org:080") @@ -677,7 +677,7 @@ def test_urlsplit_strip_url(self): self.assertEqual(p.password, b"Pass") self.assertEqual(p.hostname, b"www.python.org") self.assertEqual(p.port, 80) - self.assertEqual(p.geturl(), base_url.encode()) + self.assertEqual(p.geturl(), base_url.encode("utf-8")) # Test that trailing space is preserved as some applications rely on # this within query strings. @@ -702,7 +702,7 @@ def test_urlsplit_strip_url(self): # with scheme as cache-key url = "//www.python.org/" - scheme = noise.decode() + "https" + noise.decode() + scheme = noise.decode("utf-8") + "https" + noise.decode("utf-8") for _ in range(2): p = urllib.parse.urlsplit(url, scheme=scheme) self.assertEqual(p.scheme, "https") From 456d238c5a6c8565e4ba1d3edc6d93bfc049b785 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Wed, 3 May 2023 14:20:46 -0700 Subject: [PATCH 07/10] remove a duplicate word MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Lumír 'Frenzy' Balhar --- Doc/library/urllib.parse.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst index 1f482655579a88..db5fa4b3bf19be 100644 --- a/Doc/library/urllib.parse.rst +++ b/Doc/library/urllib.parse.rst @@ -325,7 +325,7 @@ or on combining URL components into a URL string. decomposed before parsing, no error will be raised. Following some of the `WHATWG spec`_ that updates RFC 3986, leading C0 - control control and space characters are stripped from the URL. ``\n``, + control and space characters are stripped from the URL. ``\n``, ``\r`` and tab ``\t`` characters are removed from the URL at any position. .. versionchanged:: 3.6 From 0f7f9ea830064d3d2e96680ce63eeb503acfd1ad Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Wed, 3 May 2023 14:21:53 -0700 Subject: [PATCH 08/10] Update 2023-03-07-20-59-17.gh-issue-102153.14CLSZ.rst remove "trailing" --- .../Security/2023-03-07-20-59-17.gh-issue-102153.14CLSZ.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Misc/NEWS.d/next/Security/2023-03-07-20-59-17.gh-issue-102153.14CLSZ.rst b/Misc/NEWS.d/next/Security/2023-03-07-20-59-17.gh-issue-102153.14CLSZ.rst index 97652398a0fd70..e57ac4ed3ac5d7 100644 --- a/Misc/NEWS.d/next/Security/2023-03-07-20-59-17.gh-issue-102153.14CLSZ.rst +++ b/Misc/NEWS.d/next/Security/2023-03-07-20-59-17.gh-issue-102153.14CLSZ.rst @@ -1,3 +1,3 @@ -:func:`urllib.parse.urlsplit` now strips leading and trailing C0 control and -space characters following the controlling specification for URLs defined by -WHATWG in response to CVE-2023-24329. Patch by Illia Volochii. +:func:`urllib.parse.urlsplit` now strips leading C0 control and space +characters following the specification for URLs defined by WHATWG in +response to CVE-2023-24329. Patch by Illia Volochii. From a510652af8eb02fd2377accbd66101c81bb326e8 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Wed, 17 May 2023 00:09:50 -0700 Subject: [PATCH 09/10] Add urlparse and urlsplit security warnings. The added section describing the situation is longer than I might want, but being more brief just leaves open questions. This is a lighter worded version of my original text proposed in https://discuss.python.org/t/how-to-word-a-warning-about-security-uses-in-urllib-parse-docs/26399 --- Doc/library/urllib.parse.rst | 38 ++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst index db5fa4b3bf19be..edcf815544735b 100644 --- a/Doc/library/urllib.parse.rst +++ b/Doc/library/urllib.parse.rst @@ -159,6 +159,10 @@ or on combining URL components into a URL string. ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='') + .. warning:: + + The :func:`urlparse` API does not perform validation. See :ref:`URL + parsing security ` for details. .. versionchanged:: 3.2 Added IPv6 URL parsing capabilities. @@ -328,6 +332,11 @@ or on combining URL components into a URL string. control and space characters are stripped from the URL. ``\n``, ``\r`` and tab ``\t`` characters are removed from the URL at any position. + .. warning:: + + The :func:`urlsplit` API does not perform validation. See :ref:`URL + parsing security ` for details. + .. versionchanged:: 3.6 Out-of-range port numbers now raise :exc:`ValueError`, instead of returning :const:`None`. @@ -418,6 +427,35 @@ or on combining URL components into a URL string. or ``scheme://host/path``). If *url* is not a wrapped URL, it is returned without changes. +.. _url-parsing-security: + +URL parsing security +-------------------- + + The :func:`urlsplit` and :func:`urlparse` APIs do not perform **validation** + of inputs. They may not raise errors on inputs that other applications + consider invalid. They may accept and pass through some inputs that might + not be considered URLs elsewhere as unusually split component parts. Their + purpose is for practical functionality rather than purity. + + Instead of raising an exception on unusual input, they may instead return + some components as empty ``""`` strings. Or components may contain more than + perhaps they should. + + We recommend that users of these APIs where the values may be used anywhere + with security implications code defensively. Do some verification within + your code before trusting a returned component part. Does that ``scheme`` + make sense? Is that a sensible ``path``? Is there anything strange about + that ``hostname``? etc. + + What constitutes a URL is not universally well defined. Different + applications have different needs and desired constraints. For instance the + living `WHATWG spec`_ describes what user facing web clients such as a web + browser require. While :rfc:`3986` is more general. These functions + incorporate some aspects of both, but cannot be claimed compliant with + either. Our APIs and code with expectations on their behaviors predate both + standards. We attempt to maintain backwards compatibility. + .. _parsing-ascii-encoded-bytes: Parsing ASCII Encoded Bytes From 766a7d3385418a85e39ccb992a459b8d553d087f Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Wed, 17 May 2023 00:26:35 -0700 Subject: [PATCH 10/10] doc formatting and wording tweaks. --- Doc/library/urllib.parse.rst | 54 ++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst index edcf815544735b..5a9a53f83dace0 100644 --- a/Doc/library/urllib.parse.rst +++ b/Doc/library/urllib.parse.rst @@ -161,8 +161,8 @@ or on combining URL components into a URL string. .. warning:: - The :func:`urlparse` API does not perform validation. See :ref:`URL - parsing security ` for details. + :func:`urlparse` does not perform validation. See :ref:`URL parsing + security ` for details. .. versionchanged:: 3.2 Added IPv6 URL parsing capabilities. @@ -334,8 +334,8 @@ or on combining URL components into a URL string. .. warning:: - The :func:`urlsplit` API does not perform validation. See :ref:`URL - parsing security ` for details. + :func:`urlsplit` does not perform validation. See :ref:`URL parsing + security ` for details. .. versionchanged:: 3.6 Out-of-range port numbers now raise :exc:`ValueError`, instead of @@ -432,29 +432,29 @@ or on combining URL components into a URL string. URL parsing security -------------------- - The :func:`urlsplit` and :func:`urlparse` APIs do not perform **validation** - of inputs. They may not raise errors on inputs that other applications - consider invalid. They may accept and pass through some inputs that might - not be considered URLs elsewhere as unusually split component parts. Their - purpose is for practical functionality rather than purity. - - Instead of raising an exception on unusual input, they may instead return - some components as empty ``""`` strings. Or components may contain more than - perhaps they should. - - We recommend that users of these APIs where the values may be used anywhere - with security implications code defensively. Do some verification within - your code before trusting a returned component part. Does that ``scheme`` - make sense? Is that a sensible ``path``? Is there anything strange about - that ``hostname``? etc. - - What constitutes a URL is not universally well defined. Different - applications have different needs and desired constraints. For instance the - living `WHATWG spec`_ describes what user facing web clients such as a web - browser require. While :rfc:`3986` is more general. These functions - incorporate some aspects of both, but cannot be claimed compliant with - either. Our APIs and code with expectations on their behaviors predate both - standards. We attempt to maintain backwards compatibility. +The :func:`urlsplit` and :func:`urlparse` APIs do not perform **validation** of +inputs. They may not raise errors on inputs that other applications consider +invalid. They may also succeed on some inputs that might not be considered +URLs elsewhere. Their purpose is for practical functionality rather than +purity. + +Instead of raising an exception on unusual input, they may instead return some +component parts as empty strings. Or components may contain more than perhaps +they should. + +We recommend that users of these APIs where the values may be used anywhere +with security implications code defensively. Do some verification within your +code before trusting a returned component part. Does that ``scheme`` make +sense? Is that a sensible ``path``? Is there anything strange about that +``hostname``? etc. + +What constitutes a URL is not universally well defined. Different applications +have different needs and desired constraints. For instance the living `WHATWG +spec`_ describes what user facing web clients such as a web browser require. +While :rfc:`3986` is more general. These functions incorporate some aspects of +both, but cannot be claimed compliant with either. The APIs and existing user +code with expectations on specific behaviors predate both standards leading us +to be very cautious about making API behavior changes. .. _parsing-ascii-encoded-bytes: