From 2e279e85fece187b6058718ac7e82d1692461e26 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Sat, 10 Dec 2022 16:17:39 -0800 Subject: [PATCH] gh-88500: Reduce memory use of `urllib.unquote` (#96763) `urllib.unquote_to_bytes` and `urllib.unquote` could both potentially generate `O(len(string))` intermediate `bytes` or `str` objects while computing the unquoted final result depending on the input provided. As Python objects are relatively large, this could consume a lot of ram. This switches the implementation to using an expanding `bytearray` and a generator internally instead of precomputed `split()` style operations. Microbenchmarks with some antagonistic inputs like `mess = "\u0141%%%20a%fe"*1000` show this is 10-20% slower for unquote and unquote_to_bytes and no different for typical inputs that are short or lack much unicode or % escaping. But the functions are already quite fast anyways so not a big deal. The slowdown scales consistently linear with input size as expected. Memory usage observed manually using `/usr/bin/time -v` on `python -m timeit` runs of larger inputs. Unittesting memory consumption is difficult and does not seem worthwhile. Observed memory usage is ~1/2 for `unquote()` and <1/3 for `unquote_to_bytes()` using `python -m timeit -s 'from urllib.parse import unquote, unquote_to_bytes; v="\u0141%01\u0161%20"*500_000' 'unquote_to_bytes(v)'` as a test. --- Lib/test/test_urllib.py | 2 ++ Lib/urllib/parse.py | 30 ++++++++++++------- ...2-09-16-08-21-46.gh-issue-88500.jQ0pCc.rst | 2 ++ 3 files changed, 23 insertions(+), 11 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2022-09-16-08-21-46.gh-issue-88500.jQ0pCc.rst diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index f067560ca6caa1..2df74f5e6f99b2 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -1104,6 +1104,8 @@ def test_unquoting(self): self.assertEqual(result.count('%'), 1, "using unquote(): not all characters escaped: " "%s" % result) + + def test_unquote_rejects_none_and_tuple(self): self.assertRaises((TypeError, AttributeError), urllib.parse.unquote, None) self.assertRaises((TypeError, AttributeError), urllib.parse.unquote, ()) diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 4f6867accbc0eb..5f95c5ff7f9c1c 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -600,6 +600,9 @@ def urldefrag(url): def unquote_to_bytes(string): """unquote_to_bytes('abc%20def') -> b'abc def'.""" + return bytes(_unquote_impl(string)) + +def _unquote_impl(string: bytes | bytearray | str) -> bytes | bytearray: # Note: strings are encoded as UTF-8. This is only an issue if it contains # unescaped non-ASCII characters, which URIs should not. if not string: @@ -611,8 +614,8 @@ def unquote_to_bytes(string): bits = string.split(b'%') if len(bits) == 1: return string - res = [bits[0]] - append = res.append + res = bytearray(bits[0]) + append = res.extend # Delay the initialization of the table to not waste memory # if the function is never called global _hextobyte @@ -626,10 +629,20 @@ def unquote_to_bytes(string): except KeyError: append(b'%') append(item) - return b''.join(res) + return res _asciire = re.compile('([\x00-\x7f]+)') +def _generate_unquoted_parts(string, encoding, errors): + previous_match_end = 0 + for ascii_match in _asciire.finditer(string): + start, end = ascii_match.span() + yield string[previous_match_end:start] # Non-ASCII + # The ascii_match[1] group == string[start:end]. + yield _unquote_impl(ascii_match[1]).decode(encoding, errors) + previous_match_end = end + yield string[previous_match_end:] # Non-ASCII tail + def unquote(string, encoding='utf-8', errors='replace'): """Replace %xx escapes by their single-character equivalent. The optional encoding and errors parameters specify how to decode percent-encoded @@ -641,21 +654,16 @@ def unquote(string, encoding='utf-8', errors='replace'): unquote('abc%20def') -> 'abc def'. """ if isinstance(string, bytes): - return unquote_to_bytes(string).decode(encoding, errors) + return _unquote_impl(string).decode(encoding, errors) if '%' not in string: + # Is it a string-like object? string.split return string if encoding is None: encoding = 'utf-8' if errors is None: errors = 'replace' - bits = _asciire.split(string) - res = [bits[0]] - append = res.append - for i in range(1, len(bits), 2): - append(unquote_to_bytes(bits[i]).decode(encoding, errors)) - append(bits[i + 1]) - return ''.join(res) + return ''.join(_generate_unquoted_parts(string, encoding, errors)) def parse_qs(qs, keep_blank_values=False, strict_parsing=False, diff --git a/Misc/NEWS.d/next/Library/2022-09-16-08-21-46.gh-issue-88500.jQ0pCc.rst b/Misc/NEWS.d/next/Library/2022-09-16-08-21-46.gh-issue-88500.jQ0pCc.rst new file mode 100644 index 00000000000000..ad01f5e16b16a9 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-09-16-08-21-46.gh-issue-88500.jQ0pCc.rst @@ -0,0 +1,2 @@ +Reduced the memory usage of :func:`urllib.parse.unquote` and +:func:`urllib.parse.unquote_to_bytes` on large values.