From 101924d8d85b6babc83cbec891d6b8b9288079ca Mon Sep 17 00:00:00 2001 From: Tom Christie Date: Thu, 19 Oct 2023 10:43:21 +0100 Subject: [PATCH 1/8] Limit which text codecs are supported --- httpx/_utils.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/httpx/_utils.py b/httpx/_utils.py index 1775b1a1ef..305118c0f0 100644 --- a/httpx/_utils.py +++ b/httpx/_utils.py @@ -25,6 +25,12 @@ r"|".join([re.escape(c) for c in _HTML5_FORM_ENCODING_REPLACEMENTS.keys()]) ) +# Text codecs as supported by Chromium, Oct. 2023. +# https://chromium.googlesource.com/chromium/chromium/+/refs/heads/trunk/chrome/browser/character_encoding.cc#36 +SUPPORTED_CODECS = [ + 'utf-8', 'utf-16le', 'iso-8859-1', 'windows-1252', 'gbk', 'gb18030', 'big5', 'big5-hkscs', 'euc-kr', 'shift-jis', 'euc-jp', 'iso-2022-jp', 'windows-874', 'iso-8859-15', 'macintosh', 'iso-8859-2', 'windows-1250', 'iso-8859-5', 'windows-1251', 'koi8-r', 'koi8-u', 'iso-8859-7', 'windows-1253', 'windows-1254', 'windows-1256', 'iso-8859-6', 'windows-1255', 'iso-8859-8-i', 'iso-8859-8', 'windows-1258', 'iso-8859-4', 'iso-8859-13', 'windows-1257', 'iso-8859-3', 'iso-8859-10', 'iso-8859-14', 'iso-8859-16' +] + def normalize_header_key( value: typing.Union[str, bytes], @@ -72,6 +78,15 @@ def is_known_encoding(encoding: str) -> bool: """ Return `True` if `encoding` is a known codec. """ + # Only allow text codecs within our supported range. + if encoding.lower().replace('_', '-') not in SUPPORTED_CODECS: + return False + + # Also ensure that the codec is actually available. + # At the point of writing this was true for all the SUPPORTED_CODECS + # except "windows-874", "iso-8859-8-i", when using cpython. + # But there *could* feasibly be a different set of codecs available + # under some installations. try: codecs.lookup(encoding) except LookupError: From 6366bb8993af75faf4f1fe9c2daa90d457d09644 Mon Sep 17 00:00:00 2001 From: Tom Christie Date: Thu, 19 Oct 2023 11:06:04 +0100 Subject: [PATCH 2/8] Linting --- httpx/_utils.py | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/httpx/_utils.py b/httpx/_utils.py index 305118c0f0..cc1cbecd9d 100644 --- a/httpx/_utils.py +++ b/httpx/_utils.py @@ -28,7 +28,43 @@ # Text codecs as supported by Chromium, Oct. 2023. # https://chromium.googlesource.com/chromium/chromium/+/refs/heads/trunk/chrome/browser/character_encoding.cc#36 SUPPORTED_CODECS = [ - 'utf-8', 'utf-16le', 'iso-8859-1', 'windows-1252', 'gbk', 'gb18030', 'big5', 'big5-hkscs', 'euc-kr', 'shift-jis', 'euc-jp', 'iso-2022-jp', 'windows-874', 'iso-8859-15', 'macintosh', 'iso-8859-2', 'windows-1250', 'iso-8859-5', 'windows-1251', 'koi8-r', 'koi8-u', 'iso-8859-7', 'windows-1253', 'windows-1254', 'windows-1256', 'iso-8859-6', 'windows-1255', 'iso-8859-8-i', 'iso-8859-8', 'windows-1258', 'iso-8859-4', 'iso-8859-13', 'windows-1257', 'iso-8859-3', 'iso-8859-10', 'iso-8859-14', 'iso-8859-16' + "utf-8", + "utf-16le", + "iso-8859-1", + "windows-1252", + "gbk", + "gb18030", + "big5", + "big5-hkscs", + "euc-kr", + "shift-jis", + "euc-jp", + "iso-2022-jp", + "windows-874", + "iso-8859-15", + "macintosh", + "iso-8859-2", + "windows-1250", + "iso-8859-5", + "windows-1251", + "koi8-r", + "koi8-u", + "iso-8859-7", + "windows-1253", + "windows-1254", + "windows-1256", + "iso-8859-6", + "windows-1255", + "iso-8859-8-i", + "iso-8859-8", + "windows-1258", + "iso-8859-4", + "iso-8859-13", + "windows-1257", + "iso-8859-3", + "iso-8859-10", + "iso-8859-14", + "iso-8859-16", ] @@ -79,7 +115,7 @@ def is_known_encoding(encoding: str) -> bool: Return `True` if `encoding` is a known codec. """ # Only allow text codecs within our supported range. - if encoding.lower().replace('_', '-') not in SUPPORTED_CODECS: + if encoding.lower().replace("_", "-") not in SUPPORTED_CODECS: return False # Also ensure that the codec is actually available. From 8e8ef6e9c62ae9c88299cfc459fef343969999e7 Mon Sep 17 00:00:00 2001 From: Tom Christie Date: Thu, 19 Oct 2023 11:19:52 +0100 Subject: [PATCH 3/8] Use set instead of list --- httpx/_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/httpx/_utils.py b/httpx/_utils.py index cc1cbecd9d..875dd29d16 100644 --- a/httpx/_utils.py +++ b/httpx/_utils.py @@ -27,7 +27,7 @@ # Text codecs as supported by Chromium, Oct. 2023. # https://chromium.googlesource.com/chromium/chromium/+/refs/heads/trunk/chrome/browser/character_encoding.cc#36 -SUPPORTED_CODECS = [ +SUPPORTED_CODECS = { "utf-8", "utf-16le", "iso-8859-1", @@ -65,7 +65,7 @@ "iso-8859-10", "iso-8859-14", "iso-8859-16", -] +} def normalize_header_key( From cff58c91dbee17701e37a175c6be6cdde8a573ec Mon Sep 17 00:00:00 2001 From: Tom Christie Date: Thu, 19 Oct 2023 12:23:39 +0100 Subject: [PATCH 4/8] Supported text codecs should handle available aliases --- httpx/_utils.py | 95 +++++++++++++++++++++++-------------------------- 1 file changed, 45 insertions(+), 50 deletions(-) diff --git a/httpx/_utils.py b/httpx/_utils.py index 875dd29d16..eb1708a0ed 100644 --- a/httpx/_utils.py +++ b/httpx/_utils.py @@ -25,46 +25,49 @@ r"|".join([re.escape(c) for c in _HTML5_FORM_ENCODING_REPLACEMENTS.keys()]) ) -# Text codecs as supported by Chromium, Oct. 2023. +# For our supported text codecs, we start with the text codecs as supported by Chromium, Oct. 2023. # https://chromium.googlesource.com/chromium/chromium/+/refs/heads/trunk/chrome/browser/character_encoding.cc#36 +# +# Then limit those to any which documented as included by cpython, +# which drops "windows-874", "iso-8859-8-i". +# +# Then make sure we're referencing them with the canonical name as used by the Python codecs. SUPPORTED_CODECS = { - "utf-8", - "utf-16le", - "iso-8859-1", - "windows-1252", - "gbk", - "gb18030", - "big5", - "big5-hkscs", - "euc-kr", - "shift-jis", - "euc-jp", - "iso-2022-jp", - "windows-874", - "iso-8859-15", - "macintosh", - "iso-8859-2", - "windows-1250", - "iso-8859-5", - "windows-1251", - "koi8-r", - "koi8-u", - "iso-8859-7", - "windows-1253", - "windows-1254", - "windows-1256", - "iso-8859-6", - "windows-1255", - "iso-8859-8-i", - "iso-8859-8", - "windows-1258", - "iso-8859-4", - "iso-8859-13", - "windows-1257", - "iso-8859-3", - "iso-8859-10", - "iso-8859-14", - "iso-8859-16", + "big5", # big5 + "big5hkscs", # big5-hkscs + "cp1250", # windows-1250 + "cp1251", # windows-1251 + "cp1252", # windows-1252 + "cp1253", # windows-1253 + "cp1254", # windows-1254 + "cp1255", # windows-1255 + "cp1256", # windows-1256 + "cp1257", # windows-1257 + "cp1258", # windows-1258 + "euc_jp", # euc-jp + "euc_kr", # euc-kr + "gb18030", # gb18030 + "gbk", # gbk + "iso2022_jp", # iso-2022-jp + "iso8859-1", # iso-8859-1 + "iso8859-2", # iso-8859-2 + "iso8859-3", # iso-8859-3 + "iso8859-4", # iso-8859-4 + "iso8859-5", # iso-8859-5 + "iso8859-6", # iso-8859-6 + "iso8859-7", # iso-8859-7 + "iso8859-8", # iso-8859-8 + "iso8859-10", # iso-8859-10 + "iso8859-13", # iso-8859-13 + "iso8859-14", # iso-8859-14 + "iso8859-15", # iso-8859-15 + "iso8859-16", # iso-8859-16 + "koi8-r", # koi8-r + "koi8-u", # koi8-u + "mac-roman", # macintosh + "shift_jis", # shift-jis + "utf-8", # utf-8 + "utf-16-le", # utf-16le } @@ -112,22 +115,14 @@ def primitive_value_to_str(value: "PrimitiveData") -> str: def is_known_encoding(encoding: str) -> bool: """ - Return `True` if `encoding` is a known codec. + Return `True` if `encoding` is a supported text codec. """ - # Only allow text codecs within our supported range. - if encoding.lower().replace("_", "-") not in SUPPORTED_CODECS: - return False - - # Also ensure that the codec is actually available. - # At the point of writing this was true for all the SUPPORTED_CODECS - # except "windows-874", "iso-8859-8-i", when using cpython. - # But there *could* feasibly be a different set of codecs available - # under some installations. try: - codecs.lookup(encoding) + codec = codecs.lookup(encoding) except LookupError: return False - return True + + return codec.name in SUPPORTED_CODECS def format_form_param(name: str, value: str) -> bytes: From 91a11cf8c28ff541bfd00ce9dca08fffa3e56688 Mon Sep 17 00:00:00 2001 From: Tom Christie Date: Thu, 19 Oct 2023 12:28:27 +0100 Subject: [PATCH 5/8] Update comment --- httpx/_utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/httpx/_utils.py b/httpx/_utils.py index eb1708a0ed..f2c54b3ede 100644 --- a/httpx/_utils.py +++ b/httpx/_utils.py @@ -28,10 +28,11 @@ # For our supported text codecs, we start with the text codecs as supported by Chromium, Oct. 2023. # https://chromium.googlesource.com/chromium/chromium/+/refs/heads/trunk/chrome/browser/character_encoding.cc#36 # -# Then limit those to any which documented as included by cpython, -# which drops "windows-874", "iso-8859-8-i". +# Then limit them to only includec codecs which are documented as included by cpython. +# https://docs.python.org/3/library/codecs.html#standard-encodings # -# Then make sure we're referencing them with the canonical name as used by the Python codecs. +# We're referencing them with the canonical name as used by the Python codecs. +# The alias given in the chromium source is included as a comment for comparison. SUPPORTED_CODECS = { "big5", # big5 "big5hkscs", # big5-hkscs From fab863679016c6c3134e53ec0674fe106571811c Mon Sep 17 00:00:00 2001 From: Tom Christie Date: Thu, 19 Oct 2023 12:30:03 +0100 Subject: [PATCH 6/8] Linting --- httpx/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/httpx/_utils.py b/httpx/_utils.py index f2c54b3ede..dc809e4582 100644 --- a/httpx/_utils.py +++ b/httpx/_utils.py @@ -32,7 +32,7 @@ # https://docs.python.org/3/library/codecs.html#standard-encodings # # We're referencing them with the canonical name as used by the Python codecs. -# The alias given in the chromium source is included as a comment for comparison. +# The alias given in the chromium source is included as a comment for comparison. SUPPORTED_CODECS = { "big5", # big5 "big5hkscs", # big5-hkscs From 6d4ca0b48b1796bf704fe777e0e6a10e6b068058 Mon Sep 17 00:00:00 2001 From: Tom Christie Date: Thu, 19 Oct 2023 12:38:07 +0100 Subject: [PATCH 7/8] Include the full set of supported UTF encodings --- httpx/_utils.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/httpx/_utils.py b/httpx/_utils.py index dc809e4582..9bfa779f4b 100644 --- a/httpx/_utils.py +++ b/httpx/_utils.py @@ -69,6 +69,13 @@ "shift_jis", # shift-jis "utf-8", # utf-8 "utf-16-le", # utf-16le + # We also support the following UTF flavors... + "utf-8-sig", + "utf-16", + "utf-16-be", + "utf-32", + "utf-32-be", + "utf-32-le", } From 85669525675c20ca5f5875cce5d7995710c860ed Mon Sep 17 00:00:00 2001 From: Tom Christie Date: Tue, 31 Oct 2023 10:16:14 +0000 Subject: [PATCH 8/8] Drop unneccessary JSON encodings --- httpx/_utils.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/httpx/_utils.py b/httpx/_utils.py index 4ae6dba7d1..f95658b6ce 100644 --- a/httpx/_utils.py +++ b/httpx/_utils.py @@ -69,13 +69,6 @@ "shift_jis", # shift-jis "utf-8", # utf-8 "utf-16-le", # utf-16le - # We also support the following UTF flavors... - "utf-8-sig", - "utf-16", - "utf-16-be", - "utf-32", - "utf-32-be", - "utf-32-le", }