diff --git a/CHANGES.rst b/CHANGES.rst index 600ec0b20..2150ad299 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -8,6 +8,8 @@ Unreleased - Restore behavior where parsing `multipart/x-www-form-urlencoded` data with invalid UTF-8 bytes in the body results in no form data parsed rather than a 413 error. :issue:`2930` +- Improve ``parse_options_header`` performance when parsing unterminated + quoted string values. :issue:`2907` Version 3.0.3 diff --git a/src/werkzeug/http.py b/src/werkzeug/http.py index 27fa9af90..c0ec92389 100644 --- a/src/werkzeug/http.py +++ b/src/werkzeug/http.py @@ -395,22 +395,8 @@ def parse_dict_header(value: str) -> dict[str, str | None]: # https://httpwg.org/specs/rfc9110.html#parameter -_parameter_re = re.compile( - r""" - # don't match multiple empty parts, that causes backtracking - \s*;\s* # find the part delimiter - (?: - ([\w!#$%&'*+\-.^`|~]+) # key, one or more token chars - = # equals, with no space on either side - ( # value, token or quoted string - [\w!#$%&'*+\-.^`|~]+ # one or more token chars - | - "(?:\\\\|\\"|.)*?" # quoted string, consuming slash escapes - ) - )? # optionally match key=value, to account for empty parts - """, - re.ASCII | re.VERBOSE, -) +_parameter_key_re = re.compile(r"([\w!#$%&'*+\-.^`|~]+)=", flags=re.ASCII) +_parameter_token_value_re = re.compile(r"[\w!#$%&'*+\-.^`|~]+", flags=re.ASCII) # https://www.rfc-editor.org/rfc/rfc2231#section-4 _charset_value_re = re.compile( r""" @@ -492,18 +478,49 @@ def parse_options_header(value: str | None) -> tuple[str, dict[str, str]]: # empty (invalid) value, or value without options return value, {} - rest = f";{rest}" + # Collect all valid key=value parts without processing the value. + parts: list[tuple[str, str]] = [] + + while True: + if (m := _parameter_key_re.match(rest)) is not None: + pk = m.group(1).lower() + rest = rest[m.end() :] + + # Value may be a token. + if (m := _parameter_token_value_re.match(rest)) is not None: + parts.append((pk, m.group())) + + # Value may be a quoted string, find the closing quote. + elif rest[:1] == '"': + pos = 1 + length = len(rest) + + while pos < length: + if rest[pos : pos + 2] in {"\\\\", '\\"'}: + # Consume escaped slashes and quotes. + pos += 2 + elif rest[pos] == '"': + # Stop at an unescaped quote. + parts.append((pk, rest[: pos + 1])) + rest = rest[pos + 1 :] + break + else: + # Consume any other character. + pos += 1 + + # Find the next section delimited by `;`, if any. + if (end := rest.find(";")) == -1: + break + + rest = rest[end + 1 :].lstrip() + options: dict[str, str] = {} encoding: str | None = None continued_encoding: str | None = None - for pk, pv in _parameter_re.findall(rest): - if not pk: - # empty or invalid part - continue - - pk = pk.lower() - + # For each collected part, process optional charset and continuation, + # unquote quoted values. + for pk, pv in parts: if pk[-1] == "*": # key*=charset''value becomes key=value, where value is percent encoded pk = pk[:-1] diff --git a/tests/test_http.py b/tests/test_http.py index bbd51ba33..02e5eb512 100644 --- a/tests/test_http.py +++ b/tests/test_http.py @@ -361,8 +361,8 @@ def test_parse_options_header_empty(self, value, expect): ('v;a="b\\"c";d=e', {"a": 'b"c', "d": "e"}), # HTTP headers use \\ for internal \ ('v;a="c:\\\\"', {"a": "c:\\"}), - # Invalid trailing slash in quoted part is left as-is. - ('v;a="c:\\"', {"a": "c:\\"}), + # Part with invalid trailing slash is discarded. + ('v;a="c:\\"', {}), ('v;a="b\\\\\\"c"', {"a": 'b\\"c'}), # multipart form data uses %22 for internal " ('v;a="b%22c"', {"a": 'b"c'}), @@ -377,6 +377,8 @@ def test_parse_options_header_empty(self, value, expect): ("v;a*0=b;a*1=c;d=e", {"a": "bc", "d": "e"}), ("v;a*0*=b", {"a": "b"}), ("v;a*0*=UTF-8''b;a*1=c;a*2*=%C2%B5", {"a": "bcµ"}), + # Long invalid quoted string with trailing slashes does not freeze. + ('v;a="' + "\\" * 400, {}), ], ) def test_parse_options_header(self, value, expect) -> None: