Skip to content

Commit

Permalink
Cleanup URL percent-encoding behavior. (#2990)
Browse files Browse the repository at this point in the history
* Replace path_query_fragment encoding tests

* Remove replaced test cases

* Fix test case to use correct hex sequence for 'abc'

* Fix 'quote' behaviour so we don't double-escape.

* Add '/' to safe chars in query strings

* Update docstring

* Linting

* Update outdated comment.

* Revert unrelated change

---------

Co-authored-by: Kar Petrosyan <92274156+karpetrosyan@users.noreply.github.com>
  • Loading branch information
tomchristie and karpetrosyan authored Dec 15, 2023
1 parent 3b9060e commit a11fc38
Show file tree
Hide file tree
Showing 2 changed files with 113 additions and 42 deletions.
51 changes: 43 additions & 8 deletions httpx/_urlparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,10 +260,8 @@ def urlparse(url: str = "", **kwargs: typing.Optional[str]) -> ParseResult:
# For 'path' we need to drop ? and # from the GEN_DELIMS set.
parsed_path: str = quote(path, safe=SUB_DELIMS + ":/[]@")
# For 'query' we need to drop '#' from the GEN_DELIMS set.
# We also exclude '/' because it is more robust to replace it with a percent
# encoding despite it not being a requirement of the spec.
parsed_query: typing.Optional[str] = (
None if query is None else quote(query, safe=SUB_DELIMS + ":?[]@")
None if query is None else quote(query, safe=SUB_DELIMS + ":/?[]@")
)
# For 'fragment' we can include all of the GEN_DELIMS set.
parsed_fragment: typing.Optional[str] = (
Expand Down Expand Up @@ -432,13 +430,12 @@ def is_safe(string: str, safe: str = "/") -> bool:
if char not in NON_ESCAPED_CHARS:
return False

# Any '%' characters must be valid '%xx' escape sequences.
return string.count("%") == len(PERCENT_ENCODED_REGEX.findall(string))
return True


def quote(string: str, safe: str = "/") -> str:
def percent_encoded(string: str, safe: str = "/") -> str:
"""
Use percent-encoding to quote a string if required.
Use percent-encoding to quote a string.
"""
if is_safe(string, safe=safe):
return string
Expand All @@ -449,6 +446,39 @@ def quote(string: str, safe: str = "/") -> str:
)


def quote(string: str, safe: str = "/") -> str:
"""
Use percent-encoding to quote a string, omitting existing '%xx' escape sequences.
See: https://www.rfc-editor.org/rfc/rfc3986#section-2.1
* `string`: The string to be percent-escaped.
* `safe`: A string containing characters that may be treated as safe, and do not
need to be escaped. Unreserved characters are always treated as safe.
See: https://www.rfc-editor.org/rfc/rfc3986#section-2.3
"""
parts = []
current_position = 0
for match in re.finditer(PERCENT_ENCODED_REGEX, string):
start_position, end_position = match.start(), match.end()
matched_text = match.group(0)
# Add any text up to the '%xx' escape sequence.
if start_position != current_position:
leading_text = string[current_position:start_position]
parts.append(percent_encoded(leading_text, safe=safe))

# Add the '%xx' escape sequence.
parts.append(matched_text)
current_position = end_position

# Add any text after the final '%xx' escape sequence.
if current_position != len(string):
trailing_text = string[current_position:]
parts.append(percent_encoded(trailing_text, safe=safe))

return "".join(parts)


def urlencode(items: typing.List[typing.Tuple[str, str]]) -> str:
"""
We can use a much simpler version of the stdlib urlencode here because
Expand All @@ -464,4 +494,9 @@ def urlencode(items: typing.List[typing.Tuple[str, str]]) -> str:
- https://github.com/encode/httpx/issues/2721
- https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urlencode
"""
return "&".join([quote(k, safe="") + "=" + quote(v, safe="") for k, v in items])
return "&".join(
[
percent_encoded(k, safe="") + "=" + percent_encoded(v, safe="")
for k, v in items
]
)
104 changes: 70 additions & 34 deletions tests/models/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,56 +70,92 @@ def test_url_no_authority():
# Tests for percent encoding across path, query, and fragment...


def test_path_percent_encoding():
# Test percent encoding for SUB_DELIMS ALPHA NUM and allowable GEN_DELIMS
url = httpx.URL("https://example.com/!$&'()*+,;= abc ABC 123 :/[]@")
assert url.raw_path == b"/!$&'()*+,;=%20abc%20ABC%20123%20:/[]@"
assert url.path == "/!$&'()*+,;= abc ABC 123 :/[]@"
assert url.query == b""
assert url.fragment == ""


def test_query_percent_encoding():
# Test percent encoding for SUB_DELIMS ALPHA NUM and allowable GEN_DELIMS
url = httpx.URL("https://example.com/?!$&'()*+,;= abc ABC 123 :/[]@" + "?")
assert url.raw_path == b"/?!$&'()*+,;=%20abc%20ABC%20123%20:%2F[]@?"
assert url.path == "/"
assert url.query == b"!$&'()*+,;=%20abc%20ABC%20123%20:%2F[]@?"
assert url.fragment == ""


def test_fragment_percent_encoding():
# Test percent encoding for SUB_DELIMS ALPHA NUM and allowable GEN_DELIMS
url = httpx.URL("https://example.com/#!$&'()*+,;= abc ABC 123 :/[]@" + "?#")
assert url.raw_path == b"/"
assert url.path == "/"
assert url.query == b""
assert url.fragment == "!$&'()*+,;= abc ABC 123 :/[]@?#"
@pytest.mark.parametrize(
"url,raw_path,path,query,fragment",
[
# URL with unescaped chars in path.
(
"https://example.com/!$&'()*+,;= abc ABC 123 :/[]@",
b"/!$&'()*+,;=%20abc%20ABC%20123%20:/[]@",
"/!$&'()*+,;= abc ABC 123 :/[]@",
b"",
"",
),
# URL with escaped chars in path.
(
"https://example.com/!$&'()*+,;=%20abc%20ABC%20123%20:/[]@",
b"/!$&'()*+,;=%20abc%20ABC%20123%20:/[]@",
"/!$&'()*+,;= abc ABC 123 :/[]@",
b"",
"",
),
# URL with mix of unescaped and escaped chars in path.
# WARNING: This has the incorrect behaviour, adding the test as an interim step.
(
"https://example.com/ %61%62%63",
b"/%20%61%62%63",
"/ abc",
b"",
"",
),
# URL with unescaped chars in query.
(
"https://example.com/?!$&'()*+,;= abc ABC 123 :/[]@?",
b"/?!$&'()*+,;=%20abc%20ABC%20123%20:/[]@?",
"/",
b"!$&'()*+,;=%20abc%20ABC%20123%20:/[]@?",
"",
),
# URL with escaped chars in query.
(
"https://example.com/?!$&%27()*+,;=%20abc%20ABC%20123%20:%2F[]@?",
b"/?!$&%27()*+,;=%20abc%20ABC%20123%20:%2F[]@?",
"/",
b"!$&%27()*+,;=%20abc%20ABC%20123%20:%2F[]@?",
"",
),
# URL with mix of unescaped and escaped chars in query.
(
"https://example.com/?%20%97%98%99",
b"/?%20%97%98%99",
"/",
b"%20%97%98%99",
"",
),
# URL encoding characters in fragment.
(
"https://example.com/#!$&'()*+,;= abc ABC 123 :/[]@?#",
b"/",
"/",
b"",
"!$&'()*+,;= abc ABC 123 :/[]@?#",
),
],
)
def test_path_query_fragment(url, raw_path, path, query, fragment):
url = httpx.URL(url)
assert url.raw_path == raw_path
assert url.path == path
assert url.query == query
assert url.fragment == fragment


def test_url_query_encoding():
"""
URL query parameters should use '%20' to encoding spaces,
URL query parameters should use '%20' for encoding spaces,
and should treat '/' as a safe character. This behaviour differs
across clients, but we're matching browser behaviour here.
See https://github.com/encode/httpx/issues/2536
and https://github.com/encode/httpx/discussions/2460
"""
url = httpx.URL("https://www.example.com/?a=b c&d=e/f")
assert url.raw_path == b"/?a=b%20c&d=e%2Ff"
assert url.raw_path == b"/?a=b%20c&d=e/f"

url = httpx.URL("https://www.example.com/", params={"a": "b c", "d": "e/f"})
assert url.raw_path == b"/?a=b%20c&d=e%2Ff"


def test_url_with_url_encoded_path():
url = httpx.URL("https://www.example.com/path%20to%20somewhere")
assert url.path == "/path to somewhere"
assert url.query == b""
assert url.raw_path == b"/path%20to%20somewhere"


def test_url_params():
url = httpx.URL("https://example.org:123/path/to/somewhere", params={"a": "123"})
assert str(url) == "https://example.org:123/path/to/somewhere?a=123"
Expand Down

0 comments on commit a11fc38

Please sign in to comment.