Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Differentiate between 'url.host' and 'url.raw_host' #1590

Merged
merged 7 commits into from
Apr 23, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 89 additions & 28 deletions httpx/_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from http.cookiejar import Cookie, CookieJar
from urllib.parse import parse_qsl, quote, unquote, urlencode

import idna
import rfc3986
import rfc3986.exceptions

Expand Down Expand Up @@ -60,33 +61,45 @@

class URL:
"""
url = httpx.URL("HTTPS://jo%40email.com:a%20secret@example.com:1234/pa%20th?search=ab#anchorlink")
url = httpx.URL("HTTPS://jo%40email.com:a%20secret@müller.de:1234/pa%20th?search=ab#anchorlink")

assert url.scheme == "https"
assert url.username == "jo@email.com"
assert url.password == "a secret"
assert url.userinfo == b"jo%40email.com:a%20secret"
assert url.host == "example.com"
assert url.host == "müller.de"
assert url.raw_host == b"xn--mller-kva.de"
assert url.port == 1234
assert url.netloc == "example.com:1234"
assert url.netloc == b"xn--mller-kva.de:1234"
assert url.path == "/pa th"
assert url.query == b"?search=ab"
assert url.raw_path == b"/pa%20th?search=ab"
assert url.fragment == "anchorlink"

The components of a URL are broken down like this:

https://jo%40email.com:a%20secret@example.com:1234/pa%20th?search=ab#anchorlink
[scheme][ username ] [password] [ host ][port][ path ] [ query ] [fragment]
[ userinfo ] [ netloc ][ raw_path ]
https://jo%40email.com:a%20secret@müller.de:1234/pa%20th?search=ab#anchorlink
[scheme] [ username ] [password] [ host ][port][ path ] [ query ] [fragment]
[ userinfo ] [ netloc ][ raw_path ]

Note that:

* `url.scheme` is normalized to always be lowercased.

* `url.host` is normalized to always be lowercased, and is IDNA encoded. For instance:
url = httpx.URL("http://中国.icom.museum")
assert url.host == "xn--fiqs8s.icom.museum"
* `url.host` is normalized to always be lowercased. Internationalized domain
names are represented in unicode, without IDNA encoding applied. For instance:

url = httpx.URL("http://中国.icom.museum")
assert url.host == "中国.icom.museum"
url = httpx.URL("http://xn--fiqs8s.icom.museum")
assert url.host == "中国.icom.museum"

* `url.raw_host` is normalized to always be lowercased, and is IDNA encoded.

url = httpx.URL("http://中国.icom.museum")
assert url.raw_host == b"xn--fiqs8s.icom.museum"
url = httpx.URL("http://xn--fiqs8s.icom.museum")
assert url.raw_host == b"xn--fiqs8s.icom.museum"

* `url.userinfo` is raw bytes, without URL escaping. Usually you'll want to work with
`url.username` and `url.password` instead, which handle the URL escaping.
Expand Down Expand Up @@ -150,6 +163,14 @@ def scheme(self) -> str:
"""
return self._uri_reference.scheme or ""

@property
def raw_scheme(self) -> bytes:
"""
The raw bytes representation of the URL scheme, such as b"http", b"https".
Always normalised to lowercase.
"""
return self.scheme.encode("ascii")

@property
def userinfo(self) -> bytes:
"""
Expand Down Expand Up @@ -181,26 +202,60 @@ def password(self) -> str:
def host(self) -> str:
"""
The URL host as a string.
Always normlized to lowercase, and IDNA encoded.
Always normalized to lowercase, with IDNA hosts decoded into unicode.

Examples:

url = httpx.URL("http://www.EXAMPLE.org")
assert url.host == "www.example.org"

url = httpx.URL("http://中国.icom.museum")
assert url.host == "xn--fiqs8s.icom.museum"
assert url.host == "中国.icom.museum"

url = httpx.URL("http://xn--fiqs8s.icom.museum")
assert url.host == "中国.icom.museum"

url = httpx.URL("https://[::ffff:192.168.0.1]")
assert url.host == "::ffff:192.168.0.1"
"""
host: str = self._uri_reference.host
host: str = self._uri_reference.host or ""

if host and ":" in host and host[0] == "[":
# it's an IPv6 address
host = host.lstrip("[").rstrip("]")

if host.startswith("xn--"):
host = idna.decode(host)

return host

@property
def raw_host(self) -> bytes:
"""
The raw bytes representation of the URL host.
Always normalized to lowercase, and IDNA encoded.

Examples:

url = httpx.URL("http://www.EXAMPLE.org")
assert url.raw_host == b"www.example.org"

url = httpx.URL("http://中国.icom.museum")
assert url.raw_host == b"xn--fiqs8s.icom.museum"

url = httpx.URL("http://xn--fiqs8s.icom.museum")
assert url.raw_host == b"xn--fiqs8s.icom.museum"

url = httpx.URL("https://[::ffff:192.168.0.1]")
assert url.raw_host == b"::ffff:192.168.0.1"
"""
host: str = self._uri_reference.host or ""

if host and ":" in host and host[0] == "[":
# it's an IPv6 address
host = host.lstrip("[").rstrip("]")

return host or ""
return host.encode("ascii")

@property
def port(self) -> typing.Optional[int]:
Expand All @@ -211,14 +266,17 @@ def port(self) -> typing.Optional[int]:
return int(port) if port else None

@property
def netloc(self) -> str:
def netloc(self) -> bytes:
"""
Either `<host>` or `<host>:<port>` as a string.
Always normlized to lowercase, and IDNA encoded.
Either `<host>` or `<host>:<port>` as bytes.
Always normalized to lowercase, and IDNA encoded.
"""
host = self._uri_reference.host or ""
port = self._uri_reference.port
return host if port is None else f"{host}:{port}"
netloc = host.encode("ascii")
if port:
netloc = netloc + b":" + str(port).encode("ascii")
return netloc

@property
def path(self) -> str:
Expand Down Expand Up @@ -277,8 +335,8 @@ def raw(self) -> RawURL:
Provides the (scheme, host, port, target) for the outgoing request.
"""
return (
self.scheme.encode("ascii"),
self.host.encode("ascii"),
self.raw_scheme,
self.raw_host,
self.port,
self.raw_path,
)
Expand All @@ -293,7 +351,7 @@ def is_absolute_url(self) -> bool:
# URLs with a fragment portion as not absolute.
# What we actually care about is if the URL provides
# a scheme and hostname to which connections should be made.
return bool(self.scheme and self.host)
return bool(self._uri_reference.scheme and self._uri_reference.host)

@property
def is_relative_url(self) -> bool:
Expand Down Expand Up @@ -321,7 +379,7 @@ def copy_with(self, **kwargs: typing.Any) -> "URL":
"userinfo": bytes,
"host": str,
"port": int,
"netloc": str,
"netloc": bytes,
"path": str,
"query": bytes,
"raw_path": bytes,
Expand Down Expand Up @@ -354,12 +412,16 @@ def copy_with(self, **kwargs: typing.Any) -> "URL":
# it's an IPv6 address, so it should be hidden under bracket
host = f"[{host}]"

kwargs["netloc"] = f"{host}:{port}" if port is not None else host
kwargs["netloc"] = (
f"{host}:{port}".encode("ascii")
if port is not None
else host.encode("ascii")
)

if "userinfo" in kwargs or "netloc" in kwargs:
# Consolidate userinfo and netloc into authority.
userinfo = (kwargs.pop("userinfo", self.userinfo) or b"").decode("ascii")
netloc = kwargs.pop("netloc", self.netloc) or ""
netloc = (kwargs.pop("netloc", self.netloc) or b"").decode("ascii")
authority = f"{userinfo}@{netloc}" if userinfo else netloc
kwargs["authority"] = authority

Expand Down Expand Up @@ -848,11 +910,10 @@ def _prepare(self, default_headers: typing.Dict[str, str]) -> None:
)

if not has_host and self.url.host:
default_port = {"http": 80, "https": 443}.get(self.url.scheme)
if self.url.port is None or self.url.port == default_port:
host_header = self.url.host.encode("ascii")
else:
host_header = self.url.netloc.encode("ascii")
default_port = {"http": b":80", "https": b":443"}.get(self.url.scheme, b"")
host_header = self.url.netloc
if host_header.endswith(default_port):
host_header = host_header[: -len(default_port)]
auto_headers.append((b"Host", host_header))
if not has_content_length and self.method in ("POST", "PUT", "PATCH"):
auto_headers.append((b"Content-Length", b"0"))
Expand Down
37 changes: 25 additions & 12 deletions tests/models/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,41 +4,53 @@


@pytest.mark.parametrize(
"given,idna,host,scheme,port",
"given,idna,host,raw_host,scheme,port",
[
(
"http://中国.icom.museum:80/",
"http://xn--fiqs8s.icom.museum:80/",
"xn--fiqs8s.icom.museum",
"中国.icom.museum",
b"xn--fiqs8s.icom.museum",
"http",
80,
),
(
"http://Königsgäßchen.de",
"http://xn--knigsgchen-b4a3dun.de",
"xn--knigsgchen-b4a3dun.de",
"königsgäßchen.de",
b"xn--knigsgchen-b4a3dun.de",
"http",
None,
),
("https://faß.de", "https://xn--fa-hia.de", "xn--fa-hia.de", "https", None),
(
"https://faß.de",
"https://xn--fa-hia.de",
"faß.de",
b"xn--fa-hia.de",
"https",
None,
),
(
"https://βόλος.com:443",
"https://xn--nxasmm1c.com:443",
"xn--nxasmm1c.com",
"βόλος.com",
b"xn--nxasmm1c.com",
"https",
443,
),
(
"http://ශ්‍රී.com:444",
"http://xn--10cl1a0b660p.com:444",
"xn--10cl1a0b660p.com",
"ශ්‍රී.com",
b"xn--10cl1a0b660p.com",
"http",
444,
),
(
"https://نامه‌ای.com:4433",
"https://xn--mgba3gch31f060k.com:4433",
"xn--mgba3gch31f060k.com",
"نامه‌ای.com",
b"xn--mgba3gch31f060k.com",
"https",
4433,
),
Expand All @@ -52,10 +64,11 @@
"https_with_custom_port",
],
)
def test_idna_url(given, idna, host, scheme, port):
def test_idna_url(given, idna, host, raw_host, scheme, port):
url = httpx.URL(given)
assert url == httpx.URL(idna)
assert url.host == host
assert url.raw_host == raw_host
assert url.scheme == scheme
assert url.port == port

Expand Down Expand Up @@ -197,7 +210,7 @@ def test_url_copywith_authority_subcomponents():

def test_url_copywith_netloc():
copy_with_kwargs = {
"netloc": "example.net:444",
"netloc": b"example.net:444",
}
url = httpx.URL("https://example.org")
new = url.copy_with(**copy_with_kwargs)
Expand Down Expand Up @@ -301,7 +314,7 @@ def test_ipv6_url():
url = httpx.URL("http://[::ffff:192.168.0.1]:5678/")

assert url.host == "::ffff:192.168.0.1"
assert url.netloc == "[::ffff:192.168.0.1]:5678"
assert url.netloc == b"[::ffff:192.168.0.1]:5678"


@pytest.mark.parametrize(
Expand All @@ -317,7 +330,7 @@ def test_ipv6_url_copy_with_host(url_str, new_host):
url = httpx.URL(url_str).copy_with(host=new_host)

assert url.host == "::ffff:192.168.0.1"
assert url.netloc == "[::ffff:192.168.0.1]:1234"
assert url.netloc == b"[::ffff:192.168.0.1]:1234"
assert str(url) == "http://[::ffff:192.168.0.1]:1234"


Expand All @@ -327,5 +340,5 @@ def test_ipv6_url_from_raw_url(host):
url = httpx.URL(raw_url)

assert url.host == "::ffff:192.168.0.1"
assert url.netloc == "[::ffff:192.168.0.1]:443"
assert url.netloc == b"[::ffff:192.168.0.1]:443"
assert str(url) == "https://[::ffff:192.168.0.1]:443/"