Skip to content

Commit

Permalink
Differentiate between 'url.host' and 'url.raw_host' (#1590)
Browse files Browse the repository at this point in the history
* Differentiate between 'url.host' and 'url.raw_host'
  • Loading branch information
tomchristie authored Apr 23, 2021
1 parent d98e9e7 commit 39d8ee6
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 40 deletions.
117 changes: 89 additions & 28 deletions httpx/_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from http.cookiejar import Cookie, CookieJar
from urllib.parse import parse_qsl, quote, unquote, urlencode

import idna
import rfc3986
import rfc3986.exceptions

Expand Down Expand Up @@ -60,33 +61,45 @@

class URL:
"""
url = httpx.URL("HTTPS://jo%40email.com:a%20secret@example.com:1234/pa%20th?search=ab#anchorlink")
url = httpx.URL("HTTPS://jo%40email.com:a%20secret@müller.de:1234/pa%20th?search=ab#anchorlink")
assert url.scheme == "https"
assert url.username == "jo@email.com"
assert url.password == "a secret"
assert url.userinfo == b"jo%40email.com:a%20secret"
assert url.host == "example.com"
assert url.host == "müller.de"
assert url.raw_host == b"xn--mller-kva.de"
assert url.port == 1234
assert url.netloc == "example.com:1234"
assert url.netloc == b"xn--mller-kva.de:1234"
assert url.path == "/pa th"
assert url.query == b"?search=ab"
assert url.raw_path == b"/pa%20th?search=ab"
assert url.fragment == "anchorlink"
The components of a URL are broken down like this:
https://jo%40email.com:a%20secret@example.com:1234/pa%20th?search=ab#anchorlink
[scheme][ username ] [password] [ host ][port][ path ] [ query ] [fragment]
[ userinfo ] [ netloc ][ raw_path ]
https://jo%40email.com:a%20secret@müller.de:1234/pa%20th?search=ab#anchorlink
[scheme] [ username ] [password] [ host ][port][ path ] [ query ] [fragment]
[ userinfo ] [ netloc ][ raw_path ]
Note that:
* `url.scheme` is normalized to always be lowercased.
* `url.host` is normalized to always be lowercased, and is IDNA encoded. For instance:
url = httpx.URL("http://中国.icom.museum")
assert url.host == "xn--fiqs8s.icom.museum"
* `url.host` is normalized to always be lowercased. Internationalized domain
names are represented in unicode, without IDNA encoding applied. For instance:
url = httpx.URL("http://中国.icom.museum")
assert url.host == "中国.icom.museum"
url = httpx.URL("http://xn--fiqs8s.icom.museum")
assert url.host == "中国.icom.museum"
* `url.raw_host` is normalized to always be lowercased, and is IDNA encoded.
url = httpx.URL("http://中国.icom.museum")
assert url.raw_host == b"xn--fiqs8s.icom.museum"
url = httpx.URL("http://xn--fiqs8s.icom.museum")
assert url.raw_host == b"xn--fiqs8s.icom.museum"
* `url.userinfo` is raw bytes, without URL escaping. Usually you'll want to work with
`url.username` and `url.password` instead, which handle the URL escaping.
Expand Down Expand Up @@ -150,6 +163,14 @@ def scheme(self) -> str:
"""
return self._uri_reference.scheme or ""

@property
def raw_scheme(self) -> bytes:
"""
The raw bytes representation of the URL scheme, such as b"http", b"https".
Always normalised to lowercase.
"""
return self.scheme.encode("ascii")

@property
def userinfo(self) -> bytes:
"""
Expand Down Expand Up @@ -181,26 +202,60 @@ def password(self) -> str:
def host(self) -> str:
"""
The URL host as a string.
Always normlized to lowercase, and IDNA encoded.
Always normalized to lowercase, with IDNA hosts decoded into unicode.
Examples:
url = httpx.URL("http://www.EXAMPLE.org")
assert url.host == "www.example.org"
url = httpx.URL("http://中国.icom.museum")
assert url.host == "xn--fiqs8s.icom.museum"
assert url.host == "中国.icom.museum"
url = httpx.URL("http://xn--fiqs8s.icom.museum")
assert url.host == "中国.icom.museum"
url = httpx.URL("https://[::ffff:192.168.0.1]")
assert url.host == "::ffff:192.168.0.1"
"""
host: str = self._uri_reference.host
host: str = self._uri_reference.host or ""

if host and ":" in host and host[0] == "[":
# it's an IPv6 address
host = host.lstrip("[").rstrip("]")

if host.startswith("xn--"):
host = idna.decode(host)

return host

@property
def raw_host(self) -> bytes:
"""
The raw bytes representation of the URL host.
Always normalized to lowercase, and IDNA encoded.
Examples:
url = httpx.URL("http://www.EXAMPLE.org")
assert url.raw_host == b"www.example.org"
url = httpx.URL("http://中国.icom.museum")
assert url.raw_host == b"xn--fiqs8s.icom.museum"
url = httpx.URL("http://xn--fiqs8s.icom.museum")
assert url.raw_host == b"xn--fiqs8s.icom.museum"
url = httpx.URL("https://[::ffff:192.168.0.1]")
assert url.raw_host == b"::ffff:192.168.0.1"
"""
host: str = self._uri_reference.host or ""

if host and ":" in host and host[0] == "[":
# it's an IPv6 address
host = host.lstrip("[").rstrip("]")

return host or ""
return host.encode("ascii")

@property
def port(self) -> typing.Optional[int]:
Expand All @@ -211,14 +266,17 @@ def port(self) -> typing.Optional[int]:
return int(port) if port else None

@property
def netloc(self) -> str:
def netloc(self) -> bytes:
"""
Either `<host>` or `<host>:<port>` as a string.
Always normlized to lowercase, and IDNA encoded.
Either `<host>` or `<host>:<port>` as bytes.
Always normalized to lowercase, and IDNA encoded.
"""
host = self._uri_reference.host or ""
port = self._uri_reference.port
return host if port is None else f"{host}:{port}"
netloc = host.encode("ascii")
if port:
netloc = netloc + b":" + str(port).encode("ascii")
return netloc

@property
def path(self) -> str:
Expand Down Expand Up @@ -277,8 +335,8 @@ def raw(self) -> RawURL:
Provides the (scheme, host, port, target) for the outgoing request.
"""
return (
self.scheme.encode("ascii"),
self.host.encode("ascii"),
self.raw_scheme,
self.raw_host,
self.port,
self.raw_path,
)
Expand All @@ -293,7 +351,7 @@ def is_absolute_url(self) -> bool:
# URLs with a fragment portion as not absolute.
# What we actually care about is if the URL provides
# a scheme and hostname to which connections should be made.
return bool(self.scheme and self.host)
return bool(self._uri_reference.scheme and self._uri_reference.host)

@property
def is_relative_url(self) -> bool:
Expand Down Expand Up @@ -321,7 +379,7 @@ def copy_with(self, **kwargs: typing.Any) -> "URL":
"userinfo": bytes,
"host": str,
"port": int,
"netloc": str,
"netloc": bytes,
"path": str,
"query": bytes,
"raw_path": bytes,
Expand Down Expand Up @@ -354,12 +412,16 @@ def copy_with(self, **kwargs: typing.Any) -> "URL":
# it's an IPv6 address, so it should be hidden under bracket
host = f"[{host}]"

kwargs["netloc"] = f"{host}:{port}" if port is not None else host
kwargs["netloc"] = (
f"{host}:{port}".encode("ascii")
if port is not None
else host.encode("ascii")
)

if "userinfo" in kwargs or "netloc" in kwargs:
# Consolidate userinfo and netloc into authority.
userinfo = (kwargs.pop("userinfo", self.userinfo) or b"").decode("ascii")
netloc = kwargs.pop("netloc", self.netloc) or ""
netloc = (kwargs.pop("netloc", self.netloc) or b"").decode("ascii")
authority = f"{userinfo}@{netloc}" if userinfo else netloc
kwargs["authority"] = authority

Expand Down Expand Up @@ -848,11 +910,10 @@ def _prepare(self, default_headers: typing.Dict[str, str]) -> None:
)

if not has_host and self.url.host:
default_port = {"http": 80, "https": 443}.get(self.url.scheme)
if self.url.port is None or self.url.port == default_port:
host_header = self.url.host.encode("ascii")
else:
host_header = self.url.netloc.encode("ascii")
default_port = {"http": b":80", "https": b":443"}.get(self.url.scheme, b"")
host_header = self.url.netloc
if host_header.endswith(default_port):
host_header = host_header[: -len(default_port)]
auto_headers.append((b"Host", host_header))
if not has_content_length and self.method in ("POST", "PUT", "PATCH"):
auto_headers.append((b"Content-Length", b"0"))
Expand Down
37 changes: 25 additions & 12 deletions tests/models/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,41 +4,53 @@


@pytest.mark.parametrize(
"given,idna,host,scheme,port",
"given,idna,host,raw_host,scheme,port",
[
(
"http://中国.icom.museum:80/",
"http://xn--fiqs8s.icom.museum:80/",
"xn--fiqs8s.icom.museum",
"中国.icom.museum",
b"xn--fiqs8s.icom.museum",
"http",
80,
),
(
"http://Königsgäßchen.de",
"http://xn--knigsgchen-b4a3dun.de",
"xn--knigsgchen-b4a3dun.de",
"königsgäßchen.de",
b"xn--knigsgchen-b4a3dun.de",
"http",
None,
),
("https://faß.de", "https://xn--fa-hia.de", "xn--fa-hia.de", "https", None),
(
"https://faß.de",
"https://xn--fa-hia.de",
"faß.de",
b"xn--fa-hia.de",
"https",
None,
),
(
"https://βόλος.com:443",
"https://xn--nxasmm1c.com:443",
"xn--nxasmm1c.com",
"βόλος.com",
b"xn--nxasmm1c.com",
"https",
443,
),
(
"http://ශ්‍රී.com:444",
"http://xn--10cl1a0b660p.com:444",
"xn--10cl1a0b660p.com",
"ශ්‍රී.com",
b"xn--10cl1a0b660p.com",
"http",
444,
),
(
"https://نامه‌ای.com:4433",
"https://xn--mgba3gch31f060k.com:4433",
"xn--mgba3gch31f060k.com",
"نامه‌ای.com",
b"xn--mgba3gch31f060k.com",
"https",
4433,
),
Expand All @@ -52,10 +64,11 @@
"https_with_custom_port",
],
)
def test_idna_url(given, idna, host, scheme, port):
def test_idna_url(given, idna, host, raw_host, scheme, port):
url = httpx.URL(given)
assert url == httpx.URL(idna)
assert url.host == host
assert url.raw_host == raw_host
assert url.scheme == scheme
assert url.port == port

Expand Down Expand Up @@ -197,7 +210,7 @@ def test_url_copywith_authority_subcomponents():

def test_url_copywith_netloc():
copy_with_kwargs = {
"netloc": "example.net:444",
"netloc": b"example.net:444",
}
url = httpx.URL("https://example.org")
new = url.copy_with(**copy_with_kwargs)
Expand Down Expand Up @@ -301,7 +314,7 @@ def test_ipv6_url():
url = httpx.URL("http://[::ffff:192.168.0.1]:5678/")

assert url.host == "::ffff:192.168.0.1"
assert url.netloc == "[::ffff:192.168.0.1]:5678"
assert url.netloc == b"[::ffff:192.168.0.1]:5678"


@pytest.mark.parametrize(
Expand All @@ -317,7 +330,7 @@ def test_ipv6_url_copy_with_host(url_str, new_host):
url = httpx.URL(url_str).copy_with(host=new_host)

assert url.host == "::ffff:192.168.0.1"
assert url.netloc == "[::ffff:192.168.0.1]:1234"
assert url.netloc == b"[::ffff:192.168.0.1]:1234"
assert str(url) == "http://[::ffff:192.168.0.1]:1234"


Expand All @@ -327,5 +340,5 @@ def test_ipv6_url_from_raw_url(host):
url = httpx.URL(raw_url)

assert url.host == "::ffff:192.168.0.1"
assert url.netloc == "[::ffff:192.168.0.1]:443"
assert url.netloc == b"[::ffff:192.168.0.1]:443"
assert str(url) == "https://[::ffff:192.168.0.1]:443/"

0 comments on commit 39d8ee6

Please sign in to comment.