Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fqdn() to always include suffix if private suffix enabled and private suffix exists #300

Merged
merged 8 commits into from
Sep 13, 2023
18 changes: 9 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ A public suffix is also sometimes called an effective TLD (eTLD).
>>> import tldextract

>>> tldextract.extract('http://forums.news.cnn.com/')
ExtractResult(subdomain='forums.news', domain='cnn', suffix='com')
ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', is_private=False)

>>> tldextract.extract('http://forums.bbc.co.uk/') # United Kingdom
ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk')
ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk', is_private=False)

>>> tldextract.extract('http://www.worldbank.org.kg/') # Kyrgyzstan
ExtractResult(subdomain='www', domain='worldbank', suffix='org.kg')
ExtractResult(subdomain='www', domain='worldbank', suffix='org.kg', is_private=False)
```

`ExtractResult` is a namedtuple, so it's simple to access the parts you want.
Expand All @@ -50,13 +50,13 @@ subdomain or a valid suffix.

```python
>>> tldextract.extract('google.com')
ExtractResult(subdomain='', domain='google', suffix='com')
ExtractResult(subdomain='', domain='google', suffix='com', is_private=False)

>>> tldextract.extract('google.notavalidsuffix')
ExtractResult(subdomain='google', domain='notavalidsuffix', suffix='')
ExtractResult(subdomain='google', domain='notavalidsuffix', suffix='', is_private=False)

>>> tldextract.extract('http://127.0.0.1:8080/deployed/')
ExtractResult(subdomain='', domain='127.0.0.1', suffix='')
ExtractResult(subdomain='', domain='127.0.0.1', suffix='', is_private=False)
```

If you want to rejoin the whole namedtuple, regardless of whether a subdomain
Expand Down Expand Up @@ -161,21 +161,21 @@ By default, `tldextract` treats public and private domains the same.
```python
>>> extract = tldextract.TLDExtract()
>>> extract('waiterrant.blogspot.com')
ExtractResult(subdomain='waiterrant', domain='blogspot', suffix='com')
ExtractResult(subdomain='waiterrant', domain='blogspot', suffix='com', is_private=False)
```

The following overrides this.
```python
>>> extract = tldextract.TLDExtract()
>>> extract('waiterrant.blogspot.com', include_psl_private_domains=True)
ExtractResult(subdomain='', domain='waiterrant', suffix='blogspot.com')
ExtractResult(subdomain='', domain='waiterrant', suffix='blogspot.com', is_private=True)
```

or to change the default for all extract calls,
```python
>>> extract = tldextract.TLDExtract( include_psl_private_domains=True)
>>> extract('waiterrant.blogspot.com')
ExtractResult(subdomain='', domain='waiterrant', suffix='blogspot.com')
ExtractResult(subdomain='', domain='waiterrant', suffix='blogspot.com', is_private=True)
```

The thinking behind the default is, it's the more common case when people
Expand Down
3 changes: 2 additions & 1 deletion tests/custom_suffix_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,12 @@
def test_private_extraction():
tld = tldextract.TLDExtract(cache_dir=tempfile.mkdtemp(), suffix_list_urls=[])

assert tld("foo.blogspot.com") == ("foo", "blogspot", "com")
assert tld("foo.blogspot.com") == ("foo", "blogspot", "com", False)
assert tld("foo.blogspot.com", include_psl_private_domains=True) == (
"",
"foo",
"blogspot.com",
True,
)


Expand Down
97 changes: 78 additions & 19 deletions tests/main_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,12 @@ def test_result_as_dict():
result = extract(
"http://admin:password1@www.google.com:666/secret/admin/interface?param1=42"
)
expected_dict = {"subdomain": "www", "domain": "google", "suffix": "com"}
expected_dict = {
"subdomain": "www",
"domain": "google",
"suffix": "com",
"is_private": False,
}
assert result._asdict() == expected_dict


Expand Down Expand Up @@ -460,10 +465,10 @@ def test_include_psl_private_domain_attr():
extract_private = tldextract.TLDExtract(include_psl_private_domains=True)
extract_public = tldextract.TLDExtract(include_psl_private_domains=False)
assert extract_private("foo.uk.com") == ExtractResult(
subdomain="", domain="foo", suffix="uk.com"
subdomain="", domain="foo", suffix="uk.com", is_private=True
)
assert extract_public("foo.uk.com") == ExtractResult(
subdomain="foo", domain="uk", suffix="com"
subdomain="foo", domain="uk", suffix="com", is_private=False
)


Expand All @@ -478,38 +483,92 @@ def test_tlds_property():


def test_global_extract():
assert tldextract.extract("foo.blogspot.com") == ExtractResult(
subdomain="foo", domain="blogspot", suffix="com"
)
assert tldextract.extract(
"foo.blogspot.com", include_psl_private_domains=True
) == ExtractResult(subdomain="", domain="foo", suffix="blogspot.com")
"blogspot.com", include_psl_private_domains=True
) == ExtractResult(subdomain="", domain="", suffix="blogspot.com", is_private=True)
assert tldextract.extract(
"s3.ap-south-1.amazonaws.com", include_psl_private_domains=True
) == ExtractResult(subdomain="", domain="", suffix="s3.ap-south-1.amazonaws.com")
"foo.blogspot.com", include_psl_private_domains=True
) == ExtractResult(
subdomain="", domain="foo", suffix="blogspot.com", is_private=True
)
assert tldextract.extract(
"the-quick-brown-fox.ap-south-1.amazonaws.com", include_psl_private_domains=True
) == ExtractResult(
subdomain="the-quick-brown-fox.ap-south-1", domain="amazonaws", suffix="com"
subdomain="the-quick-brown-fox.ap-south-1",
domain="amazonaws",
suffix="com",
is_private=False,
)
assert tldextract.extract(
"ap-south-1.amazonaws.com", include_psl_private_domains=True
) == ExtractResult(subdomain="ap-south-1", domain="amazonaws", suffix="com")
) == ExtractResult(
subdomain="ap-south-1", domain="amazonaws", suffix="com", is_private=False
)
assert tldextract.extract(
"amazonaws.com", include_psl_private_domains=True
) == ExtractResult(subdomain="", domain="amazonaws", suffix="com")
assert tldextract.extract(
"s3.cn-north-1.amazonaws.com.cn", include_psl_private_domains=True
) == ExtractResult(subdomain="", domain="", suffix="s3.cn-north-1.amazonaws.com.cn")
) == ExtractResult(subdomain="", domain="amazonaws", suffix="com", is_private=False)
assert tldextract.extract(
"the-quick-brown-fox.cn-north-1.amazonaws.com.cn",
include_psl_private_domains=True,
) == ExtractResult(
subdomain="the-quick-brown-fox.cn-north-1", domain="amazonaws", suffix="com.cn"
subdomain="the-quick-brown-fox.cn-north-1",
domain="amazonaws",
suffix="com.cn",
is_private=False,
)
assert tldextract.extract(
"cn-north-1.amazonaws.com.cn", include_psl_private_domains=True
) == ExtractResult(subdomain="cn-north-1", domain="amazonaws", suffix="com.cn")
) == ExtractResult(
subdomain="cn-north-1", domain="amazonaws", suffix="com.cn", is_private=False
)
assert tldextract.extract(
"amazonaws.com.cn", include_psl_private_domains=True
) == ExtractResult(subdomain="", domain="amazonaws", suffix="com.cn")
) == ExtractResult(
subdomain="", domain="amazonaws", suffix="com.cn", is_private=False
)
assert tldextract.extract(
"another.icann.compute.amazonaws.com", include_psl_private_domains=True
) == ExtractResult(
subdomain="",
domain="another",
suffix="icann.compute.amazonaws.com",
is_private=True,
)
assert tldextract.extract(
"another.s3.dualstack.us-east-1.amazonaws.com", include_psl_private_domains=True
) == ExtractResult(
subdomain="",
domain="another",
suffix="s3.dualstack.us-east-1.amazonaws.com",
is_private=True,
)

assert tldextract.extract(
"s3.ap-south-1.amazonaws.com", include_psl_private_domains=True
) == ExtractResult(
subdomain="", domain="", suffix="s3.ap-south-1.amazonaws.com", is_private=True
)
assert tldextract.extract(
"s3.cn-north-1.amazonaws.com.cn", include_psl_private_domains=True
) == ExtractResult(
subdomain="",
domain="",
suffix="s3.cn-north-1.amazonaws.com.cn",
is_private=True,
)
assert tldextract.extract(
"icann.compute.amazonaws.com", include_psl_private_domains=True
) == ExtractResult(
subdomain="", domain="", suffix="icann.compute.amazonaws.com", is_private=True
)

# Entire URL is private suffix which ends with another private suffix
# i.e. "s3.dualstack.us-east-1.amazonaws.com" ends with "us-east-1.amazonaws.com"
assert tldextract.extract(
"s3.dualstack.us-east-1.amazonaws.com", include_psl_private_domains=True
) == ExtractResult(
subdomain="",
domain="",
suffix="s3.dualstack.us-east-1.amazonaws.com",
is_private=True,
)
16 changes: 4 additions & 12 deletions tests/test_trie.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,11 @@


def test_nested_dict() -> None:
original_keys_sequence = [
["a"],
["a", "d"],
["a", "b"],
["a", "b", "c"],
["c"],
["c", "b"],
["d", "f"],
]
for keys_sequence in permutations(original_keys_sequence):
suffixes = ["a", "d.a", "b.a", "c.b.a", "c", "b.c", "f.d"]
for suffixes_sequence in permutations(suffixes):
trie = Trie()
for keys in keys_sequence:
trie.add_suffix(keys)
for suffix in suffixes_sequence:
trie.add_suffix(suffix)
# check each nested value
# Top level c
assert "c" in trie.matches
Expand Down
3 changes: 2 additions & 1 deletion tldextract/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,4 +88,5 @@ def main() -> None:
sys.exit(1)

for i in args.input:
print(" ".join(tld_extract(i)))
subdomain, domain, suffix, _ = tld_extract(i)
print(f"{subdomain} {domain} {suffix}")
Loading