Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce time complexity of URL subtraction #1388

Closed
wants to merge 37 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
7449454
Try to make URL subtraction a bit lazier
bdraco Oct 24, 2024
c1855d0
lazier
bdraco Oct 24, 2024
4c0628f
reduce
bdraco Oct 24, 2024
e60bc57
reduce
bdraco Oct 24, 2024
2d5655b
working
bdraco Oct 25, 2024
295f405
abstract it
bdraco Oct 25, 2024
31630e0
abstract it
bdraco Oct 25, 2024
d24ffe3
wip
bdraco Oct 25, 2024
62e8a49
reduce some more
bdraco Oct 25, 2024
9cae1ef
reduce
bdraco Oct 25, 2024
dcac142
naming
bdraco Oct 25, 2024
6a952a0
naming
bdraco Oct 25, 2024
1fa70d5
naming
bdraco Oct 25, 2024
89dacef
fix
bdraco Oct 25, 2024
bccef2d
fix
bdraco Oct 25, 2024
28dc640
fix
bdraco Oct 25, 2024
bf405f5
fix
bdraco Oct 25, 2024
1a00a76
reduce
bdraco Oct 25, 2024
98f41e5
Update tests/test_url.py
bdraco Oct 25, 2024
75e3650
naming
bdraco Oct 25, 2024
8a4ea89
naming
bdraco Oct 25, 2024
dc5a187
Merge remote-tracking branch 'origin/url_lazy_no_pathlib' into url_la…
bdraco Oct 25, 2024
1cc8805
naming
bdraco Oct 25, 2024
3964744
Add xfail tests for URL subtraction with empty segments
bdraco Oct 25, 2024
03b23a0
Merge branch 'empty_segments_sub' into url_lazy_no_pathlib
bdraco Oct 25, 2024
060b10c
Merge branch 'master' into url_lazy_no_pathlib
bdraco Oct 25, 2024
6d61d54
Merge remote-tracking branch 'origin/url_lazy_no_pathlib' into url_la…
bdraco Oct 25, 2024
e4ff365
tweak
bdraco Oct 25, 2024
2595811
fix
bdraco Oct 25, 2024
9ea6b9c
fixes
bdraco Oct 25, 2024
2d3d173
reduce
bdraco Oct 25, 2024
53b19a6
cleanup
bdraco Oct 25, 2024
4ed7ff2
naming
bdraco Oct 25, 2024
5674c92
Update yarl/_path.py
bdraco Oct 25, 2024
317e384
remove
bdraco Oct 25, 2024
bf918db
adding / should only happen if netloc
bdraco Oct 25, 2024
3885557
adding / should only happen if netloc
bdraco Oct 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions tests/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,11 @@ def test_str():
"http://example.com/this/",
"is/a/test",
),
("http://example.com/this/is/../a//test", "http://example.com/this/", "a/test"),
(
"http://example.com/this/is/../a//test",
"http://example.com/this/",
"a//test",
),
("http://example.com/path/to", "http://example.com/spam/", "../path/to"),
("http://example.com/path", "http://example.com/path/to/", ".."),
("http://example.com/path", "http://example.com/other/../path/to/", ".."),
Expand All @@ -98,7 +102,6 @@ def test_sub(target: str, base: str, expected: str):
assert result_url == expected_url


@pytest.mark.xfail(reason="Empty segments are not preserved")
@pytest.mark.parametrize(
("target", "base", "expected"),
[
Expand All @@ -110,7 +113,7 @@ def test_sub(target: str, base: str, expected: str):
(
"http://example.com////path/////to",
"http://example.com/////spam",
"..//path/////to",
"../path/////to",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using ("/path/to", "/spam/", "../path/to") as reference, this can't be right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TBH at this point I'm really not sure how its supposed to work and I'm tempted to revert out the whole feature as its seems like its turning out to be more trouble than its worth

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe it is right .. I'm not really sure

http://example.com////path/////to
http://example.com/////spam

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

or all the tests are wrong....

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd expect it to follow along with the first test case:

…
("/path/to", "/spam/", "../path/to"),
("//path/to", "/spam/", "..//path/to"),
(
    "http://example.com//path/to",
    "http://example.com/spam",
    "..//path/to",
),
(
    "http://example.com////path/////to",
    "http://example.com/////spam",
    "..//path/////to",
)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The trailing / in "spam/" has to be removed.

("/path/to", "/spam", "../path/to"),
("/path/to", "/spam/", "../../path/to"),

And … others to consider

("/path/to", "//spam", "../../path/to"),
(
    "http://example.com/path/to",
    "http://example.com//spam",
    "../../path/to",
),
(
    "http://example.com/////path/////to",
    "http://example.com////spam",
    "../../path/////to",
)

),
],
)
Expand Down Expand Up @@ -139,9 +142,9 @@ def test_sub_with_different_anchors():


def test_sub_with_two_dots_in_base():
expected_error_msg = "'..' segment in '/path/..' cannot be walked"
expected_error_msg = "'..' segment in 'path/..' cannot be walked"
with pytest.raises(ValueError, match=expected_error_msg):
URL("path/to") - URL("/path/../from")
URL("path/to") - URL("path/../from")


def test_repr():
Expand Down
87 changes: 68 additions & 19 deletions yarl/_path.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
"""Utilities for working with paths."""

from collections.abc import Sequence
from collections.abc import Generator, Sequence
from contextlib import suppress
from itertools import chain
from pathlib import PurePosixPath
from typing import Union


def normalize_path_segments(segments: Sequence[str]) -> list[str]:
Expand Down Expand Up @@ -43,29 +43,78 @@ def normalize_path(path: str) -> str:
return prefix + "/".join(normalize_path_segments(segments))


class URLPath:
"""A class for working with URL paths."""

__slots__ = ("parts", "path")

def __init__(self, path: str, strip_tail: bool = False) -> None:
"""Initialize a URLPath object."""
had_trailing_slash = path[-1] == "/"
# Strip trailing slash
if path and had_trailing_slash:
path = path[:-1]
if "." in path:
# Strip '.' segments
parts = [x for x in path.split("/") if x != "."]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

stripping "." segments is wrong for "/./" which is equal to "//"

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe right here … but I remember there was a special case on this.

else:
parts = path.split("/")
if strip_tail and not had_trailing_slash and parts:
parts.pop()
self.path = "/".join(parts) or "."
self.parts = parts

def parents(self) -> Generator["URLPath", None, None]:
"""Return a list of parent paths for a given path."""
parts = self.parts
for i in range(len(parts) - 1, -1, -1):
parent_parts = parts[:i]
url_path = object.__new__(URLPath)
url_path.path = "/".join(parent_parts) or "."
url_path.parts = parent_parts
yield url_path


def calculate_relative_path(target: str, base: str) -> str:
"""Return the relative path between two other paths.

If the operation is not possible, raise ValueError.
"""
target_path = URLPath(target)
base_path = URLPath(base, strip_tail=True)

target = target or "/"
base = base or "/"

target_path = PurePosixPath(target)
base_path = PurePosixPath(base)

if base[-1] != "/":
base_path = base_path.parent
target_path_parts: Union[set[str], None] = None
target_path_path = target_path.path

for step, path in enumerate(chain((base_path,), base_path.parents)):
if path == target_path or path in target_path.parents:
break
elif path.name == "..":
raise ValueError(f"'..' segment in {str(base_path)!r} cannot be walked")
else:
if (target and target[0] == "/") != (base and base[0] == "/"):
raise ValueError(
f"{str(target_path)!r} and {str(base_path)!r} have different anchors"
f"{target_path_path!r} and {base_path.path!r} have different anchors"
)
offset = len(path.parts)
return str(PurePosixPath(*("..",) * step, *target_path.parts[offset:]))

for step, base_walk in enumerate(chain((base_path,), base_path.parents())):
if base_walk.path == target_path_path:
break
# If the target_path_parts is already built we can use a fast path
if target_path_parts is not None:
if base_walk.path in target_path_parts:
break
elif base_walk.parts[-1] == "..":
raise ValueError(f"'..' segment in {base_path.path!r} cannot be walked")
continue
target_path_parts = set()
# We check one at a time because enumerating parents
# builds the value on demand, and we want to stop
# as soon as we find the common parent
for target_parent in target_path.parents():
if target_parent.path == base_path.path:
break
target_path_parts.add(target_parent.path)
else:
# If we didn't break, it means we didn't find a common parent
if base_walk.parts[-1] == "..":
raise ValueError(f"'..' segment in {base_path.path!r} cannot be walked")
continue
break

offset = len(base_walk.parts)
return "/".join((*("..",) * step, *target_path.parts[offset:])) or "."
5 changes: 5 additions & 0 deletions yarl/_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,11 @@ def __sub__(self, other: object) -> "URL":
if target_netloc != base_netloc:
raise ValueError("Both URLs should have the same netloc")

if target_netloc and not target_path:
target_path = "/"
if base_netloc and not base_path:
base_path = "/"

path = calculate_relative_path(target_path, base_path)
return self._from_tup(("", "", path, "", ""))

Expand Down