diff --git a/tests/test_url.py b/tests/test_url.py index 444a9f565..0d10d2538 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -72,7 +72,11 @@ def test_str(): "http://example.com/this/", "is/a/test", ), - ("http://example.com/this/is/../a//test", "http://example.com/this/", "a/test"), + ( + "http://example.com/this/is/../a//test", + "http://example.com/this/", + "a//test", + ), ("http://example.com/path/to", "http://example.com/spam/", "../path/to"), ("http://example.com/path", "http://example.com/path/to/", ".."), ("http://example.com/path", "http://example.com/other/../path/to/", ".."), @@ -98,7 +102,6 @@ def test_sub(target: str, base: str, expected: str): assert result_url == expected_url -@pytest.mark.xfail(reason="Empty segments are not preserved") @pytest.mark.parametrize( ("target", "base", "expected"), [ @@ -110,7 +113,7 @@ def test_sub(target: str, base: str, expected: str): ( "http://example.com////path/////to", "http://example.com/////spam", - "..//path/////to", + "../path/////to", ), ], ) @@ -139,9 +142,9 @@ def test_sub_with_different_anchors(): def test_sub_with_two_dots_in_base(): - expected_error_msg = "'..' segment in '/path/..' cannot be walked" + expected_error_msg = "'..' segment in 'path/..' cannot be walked" with pytest.raises(ValueError, match=expected_error_msg): - URL("path/to") - URL("/path/../from") + URL("path/to") - URL("path/../from") def test_repr(): diff --git a/yarl/_path.py b/yarl/_path.py index 838aa94cb..c805013c0 100644 --- a/yarl/_path.py +++ b/yarl/_path.py @@ -1,9 +1,9 @@ """Utilities for working with paths.""" -from collections.abc import Sequence +from collections.abc import Generator, Sequence from contextlib import suppress from itertools import chain -from pathlib import PurePosixPath +from typing import Union def normalize_path_segments(segments: Sequence[str]) -> list[str]: @@ -43,29 +43,78 @@ def normalize_path(path: str) -> str: return prefix + "/".join(normalize_path_segments(segments)) +class URLPath: + """A class for working with URL paths.""" + + __slots__ = ("parts", "path") + + def __init__(self, path: str, strip_tail: bool = False) -> None: + """Initialize a URLPath object.""" + had_trailing_slash = path[-1] == "/" + # Strip trailing slash + if path and had_trailing_slash: + path = path[:-1] + if "." in path: + # Strip '.' segments + parts = [x for x in path.split("/") if x != "."] + else: + parts = path.split("/") + if strip_tail and not had_trailing_slash and parts: + parts.pop() + self.path = "/".join(parts) or "." + self.parts = parts + + def parents(self) -> Generator["URLPath", None, None]: + """Return a list of parent paths for a given path.""" + parts = self.parts + for i in range(len(parts) - 1, -1, -1): + parent_parts = parts[:i] + url_path = object.__new__(URLPath) + url_path.path = "/".join(parent_parts) or "." + url_path.parts = parent_parts + yield url_path + + def calculate_relative_path(target: str, base: str) -> str: """Return the relative path between two other paths. If the operation is not possible, raise ValueError. """ + target_path = URLPath(target) + base_path = URLPath(base, strip_tail=True) - target = target or "/" - base = base or "/" - - target_path = PurePosixPath(target) - base_path = PurePosixPath(base) - - if base[-1] != "/": - base_path = base_path.parent + target_path_parts: Union[set[str], None] = None + target_path_path = target_path.path - for step, path in enumerate(chain((base_path,), base_path.parents)): - if path == target_path or path in target_path.parents: - break - elif path.name == "..": - raise ValueError(f"'..' segment in {str(base_path)!r} cannot be walked") - else: + if (target and target[0] == "/") != (base and base[0] == "/"): raise ValueError( - f"{str(target_path)!r} and {str(base_path)!r} have different anchors" + f"{target_path_path!r} and {base_path.path!r} have different anchors" ) - offset = len(path.parts) - return str(PurePosixPath(*("..",) * step, *target_path.parts[offset:])) + + for step, base_walk in enumerate(chain((base_path,), base_path.parents())): + if base_walk.path == target_path_path: + break + # If the target_path_parts is already built we can use a fast path + if target_path_parts is not None: + if base_walk.path in target_path_parts: + break + elif base_walk.parts[-1] == "..": + raise ValueError(f"'..' segment in {base_path.path!r} cannot be walked") + continue + target_path_parts = set() + # We check one at a time because enumerating parents + # builds the value on demand, and we want to stop + # as soon as we find the common parent + for target_parent in target_path.parents(): + if target_parent.path == base_path.path: + break + target_path_parts.add(target_parent.path) + else: + # If we didn't break, it means we didn't find a common parent + if base_walk.parts[-1] == "..": + raise ValueError(f"'..' segment in {base_path.path!r} cannot be walked") + continue + break + + offset = len(base_walk.parts) + return "/".join((*("..",) * step, *target_path.parts[offset:])) or "." diff --git a/yarl/_url.py b/yarl/_url.py index 1766f305c..2b6399a49 100644 --- a/yarl/_url.py +++ b/yarl/_url.py @@ -502,6 +502,11 @@ def __sub__(self, other: object) -> "URL": if target_netloc != base_netloc: raise ValueError("Both URLs should have the same netloc") + if target_netloc and not target_path: + target_path = "/" + if base_netloc and not base_path: + base_path = "/" + path = calculate_relative_path(target_path, base_path) return self._from_tup(("", "", path, "", ""))