From f81083c1c25f67f6e24e481a001c14344ac005b3 Mon Sep 17 00:00:00 2001 From: Barney Gale Date: Sun, 9 Apr 2023 18:40:03 +0100 Subject: [PATCH] GH-101362: Omit path anchor from `pathlib.PurePath()._parts` (GH-102476) Improve performance of path construction by skipping the addition of the path anchor (`drive + root`) to the internal `_parts` list. Rename this attribute to `_tail` for clarity. --- Lib/pathlib.py | 171 +++++++++++------- Lib/test/test_pathlib.py | 2 - ...-03-06-18-49-57.gh-issue-101362.eSSy6L.rst | 2 + 3 files changed, 108 insertions(+), 67 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2023-03-06-18-49-57.gh-issue-101362.eSSy6L.rst diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 490f89f39d26d1b..4ae1fae6f4b3580 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -210,20 +210,17 @@ def _select_from(self, parent_path, is_dir, exists, scandir, normcase): class _PathParents(Sequence): """This object provides sequence-like access to the logical ancestors of a path. Don't try to construct it yourself.""" - __slots__ = ('_pathcls', '_drv', '_root', '_parts') + __slots__ = ('_pathcls', '_drv', '_root', '_tail') def __init__(self, path): # We don't store the instance to avoid reference cycles self._pathcls = type(path) self._drv = path.drive self._root = path.root - self._parts = path._parts + self._tail = path._tail def __len__(self): - if self._drv or self._root: - return len(self._parts) - 1 - else: - return len(self._parts) + return len(self._tail) def __getitem__(self, idx): if isinstance(idx, slice): @@ -234,7 +231,7 @@ def __getitem__(self, idx): if idx < 0: idx += len(self) return self._pathcls._from_parsed_parts(self._drv, self._root, - self._parts[:-idx - 1]) + self._tail[:-idx - 1]) def __repr__(self): return "<{}.parents>".format(self._pathcls.__name__) @@ -249,9 +246,41 @@ class PurePath(object): PureWindowsPath object. You can also instantiate either of these classes directly, regardless of your system. """ + __slots__ = ( - '_raw_path', '_drv', '_root', '_parts_cached', - '_str', '_hash', '_parts_tuple', '_parts_normcase_cached', + # The `_raw_path` slot stores an unnormalized string path. This is set + # in the `__init__()` method. + '_raw_path', + + # The `_drv`, `_root` and `_tail_cached` slots store parsed and + # normalized parts of the path. They are set when any of the `drive`, + # `root` or `_tail` properties are accessed for the first time. The + # three-part division corresponds to the result of + # `os.path.splitroot()`, except that the tail is further split on path + # separators (i.e. it is a list of strings), and that the root and + # tail are normalized. + '_drv', '_root', '_tail_cached', + + # The `_str` slot stores the string representation of the path, + # computed from the drive, root and tail when `__str__()` is called + # for the first time. It's used to implement `_str_normcase` + '_str', + + # The `_str_normcase_cached` slot stores the string path with + # normalized case. It is set when the `_str_normcase` property is + # accessed for the first time. It's used to implement `__eq__()` + # `__hash__()`, and `_parts_normcase` + '_str_normcase_cached', + + # The `_parts_normcase_cached` slot stores the case-normalized + # string path after splitting on path separators. It's set when the + # `_parts_normcase` property is accessed for the first time. It's used + # to implement comparison methods like `__lt__()`. + '_parts_normcase_cached', + + # The `_hash` slot stores the hash of the case-normalized string + # path. It's set when `__hash__()` is called for the first time. + '_hash', ) _flavour = os.path @@ -277,10 +306,7 @@ def __init__(self, *args): path = os.fspath(args[0]) else: path = self._flavour.join(*args) - if isinstance(path, str): - # Force-cast str subclasses to str (issue #21127) - path = str(path) - else: + if not isinstance(path, str): raise TypeError( "argument should be a str or an os.PathLike " "object where __fspath__ returns a str, " @@ -299,33 +325,32 @@ def _parse_path(cls, path): if drv.startswith(sep): # pathlib assumes that UNC paths always have a root. root = sep - unfiltered_parsed = [drv + root] + rel.split(sep) - parsed = [sys.intern(x) for x in unfiltered_parsed if x and x != '.'] + parsed = [sys.intern(str(x)) for x in rel.split(sep) if x and x != '.'] return drv, root, parsed def _load_parts(self): - drv, root, parts = self._parse_path(self._raw_path) + drv, root, tail = self._parse_path(self._raw_path) self._drv = drv self._root = root - self._parts_cached = parts + self._tail_cached = tail @classmethod - def _from_parsed_parts(cls, drv, root, parts): - path = cls._format_parsed_parts(drv, root, parts) + def _from_parsed_parts(cls, drv, root, tail): + path = cls._format_parsed_parts(drv, root, tail) self = cls(path) self._str = path or '.' self._drv = drv self._root = root - self._parts_cached = parts + self._tail_cached = tail return self @classmethod - def _format_parsed_parts(cls, drv, root, parts): + def _format_parsed_parts(cls, drv, root, tail): if drv or root: - return drv + root + cls._flavour.sep.join(parts[1:]) - elif parts and cls._flavour.splitdrive(parts[0])[0]: - parts = ['.'] + parts - return cls._flavour.sep.join(parts) + return drv + root + cls._flavour.sep.join(tail) + elif tail and cls._flavour.splitdrive(tail[0])[0]: + tail = ['.'] + tail + return cls._flavour.sep.join(tail) def __str__(self): """Return the string representation of the path, suitable for @@ -334,7 +359,7 @@ def __str__(self): return self._str except AttributeError: self._str = self._format_parsed_parts(self.drive, self.root, - self._parts) or '.' + self._tail) or '.' return self._str def __fspath__(self): @@ -374,25 +399,34 @@ def as_uri(self): path = str(self) return prefix + urlquote_from_bytes(os.fsencode(path)) + @property + def _str_normcase(self): + # String with normalized case, for hashing and equality checks + try: + return self._str_normcase_cached + except AttributeError: + self._str_normcase_cached = self._flavour.normcase(str(self)) + return self._str_normcase_cached + @property def _parts_normcase(self): - # Cached parts with normalized case, for hashing and comparison. + # Cached parts with normalized case, for comparisons. try: return self._parts_normcase_cached except AttributeError: - self._parts_normcase_cached = [self._flavour.normcase(p) for p in self._parts] + self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep) return self._parts_normcase_cached def __eq__(self, other): if not isinstance(other, PurePath): return NotImplemented - return self._parts_normcase == other._parts_normcase and self._flavour is other._flavour + return self._str_normcase == other._str_normcase and self._flavour is other._flavour def __hash__(self): try: return self._hash except AttributeError: - self._hash = hash(tuple(self._parts_normcase)) + self._hash = hash(self._str_normcase) return self._hash def __lt__(self, other): @@ -434,12 +468,12 @@ def root(self): return self._root @property - def _parts(self): + def _tail(self): try: - return self._parts_cached + return self._tail_cached except AttributeError: self._load_parts() - return self._parts_cached + return self._tail_cached @property def anchor(self): @@ -450,10 +484,10 @@ def anchor(self): @property def name(self): """The final path component, if any.""" - parts = self._parts - if len(parts) == (1 if (self.drive or self.root) else 0): + tail = self._tail + if not tail: return '' - return parts[-1] + return tail[-1] @property def suffix(self): @@ -501,7 +535,7 @@ def with_name(self, name): if drv or root or not tail or f.sep in tail or (f.altsep and f.altsep in tail): raise ValueError("Invalid name %r" % (name)) return self._from_parsed_parts(self.drive, self.root, - self._parts[:-1] + [name]) + self._tail[:-1] + [name]) def with_stem(self, stem): """Return a new path with the stem changed.""" @@ -526,7 +560,7 @@ def with_suffix(self, suffix): else: name = name[:-len(old_suffix)] + suffix return self._from_parsed_parts(self.drive, self.root, - self._parts[:-1] + [name]) + self._tail[:-1] + [name]) def relative_to(self, other, /, *_deprecated, walk_up=False): """Return the relative path to another path identified by the passed @@ -551,7 +585,7 @@ def relative_to(self, other, /, *_deprecated, walk_up=False): raise ValueError(f"{str(self)!r} and {str(other)!r} have different anchors") if step and not walk_up: raise ValueError(f"{str(self)!r} is not in the subpath of {str(other)!r}") - parts = ('..',) * step + self.parts[len(path.parts):] + parts = ['..'] * step + self._tail[len(path._tail):] return path_cls(*parts) def is_relative_to(self, other, /, *_deprecated): @@ -570,13 +604,10 @@ def is_relative_to(self, other, /, *_deprecated): def parts(self): """An object providing sequence-like access to the components in the filesystem path.""" - # We cache the tuple to avoid building a new one each time .parts - # is accessed. XXX is this necessary? - try: - return self._parts_tuple - except AttributeError: - self._parts_tuple = tuple(self._parts) - return self._parts_tuple + if self.drive or self.root: + return (self.drive + self.root,) + tuple(self._tail) + else: + return tuple(self._tail) def joinpath(self, *args): """Combine this path with one or several arguments, and return a @@ -603,10 +634,10 @@ def parent(self): """The logical parent of the path.""" drv = self.drive root = self.root - parts = self._parts - if len(parts) == 1 and (drv or root): + tail = self._tail + if not tail: return self - return self._from_parsed_parts(drv, root, parts[:-1]) + return self._from_parsed_parts(drv, root, tail[:-1]) @property def parents(self): @@ -624,29 +655,29 @@ def is_absolute(self): def is_reserved(self): """Return True if the path contains one of the special names reserved by the system, if any.""" - if self._flavour is posixpath or not self._parts: + if self._flavour is posixpath or not self._tail: return False # NOTE: the rules for reserved names seem somewhat complicated # (e.g. r"..\NUL" is reserved but not r"foo\NUL" if "foo" does not # exist). We err on the side of caution and return True for paths # which are not considered reserved by Windows. - if self._parts[0].startswith('\\\\'): + if self.drive.startswith('\\\\'): # UNC paths are never reserved. return False - name = self._parts[-1].partition('.')[0].partition(':')[0].rstrip(' ') + name = self._tail[-1].partition('.')[0].partition(':')[0].rstrip(' ') return name.upper() in _WIN_RESERVED_NAMES def match(self, path_pattern): """ Return True if this path matches the given pattern. """ - path_pattern = self._flavour.normcase(path_pattern) - drv, root, pat_parts = self._parse_path(path_pattern) - if not pat_parts: + pat = type(self)(path_pattern) + if not pat.parts: raise ValueError("empty pattern") + pat_parts = pat._parts_normcase parts = self._parts_normcase - if drv or root: + if pat.drive or pat.root: if len(pat_parts) != len(parts): return False elif len(pat_parts) > len(parts): @@ -707,11 +738,21 @@ def __new__(cls, *args, **kwargs): cls = WindowsPath if os.name == 'nt' else PosixPath return object.__new__(cls) - def _make_child_relpath(self, part): - # This is an optimization used for dir walking. `part` must be - # a single part relative to this path. - parts = self._parts + [part] - return self._from_parsed_parts(self.drive, self.root, parts) + def _make_child_relpath(self, name): + path_str = str(self) + tail = self._tail + if tail: + path_str = f'{path_str}{self._flavour.sep}{name}' + elif path_str != '.': + path_str = f'{path_str}{name}' + else: + path_str = name + path = type(self)(path_str) + path._str = path_str + path._drv = self.drive + path._root = self.root + path._tail_cached = tail + [name] + return path def __enter__(self): # In previous versions of pathlib, __exit__() marked this path as @@ -1196,12 +1237,12 @@ def expanduser(self): (as returned by os.path.expanduser) """ if (not (self.drive or self.root) and - self._parts and self._parts[0][:1] == '~'): - homedir = self._flavour.expanduser(self._parts[0]) + self._tail and self._tail[0][:1] == '~'): + homedir = self._flavour.expanduser(self._tail[0]) if homedir[:1] == "~": raise RuntimeError("Could not determine home directory.") - drv, root, parts = self._parse_path(homedir) - return self._from_parsed_parts(drv, root, parts + self._parts[1:]) + drv, root, tail = self._parse_path(homedir) + return self._from_parsed_parts(drv, root, tail + self._tail[1:]) return self diff --git a/Lib/test/test_pathlib.py b/Lib/test/test_pathlib.py index fe75f1c2580bc49..3c6da94d094610f 100644 --- a/Lib/test/test_pathlib.py +++ b/Lib/test/test_pathlib.py @@ -346,8 +346,6 @@ def test_parts_common(self): p = P('a/b') parts = p.parts self.assertEqual(parts, ('a', 'b')) - # The object gets reused. - self.assertIs(parts, p.parts) # When the path is absolute, the anchor is a separate part. p = P('/a/b') parts = p.parts diff --git a/Misc/NEWS.d/next/Library/2023-03-06-18-49-57.gh-issue-101362.eSSy6L.rst b/Misc/NEWS.d/next/Library/2023-03-06-18-49-57.gh-issue-101362.eSSy6L.rst new file mode 100644 index 000000000000000..87617a503c0dba6 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-03-06-18-49-57.gh-issue-101362.eSSy6L.rst @@ -0,0 +1,2 @@ +Speed up :class:`pathlib.Path` construction by omitting the path anchor from +the internal list of path parts.