From 2d6db63092090149d247e9504596f6bd9fa245ff Mon Sep 17 00:00:00 2001 From: "Joshua A. Anderson" Date: Tue, 13 Feb 2024 11:28:42 -0500 Subject: [PATCH] Add Job.cached_statepoint for fast read access to state point (#975) * Make additional use of the statepoint cache. _StatePointDict takes significant time to initialize, even when the statepoint dict is known. Adjust `Job` initialization to make more use of the statepoint cache and initialize `_StatePointDict` only when `Job.statepoint` is accessed. Provide a faster path for cached *read* access to the statepoint dict via the new property `Job.statepoint_dict`. One side effect of this change is that some warnings are now deferred to `statepoint` access that were previously issued during `Job.__init__` (see changes in tests/). There are additional opportunities to use the cached statepoint dict in `Project.groupby` that this commit does not address. * Make JobsCursor.__len__ and .__contains__ O(1). Cache the ids matching the job filter. This enables O(1) cost for __len__ and __contains__ as users would expect. In some use-cases, signac-flow repeatedly calls __contains__ on a JobsCursor. The side effect of this change is that modifications to the workspace will not be reflected in existing JobsCursor instances. This behavior was not previously documented in the user API. * Add validate_statepoint argument to Job.init() `with job`, `Job.document`, and `Job.stores` call `init()` because they require that the job directory exists. Prior to this change, `init()` also forced a load of the `_StatepointDict`. These methods now call `init(validate_statepoints=False)` which exits early when the job directory exists. This change provides a reasonable performance boost (5x on NVME, more on network storage). There may be more room for improvement as there are currently 2N stat calls in this use-case: ```python for job in project: with job: pass ``` * Rename statepoint_dict to statepoint_mapping. deepcopy is unexpectedly expensive. Refactor the earlier commit to deepcopy only user-provided statepoint dicts. Statepoints from the cache are passed to the user read-only via MappingProxyType. * Read the cache from disk in `open_job`. `open_job` uses the statepoint cache to improve performance. Read the cache from disk in `open_job` (if it has not already been read). This provides consistently high performance in cases where `open_job` is called before any other method that may have triggered `_get_statepoint`. * Restore cache miss logger level to debug. Users may find the messages to verbose. At the same time, users might never realize that they should run `signac update-cache` without this message... * Instantiate Job by id directly when iterating over ids. `open_job` is a user-facing function and performs error checking on the id. This check involves a stat call to verify the job directory exists. When `Project` is looping over ids from `_get_job_ids`, the directory is known to exist (subject to race conditions). `stat` calls are expensive, especially on networked filesystems. Instantiating `Job` directly bypasses this check in `open_job`. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add statepoint_mapping test. * Pass information about the Job directories existence from Project to Job. This allows Job to avoid some `stat` calls which greatly improves performance on networked file systems. * Populate _statepoint_mapping in additional code paths. These missed opportunities to pre-populate _statepoint_mapping triggered slow code paths. * Increase test coverage. * Update change log. * Rename statepoint_mapping to cached_statepoint. Also attempt to fix sphinx-doc links. * Doc fixes. * Update code comments * Use cached_statepoint in to_dataframe. * Restore iteration order. Python set has a randomized iteration order. Preserve the original iteration order with a list and converto to set only for __contains__ checks. * Validate cached_statpoing when read from disk. This has the added benefit of validating all statepoints that are added to the cache. I needed to add a validate argument to update_cache because one of the unit tests relies on adding invalid statepoints to the cache. * Use cached_statepoint in groupby. * Remove validate argument from update_cache. Locally catch the JobsCorruptedError and ignore it in the test that needs to. * Write state point as two words in doc strings --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Corwin Kerr --- changelog.txt | 5 +- doc/api.rst | 1 + signac/job.py | 110 +++++++++++++++++++----- signac/project.py | 143 +++++++++++++++++++++++--------- tests/test_job.py | 39 ++++++++- tests/test_numpy_integration.py | 1 + tests/test_project.py | 6 +- 7 files changed, 241 insertions(+), 64 deletions(-) diff --git a/changelog.txt b/changelog.txt index 63c7af8cd..d9403b8cb 100644 --- a/changelog.txt +++ b/changelog.txt @@ -7,13 +7,15 @@ The **signac** package follows `semantic versioning `_. Version 2 ========= -[2.2.0] -- 2023-xx-xx +[2.2.0] -- 2024-xx-xx --------------------- Added +++++ - Official support for Python 3.12 (#957). + - ``Job.cached_statepoint`` - cached and read only access to job state points. Faster than + ``Job.statepoint`` (#975). Changed +++++++ @@ -21,6 +23,7 @@ Changed - Restrict allowable tar file features in Python 3.12 (#957). - linked views now can contain spaces and other characters except directory separators (#926). - linked views now can be created on Windows, if 'Developer mode' is enabled (#430). + - Increase performance for many usage patterns (#975). Fixed +++++ diff --git a/doc/api.rst b/doc/api.rst index 4a89a5a55..41734ed39 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -78,6 +78,7 @@ The Job class .. autosummary:: + Job.cached_statepoint Job.clear Job.close Job.data diff --git a/signac/job.py b/signac/job.py index c26e396b8..c3a637ff0 100644 --- a/signac/job.py +++ b/signac/job.py @@ -11,6 +11,7 @@ import shutil from copy import deepcopy from threading import RLock +from types import MappingProxyType from typing import FrozenSet from synced_collections.backends.collection_json import ( @@ -248,7 +249,8 @@ class Job: Jobs can be opened by ``statepoint`` or ``id_``. If both values are provided, it is the user's responsibility to ensure that the values - correspond. + correspond. Set ``directory_known`` to ``True`` when the job directory + is known to exist - this skips some expensive isdir checks. Parameters ---------- @@ -258,6 +260,8 @@ class Job: State point for the job. (Default value = None) id_ : str, optional The job identifier. (Default value = None) + directory_known : bool, optional + Set to true when the job directory is known to exist. (Default value = False) """ @@ -274,30 +278,34 @@ class Job: KEY_DATA = "signac_data" "The job's datastore key." - def __init__(self, project, statepoint=None, id_=None): + def __init__(self, project, statepoint=None, id_=None, directory_known=False): self._project = project self._lock = RLock() self._initialize_lazy_properties() + self._directory_known = directory_known if statepoint is None and id_ is None: raise ValueError("Either statepoint or id_ must be provided.") elif statepoint is not None: - self._statepoint_requires_init = False try: self._id = calc_id(statepoint) if id_ is None else id_ except TypeError: raise KeyTypeError - self._statepoint = _StatePointDict( - jobs=[self], filename=self._statepoint_filename, data=statepoint - ) - # Update the project's state point cache immediately if opened by state point - self._project._register(self.id, statepoint) + self._cached_statepoint = statepoint + self._statepoint_requires_init = True else: # Only an id was provided. State point will be loaded lazily. self._id = id_ self._statepoint_requires_init = True + # Fetch the cached statepoint from the project's cache. Don't load it + # from disk on a cache miss (will be loaded on demand). + try: + self._cached_statepoint = project._sp_cache[id_] + except KeyError: + self._cached_statepoint = None + def _initialize_lazy_properties(self): """Initialize all properties that are designed to be loaded lazily.""" with self._lock: @@ -334,7 +342,7 @@ def __str__(self): def __repr__(self): return "{}(project={}, statepoint={})".format( - self.__class__.__name__, repr(self._project), self.statepoint + self.__class__.__name__, repr(self._project), self.cached_statepoint ) @property @@ -406,6 +414,33 @@ def update_statepoint(self, update, overwrite=False): statepoint.update(update) self.statepoint = statepoint + @property + def cached_statepoint(self): + """Get a copy of the job's state point as a read-only mapping. + + :py:attr:`cached_statepoint` uses the state point cache to provide fast access to + the job's state point for reading. + + .. note:: + + Create and update the state point cache by calling + :py:meth:`project.update_cache ` + or running ``signac update-cache`` on the command line. + + .. seealso:: + + Use :py:attr:`statepoint` to modify the job's state point. + + Returns + ------- + Mapping + Returns the job's state point. + """ + if self._cached_statepoint is None: + self._cached_statepoint = self._project._get_statepoint(self._id) + + return MappingProxyType(self._cached_statepoint) + @property def statepoint(self): """Get or set the job's state point. @@ -416,6 +451,11 @@ def statepoint(self): `Modifying the State Point `_. + .. tip:: + + Use :py:attr:`cached_statepoint` for fast read-only access to the + state point. + .. warning:: The state point object behaves like a dictionary in most cases, @@ -443,14 +483,25 @@ def statepoint(self): """ with self._lock: if self._statepoint_requires_init: - # Load state point data lazily (on access). - self._statepoint = _StatePointDict( - jobs=[self], filename=self._statepoint_filename - ) - statepoint = self._statepoint.load(self.id) + if self._cached_statepoint is None: + # Load state point data lazily (on access). + self._statepoint = _StatePointDict( + jobs=[self], + filename=self._statepoint_filename, + ) + statepoint = self._statepoint.load(self.id) + + # Update the project's state point cache when loaded lazily + self._project._register(self.id, statepoint) + self._cached_statepoint = statepoint + else: + # Create _StatePointDict lazily with a known statepoint dict. + self._statepoint = _StatePointDict( + jobs=[self], + filename=self._statepoint_filename, + data=self._cached_statepoint, + ) - # Update the project's state point cache when loaded lazily - self._project._register(self.id, statepoint) self._statepoint_requires_init = False return self._statepoint @@ -510,7 +561,7 @@ def document(self): """ with self._lock: if self._document is None: - self.init() + self.init(validate_statepoint=False) fn_doc = os.path.join(self.path, self.FN_DOCUMENT) self._document = BufferedJSONAttrDict( filename=fn_doc, write_concern=True @@ -591,9 +642,9 @@ def stores(self): """ with self._lock: if self._stores is None: - self.init() + self.init(validate_statepoint=False) self._stores = H5StoreManager(self.path) - return self.init()._stores + return self._stores @property def data(self): @@ -640,7 +691,7 @@ def project(self): """ return self._project - def init(self, force=False): + def init(self, force=False, validate_statepoint=True): """Initialize the job's workspace directory. This function will do nothing if the directory and the job state point @@ -656,6 +707,10 @@ def init(self, force=False): Overwrite any existing state point files, e.g., to repair them if they got corrupted (Default value = False). + validate_statepoint : bool, optional + When True (the default), load the job state point and ensure that it matches + the id. When False, exit early when the job directory exists. + Returns ------- Job @@ -671,6 +726,15 @@ def init(self, force=False): """ with self._lock: try: + # Fast early exit when not validating. + if not validate_statepoint: + if self._directory_known: + return self + + if os.path.isdir(self.path): + self._directory_known = True + return self + # Attempt early exit if the state point file exists and is valid. try: statepoint = self.statepoint.load(self.id) @@ -687,6 +751,8 @@ def init(self, force=False): ) raise + self._directory_known = True + # The state point save will not overwrite an existing file on # disk unless force is True, so the subsequent load will catch # when a preexisting invalid file was present. @@ -760,6 +826,8 @@ def remove(self): self._document = None self._stores = None + self._directory_known = False + def move(self, project): """Move this job to project. @@ -899,7 +967,7 @@ def open(self): """ self._cwd.append(os.getcwd()) - self.init() + self.init(validate_statepoint=False) logger.info(f"Enter workspace '{self.path}'.") os.chdir(self.path) diff --git a/signac/project.py b/signac/project.py index d0cb27938..6ead1c356 100644 --- a/signac/project.py +++ b/signac/project.py @@ -15,6 +15,7 @@ from collections import defaultdict from collections.abc import Iterable from contextlib import contextmanager +from copy import deepcopy from datetime import timedelta from itertools import groupby from multiprocessing.pool import ThreadPool @@ -203,6 +204,7 @@ def __init__(self, path=None): # can be used to re-open a job by id as long as that id remains in the # cache. self._sp_cache = {} + self._sp_cache_read = False self._sp_cache_misses = 0 self._sp_cache_warned = False self._sp_cache_miss_warning_threshold = self.config.get( @@ -517,13 +519,18 @@ def open_job(self, statepoint=None, id=None): than one match. """ + if not self._sp_cache_read: + # Read the cache from disk on the first call. + self._read_cache() + self._sp_cache_read = True + if statepoint is None and id is None: raise ValueError("Must provide statepoint or id.") elif statepoint is not None and id is not None: raise ValueError("Either statepoint or id must be provided, but not both.") elif statepoint is not None: # Second best case (Job will update self._sp_cache on init) - return Job(project=self, statepoint=statepoint) + return Job(project=self, statepoint=deepcopy(statepoint)) try: # Optimal case (id is in the state point cache) return Job(project=self, statepoint=self._sp_cache[id], id_=id) @@ -545,7 +552,7 @@ def open_job(self, statepoint=None, id=None): elif not self._contains_job_id(id): # id does not exist in the project data space raise KeyError(id) - return Job(project=self, id_=id) + return Job(project=self, id_=id, directory_known=True) def _job_dirs(self): """Generate ids of jobs in the workspace. @@ -712,6 +719,20 @@ def find_jobs(self, filter=None): See :ref:`signac find ` for the command line equivalent. + .. tip:: + + To find a single job given a state point, use `open_job` with O(1) cost. + + .. tip:: + + To find many groups of jobs, use your own code to loop through the project + once and build multiple matching lists. + + .. warning:: + + `find_jobs` costs O(N) each time it is called. It applies the filter to + every job in the workspace. + Parameters ---------- filter : Mapping, optional @@ -835,13 +856,20 @@ def _register(self, id_, statepoint): """ self._sp_cache[id_] = statepoint - def _get_statepoint_from_workspace(self, job_id): + def _get_statepoint_from_workspace(self, job_id, validate=True): """Attempt to read the state point from the workspace. Parameters ---------- job_id : str Identifier of the job. + validate : bool + When True, validate that any state point read from disk matches the job_id. + + Raises + ------ + :class:`signac.errors.JobsCorruptedError` + When one or more jobs are identified as corrupted. """ # Performance-critical path. We can rely on the project workspace, job @@ -850,7 +878,11 @@ def _get_statepoint_from_workspace(self, job_id): fn_statepoint = os.sep.join((self.workspace, job_id, Job.FN_STATE_POINT)) try: with open(fn_statepoint, "rb") as statepoint_file: - return json.loads(statepoint_file.read().decode()) + statepoint = json.loads(statepoint_file.read().decode()) + if validate and calc_id(statepoint) != job_id: + raise JobsCorruptedError([job_id]) + + return statepoint except (OSError, ValueError) as error: if os.path.isdir(os.sep.join((self.workspace, job_id))): logger.error( @@ -861,7 +893,7 @@ def _get_statepoint_from_workspace(self, job_id): raise JobsCorruptedError([job_id]) raise KeyError(job_id) - def _get_statepoint(self, job_id): + def _get_statepoint(self, job_id, validate=True): """Get the state point associated with a job id. The state point is retrieved from the internal cache, from @@ -871,6 +903,9 @@ def _get_statepoint(self, job_id): ---------- job_id : str A job id to get the state point for. + validate : bool + When True, validate that any state point read from disk matches the job_id. + Returns ------- @@ -886,10 +921,10 @@ def _get_statepoint(self, job_id): corrupted. """ - if not self._sp_cache: - # Triggers if no state points have been added to the cache, and all - # the values are None. + if not self._sp_cache_read: + # Read the cache from disk on the first call. self._read_cache() + self._sp_cache_read = True try: # State point cache hit return self._sp_cache[job_id] @@ -902,10 +937,10 @@ def _get_statepoint(self, job_id): ): logger.debug( "High number of state point cache misses. Consider " - "to update cache with the Project.update_cache() method." + "updating the cache by running `signac update-cache`." ) self._sp_cache_warned = True - statepoint = self._get_statepoint_from_workspace(job_id) + statepoint = self._get_statepoint_from_workspace(job_id, validate) # Update the project's state point cache from this cache miss self._sp_cache[job_id] = statepoint return statepoint @@ -1237,11 +1272,7 @@ def check(self): logger.info("Checking workspace for corruption...") for job_id in self._find_job_ids(): try: - statepoint = self._get_statepoint(job_id) - if calc_id(statepoint) != job_id: - corrupted.append(job_id) - else: - self.open_job(statepoint).init() + self._get_statepoint_from_workspace(job_id) except JobsCorruptedError as error: corrupted.extend(error.job_ids) if corrupted: @@ -1277,7 +1308,7 @@ def repair(self, job_ids=None): for job_id in job_ids: try: # First, check if we can look up the state point. - statepoint = self._get_statepoint(job_id) + statepoint = self._get_statepoint(job_id, validate=False) # Check if state point and id correspond. correct_id = calc_id(statepoint) if correct_id != job_id: @@ -1406,7 +1437,6 @@ def update_cache(self): including iteration and filtering or selection are expected to be significantly faster after calling this function, especially for large data spaces. - """ logger.info("Update cache...") start = time.time() @@ -1610,7 +1640,7 @@ def get_job(cls, path=None): project = cls.get_project(os.path.join(job_path, os.pardir)) # Return the matched job id from the found project - return project.open_job(id=job_id) + return Job(project=project, id_=job_id, directory_known=True) def __getstate__(self): state = dict(self.__dict__) @@ -1667,7 +1697,9 @@ def __init__(self, project, ids): self._ids_iterator = iter(ids) def __next__(self): - return self._project.open_job(id=next(self._ids_iterator)) + return Job( + project=self._project, id_=next(self._ids_iterator), directory_known=True + ) def __iter__(self): return type(self)(self._project, self._ids) @@ -1681,6 +1713,12 @@ class JobsCursor: Enables simple iteration and grouping operations. + .. warning:: + + `JobsCursor` caches the jobs that match the filter. Call `Project.find_jobs` + again to update the search after making changes to jobs or the workspace + that would change the result of the search. + Parameters ---------- project : :class:`~signac.Project` @@ -1700,6 +1738,42 @@ def __init__(self, project, filter=None): if self._filter == {}: self._filter = None + # Cache for matching ids. + self._id_cache = None + self._id_set_cache = None + + @property + def _ids(self): + """List of job ids that match the filter. + + Populated on first use, then cached in subsequent calls. + + Returns + ------- + list[str] + Job ids that match the filter. + """ + if self._id_cache is None: + self._id_cache = self._project._find_job_ids(self._filter) + + return self._id_cache + + @property + def _id_set(self): + """Set of job ids that match the filter. + + Populated on first use, then cached in subsequent calls. + + Returns + ------- + set[str] + Job ids that match the filter. + """ + if self._id_set_cache is None: + self._id_set_cache = set(self._ids) + + return self._id_set_cache + def __eq__(self, other): return self._project == other._project and self._filter == other._filter @@ -1708,7 +1782,7 @@ def __len__(self): if self._filter: # We use the standard function for determining job ids if and only if # any of the two filter is provided. - return len(self._project._find_job_ids(self._filter)) + return len(self._ids) else: # Without filters, we can simply return the length of the whole project. return len(self._project) @@ -1727,25 +1801,14 @@ def __contains__(self, job): True if the job matches the filter criteria and is initialized for this project. """ - if job not in self._project: - # Exit early if the job is not in the project. This is O(1). - return False if self._filter: - # We use the standard function for determining job ids if a filter - # is provided. This is O(N) and could be optimized by caching the - # ids of state points that match a state point filter. Caching the - # matches for a document filter is not safe because the document - # can change. - return job.id in self._project._find_job_ids(self._filter) - # Without filters, we can simply check if the job is in the project. - # By the early-exit condition, we know the job must be contained. - return True + return job.id in self._id_set + + return job in self._project def __iter__(self): # Code duplication here for improved performance. - return _JobsCursorIterator( - self._project, self._project._find_job_ids(self._filter) - ) + return _JobsCursorIterator(self._project, self._ids) def groupby(self, key=None, default=None): """Group jobs according to one or more state point or document parameters. @@ -1845,7 +1908,7 @@ def keyfunction(job): else: def keyfunction(job): - return job.sp[stripped_key] + return job.cached_statepoint[stripped_key] else: if _is_doc_key(key): @@ -1856,7 +1919,7 @@ def keyfunction(job): else: def keyfunction(job): - return job.sp.get(stripped_key, default) + return job.cached_statepoint.get(stripped_key, default) elif isinstance(key, Iterable): sp_keys = [] @@ -1875,7 +1938,7 @@ def keyfunction(job): def keyfunction(job): return tuple( - [job.sp[k] for k in sp_keys] + [job.cached_statepoint[k] for k in sp_keys] + [job.document[k] for k in doc_keys] ) @@ -1883,7 +1946,7 @@ def keyfunction(job): def keyfunction(job): return tuple( - [job.sp.get(k, default) for k in sp_keys] + [job.cached_statepoint.get(k, default) for k in sp_keys] + [job.document.get(k, default) for k in doc_keys] ) @@ -2002,7 +2065,7 @@ def _export_sp_and_doc(job): tuple with prefixed state point or document key and values. """ - for key, value in _flatten(job.statepoint).items(): + for key, value in _flatten(job.cached_statepoint).items(): prefixed_key = sp_prefix + key if usecols(prefixed_key): yield prefixed_key, value diff --git a/tests/test_job.py b/tests/test_job.py index 7cf8d6aa6..20cf7bf69 100644 --- a/tests/test_job.py +++ b/tests/test_job.py @@ -308,7 +308,7 @@ def test_interface_job_identity_change(self): def test_interface_nested_kws(self): with pytest.raises(InvalidKeyError): - job = self.open_job({"a.b.c": 0}) + job = self.open_job({"a.b.c": 0}).statepoint job = self.open_job(dict(a=dict(b=dict(c=2)))) assert job.sp.a.b.c == 2 @@ -492,6 +492,43 @@ class A: with pytest.raises(TypeError): job.doc = {key: "test"} + def test_cached_statepoint_read_only(self): + statepoint = {"a": 0, "b": 1, "dict": {"value": "string"}} + job = self.open_job(statepoint=statepoint) + job.init() + + assert "a" in job.cached_statepoint + assert "b" in job.cached_statepoint + assert "c" not in job.cached_statepoint + assert "dict" in job.cached_statepoint + assert job.cached_statepoint["a"] == 0 + assert job.cached_statepoint["b"] == 1 + assert job.cached_statepoint["dict"] == {"value": "string"} + with pytest.raises(KeyError): + job.cached_statepoint["c"] + assert list(job.cached_statepoint.keys()) == ["a", "b", "dict"] + + with pytest.raises(TypeError): + job.cached_statepoint["c"] = 2 + + def test_cached_statepoint_lazy_init(self): + statepoint = {"a": 0} + job = self.project.open_job(statepoint=statepoint) + job.init() + id_ = job.id + + # Clear the cache to force a lazy load of the cached statepoint + self.project._sp_cache.clear() + job = self.project.open_job(id=id_) + job.cached_statepoint + + def test_no_args_error(self): + with pytest.raises(ValueError): + self.project.open_job() + + with pytest.raises(ValueError): + Job(project=self.project) + class TestConfig(TestJobBase): def test_config_str(self): diff --git a/tests/test_numpy_integration.py b/tests/test_numpy_integration.py index 28275fe85..f64e1ddca 100644 --- a/tests/test_numpy_integration.py +++ b/tests/test_numpy_integration.py @@ -22,6 +22,7 @@ def test_store_number_in_sp_and_doc(self): b = numpy.float64(i) if i % 2 else numpy.float32(i) with pytest.warns(NumpyConversionWarning): job = self.project.open_job(dict(a=a)) + job.statepoint with pytest.warns(NumpyConversionWarning): job.doc.b = b numpy.testing.assert_equal(job.doc.b, b) diff --git a/tests/test_project.py b/tests/test_project.py index da546ede0..2b75e2eb5 100644 --- a/tests/test_project.py +++ b/tests/test_project.py @@ -513,7 +513,11 @@ def test_rename_workspace(self): with pytest.raises(JobsCorruptedError): self.project.check() # ... we reinitialize the initial job, ... - job.init() + try: + job.init() + except JobsCorruptedError: + # ... which raises the JobsCorruptedError in update_cache + pass with pytest.raises(JobsCorruptedError): # ... which means the repair attempt must fail. self.project.repair()