Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance #975

Merged
merged 23 commits into from
Feb 13, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
ceb8c9e
Make additional use of the statepoint cache.
joaander Feb 7, 2024
19cc47a
Make JobsCursor.__len__ and .__contains__ O(1).
joaander Feb 7, 2024
f7dc01f
Add validate_statepoint argument to Job.init()
joaander Feb 7, 2024
8e6e2fc
Rename statepoint_dict to statepoint_mapping.
joaander Feb 7, 2024
df2b495
Read the cache from disk in `open_job`.
joaander Feb 7, 2024
e20194f
Restore cache miss logger level to debug.
joaander Feb 7, 2024
712ed09
Instantiate Job by id directly when iterating over ids.
joaander Feb 7, 2024
15c2d60
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 7, 2024
99aaf28
Add statepoint_mapping test.
joaander Feb 8, 2024
57ef8a9
Pass information about the Job directories existence from Project to …
joaander Feb 8, 2024
04fca6e
Populate _statepoint_mapping in additional code paths.
joaander Feb 8, 2024
4f135b6
Increase test coverage.
joaander Feb 8, 2024
c95ac0b
Update change log.
joaander Feb 8, 2024
f02beb5
Rename statepoint_mapping to cached_statepoint.
joaander Feb 9, 2024
32b8fa9
Doc fixes.
joaander Feb 9, 2024
e641333
Update code comments
cbkerr Feb 9, 2024
60e76a2
Use cached_statepoint in to_dataframe.
joaander Feb 9, 2024
7578993
Restore iteration order.
joaander Feb 12, 2024
4ebf74f
Validate cached_statpoing when read from disk.
joaander Feb 12, 2024
1de7155
Use cached_statepoint in groupby.
joaander Feb 12, 2024
d48d281
Remove validate argument from update_cache.
joaander Feb 12, 2024
9eb658f
Write state point as two words in doc strings
cbkerr Feb 13, 2024
6639566
Merge branch 'main' into improve-performance
cbkerr Feb 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion changelog.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,23 @@ The **signac** package follows `semantic versioning <https://semver.org/>`_.
Version 2
=========

[2.2.0] -- 2023-xx-xx
[2.2.0] -- 2024-xx-xx
---------------------

Added
+++++

- Official support for Python 3.12 (#957).
- ``Job.statepoint_mapping`` - cached and read only access to job statepoints. Faster than
``Job.statepoint`` (#975).

Changed
+++++++

- Restrict allowable tar file features in Python 3.12 (#957).
- linked views now can contain spaces and other characters except directory separators (#926).
- linked views now can be created on Windows, if 'Developer mode' is enabled (#430).
- Increase performance for many usage patterns (#975).

Fixed
+++++
Expand Down
108 changes: 87 additions & 21 deletions signac/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import shutil
from copy import deepcopy
from threading import RLock
from types import MappingProxyType
from typing import FrozenSet

from synced_collections.backends.collection_json import (
Expand Down Expand Up @@ -248,7 +249,8 @@ class Job:

Jobs can be opened by ``statepoint`` or ``id_``. If both values are
provided, it is the user's responsibility to ensure that the values
correspond.
correspond. Set ``directory_known`` to ``True`` when the job directory
is known to exist - this skips some expensive isdir checks.

Parameters
----------
Expand All @@ -258,6 +260,8 @@ class Job:
State point for the job. (Default value = None)
id_ : str, optional
The job identifier. (Default value = None)
directory_known : bool, optional
Set to true when the job directory is known to exist. (Default value = False)

"""

Expand All @@ -274,30 +278,34 @@ class Job:
KEY_DATA = "signac_data"
"The job's datastore key."

def __init__(self, project, statepoint=None, id_=None):
def __init__(self, project, statepoint=None, id_=None, directory_known=False):
self._project = project
self._lock = RLock()
self._initialize_lazy_properties()
self._directory_known = directory_known

if statepoint is None and id_ is None:
raise ValueError("Either statepoint or id_ must be provided.")
elif statepoint is not None:
self._statepoint_requires_init = False
try:
self._id = calc_id(statepoint) if id_ is None else id_
except TypeError:
raise KeyTypeError
self._statepoint = _StatePointDict(
jobs=[self], filename=self._statepoint_filename, data=statepoint
)

# Update the project's state point cache immediately if opened by state point
self._project._register(self.id, statepoint)
self._statepoint_mapping = statepoint
self._statepoint_requires_init = True
else:
# Only an id was provided. State point will be loaded lazily.
self._id = id_
self._statepoint_requires_init = True

# Fetch the statepoint mapping from the project's cache. Don't load it
# from disk on a cache miss (will be loaded on demand).
try:
self._statepoint_mapping = project._sp_cache[id_]
except KeyError:
self._statepoint_mapping = None

def _initialize_lazy_properties(self):
"""Initialize all properties that are designed to be loaded lazily."""
with self._lock:
Expand Down Expand Up @@ -334,7 +342,7 @@ def __str__(self):

def __repr__(self):
return "{}(project={}, statepoint={})".format(
self.__class__.__name__, repr(self._project), self.statepoint
self.__class__.__name__, repr(self._project), self.statepoint_mapping
)

@property
Expand Down Expand Up @@ -406,6 +414,32 @@ def update_statepoint(self, update, overwrite=False):
statepoint.update(update)
self.statepoint = statepoint

@property
def statepoint_mapping(self):
"""Get a copy of the job's statepoint as a read-only mapping.
cbkerr marked this conversation as resolved.
Show resolved Hide resolved

`statepoint_mapping` uses the statepoint cache to provide fast access to the
job's statepoint for reading.

.. note::

Create and update the statepoint cache with ``signac update-cache`` on the
command line.
joaander marked this conversation as resolved.
Show resolved Hide resolved

.. seealso::

Use `statepoint` to modify the job's statepoint.

Returns
-------
Mapping
Returns the job's state point.
"""
if self._statepoint_mapping is None:
self._statepoint_mapping = self._project._get_statepoint(self._id)

return MappingProxyType(self._statepoint_mapping)

@property
def statepoint(self):
"""Get or set the job's state point.
Expand All @@ -416,6 +450,10 @@ def statepoint(self):
`Modifying the State Point
<https://docs.signac.io/en/latest/jobs.html#modifying-the-state-point>`_.

.. tip::

Use `statepoint_mapping` for faster access to read the statepoint.

.. warning::

The state point object behaves like a dictionary in most cases,
Expand Down Expand Up @@ -443,14 +481,25 @@ def statepoint(self):
"""
with self._lock:
if self._statepoint_requires_init:
# Load state point data lazily (on access).
self._statepoint = _StatePointDict(
jobs=[self], filename=self._statepoint_filename
)
statepoint = self._statepoint.load(self.id)
if self._statepoint_mapping is None:
# Load state point data lazily (on access).
self._statepoint = _StatePointDict(
jobs=[self],
filename=self._statepoint_filename,
)
statepoint = self._statepoint.load(self.id)

# Update the project's state point cache when loaded lazily
self._project._register(self.id, statepoint)
self._statepoint_mapping = statepoint
else:
# Create _StatePointDict lazily with a known statepoint dict.
self._statepoint = _StatePointDict(
jobs=[self],
filename=self._statepoint_filename,
data=self._statepoint_mapping,
)

# Update the project's state point cache when loaded lazily
self._project._register(self.id, statepoint)
self._statepoint_requires_init = False

return self._statepoint
Expand Down Expand Up @@ -510,7 +559,7 @@ def document(self):
"""
with self._lock:
if self._document is None:
self.init()
self.init(validate_statepoint=False)
fn_doc = os.path.join(self.path, self.FN_DOCUMENT)
self._document = BufferedJSONAttrDict(
filename=fn_doc, write_concern=True
Expand Down Expand Up @@ -591,9 +640,9 @@ def stores(self):
"""
with self._lock:
if self._stores is None:
self.init()
self.init(validate_statepoint=False)
self._stores = H5StoreManager(self.path)
return self.init()._stores
return self._stores

@property
def data(self):
Expand Down Expand Up @@ -640,7 +689,7 @@ def project(self):
"""
return self._project

def init(self, force=False):
def init(self, force=False, validate_statepoint=True):
"""Initialize the job's workspace directory.

This function will do nothing if the directory and the job state point
Expand All @@ -656,6 +705,10 @@ def init(self, force=False):
Overwrite any existing state point files, e.g., to repair them if
they got corrupted (Default value = False).

validate_statepoint : bool, optional
When True (the default), load the job statepoint and ensure that it matches
cbkerr marked this conversation as resolved.
Show resolved Hide resolved
the id. When False, exit early when the job directory exists.

Returns
-------
Job
Expand All @@ -671,6 +724,15 @@ def init(self, force=False):
"""
with self._lock:
try:
# Fast early exit when not validating.
if not validate_statepoint:
if self._directory_known:
return self

if os.path.isdir(self.path):
self._directory_known = True
return self

# Attempt early exit if the state point file exists and is valid.
try:
statepoint = self.statepoint.load(self.id)
Expand All @@ -687,6 +749,8 @@ def init(self, force=False):
)
raise

self._directory_known = True

# The state point save will not overwrite an existing file on
# disk unless force is True, so the subsequent load will catch
# when a preexisting invalid file was present.
Expand Down Expand Up @@ -760,6 +824,8 @@ def remove(self):
self._document = None
self._stores = None

self._directory_known = False

def move(self, project):
"""Move this job to project.

Expand Down Expand Up @@ -899,7 +965,7 @@ def open(self):

"""
self._cwd.append(os.getcwd())
self.init()
self.init(validate_statepoint=False)
logger.info(f"Enter workspace '{self.path}'.")
os.chdir(self.path)

Expand Down
Loading
Loading