Skip to content

Commit

Permalink
Store failed calculations with a dedicated naming scheme (#2151)
Browse files Browse the repository at this point in the history
## Summary of Changes

Closes #2016 by storing failed calculations in a renamed
`failed-quacc-...` directory. The only user-facing change here is the
`tmp-quacc-12345` becomes `failed-quacc-12345` upon failure.

### Checklist

- [X] I have read the ["Guidelines"
section](https://quantum-accelerators.github.io/quacc/dev/contributing.html#guidelines)
of the contributing guide. Don't lie! 😉
- [X] My PR is on a custom branch and is _not_ named `main`.
- [X] I have added relevant, comprehensive [unit
tests](https://quantum-accelerators.github.io/quacc/dev/contributing.html#unit-tests).

### Notes

- Your PR will likely not be merged without proper and thorough tests.
- If you are an external contributor, you will see a comment from
[@buildbot-princeton](https://github.com/buildbot-princeton). This is
solely for the maintainers.
- When your code is ready for review, ping one of the [active
maintainers](https://quantum-accelerators.github.io/quacc/about/contributors.html#active-maintainers).

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
Andrew-S-Rosen and pre-commit-ci[bot] authored May 16, 2024
1 parent 7c9dba5 commit 6f1c19e
Show file tree
Hide file tree
Showing 6 changed files with 141 additions and 51 deletions.
24 changes: 22 additions & 2 deletions docs/user/settings/file_management.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,14 @@ RESULTS_DIR

### Job Failure

If the job fails or does not complete, then the `tmp-quacc-2023-12-08-67890` directory will remain in `RESULTS_DIR` so you can inspect the files.
If the job fails or does not complete, then the file structure looks like:

```text
RESULTS_DIR
├── failed-quacc-2023-12-08-67890
│ ├── INPUT
└── OUTPUT
```

## Scenario 2: Specifying a `SCRATCH_DIR`

Expand Down Expand Up @@ -83,4 +90,17 @@ SCRATCH_DIR

### Job Failure

If the job fails or does not complete, then the `tmp-quacc-2023-12-08-67890` directory will remain in `SCRATCH_DIR` so you can inspect the files. The symbolic link in `RESULTS_DIR` will also remain.
If the job fails or does not complete, then the file structure looks like:

```text
RESULTS_DIR
├── symlink-failed-quacc-2023-12-08-67890
```

```text
SCRATCH_DIR
├── failed-quacc-2023-12-08-67890
│ ├── INPUT
└── OUTPUT
```
59 changes: 35 additions & 24 deletions src/quacc/runners/ase.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from quacc import SETTINGS
from quacc.atoms.core import copy_atoms, get_final_atoms_from_dynamics
from quacc.runners.prep import calc_cleanup, calc_setup
from quacc.runners.prep import calc_cleanup, calc_setup, terminate
from quacc.utils.dicts import recursive_dict_merge

try:
Expand Down Expand Up @@ -106,10 +106,13 @@ def run_calc(
tmpdir, job_results_dir = calc_setup(atoms, copy_files=copy_files)

# Run calculation
if get_forces:
atoms.get_forces()
else:
atoms.get_potential_energy()
try:
if get_forces:
atoms.get_forces()
else:
atoms.get_potential_energy()
except Exception as exception:
terminate(tmpdir, exception)

# Most ASE calculators do not update the atoms object in-place with a call
# to .get_potential_energy(), which is important if an internal optimizer is
Expand Down Expand Up @@ -233,24 +236,29 @@ def run_opt(
atoms = FrechetCellFilter(atoms)

# Run optimization
with traj, optimizer(atoms, **optimizer_kwargs) as dyn:
if optimizer.__name__.startswith("SciPy"):
# https://gitlab.com/ase/ase/-/issues/1475
dyn.run(fmax=fmax, steps=max_steps, **run_kwargs)
else:
for i, _ in enumerate(dyn.irun(fmax=fmax, steps=max_steps, **run_kwargs)):
if store_intermediate_results:
_copy_intermediate_files(
tmpdir,
i,
files_to_ignore=[
traj_file,
optimizer_kwargs["restart"],
optimizer_kwargs["logfile"],
],
)
if fn_hook:
fn_hook(dyn)
try:
with traj, optimizer(atoms, **optimizer_kwargs) as dyn:
if optimizer.__name__.startswith("SciPy"):
# https://gitlab.com/ase/ase/-/issues/1475
dyn.run(fmax=fmax, steps=max_steps, **run_kwargs)
else:
for i, _ in enumerate(
dyn.irun(fmax=fmax, steps=max_steps, **run_kwargs)
):
if store_intermediate_results:
_copy_intermediate_files(
tmpdir,
i,
files_to_ignore=[
traj_file,
optimizer_kwargs["restart"],
optimizer_kwargs["logfile"],
],
)
if fn_hook:
fn_hook(dyn)
except Exception as exception:
terminate(tmpdir, exception)

# Store the trajectory atoms
dyn.traj_atoms = read(traj_file, index=":")
Expand Down Expand Up @@ -299,7 +307,10 @@ def run_vib(

# Run calculation
vib = Vibrations(atoms, name=str(tmpdir / "vib"), **vib_kwargs)
vib.run()
try:
vib.run()
except Exception as exception:
terminate(tmpdir, exception)

# Summarize run
vib.summary(log=sys.stdout if SETTINGS.DEBUG else str(tmpdir / "vib_summary.log"))
Expand Down
53 changes: 40 additions & 13 deletions src/quacc/runners/prep.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,8 @@ def calc_setup(

# Create a symlink to the tmpdir
if os.name != "nt" and SETTINGS.SCRATCH_DIR:
symlink = SETTINGS.RESULTS_DIR / f"symlink-{tmpdir.name}"
symlink.unlink(missing_ok=True)
symlink.symlink_to(tmpdir, target_is_directory=True)
symlink_path = SETTINGS.RESULTS_DIR / f"symlink-{tmpdir.name}"
symlink_path.symlink_to(tmpdir, target_is_directory=True)

# Copy files to tmpdir and decompress them if needed
if copy_files:
Expand Down Expand Up @@ -101,15 +100,13 @@ def calc_cleanup(
deleted after the calculation is complete.
job_results_dir
The path to the job_results_dir, where the files will ultimately be
stored. A symlink to the tmpdir will be made here during the calculation
for convenience.
stored.
Returns
-------
None
"""
job_results_dir, tmpdir = Path(job_results_dir), Path(tmpdir)
logger.info(f"Calculation results stored at {job_results_dir}")

# Safety check
if "tmp-" not in str(tmpdir):
Expand All @@ -120,21 +117,51 @@ def calc_cleanup(
if atoms is not None:
atoms.calc.directory = job_results_dir

# Make the results directory
job_results_dir.mkdir(parents=True, exist_ok=True)

# Gzip files in tmpdir
if SETTINGS.GZIP_FILES:
gzip_dir(tmpdir)

# Move files from tmpdir to job_results_dir
for file_name in os.listdir(tmpdir):
move(tmpdir / file_name, job_results_dir / file_name)
if SETTINGS.CREATE_UNIQUE_DIR:
move(tmpdir, job_results_dir)
else:
for file_name in os.listdir(tmpdir):
move(tmpdir / file_name, job_results_dir / file_name)
rmtree(tmpdir)
logger.info(f"Calculation results stored at {job_results_dir}")

# Remove symlink to tmpdir
if os.name != "nt" and SETTINGS.SCRATCH_DIR:
symlink_path = SETTINGS.RESULTS_DIR / f"symlink-{tmpdir.name}"
symlink_path.unlink(missing_ok=True)

# Remove the tmpdir
rmtree(tmpdir, ignore_errors=True)

def terminate(tmpdir: Path | str, exception: Exception) -> Exception:
"""
Terminate a calculation and move files to a failed directory.
Parameters
----------
tmpdir
The path to the tmpdir, where the calculation was run.
exception
The exception that caused the calculation to fail.
Raises
-------
Exception
The exception that caused the calculation to fail.
"""
job_failed_dir = tmpdir.with_name(tmpdir.name.replace("tmp-", "failed-"))
tmpdir.rename(job_failed_dir)

msg = f"Calculation failed! Files stored at {job_failed_dir}"
logging.info(msg)

if os.name != "nt" and SETTINGS.SCRATCH_DIR:
old_symlink_path = SETTINGS.RESULTS_DIR / f"symlink-{tmpdir.name}"
symlink_path = SETTINGS.RESULTS_DIR / f"symlink-{job_failed_dir.name}"
old_symlink_path.unlink(missing_ok=True)
symlink_path.symlink_to(job_failed_dir, target_is_directory=True)

raise exception
39 changes: 31 additions & 8 deletions tests/core/recipes/dftb_recipes/test_dftb_recipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,18 @@
DFTBPLUS_EXISTS = bool(which("dftb+"))

pytestmark = pytest.mark.skipif(not DFTBPLUS_EXISTS, reason="Needs DFTB+")
import logging
import os

import numpy as np
from ase.build import bulk, molecule

from quacc import change_settings
from quacc.recipes.dftb.core import relax_job, static_job

LOGGER = logging.getLogger(__name__)
LOGGER.propagate = True


def test_static_job_water(tmp_path, monkeypatch):
monkeypatch.chdir(tmp_path)
Expand Down Expand Up @@ -160,12 +166,29 @@ def test_relax_job_cu_supercell_errors(tmp_path, monkeypatch):
relax_job(atoms, kpts=(3, 3, 3), MaxSteps=1, Hamiltonian_MaxSccIterations=100)


def test_child_errors(tmp_path, monkeypatch):
@pytest.mark.skipif(os.name == "nt", reason="symlinking not possible on Windows")
def test_child_errors(tmp_path, monkeypatch, caplog):
monkeypatch.chdir(tmp_path)
with pytest.raises(RuntimeError, match="failed with command"):
atoms = bulk("Cu")
static_job(atoms)

with pytest.raises(RuntimeError, match="failed with command"):
atoms = bulk("Cu")
relax_job(atoms)
atoms = bulk("Cu")
with (
caplog.at_level(logging.INFO),
change_settings({"SCRATCH_DIR": tmp_path / "scratch"}),
):
with pytest.raises(RuntimeError, match="failed with command"):
static_job(atoms)
assert "Calculation failed" in caplog.text
assert "failed-quacc-" in " ".join(os.listdir(tmp_path / "scratch"))


@pytest.mark.skipif(os.name == "nt", reason="symlinking not possible on Windows")
def test_child_errors2(tmp_path, monkeypatch, caplog):
monkeypatch.chdir(tmp_path)
atoms = bulk("Cu")
with (
caplog.at_level(logging.INFO),
change_settings({"SCRATCH_DIR": tmp_path / "scratch"}),
):
with pytest.raises(RuntimeError, match="failed with command"):
relax_job(atoms)
assert "Calculation failed" in caplog.text
assert "failed-quacc-" in " ".join(os.listdir(tmp_path / "scratch"))
11 changes: 10 additions & 1 deletion tests/core/runners/test_prep.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from ase.calculators.emt import EMT

from quacc import change_settings
from quacc.runners.prep import calc_cleanup, calc_setup
from quacc.runners.prep import calc_cleanup, calc_setup, terminate


def make_files():
Expand Down Expand Up @@ -176,3 +176,12 @@ def test_calc_cleanup(tmp_path, monkeypatch):

with pytest.raises(ValueError):
calc_cleanup(atoms, "quacc", SETTINGS.RESULTS_DIR)


def test_terminate(tmp_path):
p = tmp_path / "tmp-quacc-1234"
os.mkdir(p)
with pytest.raises(ValueError, match="moo"):
terminate(p, ValueError("moo"))
assert not p.exists()
assert Path(tmp_path, "failed-quacc-1234").exists()
6 changes: 3 additions & 3 deletions tests/core/settings/test_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ def test_file(tmp_path, monkeypatch):

def test_store(tmp_path, monkeypatch):
monkeypatch.chdir(tmp_path)
SETTINGS.STORE = MemoryStore()
atoms = bulk("Cu")
static_job(atoms)
with change_settings({"STORE": MemoryStore()}):
atoms = bulk("Cu")
static_job(atoms)


def test_results_dir(tmp_path, monkeypatch):
Expand Down

0 comments on commit 6f1c19e

Please sign in to comment.