From 6f1c19e851fa0b35dad20663940952e36ea0e566 Mon Sep 17 00:00:00 2001 From: "Andrew S. Rosen" Date: Wed, 15 May 2024 18:42:44 -0700 Subject: [PATCH] Store failed calculations with a dedicated naming scheme (#2151) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary of Changes Closes #2016 by storing failed calculations in a renamed `failed-quacc-...` directory. The only user-facing change here is the `tmp-quacc-12345` becomes `failed-quacc-12345` upon failure. ### Checklist - [X] I have read the ["Guidelines" section](https://quantum-accelerators.github.io/quacc/dev/contributing.html#guidelines) of the contributing guide. Don't lie! 😉 - [X] My PR is on a custom branch and is _not_ named `main`. - [X] I have added relevant, comprehensive [unit tests](https://quantum-accelerators.github.io/quacc/dev/contributing.html#unit-tests). ### Notes - Your PR will likely not be merged without proper and thorough tests. - If you are an external contributor, you will see a comment from [@buildbot-princeton](https://github.com/buildbot-princeton). This is solely for the maintainers. - When your code is ready for review, ping one of the [active maintainers](https://quantum-accelerators.github.io/quacc/about/contributors.html#active-maintainers). --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/user/settings/file_management.md | 24 +++++++- src/quacc/runners/ase.py | 59 +++++++++++-------- src/quacc/runners/prep.py | 53 +++++++++++++---- .../recipes/dftb_recipes/test_dftb_recipes.py | 39 +++++++++--- tests/core/runners/test_prep.py | 11 +++- tests/core/settings/test_settings.py | 6 +- 6 files changed, 141 insertions(+), 51 deletions(-) diff --git a/docs/user/settings/file_management.md b/docs/user/settings/file_management.md index 562dcfad81..7aed734cfc 100644 --- a/docs/user/settings/file_management.md +++ b/docs/user/settings/file_management.md @@ -38,7 +38,14 @@ RESULTS_DIR ### Job Failure -If the job fails or does not complete, then the `tmp-quacc-2023-12-08-67890` directory will remain in `RESULTS_DIR` so you can inspect the files. +If the job fails or does not complete, then the file structure looks like: + +```text +RESULTS_DIR +├── failed-quacc-2023-12-08-67890 +│ ├── INPUT + └── OUTPUT +``` ## Scenario 2: Specifying a `SCRATCH_DIR` @@ -83,4 +90,17 @@ SCRATCH_DIR ### Job Failure -If the job fails or does not complete, then the `tmp-quacc-2023-12-08-67890` directory will remain in `SCRATCH_DIR` so you can inspect the files. The symbolic link in `RESULTS_DIR` will also remain. +If the job fails or does not complete, then the file structure looks like: + +```text +RESULTS_DIR +├── symlink-failed-quacc-2023-12-08-67890 +│ +``` + +```text +SCRATCH_DIR +├── failed-quacc-2023-12-08-67890 +│ ├── INPUT + └── OUTPUT +``` diff --git a/src/quacc/runners/ase.py b/src/quacc/runners/ase.py index 79bd5b8a46..9c5ddad73d 100644 --- a/src/quacc/runners/ase.py +++ b/src/quacc/runners/ase.py @@ -16,7 +16,7 @@ from quacc import SETTINGS from quacc.atoms.core import copy_atoms, get_final_atoms_from_dynamics -from quacc.runners.prep import calc_cleanup, calc_setup +from quacc.runners.prep import calc_cleanup, calc_setup, terminate from quacc.utils.dicts import recursive_dict_merge try: @@ -106,10 +106,13 @@ def run_calc( tmpdir, job_results_dir = calc_setup(atoms, copy_files=copy_files) # Run calculation - if get_forces: - atoms.get_forces() - else: - atoms.get_potential_energy() + try: + if get_forces: + atoms.get_forces() + else: + atoms.get_potential_energy() + except Exception as exception: + terminate(tmpdir, exception) # Most ASE calculators do not update the atoms object in-place with a call # to .get_potential_energy(), which is important if an internal optimizer is @@ -233,24 +236,29 @@ def run_opt( atoms = FrechetCellFilter(atoms) # Run optimization - with traj, optimizer(atoms, **optimizer_kwargs) as dyn: - if optimizer.__name__.startswith("SciPy"): - # https://gitlab.com/ase/ase/-/issues/1475 - dyn.run(fmax=fmax, steps=max_steps, **run_kwargs) - else: - for i, _ in enumerate(dyn.irun(fmax=fmax, steps=max_steps, **run_kwargs)): - if store_intermediate_results: - _copy_intermediate_files( - tmpdir, - i, - files_to_ignore=[ - traj_file, - optimizer_kwargs["restart"], - optimizer_kwargs["logfile"], - ], - ) - if fn_hook: - fn_hook(dyn) + try: + with traj, optimizer(atoms, **optimizer_kwargs) as dyn: + if optimizer.__name__.startswith("SciPy"): + # https://gitlab.com/ase/ase/-/issues/1475 + dyn.run(fmax=fmax, steps=max_steps, **run_kwargs) + else: + for i, _ in enumerate( + dyn.irun(fmax=fmax, steps=max_steps, **run_kwargs) + ): + if store_intermediate_results: + _copy_intermediate_files( + tmpdir, + i, + files_to_ignore=[ + traj_file, + optimizer_kwargs["restart"], + optimizer_kwargs["logfile"], + ], + ) + if fn_hook: + fn_hook(dyn) + except Exception as exception: + terminate(tmpdir, exception) # Store the trajectory atoms dyn.traj_atoms = read(traj_file, index=":") @@ -299,7 +307,10 @@ def run_vib( # Run calculation vib = Vibrations(atoms, name=str(tmpdir / "vib"), **vib_kwargs) - vib.run() + try: + vib.run() + except Exception as exception: + terminate(tmpdir, exception) # Summarize run vib.summary(log=sys.stdout if SETTINGS.DEBUG else str(tmpdir / "vib_summary.log")) diff --git a/src/quacc/runners/prep.py b/src/quacc/runners/prep.py index f5a0189218..3f328729d2 100644 --- a/src/quacc/runners/prep.py +++ b/src/quacc/runners/prep.py @@ -68,9 +68,8 @@ def calc_setup( # Create a symlink to the tmpdir if os.name != "nt" and SETTINGS.SCRATCH_DIR: - symlink = SETTINGS.RESULTS_DIR / f"symlink-{tmpdir.name}" - symlink.unlink(missing_ok=True) - symlink.symlink_to(tmpdir, target_is_directory=True) + symlink_path = SETTINGS.RESULTS_DIR / f"symlink-{tmpdir.name}" + symlink_path.symlink_to(tmpdir, target_is_directory=True) # Copy files to tmpdir and decompress them if needed if copy_files: @@ -101,15 +100,13 @@ def calc_cleanup( deleted after the calculation is complete. job_results_dir The path to the job_results_dir, where the files will ultimately be - stored. A symlink to the tmpdir will be made here during the calculation - for convenience. + stored. Returns ------- None """ job_results_dir, tmpdir = Path(job_results_dir), Path(tmpdir) - logger.info(f"Calculation results stored at {job_results_dir}") # Safety check if "tmp-" not in str(tmpdir): @@ -120,21 +117,51 @@ def calc_cleanup( if atoms is not None: atoms.calc.directory = job_results_dir - # Make the results directory - job_results_dir.mkdir(parents=True, exist_ok=True) - # Gzip files in tmpdir if SETTINGS.GZIP_FILES: gzip_dir(tmpdir) # Move files from tmpdir to job_results_dir - for file_name in os.listdir(tmpdir): - move(tmpdir / file_name, job_results_dir / file_name) + if SETTINGS.CREATE_UNIQUE_DIR: + move(tmpdir, job_results_dir) + else: + for file_name in os.listdir(tmpdir): + move(tmpdir / file_name, job_results_dir / file_name) + rmtree(tmpdir) + logger.info(f"Calculation results stored at {job_results_dir}") # Remove symlink to tmpdir if os.name != "nt" and SETTINGS.SCRATCH_DIR: symlink_path = SETTINGS.RESULTS_DIR / f"symlink-{tmpdir.name}" symlink_path.unlink(missing_ok=True) - # Remove the tmpdir - rmtree(tmpdir, ignore_errors=True) + +def terminate(tmpdir: Path | str, exception: Exception) -> Exception: + """ + Terminate a calculation and move files to a failed directory. + + Parameters + ---------- + tmpdir + The path to the tmpdir, where the calculation was run. + exception + The exception that caused the calculation to fail. + + Raises + ------- + Exception + The exception that caused the calculation to fail. + """ + job_failed_dir = tmpdir.with_name(tmpdir.name.replace("tmp-", "failed-")) + tmpdir.rename(job_failed_dir) + + msg = f"Calculation failed! Files stored at {job_failed_dir}" + logging.info(msg) + + if os.name != "nt" and SETTINGS.SCRATCH_DIR: + old_symlink_path = SETTINGS.RESULTS_DIR / f"symlink-{tmpdir.name}" + symlink_path = SETTINGS.RESULTS_DIR / f"symlink-{job_failed_dir.name}" + old_symlink_path.unlink(missing_ok=True) + symlink_path.symlink_to(job_failed_dir, target_is_directory=True) + + raise exception diff --git a/tests/core/recipes/dftb_recipes/test_dftb_recipes.py b/tests/core/recipes/dftb_recipes/test_dftb_recipes.py index e737b60efa..ebbca498ab 100644 --- a/tests/core/recipes/dftb_recipes/test_dftb_recipes.py +++ b/tests/core/recipes/dftb_recipes/test_dftb_recipes.py @@ -7,12 +7,18 @@ DFTBPLUS_EXISTS = bool(which("dftb+")) pytestmark = pytest.mark.skipif(not DFTBPLUS_EXISTS, reason="Needs DFTB+") +import logging +import os import numpy as np from ase.build import bulk, molecule +from quacc import change_settings from quacc.recipes.dftb.core import relax_job, static_job +LOGGER = logging.getLogger(__name__) +LOGGER.propagate = True + def test_static_job_water(tmp_path, monkeypatch): monkeypatch.chdir(tmp_path) @@ -160,12 +166,29 @@ def test_relax_job_cu_supercell_errors(tmp_path, monkeypatch): relax_job(atoms, kpts=(3, 3, 3), MaxSteps=1, Hamiltonian_MaxSccIterations=100) -def test_child_errors(tmp_path, monkeypatch): +@pytest.mark.skipif(os.name == "nt", reason="symlinking not possible on Windows") +def test_child_errors(tmp_path, monkeypatch, caplog): monkeypatch.chdir(tmp_path) - with pytest.raises(RuntimeError, match="failed with command"): - atoms = bulk("Cu") - static_job(atoms) - - with pytest.raises(RuntimeError, match="failed with command"): - atoms = bulk("Cu") - relax_job(atoms) + atoms = bulk("Cu") + with ( + caplog.at_level(logging.INFO), + change_settings({"SCRATCH_DIR": tmp_path / "scratch"}), + ): + with pytest.raises(RuntimeError, match="failed with command"): + static_job(atoms) + assert "Calculation failed" in caplog.text + assert "failed-quacc-" in " ".join(os.listdir(tmp_path / "scratch")) + + +@pytest.mark.skipif(os.name == "nt", reason="symlinking not possible on Windows") +def test_child_errors2(tmp_path, monkeypatch, caplog): + monkeypatch.chdir(tmp_path) + atoms = bulk("Cu") + with ( + caplog.at_level(logging.INFO), + change_settings({"SCRATCH_DIR": tmp_path / "scratch"}), + ): + with pytest.raises(RuntimeError, match="failed with command"): + relax_job(atoms) + assert "Calculation failed" in caplog.text + assert "failed-quacc-" in " ".join(os.listdir(tmp_path / "scratch")) diff --git a/tests/core/runners/test_prep.py b/tests/core/runners/test_prep.py index b065b3adf5..ae86fb7b8f 100644 --- a/tests/core/runners/test_prep.py +++ b/tests/core/runners/test_prep.py @@ -8,7 +8,7 @@ from ase.calculators.emt import EMT from quacc import change_settings -from quacc.runners.prep import calc_cleanup, calc_setup +from quacc.runners.prep import calc_cleanup, calc_setup, terminate def make_files(): @@ -176,3 +176,12 @@ def test_calc_cleanup(tmp_path, monkeypatch): with pytest.raises(ValueError): calc_cleanup(atoms, "quacc", SETTINGS.RESULTS_DIR) + + +def test_terminate(tmp_path): + p = tmp_path / "tmp-quacc-1234" + os.mkdir(p) + with pytest.raises(ValueError, match="moo"): + terminate(p, ValueError("moo")) + assert not p.exists() + assert Path(tmp_path, "failed-quacc-1234").exists() diff --git a/tests/core/settings/test_settings.py b/tests/core/settings/test_settings.py index bf1f6284be..f3ff803d45 100644 --- a/tests/core/settings/test_settings.py +++ b/tests/core/settings/test_settings.py @@ -28,9 +28,9 @@ def test_file(tmp_path, monkeypatch): def test_store(tmp_path, monkeypatch): monkeypatch.chdir(tmp_path) - SETTINGS.STORE = MemoryStore() - atoms = bulk("Cu") - static_job(atoms) + with change_settings({"STORE": MemoryStore()}): + atoms = bulk("Cu") + static_job(atoms) def test_results_dir(tmp_path, monkeypatch):