From 6f1c19e851fa0b35dad20663940952e36ea0e566 Mon Sep 17 00:00:00 2001
From: "Andrew S. Rosen" <asrosen93@gmail.com>
Date: Wed, 15 May 2024 18:42:44 -0700
Subject: [PATCH] Store failed calculations with a dedicated naming scheme
 (#2151)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary of Changes

Closes #2016 by storing failed calculations in a renamed
`failed-quacc-...` directory. The only user-facing change here is the
`tmp-quacc-12345` becomes `failed-quacc-12345` upon failure.

### Checklist

- [X] I have read the ["Guidelines"
section](https://quantum-accelerators.github.io/quacc/dev/contributing.html#guidelines)
of the contributing guide. Don't lie! 😉
- [X] My PR is on a custom branch and is _not_ named `main`.
- [X] I have added relevant, comprehensive [unit
tests](https://quantum-accelerators.github.io/quacc/dev/contributing.html#unit-tests).

### Notes

- Your PR will likely not be merged without proper and thorough tests.
- If you are an external contributor, you will see a comment from
[@buildbot-princeton](https://github.com/buildbot-princeton). This is
solely for the maintainers.
- When your code is ready for review, ping one of the [active
maintainers](https://quantum-accelerators.github.io/quacc/about/contributors.html#active-maintainers).

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 docs/user/settings/file_management.md         | 24 +++++++-
 src/quacc/runners/ase.py                      | 59 +++++++++++--------
 src/quacc/runners/prep.py                     | 53 +++++++++++++----
 .../recipes/dftb_recipes/test_dftb_recipes.py | 39 +++++++++---
 tests/core/runners/test_prep.py               | 11 +++-
 tests/core/settings/test_settings.py          |  6 +-
 6 files changed, 141 insertions(+), 51 deletions(-)

diff --git a/docs/user/settings/file_management.md b/docs/user/settings/file_management.md
index 562dcfad81..7aed734cfc 100644
--- a/docs/user/settings/file_management.md
+++ b/docs/user/settings/file_management.md
@@ -38,7 +38,14 @@ RESULTS_DIR
 
 ### Job Failure
 
-If the job fails or does not complete, then the `tmp-quacc-2023-12-08-67890` directory will remain in `RESULTS_DIR` so you can inspect the files.
+If the job fails or does not complete, then the file structure looks like:
+
+```text
+RESULTS_DIR
+├── failed-quacc-2023-12-08-67890
+│   ├── INPUT
+    └── OUTPUT
+```
 
 ## Scenario 2: Specifying a `SCRATCH_DIR`
 
@@ -83,4 +90,17 @@ SCRATCH_DIR
 
 ### Job Failure
 
-If the job fails or does not complete, then the `tmp-quacc-2023-12-08-67890` directory will remain in `SCRATCH_DIR` so you can inspect the files. The symbolic link in `RESULTS_DIR` will also remain.
+If the job fails or does not complete, then the file structure looks like:
+
+```text
+RESULTS_DIR
+├── symlink-failed-quacc-2023-12-08-67890
+│
+```
+
+```text
+SCRATCH_DIR
+├── failed-quacc-2023-12-08-67890
+│   ├── INPUT
+    └── OUTPUT
+```
diff --git a/src/quacc/runners/ase.py b/src/quacc/runners/ase.py
index 79bd5b8a46..9c5ddad73d 100644
--- a/src/quacc/runners/ase.py
+++ b/src/quacc/runners/ase.py
@@ -16,7 +16,7 @@
 
 from quacc import SETTINGS
 from quacc.atoms.core import copy_atoms, get_final_atoms_from_dynamics
-from quacc.runners.prep import calc_cleanup, calc_setup
+from quacc.runners.prep import calc_cleanup, calc_setup, terminate
 from quacc.utils.dicts import recursive_dict_merge
 
 try:
@@ -106,10 +106,13 @@ def run_calc(
     tmpdir, job_results_dir = calc_setup(atoms, copy_files=copy_files)
 
     # Run calculation
-    if get_forces:
-        atoms.get_forces()
-    else:
-        atoms.get_potential_energy()
+    try:
+        if get_forces:
+            atoms.get_forces()
+        else:
+            atoms.get_potential_energy()
+    except Exception as exception:
+        terminate(tmpdir, exception)
 
     # Most ASE calculators do not update the atoms object in-place with a call
     # to .get_potential_energy(), which is important if an internal optimizer is
@@ -233,24 +236,29 @@ def run_opt(
         atoms = FrechetCellFilter(atoms)
 
     # Run optimization
-    with traj, optimizer(atoms, **optimizer_kwargs) as dyn:
-        if optimizer.__name__.startswith("SciPy"):
-            # https://gitlab.com/ase/ase/-/issues/1475
-            dyn.run(fmax=fmax, steps=max_steps, **run_kwargs)
-        else:
-            for i, _ in enumerate(dyn.irun(fmax=fmax, steps=max_steps, **run_kwargs)):
-                if store_intermediate_results:
-                    _copy_intermediate_files(
-                        tmpdir,
-                        i,
-                        files_to_ignore=[
-                            traj_file,
-                            optimizer_kwargs["restart"],
-                            optimizer_kwargs["logfile"],
-                        ],
-                    )
-                if fn_hook:
-                    fn_hook(dyn)
+    try:
+        with traj, optimizer(atoms, **optimizer_kwargs) as dyn:
+            if optimizer.__name__.startswith("SciPy"):
+                # https://gitlab.com/ase/ase/-/issues/1475
+                dyn.run(fmax=fmax, steps=max_steps, **run_kwargs)
+            else:
+                for i, _ in enumerate(
+                    dyn.irun(fmax=fmax, steps=max_steps, **run_kwargs)
+                ):
+                    if store_intermediate_results:
+                        _copy_intermediate_files(
+                            tmpdir,
+                            i,
+                            files_to_ignore=[
+                                traj_file,
+                                optimizer_kwargs["restart"],
+                                optimizer_kwargs["logfile"],
+                            ],
+                        )
+                    if fn_hook:
+                        fn_hook(dyn)
+    except Exception as exception:
+        terminate(tmpdir, exception)
 
     # Store the trajectory atoms
     dyn.traj_atoms = read(traj_file, index=":")
@@ -299,7 +307,10 @@ def run_vib(
 
     # Run calculation
     vib = Vibrations(atoms, name=str(tmpdir / "vib"), **vib_kwargs)
-    vib.run()
+    try:
+        vib.run()
+    except Exception as exception:
+        terminate(tmpdir, exception)
 
     # Summarize run
     vib.summary(log=sys.stdout if SETTINGS.DEBUG else str(tmpdir / "vib_summary.log"))
diff --git a/src/quacc/runners/prep.py b/src/quacc/runners/prep.py
index f5a0189218..3f328729d2 100644
--- a/src/quacc/runners/prep.py
+++ b/src/quacc/runners/prep.py
@@ -68,9 +68,8 @@ def calc_setup(
 
     # Create a symlink to the tmpdir
     if os.name != "nt" and SETTINGS.SCRATCH_DIR:
-        symlink = SETTINGS.RESULTS_DIR / f"symlink-{tmpdir.name}"
-        symlink.unlink(missing_ok=True)
-        symlink.symlink_to(tmpdir, target_is_directory=True)
+        symlink_path = SETTINGS.RESULTS_DIR / f"symlink-{tmpdir.name}"
+        symlink_path.symlink_to(tmpdir, target_is_directory=True)
 
     # Copy files to tmpdir and decompress them if needed
     if copy_files:
@@ -101,15 +100,13 @@ def calc_cleanup(
         deleted after the calculation is complete.
     job_results_dir
         The path to the job_results_dir, where the files will ultimately be
-        stored. A symlink to the tmpdir will be made here during the calculation
-        for convenience.
+        stored.
 
     Returns
     -------
     None
     """
     job_results_dir, tmpdir = Path(job_results_dir), Path(tmpdir)
-    logger.info(f"Calculation results stored at {job_results_dir}")
 
     # Safety check
     if "tmp-" not in str(tmpdir):
@@ -120,21 +117,51 @@ def calc_cleanup(
     if atoms is not None:
         atoms.calc.directory = job_results_dir
 
-    # Make the results directory
-    job_results_dir.mkdir(parents=True, exist_ok=True)
-
     # Gzip files in tmpdir
     if SETTINGS.GZIP_FILES:
         gzip_dir(tmpdir)
 
     # Move files from tmpdir to job_results_dir
-    for file_name in os.listdir(tmpdir):
-        move(tmpdir / file_name, job_results_dir / file_name)
+    if SETTINGS.CREATE_UNIQUE_DIR:
+        move(tmpdir, job_results_dir)
+    else:
+        for file_name in os.listdir(tmpdir):
+            move(tmpdir / file_name, job_results_dir / file_name)
+        rmtree(tmpdir)
+    logger.info(f"Calculation results stored at {job_results_dir}")
 
     # Remove symlink to tmpdir
     if os.name != "nt" and SETTINGS.SCRATCH_DIR:
         symlink_path = SETTINGS.RESULTS_DIR / f"symlink-{tmpdir.name}"
         symlink_path.unlink(missing_ok=True)
 
-    # Remove the tmpdir
-    rmtree(tmpdir, ignore_errors=True)
+
+def terminate(tmpdir: Path | str, exception: Exception) -> Exception:
+    """
+    Terminate a calculation and move files to a failed directory.
+
+    Parameters
+    ----------
+    tmpdir
+        The path to the tmpdir, where the calculation was run.
+    exception
+        The exception that caused the calculation to fail.
+
+    Raises
+    -------
+    Exception
+        The exception that caused the calculation to fail.
+    """
+    job_failed_dir = tmpdir.with_name(tmpdir.name.replace("tmp-", "failed-"))
+    tmpdir.rename(job_failed_dir)
+
+    msg = f"Calculation failed! Files stored at {job_failed_dir}"
+    logging.info(msg)
+
+    if os.name != "nt" and SETTINGS.SCRATCH_DIR:
+        old_symlink_path = SETTINGS.RESULTS_DIR / f"symlink-{tmpdir.name}"
+        symlink_path = SETTINGS.RESULTS_DIR / f"symlink-{job_failed_dir.name}"
+        old_symlink_path.unlink(missing_ok=True)
+        symlink_path.symlink_to(job_failed_dir, target_is_directory=True)
+
+    raise exception
diff --git a/tests/core/recipes/dftb_recipes/test_dftb_recipes.py b/tests/core/recipes/dftb_recipes/test_dftb_recipes.py
index e737b60efa..ebbca498ab 100644
--- a/tests/core/recipes/dftb_recipes/test_dftb_recipes.py
+++ b/tests/core/recipes/dftb_recipes/test_dftb_recipes.py
@@ -7,12 +7,18 @@
 DFTBPLUS_EXISTS = bool(which("dftb+"))
 
 pytestmark = pytest.mark.skipif(not DFTBPLUS_EXISTS, reason="Needs DFTB+")
+import logging
+import os
 
 import numpy as np
 from ase.build import bulk, molecule
 
+from quacc import change_settings
 from quacc.recipes.dftb.core import relax_job, static_job
 
+LOGGER = logging.getLogger(__name__)
+LOGGER.propagate = True
+
 
 def test_static_job_water(tmp_path, monkeypatch):
     monkeypatch.chdir(tmp_path)
@@ -160,12 +166,29 @@ def test_relax_job_cu_supercell_errors(tmp_path, monkeypatch):
         relax_job(atoms, kpts=(3, 3, 3), MaxSteps=1, Hamiltonian_MaxSccIterations=100)
 
 
-def test_child_errors(tmp_path, monkeypatch):
+@pytest.mark.skipif(os.name == "nt", reason="symlinking not possible on Windows")
+def test_child_errors(tmp_path, monkeypatch, caplog):
     monkeypatch.chdir(tmp_path)
-    with pytest.raises(RuntimeError, match="failed with command"):
-        atoms = bulk("Cu")
-        static_job(atoms)
-
-    with pytest.raises(RuntimeError, match="failed with command"):
-        atoms = bulk("Cu")
-        relax_job(atoms)
+    atoms = bulk("Cu")
+    with (
+        caplog.at_level(logging.INFO),
+        change_settings({"SCRATCH_DIR": tmp_path / "scratch"}),
+    ):
+        with pytest.raises(RuntimeError, match="failed with command"):
+            static_job(atoms)
+        assert "Calculation failed" in caplog.text
+        assert "failed-quacc-" in " ".join(os.listdir(tmp_path / "scratch"))
+
+
+@pytest.mark.skipif(os.name == "nt", reason="symlinking not possible on Windows")
+def test_child_errors2(tmp_path, monkeypatch, caplog):
+    monkeypatch.chdir(tmp_path)
+    atoms = bulk("Cu")
+    with (
+        caplog.at_level(logging.INFO),
+        change_settings({"SCRATCH_DIR": tmp_path / "scratch"}),
+    ):
+        with pytest.raises(RuntimeError, match="failed with command"):
+            relax_job(atoms)
+        assert "Calculation failed" in caplog.text
+        assert "failed-quacc-" in " ".join(os.listdir(tmp_path / "scratch"))
diff --git a/tests/core/runners/test_prep.py b/tests/core/runners/test_prep.py
index b065b3adf5..ae86fb7b8f 100644
--- a/tests/core/runners/test_prep.py
+++ b/tests/core/runners/test_prep.py
@@ -8,7 +8,7 @@
 from ase.calculators.emt import EMT
 
 from quacc import change_settings
-from quacc.runners.prep import calc_cleanup, calc_setup
+from quacc.runners.prep import calc_cleanup, calc_setup, terminate
 
 
 def make_files():
@@ -176,3 +176,12 @@ def test_calc_cleanup(tmp_path, monkeypatch):
 
     with pytest.raises(ValueError):
         calc_cleanup(atoms, "quacc", SETTINGS.RESULTS_DIR)
+
+
+def test_terminate(tmp_path):
+    p = tmp_path / "tmp-quacc-1234"
+    os.mkdir(p)
+    with pytest.raises(ValueError, match="moo"):
+        terminate(p, ValueError("moo"))
+    assert not p.exists()
+    assert Path(tmp_path, "failed-quacc-1234").exists()
diff --git a/tests/core/settings/test_settings.py b/tests/core/settings/test_settings.py
index bf1f6284be..f3ff803d45 100644
--- a/tests/core/settings/test_settings.py
+++ b/tests/core/settings/test_settings.py
@@ -28,9 +28,9 @@ def test_file(tmp_path, monkeypatch):
 
 def test_store(tmp_path, monkeypatch):
     monkeypatch.chdir(tmp_path)
-    SETTINGS.STORE = MemoryStore()
-    atoms = bulk("Cu")
-    static_job(atoms)
+    with change_settings({"STORE": MemoryStore()}):
+        atoms = bulk("Cu")
+        static_job(atoms)
 
 
 def test_results_dir(tmp_path, monkeypatch):