From 65ff6408f67b288c905c1413fb4908c4e58a5116 Mon Sep 17 00:00:00 2001 From: Sebastiaan Huber Date: Mon, 23 Jan 2023 17:40:13 +0100 Subject: [PATCH] `SlurmScheduler`: Parse the `NODE_FAIL` state If a job fails due to a node failure, SLURM will set the job's state to `NODE_FAIL`. The `SlurmScheduler.parse_output` method is updated to check for this state, in which case the `ERROR_SCHEDULER_NODE_FAILURE` exit code is returned. This is a new exit code defined on the `CalcJob` base class. --- aiida/engine/processes/calcjobs/calcjob.py | 3 +++ aiida/schedulers/plugins/slurm.py | 3 +++ tests/schedulers/test_slurm.py | 14 ++++++++++++++ 3 files changed, 20 insertions(+) diff --git a/aiida/engine/processes/calcjobs/calcjob.py b/aiida/engine/processes/calcjobs/calcjob.py index 1a9a331a47..8833074023 100644 --- a/aiida/engine/processes/calcjobs/calcjob.py +++ b/aiida/engine/processes/calcjobs/calcjob.py @@ -455,6 +455,9 @@ def define(cls, spec: CalcJobProcessSpec) -> None: # type: ignore[override] spec.exit_code( 131, 'ERROR_SCHEDULER_INVALID_ACCOUNT', invalidates_cache=True, message='The specified account is invalid.' ) + spec.exit_code( + 140, 'ERROR_SCHEDULER_NODE_FAILURE', invalidates_cache=True, message='The node running the job failed.' + ) spec.exit_code(150, 'STOPPED_BY_MONITOR', invalidates_cache=True, message='{message}') @classproperty diff --git a/aiida/schedulers/plugins/slurm.py b/aiida/schedulers/plugins/slurm.py index 8be252e3dc..0292e430e2 100644 --- a/aiida/schedulers/plugins/slurm.py +++ b/aiida/schedulers/plugins/slurm.py @@ -767,6 +767,9 @@ def parse_output(self, detailed_job_info=None, stdout=None, stderr=None): if data['State'] == 'TIMEOUT': return CalcJob.exit_codes.ERROR_SCHEDULER_OUT_OF_WALLTIME + if data['State'] == 'NODE_FAIL': + return CalcJob.exit_codes.ERROR_SCHEDULER_NODE_FAILURE + # Alternatively, if the ``detailed_job_info`` is not defined or hasn't already determined an error, try to match # known error messages from the output written to the ``stderr`` descriptor. if stderr is not None: diff --git a/tests/schedulers/test_slurm.py b/tests/schedulers/test_slurm.py index e661e76a2b..8835a15342 100644 --- a/tests/schedulers/test_slurm.py +++ b/tests/schedulers/test_slurm.py @@ -431,6 +431,20 @@ def test_parse_out_of_memory(): assert exit_code == CalcJob.exit_codes.ERROR_SCHEDULER_OUT_OF_MEMORY # pylint: disable=no-member +def test_parse_node_failure(): + """Test that `ERROR_SCHEDULER_NODE_FAILURE` code is returned if `STATE == NODE_FAIL`.""" + scheduler = SlurmScheduler() + detailed_job_info = { + 'retval': 0, + 'stderr': '', + 'stdout': """|||||||||||||||||||||||||||||||||||||||||||||||||| + |||||||||||||||||||||||||||||||||||||||||NODE_FAIL|||||||||""" + } # yapf: disable + + exit_code = scheduler.parse_output(detailed_job_info, '', '') + assert exit_code == CalcJob.exit_codes.ERROR_SCHEDULER_NODE_FAILURE # pylint: disable=no-member + + @pytest.mark.parametrize('detailed_job_info, expected', [ ('string', TypeError), # Not a dictionary ({'stderr': ''}, ValueError), # Key `stdout` missing