Skip to content

Commit

Permalink
SlurmScheduler: Parse the NODE_FAIL state (#5866)
Browse files Browse the repository at this point in the history
If a job fails due to a node failure, SLURM will set the job's state to
`NODE_FAIL`. The `SlurmScheduler.parse_output` method is updated to
check for this state, in which case the `ERROR_SCHEDULER_NODE_FAILURE`
exit code is returned. This is a new exit code defined on the `CalcJob`
base class.
  • Loading branch information
sphuber authored Jan 25, 2023
1 parent e83bacc commit 65c1b32
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 0 deletions.
3 changes: 3 additions & 0 deletions aiida/engine/processes/calcjobs/calcjob.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,9 @@ def define(cls, spec: CalcJobProcessSpec) -> None: # type: ignore[override]
spec.exit_code(
131, 'ERROR_SCHEDULER_INVALID_ACCOUNT', invalidates_cache=True, message='The specified account is invalid.'
)
spec.exit_code(
140, 'ERROR_SCHEDULER_NODE_FAILURE', invalidates_cache=True, message='The node running the job failed.'
)
spec.exit_code(150, 'STOPPED_BY_MONITOR', invalidates_cache=True, message='{message}')

@classproperty
Expand Down
3 changes: 3 additions & 0 deletions aiida/schedulers/plugins/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -767,6 +767,9 @@ def parse_output(self, detailed_job_info=None, stdout=None, stderr=None):
if data['State'] == 'TIMEOUT':
return CalcJob.exit_codes.ERROR_SCHEDULER_OUT_OF_WALLTIME

if data['State'] == 'NODE_FAIL':
return CalcJob.exit_codes.ERROR_SCHEDULER_NODE_FAILURE

# Alternatively, if the ``detailed_job_info`` is not defined or hasn't already determined an error, try to match
# known error messages from the output written to the ``stderr`` descriptor.
if stderr is not None:
Expand Down
14 changes: 14 additions & 0 deletions tests/schedulers/test_slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,20 @@ def test_parse_out_of_memory():
assert exit_code == CalcJob.exit_codes.ERROR_SCHEDULER_OUT_OF_MEMORY # pylint: disable=no-member


def test_parse_node_failure():
"""Test that `ERROR_SCHEDULER_NODE_FAILURE` code is returned if `STATE == NODE_FAIL`."""
scheduler = SlurmScheduler()
detailed_job_info = {
'retval': 0,
'stderr': '',
'stdout': """||||||||||||||||||||||||||||||||||||||||||||||||||
|||||||||||||||||||||||||||||||||||||||||NODE_FAIL|||||||||"""
} # yapf: disable

exit_code = scheduler.parse_output(detailed_job_info, '', '')
assert exit_code == CalcJob.exit_codes.ERROR_SCHEDULER_NODE_FAILURE # pylint: disable=no-member


@pytest.mark.parametrize('detailed_job_info, expected', [
('string', TypeError), # Not a dictionary
({'stderr': ''}, ValueError), # Key `stdout` missing
Expand Down

0 comments on commit 65c1b32

Please sign in to comment.