Skip to content

Commit

Permalink
SlurmScheduler: use stderr in parse_stdout for OOW and OOM
Browse files Browse the repository at this point in the history
The `CalcJob` implementation was changed to always call the
`parse_output` method of the scheduler, even if `detailed_info` is
`None`. This means that now we can attempt to parse errors from the
`stderr` as well.

Here we add simple regexes to try and detect OOM and OOW errors. They
return the exact same exit code as if they would have been detected from
the `detailed_info`. Note that since this is done with regexes, this
opens the door to false positives. It is not know how likely these are
to occur.
  • Loading branch information
sphuber committed Mar 21, 2022
1 parent 720c5e1 commit 85b78c3
Showing 1 changed file with 50 additions and 32 deletions.
82 changes: 50 additions & 32 deletions aiida/schedulers/plugins/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -707,53 +707,71 @@ def _parse_kill_output(self, retval, stdout, stderr):

return True

def parse_output(self, detailed_job_info, stdout, stderr): # pylint: disable=inconsistent-return-statements
def parse_output(self, detailed_job_info=None, stdout=None, stderr=None):
"""Parse the output of the scheduler.
:param detailed_job_info: dictionary with the output returned by the `Scheduler.get_detailed_job_info` command.
This should contain the keys `retval`, `stdout` and `stderr` corresponding to the return value, stdout and
stderr returned by the accounting command executed for a specific job id.
:param stdout: string with the output written by the scheduler to stdout
:param stderr: string with the output written by the scheduler to stderr
:return: None or an instance of `aiida.engine.processes.exit_code.ExitCode`
:raises TypeError or ValueError: if the passed arguments have incorrect type or value
:param stdout: string with the output written by the scheduler to stdout.
:param stderr: string with the output written by the scheduler to stderr.
:return: None or an instance of :class:`aiida.engine.processes.exit_code.ExitCode`.
:raises TypeError or ValueError: if the passed arguments have incorrect type or value.
"""
from aiida.engine import CalcJob

type_check(detailed_job_info, dict)
if detailed_job_info is not None:

try:
detailed_stdout = detailed_job_info['stdout']
except KeyError:
raise ValueError('the `detailed_job_info` does not contain the required key `stdout`.')
type_check(detailed_job_info, dict)

type_check(detailed_stdout, str)
try:
detailed_stdout = detailed_job_info['stdout']
except KeyError:
raise ValueError('the `detailed_job_info` does not contain the required key `stdout`.')

# The format of the detailed job info should be a multiline string, where the first line is the header, with
# the labels of the projected attributes. The following line should be the values of those attributes for the
# entire job. Any additional lines correspond to those values for any additional tasks that were run.
lines = detailed_stdout.splitlines()
type_check(detailed_stdout, str)

try:
master = lines[1]
except IndexError:
raise ValueError('the `detailed_job_info.stdout` contained less than two lines.')
# The format of the detailed job info should be a multiline string, where the first line is the header, with
# the labels of the projected attributes. The following line should be the values of those attributes for
# the entire job. Any additional lines correspond to those values for any additional tasks that were run.
lines = detailed_stdout.splitlines()

attributes = master.split('|')
try:
master = lines[1]
except IndexError:
raise ValueError('the `detailed_job_info.stdout` contained less than two lines.')

# Pop the last element if it is empty. This happens if the `master` string just finishes with a pipe
if not attributes[-1]:
attributes.pop()
attributes = master.split('|')

if len(self._detailed_job_info_fields) != len(attributes):
raise ValueError(
'second line in `detailed_job_info.stdout` differs in length with schedulers `_detailed_job_info_fields'
)
# Pop the last element if it is empty. This happens if the `master` string just finishes with a pipe
if not attributes[-1]:
attributes.pop()

if len(self._detailed_job_info_fields) != len(attributes):
raise ValueError(
'second line in `detailed_job_info.stdout` differs in length with the `_detailed_job_info_fields '
'attribute of the scheduler.'
)

data = dict(zip(self._detailed_job_info_fields, attributes))

if data['State'] == 'OUT_OF_MEMORY':
return CalcJob.exit_codes.ERROR_SCHEDULER_OUT_OF_MEMORY

if data['State'] == 'TIMEOUT':
return CalcJob.exit_codes.ERROR_SCHEDULER_OUT_OF_WALLTIME

# Alternatively, if the ``detailed_job_info`` is not defined or hasn't already determined an error, try to match
# known error messages from the output written to the ``stderr`` descriptor.
if stderr is not None:

type_check(stderr, str)
stderr_lower = stderr.lower()

data = dict(zip(self._detailed_job_info_fields, attributes))
if re.match(r'.*exceeded.*memory limit.*', stderr_lower):
return CalcJob.exit_codes.ERROR_SCHEDULER_OUT_OF_MEMORY

if data['State'] == 'OUT_OF_MEMORY':
return CalcJob.exit_codes.ERROR_SCHEDULER_OUT_OF_MEMORY # pylint: disable=no-member
if re.match(r'.*cancelled at.*due to time limit.*', stderr_lower):
return CalcJob.exit_codes.ERROR_SCHEDULER_OUT_OF_MEMORY

if data['State'] == 'TIMEOUT':
return CalcJob.exit_codes.ERROR_SCHEDULER_OUT_OF_WALLTIME # pylint: disable=no-member
return None

0 comments on commit 85b78c3

Please sign in to comment.