Skip to content

Commit

Permalink
SlurmScheduler: always raise for non-zero exit code (#4332)
Browse files Browse the repository at this point in the history
The `SlurmScheduler` intentionally ignored non-zero exit codes returned
by SLURM when asking the status for a number of job ids. This was put in
place because SLURM will return a non-zero exit code not only in case of
actual errors in attempting to retrieve the status of the requested jobs
but also when specifying just a single job that no longer is active.

Since the latter is not really an error, yet is difficult to distinguish
from a "real" error, the exit code was ignored. However, this could
lead to the plugin sometimes incorrectly ignoring a real problem and
assuming a job was completed when it was in fact still active.

The solution is to use the weird behavior of SLURM that when asking for
more than one job, it will never return a non-zero status, even when
one or more jobs have finished. That is why, when asking for the status
of a single job, we duplicate the job id, such that even when it is no
longer active, the exit status will still be zero.
  • Loading branch information
ltalirz authored Aug 31, 2020
1 parent 0345e61 commit 44fe2a7
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 16 deletions.
44 changes: 29 additions & 15 deletions aiida/schedulers/plugins/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,22 @@ def _get_joblist_command(self, jobs=None, user=None):
if not isinstance(jobs, (tuple, list)):
raise TypeError("If provided, the 'jobs' variable must be a string or a list of strings")
joblist = jobs

# Trick: When asking for a single job, append the same job once more.
# This helps provide a reliable way of knowing whether the squeue command failed (if its exit code is
# non-zero, _parse_joblist_output assumes that an error has occurred and raises an exception).
# When asking for a single job, squeue also returns a non-zero exit code if the corresponding job is
# no longer in the queue (stderr: "slurm_load_jobs error: Invalid job id specified"), which typically
# happens once in the life time of an AiiDA job,
# However, when providing two or more jobids via `squeue --jobs=123,234`, squeue stops caring whether
# the jobs are still in the queue and returns exit code zero irrespectively (allowing AiiDA to rely on the
# exit code for detection of real issues).
# Duplicating job ids has no other effect on the output.
# Verified on slurm versions 17.11.2, 19.05.3-2 and 20.02.2.
# See also https://github.com/aiidateam/aiida-core/issues/4326
if len(joblist) == 1:
joblist += [joblist[0]]

command.append('--jobs={}'.format(','.join(joblist)))

comm = ' '.join(command)
Expand Down Expand Up @@ -482,21 +498,19 @@ def _parse_joblist_output(self, retval, stdout, stderr):
# pylint: disable=too-many-branches,too-many-statements
num_fields = len(self.fields)

# I don't raise because if I pass a list of jobs,
# I get a non-zero status
# if one of the job is not in the list anymore
# retval should be zero
# if retval != 0:
# self.logger.warning("Error in _parse_joblist_output: retval={}; "
# "stdout={}; stderr={}".format(retval, stdout, stderr))

# issue a warning if there is any stderr output and
# there is no line containing "Invalid job id specified", that happens
# when I ask for specific calculations, and they are all finished
if stderr.strip() and 'Invalid job id specified' not in stderr:
self.logger.warning("Warning in _parse_joblist_output, non-empty stderr='{}'".format(stderr.strip()))
if retval != 0:
raise SchedulerError('Error during squeue parsing (_parse_joblist_output function)')
# See discussion in _get_joblist_command on how we ensure that AiiDA can expect exit code 0 here.
if retval != 0:
raise SchedulerError(
"""squeue returned exit code {} (_parse_joblist_output function)
stdout='{}'
stderr='{}'""".format(retval, stdout.strip(), stderr.strip())
)
if stderr.strip():
self.logger.warning(
"squeue returned exit code 0 (_parse_joblist_output function) but non-empty stderr='{}'".format(
stderr.strip()
)
)

# will contain raw data parsed from output: only lines with the
# separator, and already split in fields
Expand Down
37 changes: 36 additions & 1 deletion tests/schedulers/test_slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import datetime

from aiida.schedulers.plugins.slurm import SlurmScheduler, JobState
from aiida.schedulers import SchedulerError

# pylint: disable=line-too-long
# job_id, state_raw, annotation, executing_host, username, number_nodes, number_cpus, allocated_machines, partition, time_limit, time_used, dispatch_time, job_name, submission_time
Expand Down Expand Up @@ -42,7 +43,7 @@ class TestParserSqueue(unittest.TestCase):

def test_parse_common_joblist_output(self):
"""
Test whether _parse_joblist can parse the qstat -f output
Test whether _parse_joblist_output can parse the squeue output
"""
scheduler = SlurmScheduler()

Expand Down Expand Up @@ -98,6 +99,19 @@ def test_parse_common_joblist_output(self):
#
# self.assertTrue( j.num_machines==num_machines )
# self.assertTrue( j.num_mpiprocs==num_mpiprocs )
def test_parse_failed_squeue_output(self):
"""
Test that _parse_joblist_output reacts as expected to failures.
"""
scheduler = SlurmScheduler()

# non-zero return value should raise
with self.assertRaises(SchedulerError):
_ = scheduler._parse_joblist_output(1, TEXT_SQUEUE_TO_TEST, '') # pylint: disable=protected-access

# non-empty stderr should be logged
with self.assertLogs(scheduler.logger, 'WARNING'):
_ = scheduler._parse_joblist_output(0, TEXT_SQUEUE_TO_TEST, 'error message') # pylint: disable=protected-access


class TestTimes(unittest.TestCase):
Expand Down Expand Up @@ -332,3 +346,24 @@ def test_submit_script_with_num_cores_per_machine_and_mpiproc2(self): # pylint:
job_tmpl.job_resource = scheduler.create_job_resource(
num_machines=1, num_mpiprocs_per_machine=1, num_cores_per_machine=24, num_cores_per_mpiproc=23
)


class TestJoblistCommand(unittest.TestCase):
"""
Tests of the issued squeue command.
"""

def test_joblist_single(self):
"""Test that asking for a single job results in duplication of the list."""
scheduler = SlurmScheduler()

command = scheduler._get_joblist_command(jobs=['123']) # pylint: disable=protected-access
self.assertIn('123,123', command)

def test_joblist_multi(self):
"""Test that asking for multiple jobs does not result in duplications."""
scheduler = SlurmScheduler()

command = scheduler._get_joblist_command(jobs=['123', '456']) # pylint: disable=protected-access
self.assertIn('123,456', command)
self.assertNotIn('456,456', command)

0 comments on commit 44fe2a7

Please sign in to comment.