Skip to content

Commit

Permalink
SlurmScheduler: Make detailed job info fields dynamic (#6270)
Browse files Browse the repository at this point in the history
The `SlurmScheduler` plugin uses SLURM's `sacct` command to retrieve
detailed information for a given job. The command allows to specify
which fields should be projected using the `--format` option. The fields
to use were hardcoded by the plugin.

This approach made the plugin susceptible to breaking if the supported
fields would change. This happened for example for SLURM v23.02, where
the `Reserved` field was renamed to `Planned`, see this change log:

https://github.com/SchedMD/slurm/blob/863ead570d450e25022f04cc5c9cfb379aa8ae4d/RELEASE_NOTES#L181C1-L182C40

This caused the `sacct` command to return an error and the detailed job
info would not be retrieved.

To make the plugin more robust with respect to these kinds of changes,
the fields are no longer hardcoded, but they are determined dynamically
by calling `sacct --helpformat` in a sub shell. This prints a table of
the supported fields by the SLURM version that is interacted with. Using
`tr`, this table is transformed into a single comma-delimited list,
which is the format expected by `--format`.

There is also the `--long` option that would provide a large number of
fields, however, it is not complete and more than 50 fields are not
included. So although it would be a more robust solution, we would be
losing a lot of information, some of which could be important for later
debugging and analysis, so we stick with the original solution.
  • Loading branch information
sphuber authored Feb 9, 2024
1 parent 9524cda commit 4f9774a
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 75 deletions.
75 changes: 8 additions & 67 deletions src/aiida/schedulers/plugins/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,59 +150,6 @@ class SlurmScheduler(Scheduler):
'can_query_by_user': False,
}

_detailed_job_info_fields = [
'AllocCPUS',
'Account',
'AssocID',
'AveCPU',
'AvePages',
'AveRSS',
'AveVMSize',
'Cluster',
'Comment',
'CPUTime',
'CPUTimeRAW',
'DerivedExitCode',
'Elapsed',
'Eligible',
'End',
'ExitCode',
'GID',
'Group',
'JobID',
'JobName',
'MaxRSS',
'MaxRSSNode',
'MaxRSSTask',
'MaxVMSize',
'MaxVMSizeNode',
'MaxVMSizeTask',
'MinCPU',
'MinCPUNode',
'MinCPUTask',
'NCPUS',
'NNodes',
'NodeList',
'NTasks',
'Priority',
'Partition',
'QOSRAW',
'ReqCPUS',
'Reserved',
'ResvCPU',
'ResvCPURAW',
'Start',
'State',
'Submit',
'Suspended',
'SystemCPU',
'Timelimit',
'TotalCPU',
'UID',
'User',
'UserCPU',
]

# The class to be used for the job resource.
_job_resource_class = SlurmJobResource

Expand Down Expand Up @@ -288,8 +235,7 @@ def _get_detailed_job_info_command(self, job_id):
--parsable split the fields with a pipe (|), adding a pipe also at
the end.
"""
fields = ','.join(self._detailed_job_info_fields)
return f'sacct --format={fields} --parsable --jobs={job_id}'
return f"sacct --format=$(sacct --helpformat | tr -s '\n' ' ' | tr ' ' ',') --parsable --jobs={job_id}"

def _get_submit_script_header(self, job_tmpl):
"""Return the submit script header, using the parameters from the
Expand Down Expand Up @@ -770,24 +716,19 @@ def parse_output(self, detailed_job_info=None, stdout=None, stderr=None):
# the entire job. Any additional lines correspond to those values for any additional tasks that were run.
lines = detailed_stdout.splitlines()

try:
master = lines[1]
except IndexError:
if len(lines) < 2:
raise ValueError('the `detailed_job_info.stdout` contained less than two lines.')

attributes = master.split('|')

# Pop the last element if it is empty. This happens if the `master` string just finishes with a pipe
if not attributes[-1]:
attributes.pop()
fields = lines[0].split('|')
attributes = lines[1].split('|')

if len(self._detailed_job_info_fields) != len(attributes):
if len(fields) != len(attributes):
raise ValueError(
'second line in `detailed_job_info.stdout` differs in length with the `_detailed_job_info_fields '
'attribute of the scheduler.'
'first and second line in `detailed_job_info.stdout` differ in length: '
f'{len(fields)} vs {len(attributes)}'
)

data = dict(zip(self._detailed_job_info_fields, attributes))
data = dict(zip(fields, attributes))

if data['State'] == 'OUT_OF_MEMORY':
return CalcJob.exit_codes.ERROR_SCHEDULER_OUT_OF_MEMORY
Expand Down
15 changes: 7 additions & 8 deletions tests/schedulers/test_slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,8 +415,7 @@ def test_parse_out_of_memory():
detailed_job_info = {
'retval': 0,
'stderr': '',
'stdout': """||||||||||||||||||||||||||||||||||||||||||||||||||
|||||||||||||||||||||||||||||||||||||||||OUT_OF_MEMORY|||||||||""",
'stdout': 'Account|State|\nroot|OUT_OF_MEMORY|\n',
}

exit_code = scheduler.parse_output(detailed_job_info, stdout, stderr)
Expand All @@ -429,8 +428,7 @@ def test_parse_node_failure():
detailed_job_info = {
'retval': 0,
'stderr': '',
'stdout': """||||||||||||||||||||||||||||||||||||||||||||||||||
|||||||||||||||||||||||||||||||||||||||||NODE_FAIL|||||||||""",
'stdout': 'Account|State|\nroot|NODE_FAIL|\n',
}

exit_code = scheduler.parse_output(detailed_job_info, '', '')
Expand All @@ -444,7 +442,10 @@ def test_parse_node_failure():
({'stderr': ''}, ValueError), # Key `stdout` missing
({'stdout': None}, TypeError), # `stdout` is not a string
({'stdout': ''}, ValueError), # `stdout` does not contain at least two lines
({'stdout': 'Header\nValue'}, ValueError), # `stdout` second line contains too few elements separated by pipe
(
{'stdout': 'Account|State|\nValue|'},
ValueError,
), # `stdout` second line contains too few elements separated by pipe
],
)
def test_parse_output_invalid(detailed_job_info, expected):
Expand All @@ -457,10 +458,8 @@ def test_parse_output_invalid(detailed_job_info, expected):

def test_parse_output_valid():
"""Test `SlurmScheduler.parse_output` for valid arguments."""
number_of_fields = len(SlurmScheduler._detailed_job_info_fields)
detailed_job_info = {'stdout': f"Header\n{'|' * number_of_fields}"}
detailed_job_info = {'stdout': 'State|Account|\n||\n'}
scheduler = SlurmScheduler()

assert scheduler.parse_output(detailed_job_info, '', '') is None


Expand Down

0 comments on commit 4f9774a

Please sign in to comment.