SlurmScheduler: Make detailed job info fields dynamic (#6270)

The `SlurmScheduler` plugin uses SLURM's `sacct` command to retrieve detailed information for a given job. The command allows to specify which fields should be projected using the `--format` option. The fields to use were hardcoded by the plugin. This approach made the plugin susceptible to breaking if the supported fields would change. This happened for example for SLURM v23.02, where the `Reserved` field was renamed to `Planned`, see this change log: https://github.com/SchedMD/slurm/blob/863ead570d450e25022f04cc5c9cfb379aa8ae4d/RELEASE_NOTES#L181C1-L182C40 This caused the `sacct` command to return an error and the detailed job info would not be retrieved. To make the plugin more robust with respect to these kinds of changes, the fields are no longer hardcoded, but they are determined dynamically by calling `sacct --helpformat` in a sub shell. This prints a table of the supported fields by the SLURM version that is interacted with. Using `tr`, this table is transformed into a single comma-delimited list, which is the format expected by `--format`. There is also the `--long` option that would provide a large number of fields, however, it is not complete and more than 50 fields are not included. So although it would be a more robust solution, we would be losing a lot of information, some of which could be important for later debugging and analysis, so we stick with the original solution.
aiidateam · Feb 9, 2024 · 4f9774a · 4f9774a
1 parent 9524cda
commit 4f9774a
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 75 deletions.
diff --git a/src/aiida/schedulers/plugins/slurm.py b/src/aiida/schedulers/plugins/slurm.py
@@ -150,59 +150,6 @@ class SlurmScheduler(Scheduler):
         'can_query_by_user': False,
     }
 
-    _detailed_job_info_fields = [
-        'AllocCPUS',
-        'Account',
-        'AssocID',
-        'AveCPU',
-        'AvePages',
-        'AveRSS',
-        'AveVMSize',
-        'Cluster',
-        'Comment',
-        'CPUTime',
-        'CPUTimeRAW',
-        'DerivedExitCode',
-        'Elapsed',
-        'Eligible',
-        'End',
-        'ExitCode',
-        'GID',
-        'Group',
-        'JobID',
-        'JobName',
-        'MaxRSS',
-        'MaxRSSNode',
-        'MaxRSSTask',
-        'MaxVMSize',
-        'MaxVMSizeNode',
-        'MaxVMSizeTask',
-        'MinCPU',
-        'MinCPUNode',
-        'MinCPUTask',
-        'NCPUS',
-        'NNodes',
-        'NodeList',
-        'NTasks',
-        'Priority',
-        'Partition',
-        'QOSRAW',
-        'ReqCPUS',
-        'Reserved',
-        'ResvCPU',
-        'ResvCPURAW',
-        'Start',
-        'State',
-        'Submit',
-        'Suspended',
-        'SystemCPU',
-        'Timelimit',
-        'TotalCPU',
-        'UID',
-        'User',
-        'UserCPU',
-    ]
-
     # The class to be used for the job resource.
     _job_resource_class = SlurmJobResource
 
@@ -288,8 +235,7 @@ def _get_detailed_job_info_command(self, job_id):
         --parsable split the fields with a pipe (|), adding a pipe also at
         the end.
         """
-        fields = ','.join(self._detailed_job_info_fields)
-        return f'sacct --format={fields} --parsable --jobs={job_id}'
+        return f"sacct --format=$(sacct --helpformat | tr -s '\n' ' ' | tr ' ' ',') --parsable --jobs={job_id}"
 
     def _get_submit_script_header(self, job_tmpl):
         """Return the submit script header, using the parameters from the
@@ -770,24 +716,19 @@ def parse_output(self, detailed_job_info=None, stdout=None, stderr=None):
             # the entire job. Any additional lines correspond to those values for any additional tasks that were run.
             lines = detailed_stdout.splitlines()
 
-            try:
-                master = lines[1]
-            except IndexError:
+            if len(lines) < 2:
                 raise ValueError('the `detailed_job_info.stdout` contained less than two lines.')
 
-            attributes = master.split('|')
-
-            # Pop the last element if it is empty. This happens if the `master` string just finishes with a pipe
-            if not attributes[-1]:
-                attributes.pop()
+            fields = lines[0].split('|')
+            attributes = lines[1].split('|')
 
-            if len(self._detailed_job_info_fields) != len(attributes):
+            if len(fields) != len(attributes):
                 raise ValueError(
-                    'second line in `detailed_job_info.stdout` differs in length with the `_detailed_job_info_fields '
-                    'attribute of the scheduler.'
+                    'first and second line in `detailed_job_info.stdout` differ in length: '
+                    f'{len(fields)} vs {len(attributes)}'
                 )
 
-            data = dict(zip(self._detailed_job_info_fields, attributes))
+            data = dict(zip(fields, attributes))
 
             if data['State'] == 'OUT_OF_MEMORY':
                 return CalcJob.exit_codes.ERROR_SCHEDULER_OUT_OF_MEMORY

diff --git a/tests/schedulers/test_slurm.py b/tests/schedulers/test_slurm.py
@@ -415,8 +415,7 @@ def test_parse_out_of_memory():
     detailed_job_info = {
         'retval': 0,
         'stderr': '',
-        'stdout': """||||||||||||||||||||||||||||||||||||||||||||||||||
-        |||||||||||||||||||||||||||||||||||||||||OUT_OF_MEMORY|||||||||""",
+        'stdout': 'Account|State|\nroot|OUT_OF_MEMORY|\n',
     }
 
     exit_code = scheduler.parse_output(detailed_job_info, stdout, stderr)
@@ -429,8 +428,7 @@ def test_parse_node_failure():
     detailed_job_info = {
         'retval': 0,
         'stderr': '',
-        'stdout': """||||||||||||||||||||||||||||||||||||||||||||||||||
-        |||||||||||||||||||||||||||||||||||||||||NODE_FAIL|||||||||""",
+        'stdout': 'Account|State|\nroot|NODE_FAIL|\n',
     }
 
     exit_code = scheduler.parse_output(detailed_job_info, '', '')
@@ -444,7 +442,10 @@ def test_parse_node_failure():
         ({'stderr': ''}, ValueError),  # Key `stdout` missing
         ({'stdout': None}, TypeError),  # `stdout` is not a string
         ({'stdout': ''}, ValueError),  # `stdout` does not contain at least two lines
-        ({'stdout': 'Header\nValue'}, ValueError),  # `stdout` second line contains too few elements separated by pipe
+        (
+            {'stdout': 'Account|State|\nValue|'},
+            ValueError,
+        ),  # `stdout` second line contains too few elements separated by pipe
     ],
 )
 def test_parse_output_invalid(detailed_job_info, expected):
@@ -457,10 +458,8 @@ def test_parse_output_invalid(detailed_job_info, expected):
 
 def test_parse_output_valid():
     """Test `SlurmScheduler.parse_output` for valid arguments."""
-    number_of_fields = len(SlurmScheduler._detailed_job_info_fields)
-    detailed_job_info = {'stdout': f"Header\n{'|' * number_of_fields}"}
+    detailed_job_info = {'stdout': 'State|Account|\n||\n'}
     scheduler = SlurmScheduler()
-
     assert scheduler.parse_output(detailed_job_info, '', '') is None