SlurmScheduler: use stderr in parse_stdout for OOW and OOM

The `CalcJob` implementation was changed to always call the `parse_output` method of the scheduler, even if `detailed_info` is `None`. This means that now we can attempt to parse errors from the `stderr` as well. Here we add simple regexes to try and detect OOM and OOW errors. They return the exact same exit code as if they would have been detected from the `detailed_info`. Note that since this is done with regexes, this opens the door to false positives. It is not know how likely these are to occur.
aiidateam · Mar 21, 2022 · 85b78c3 · 85b78c3
1 parent 720c5e1
commit 85b78c3
Showing 1 changed file with 50 additions and 32 deletions.
diff --git a/aiida/schedulers/plugins/slurm.py b/aiida/schedulers/plugins/slurm.py
@@ -707,53 +707,71 @@ def _parse_kill_output(self, retval, stdout, stderr):
 
         return True
 
-    def parse_output(self, detailed_job_info, stdout, stderr):  # pylint: disable=inconsistent-return-statements
+    def parse_output(self, detailed_job_info=None, stdout=None, stderr=None):
         """Parse the output of the scheduler.
 
         :param detailed_job_info: dictionary with the output returned by the `Scheduler.get_detailed_job_info` command.
             This should contain the keys `retval`, `stdout` and `stderr` corresponding to the return value, stdout and
             stderr returned by the accounting command executed for a specific job id.
-        :param stdout: string with the output written by the scheduler to stdout
-        :param stderr: string with the output written by the scheduler to stderr
-        :return: None or an instance of `aiida.engine.processes.exit_code.ExitCode`
-        :raises TypeError or ValueError: if the passed arguments have incorrect type or value
+        :param stdout: string with the output written by the scheduler to stdout.
+        :param stderr: string with the output written by the scheduler to stderr.
+        :return: None or an instance of :class:`aiida.engine.processes.exit_code.ExitCode`.
+        :raises TypeError or ValueError: if the passed arguments have incorrect type or value.
         """
         from aiida.engine import CalcJob
 
-        type_check(detailed_job_info, dict)
+        if detailed_job_info is not None:
 
-        try:
-            detailed_stdout = detailed_job_info['stdout']
-        except KeyError:
-            raise ValueError('the `detailed_job_info` does not contain the required key `stdout`.')
+            type_check(detailed_job_info, dict)
 
-        type_check(detailed_stdout, str)
+            try:
+                detailed_stdout = detailed_job_info['stdout']
+            except KeyError:
+                raise ValueError('the `detailed_job_info` does not contain the required key `stdout`.')
 
-        # The format of the detailed job info should be a multiline string, where the first line is the header, with
-        # the labels of the projected attributes. The following line should be the values of those attributes for the
-        # entire job. Any additional lines correspond to those values for any additional tasks that were run.
-        lines = detailed_stdout.splitlines()
+            type_check(detailed_stdout, str)
 
-        try:
-            master = lines[1]
-        except IndexError:
-            raise ValueError('the `detailed_job_info.stdout` contained less than two lines.')
+            # The format of the detailed job info should be a multiline string, where the first line is the header, with
+            # the labels of the projected attributes. The following line should be the values of those attributes for
+            # the entire job. Any additional lines correspond to those values for any additional tasks that were run.
+            lines = detailed_stdout.splitlines()
 
-        attributes = master.split('|')
+            try:
+                master = lines[1]
+            except IndexError:
+                raise ValueError('the `detailed_job_info.stdout` contained less than two lines.')
 
-        # Pop the last element if it is empty. This happens if the `master` string just finishes with a pipe
-        if not attributes[-1]:
-            attributes.pop()
+            attributes = master.split('|')
 
-        if len(self._detailed_job_info_fields) != len(attributes):
-            raise ValueError(
-                'second line in `detailed_job_info.stdout` differs in length with schedulers `_detailed_job_info_fields'
-            )
+            # Pop the last element if it is empty. This happens if the `master` string just finishes with a pipe
+            if not attributes[-1]:
+                attributes.pop()
+
+            if len(self._detailed_job_info_fields) != len(attributes):
+                raise ValueError(
+                    'second line in `detailed_job_info.stdout` differs in length with the `_detailed_job_info_fields '
+                    'attribute of the scheduler.'
+                )
+
+            data = dict(zip(self._detailed_job_info_fields, attributes))
+
+            if data['State'] == 'OUT_OF_MEMORY':
+                return CalcJob.exit_codes.ERROR_SCHEDULER_OUT_OF_MEMORY
+
+            if data['State'] == 'TIMEOUT':
+                return CalcJob.exit_codes.ERROR_SCHEDULER_OUT_OF_WALLTIME
+
+        # Alternatively, if the ``detailed_job_info`` is not defined or hasn't already determined an error, try to match
+        # known error messages from the output written to the ``stderr`` descriptor.
+        if stderr is not None:
+
+            type_check(stderr, str)
+            stderr_lower = stderr.lower()
 
-        data = dict(zip(self._detailed_job_info_fields, attributes))
+            if re.match(r'.*exceeded.*memory limit.*', stderr_lower):
+                return CalcJob.exit_codes.ERROR_SCHEDULER_OUT_OF_MEMORY
 
-        if data['State'] == 'OUT_OF_MEMORY':
-            return CalcJob.exit_codes.ERROR_SCHEDULER_OUT_OF_MEMORY  # pylint: disable=no-member
+            if re.match(r'.*cancelled at.*due to time limit.*', stderr_lower):
+                return CalcJob.exit_codes.ERROR_SCHEDULER_OUT_OF_MEMORY
 
-        if data['State'] == 'TIMEOUT':
-            return CalcJob.exit_codes.ERROR_SCHEDULER_OUT_OF_WALLTIME  # pylint: disable=no-member
+        return None