Skip to content

Commit

Permalink
Merge pull request #2691 from matthewrmshin/handle-poll-when-pbs-not-…
Browse files Browse the repository at this point in the history
…avail

pbs: handle poll if PBS client cannot connect
  • Loading branch information
hjoliver authored Jun 26, 2018
2 parents 662c697 + fa1c7b6 commit a36d9a8
Show file tree
Hide file tree
Showing 8 changed files with 142 additions and 13 deletions.
1 change: 1 addition & 0 deletions lib/cylc/batch_sys_handlers/pbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class PBSHandler(object):
# N.B. The "qstat JOB_ID" command returns 1 if JOB_ID is no longer in the
# system, so there is no need to filter its output.
POLL_CMD = "qstat"
POLL_CANT_CONNECT_ERR = "Connection refused"
REC_ID_FROM_SUBMIT_OUT = re.compile(r"""\A\s*(?P<id>\S+)\s*\Z""")
SUBMIT_CMD_TMPL = "qsub '%(job)s'"

Expand Down
33 changes: 20 additions & 13 deletions lib/cylc/batch_sys_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,6 @@
Each batch system handler class should instantiate with no argument, and may
have the following constants and methods:
batch_sys.filter_poll_output(out, job_id) => boolean
* If this method is available, it will be called after the batch system's
poll command is called and returns zero. The method should read the
output to see if job_id is still alive in the batch system, and return
True if so.
batch_sys.filter_poll_many_output(out) => job_ids
* Called after the batch system's poll many command. The method should read
the output and return a list of job IDs that are still in the batch
Expand Down Expand Up @@ -69,6 +63,19 @@
beyond just running a system or shell command. See also
"batch_sys.SUBMIT_CMD".
batch_sys.KILL_CMD_TMPL
* A Python string template for getting the batch system command to remove
and terminate a job ID. The command is formed using the logic:
batch_sys.KILL_CMD_TMPL % {"job_id": job_id}
batch_sys.POLL_CANT_CONNECT_ERR
* A string containing an error message. If this is defined, when a poll
command returns a non-zero return code and its STDERR contains this
string, then the poll result will not be trusted, because it is assumed
that the batch system is currently unavailable. Jobs submitted to the
batch system will be assumed OK until we are able to connect to the batch
system again.
batch_sys.SHOULD_KILL_PROC_GROUP
* A boolean to indicate whether it is necessary to kill a job by sending
a signal to its Unix process group.
Expand All @@ -77,11 +84,6 @@
* A boolean to indicate whether it is necessary to poll a job by its PID
as well as the job ID.
batch_sys.KILL_CMD_TMPL
* A Python string template for getting the batch system command to remove
and terminate a job ID. The command is formed using the logic:
batch_sys.KILL_CMD_TMPL % {"job_id": job_id}
batch_sys.REC_ID_FROM_SUBMIT_ERR
batch_sys.REC_ID_FROM_SUBMIT_OUT
* A regular expression (compiled) to extract the job "id" from the standard
Expand Down Expand Up @@ -537,10 +539,15 @@ def _jobs_poll_batch_sys(self, job_log_root, batch_sys_name, my_ctx_list):
exc.filename = cmd[0]
sys.stderr.write(str(exc) + "\n")
return
proc.wait()
ret_code = proc.wait()
out, err = proc.communicate()
sys.stderr.write(err)
if hasattr(batch_sys, "filter_poll_many_output"):
if (ret_code and hasattr(batch_sys, "POLL_CANT_CONNECT_ERR") and
batch_sys.POLL_CANT_CONNECT_ERR in err):
# Poll command failed because it cannot connect to batch system
# Assume jobs are still healthy until the batch system is back.
bad_ids[:] = []
elif hasattr(batch_sys, "filter_poll_many_output"):
# Allow custom filter
for id_ in batch_sys.filter_poll_many_output(out):
try:
Expand Down
Empty file modified tests/cylc-poll/16-execution-time-limit.t
100644 → 100755
Empty file.
56 changes: 56 additions & 0 deletions tests/cylc-poll/17-pbs-cant-connect.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/bash
# THIS FILE IS PART OF THE CYLC SUITE ENGINE.
# Copyright (C) 2008-2018 NIWA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#-------------------------------------------------------------------------------
# Test poll PBS connection refused
CYLC_TEST_IS_GENERIC=false
. "$(dirname "$0")/test_header"

BATCH_SYS_NAME="${TEST_NAME_BASE##??-}"
BATCH_SYS_NAME="${BATCH_SYS_NAME%-cant-connect}"
RC_PREF="[test battery][batch systems][${BATCH_SYS_NAME}]"
export CYLC_TEST_BATCH_TASK_HOST=$( \
cylc get-global-config -i "${RC_PREF}host" 2>'/dev/null')
export CYLC_TEST_BATCH_SITE_DIRECTIVES=$( \
cylc get-global-config -i "${RC_PREF}[directives]" 2>'/dev/null')
if [[ -z "${CYLC_TEST_BATCH_TASK_HOST}" || "${CYLC_TEST_BATCH_TASK_HOST}" == None ]]
then
skip_all "\"[test battery][batch systems][${BATCH_SYS_NAME}]host\" not defined"
fi

set_test_number 3
install_suite "${TEST_NAME_BASE}" "${TEST_NAME_BASE}"
if [[ "${CYLC_TEST_BATCH_TASK_HOST}" != 'localhost' ]]; then
ssh -n "${CYLC_TEST_BATCH_TASK_HOST}" "mkdir -p 'cylc-run/${SUITE_NAME}/'"
rsync -a 'lib' "${CYLC_TEST_BATCH_TASK_HOST}:cylc-run/${SUITE_NAME}/"
fi

run_ok "${TEST_NAME_BASE}-validate" cylc validate "${SUITE_NAME}"
suite_run_ok "${TEST_NAME_BASE}-run" \
cylc run --reference-test --debug --no-detach "${SUITE_NAME}"
sed -n 's/^.*\(\[jobs-poll err\] Connection refused\).*$/\1/p;
s/^.*\(INFO - \[t1.1\] -(current:running)(polled) started\).*$/\1/p' \
"${SUITE_RUN_DIR}/log/suite/log" >'sed-log.out'
contains_ok 'sed-log.out' <<'__LOG__'
[jobs-poll err] Connection refused
INFO - [t1.1] -(current:running)(polled) started
__LOG__

if [[ "${CYLC_TEST_BATCH_TASK_HOST}" != 'localhost' ]]; then
purge_suite_remote "${CYLC_TEST_BATCH_TASK_HOST}" "${SUITE_NAME}"
fi
purge_suite "${SUITE_NAME}"
exit
3 changes: 3 additions & 0 deletions tests/cylc-poll/17-pbs-cant-connect/lib/python/badqstat
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash
echo 'Connection refused' >&2
exit 1
33 changes: 33 additions & 0 deletions tests/cylc-poll/17-pbs-cant-connect/lib/python/my_pbs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/usr/bin/env python2

# THIS FILE IS PART OF THE CYLC SUITE ENGINE.
# Copyright (C) 2008-2018 NIWA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import os


from cylc.batch_sys_handlers.pbs import PBSHandler


class MyPBSHandler(PBSHandler):
"""For testing poll command connection refused."""
@staticmethod
def get_poll_many_cmd(_):
"""Always print PBSHandler.POLL_CANT_CONNECT_ERR to STDERR."""
return os.path.join(os.path.dirname(__file__), 'badqstat')


BATCH_SYS_HANDLER = MyPBSHandler()
3 changes: 3 additions & 0 deletions tests/cylc-poll/17-pbs-cant-connect/reference.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
2015-12-17T16:00:01+13 INFO - Initial point: 1
2015-12-17T16:00:01+13 INFO - Final point: 1
2015-12-17T16:00:01+13 INFO - [t1.1] -triggered off []
26 changes: 26 additions & 0 deletions tests/cylc-poll/17-pbs-cant-connect/suite.rc
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!Jinja2
[cylc]
[[reference test]]
required run mode = live
live mode suite timeout = PT5M

[scheduling]
[[dependencies]]
graph = t1

[runtime]
[[t1]]
script = sleep 60
{% if "CYLC_TEST_BATCH_TASK_HOST" in environ and environ["CYLC_TEST_BATCH_TASK_HOST"] %}
[[[remote]]]
host={{environ["CYLC_TEST_BATCH_TASK_HOST"]}}
{% endif %}
[[[job]]]
batch system = my_pbs
execution time limit = PT2M
execution polling intervals = PT20S
[[[directives]]]
{% if "CYLC_TEST_BATCH_SITE_DIRECTIVES" in environ and
environ["CYLC_TEST_BATCH_SITE_DIRECTIVES"] %}
{{environ["CYLC_TEST_BATCH_SITE_DIRECTIVES"]}}
{% endif %}

0 comments on commit a36d9a8

Please sign in to comment.