Skip to content

Commit

Permalink
Some lightly modified patches for better integration/interaction with
Browse files Browse the repository at this point in the history
Grid Engine from Dave Love <d.love@liverpool.ac.uk>, current author
and maintainer of the open source Son of Grid Engine project at the
University of Liverpool (see https://arc.liv.ac.uk/trac/SGE for
project info).

This significantly improves compatibility with SoGE, UGE, OGS, and
other derived works based on the original Sun Grid Engine.
  • Loading branch information
Michael Jennings committed Oct 2, 2014
1 parent 030a750 commit 46899ea
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 48 deletions.
88 changes: 42 additions & 46 deletions nhc
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ function die() {
if [[ -n "$NHC_DETACHED" ]]; then
echo "$RET $*" > $RESULTFILE
elif [[ "$NHC_RM" == "sge" ]]; then
echo "begin"
echo "$HOSTNAME:healthy:false"
echo "$HOSTNAME:diagnosis:NHC: $*"
echo "end"
echo "begin" >$NHC_FD_OUT
echo "$HOSTNAME:healthy:false" >$NHC_FD_OUT
echo "$HOSTNAME:diagnosis:NHC: $*" >$NHC_FD_OUT
echo "end" >$NHC_FD_OUT
return 77
elif [[ -n "$LOGFILE" ]]; then
oecho "ERROR: $NAME: Health check failed: $*"
Expand All @@ -51,7 +51,7 @@ function die() {
return 0
fi
kill_watchdog
[[ -n "$LOGFILE" ]] && exec 1>&3- 2>&4-
[[ $NHC_FD_OUT -eq 3 ]] && exec 1>&3- 2>&4-
exit $RET
}

Expand Down Expand Up @@ -91,11 +91,7 @@ function oecho() {

if [[ "$SILENT" == "0" ]]; then
[[ $TS -ne 0 ]] && PREFIX="[$SECONDS] - "
if [[ -n "$LOGFILE" ]]; then
echo "$PREFIX$@" >&3
else
echo "$PREFIX$@"
fi
echo "$PREFIX$@" >&$NHC_FD_OUT
fi
}

Expand All @@ -105,11 +101,7 @@ function eecho() {

if [[ "$SILENT" == "0" ]]; then
[[ $TS -ne 0 ]] && PREFIX="[$SECONDS] - "
if [[ -n "$LOGFILE" ]]; then
echo "$PREFIX$@" >&4
else
echo "$PREFIX$@"
fi
echo "$PREFIX$@" >&$NHC_FD_ERR
fi
}

Expand All @@ -119,11 +111,7 @@ function vecho() {

if [[ "$VERBOSE" == "1" ]]; then
[[ $TS -ne 0 ]] && PREFIX="[$SECONDS] - "
if [[ -n "$LOGFILE" ]]; then
echo "$PREFIX$@" >&3
else
echo "$PREFIX$@"
fi
echo "$PREFIX$@" >&$NHC_FD_OUT
fi
}

Expand Down Expand Up @@ -171,8 +159,10 @@ function nhcmain_init_env() {
WATCHDOG_PID=0
FAIL_CNT=0
FORCE_SETSID=0
NHC_FD_OUT=1
NHC_FD_ERR=2
export PATH SYSCONFIGDIR LIBEXECDIR HOSTNAME HOSTNAME_S RET LOGGER_TEXT
export NHC_PID NHC_START_TS WATCHDOG_PID FAIL_CNT FORCE_SETSID
export NHC_PID NHC_START_TS WATCHDOG_PID FAIL_CNT FORCE_SETSID NHC_FD_OUT NHC_FD_ERR

# Users may override this in /etc/sysconfig/nhc.
NAME=${0/#*\/}
Expand Down Expand Up @@ -294,12 +284,15 @@ function nhcmain_finalize_env() {
DETACHED_MODE=${DETACHED_MODE:-0}
DETACHED_MODE_FAIL_NODATA=${DETACHED_MODE_FAIL_NODATA:-0}
TIMEOUT=${TIMEOUT:-10}
MAX_SYS_UID=${MAX_SYS_UID:-99}
NHC_CHECK_ALL=${NHC_CHECK_ALL:-0}
NHC_CHECK_FORKED=${NHC_CHECK_FORKED:-0}
FORCE_SETSID=${FORCE_SETSID:-0}
export NHC_SID=0

# Set from system defaults if present.
[[ -z "$MAX_SYS_UID" ]] && nhc_common_get_max_sys_uid
MAX_SYS_UID=${MAX_SYS_UID:-99}

# Check for session leader.
kill -s 0 -- -$NHC_PID >/dev/null 2>&1
if [[ $? -eq 0 ]]; then
Expand Down Expand Up @@ -369,26 +362,27 @@ function nhcmain_find_rm() {
if [[ -d /var/spool/torque ]]; then
NHC_RM="pbs"
return 0
elif [[ -n "$SGE_ROOT" && -x "$SGE_ROOT/util/arch" ]]; then
# SGE binaries typically won't be on the path defined above in the
# load sensor environment, but SGE_ROOT will be there.
NHC_RM="sge"
fi

# Search PATH for commands
if type -a -p -f -P pbsnodes >&/dev/null ; then
NHC_RM="pbs"
return 0
elif type -a -p -f -P scontrol >&/dev/null ; then
NHC_RM="slurm"
return 0
elif type -a -p -f -P badmin >&/dev/null ; then
NHC_RM="lsf"
return 0
elif type -a -p -f -P qselect >&/dev/null ; then
NHC_RM="sge"
return 0
fi

IFS=':'
DIRLIST=( $PATH )
IFS=$' \t\n'
for DIR in "${DIRLIST[@]}" ; do
if [[ -x "$DIR/pbsnodes" ]]; then
NHC_RM="pbs"
return 0
elif [[ -x "$DIR/scontrol" ]]; then
NHC_RM="slurm"
return 0
elif [[ -x "$DIR/badmin" ]]; then
NHC_RM="lsf"
return 0
elif [[ -x "$DIR/qselect" ]]; then
NHC_RM="sge"
return 0
fi
done
if [[ -z "$NHC_RM" ]]; then
log "Unable to detect resource manager."
return 1
Expand All @@ -407,6 +401,8 @@ function nhcmain_redirect_output() {
exit 1
else
dbg "Output redirected per LOGFILE variable $LOGFILE"
NHC_FD_OUT=3
NHC_FD_ERR=4
fi
fi
}
Expand Down Expand Up @@ -506,7 +502,7 @@ function nhcmain_detach() {
nhcmain_redirect_output
ELAPSED=$((SECONDS-NHC_START_TS))
vlog "Node Health Check detached parent completed successfully (${ELAPSED}s)."
[[ -n "$LOGFILE" ]] && exec 1>&3- 2>&4-
[[ $NHC_FD_OUT -eq 3 ]] && exec 1>&3- 2>&4-
exit 0
}

Expand Down Expand Up @@ -565,14 +561,14 @@ function nhcmain_finish() {
ELAPSED=$((SECONDS-NHC_START_TS))
vlog "Node Health Check completed successfully (${ELAPSED}s)."
if [[ "$NHC_RM" == "sge" ]]; then
echo "begin"
echo "$HOSTNAME:healthy:true"
echo "$HOSTNAME:diagnosis:HEALTHY"
echo "end"
echo "begin" >$NHC_FD_OUT
echo "$HOSTNAME:healthy:true" >$NHC_FD_OUT
echo "$HOSTNAME:diagnosis:HEALTHY" >$NHC_FD_OUT
echo "end" >$NHC_FD_OUT
return 0
fi
kill_watchdog
[[ -n "$LOGFILE" ]] && exec 1>&3- 2>&4-
[[ $NHC_FD_OUT -eq 3 ]] && exec 1>&3- 2>&4-
exit 0
}

Expand Down
15 changes: 15 additions & 0 deletions scripts/common.nhc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#

PASSWD_DATA_SRC="${PASSWD_DATA_SRC:-/etc/passwd}"
LOGIN_DEFS_SRC="${LOGIN_DEFS_SRC:-/etc/login.defs}"

RANGE_MATCH_REGEXP1='^[-a-zA-Z0-9_]+[0-9]+[-\.a-zA-Z0-9]*$'
RANGE_MATCH_REGEXP2='^([-a-zA-Z0-9_]+)\[([0-9]+)\-([0-9]+)\]([-\.a-zA-Z0-9]*)$'
Expand Down Expand Up @@ -538,3 +539,17 @@ function nhc_cmd_with_timeout() {
#exec 2>&3 3>&-
return $RET
}

# Find system definition for UID range
function nhc_common_get_max_sys_uid() {
local LINE

if [[ -e "$LOGIN_DEFS_SRC" ]]; then
while read LINE ; do
if [[ "${LINE##UID_MIN}" != "$LINE" ]]; then
MAX_SYS_UID="${LINE//[^0-9]}"
break
fi
done < "$LOGIN_DEFS_SRC"
fi
}
7 changes: 6 additions & 1 deletion scripts/ww_ps.nhc
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,12 @@ function nhc_ps_gather_data() {
elif [[ "$NHC_RM" == "slurm" ]]; then
RM_DAEMON_MATCH="${RM_DAEMON_MATCH:-/\bslurmstepd\b/}"
elif [[ "$NHC_RM" == "sge" ]]; then
RM_DAEMON_MATCH="${RM_DAEMON_MATCH:-/\bsge_execd\b/}"
# If you limit this to execd, you lose when it's been restarted,
# and the shepherd is detached. Even if execd is safe because of
# system uids, it can spawn mail commands as the job owner, at
# least. (The shepherd process name is normally
# sge_shepherd-<jobnum>, but maybe not if you change shepherd_cmd.)
RM_DAEMON_MATCH="${RM_DAEMON_MATCH:-/\bsge_(execd|shepherd)\b/}"
else
dbg "Unsupported RM detected in ${FUNCNAME}(): \"$NHC_RM\""
fi
Expand Down
19 changes: 18 additions & 1 deletion test/test_common.nhc
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# $Id$
#

plan $((9+5+8+5+4+6+8+7+9)) "common.nhc" && {
plan $((11+5+8+5+4+6+8+7+9+7)) "common.nhc" && {
is "`type -t mcheck_regexp 2>&1`" 'function' 'mcheck_regexp() loaded properly'
is "`type -t mcheck_range 2>&1`" 'function' 'mcheck_regexp() loaded properly'
is "`type -t mcheck_glob 2>&1`" 'function' 'mcheck_glob() loaded properly'
Expand All @@ -13,6 +13,8 @@ plan $((9+5+8+5+4+6+8+7+9)) "common.nhc" && {
is "`type -t nhc_common_get_uid 2>&1`" 'function' 'nhc_common_get_uid() loaded properly'
is "`type -t nhc_common_parse_size 2>&1`" 'function' 'nhc_common_parse_size() loaded properly'
is "`type -t nhc_common_unparse_size 2>&1`" 'function' 'nhc_common_unparse_size() loaded properly'
is "`type -t nhc_common_get_unix_time 2>&1`" 'function' 'nhc_common_get_unix_time() loaded properly'
is "`type -t nhc_common_get_max_sys_uid 2>&1`" 'function' 'nhc_common_get_max_sys_uid() loaded properly'

mcheck "This is a test." '/test/'
is $? 0 "Basic regexp match via mcheck()"
Expand Down Expand Up @@ -156,4 +158,19 @@ plan $((9+5+8+5+4+6+8+7+9)) "common.nhc" && {
nhc_common_unparse_count $OSIZE NSIZE
is "$NSIZE" 999 "nhc_common_unparse_count(): $OSIZE -> 999"

LOGIN_DEFS_SRC=<(echo -e "UID_MIN\t\t\t 500") nhc_common_get_max_sys_uid
is "$MAX_SYS_UID" 500 "nhc_common_get_max_sys_uid(): \$MAX_SYS_UID <- 500"
LOGIN_DEFS_SRC=<(echo -e "UID_MIN 999") nhc_common_get_max_sys_uid
is "$MAX_SYS_UID" 999 "nhc_common_get_max_sys_uid(): \$MAX_SYS_UID <- 999"
LOGIN_DEFS_SRC=<(echo -e "UID_MIN\t0\t") nhc_common_get_max_sys_uid
is "$MAX_SYS_UID" 0 "nhc_common_get_max_sys_uid(): \$MAX_SYS_UID <- 0"
LOGIN_DEFS_SRC=<(echo -e "GID_MIN\t\t\t 1234") nhc_common_get_max_sys_uid
is "$MAX_SYS_UID" 0 "nhc_common_get_max_sys_uid(): Bad syntax"
LOGIN_DEFS_SRC=<(echo -e "2345") nhc_common_get_max_sys_uid
is "$MAX_SYS_UID" 0 "nhc_common_get_max_sys_uid(): Ignore plain number"
LOGIN_DEFS_SRC=<(echo -e "stuff\nGID_MIN 1\nGID_MAX 4\nUID_MIN 3\nUID_MAX 7\nblah blah blah\n") nhc_common_get_max_sys_uid
is "$MAX_SYS_UID" 3 "nhc_common_get_max_sys_uid(): Multiline input"
LOGIN_DEFS_SRC=<(echo -e "UID_MIN\t\t\t 500") nhc_common_get_max_sys_uid
is "$MAX_SYS_UID" 500 "nhc_common_get_max_sys_uid(): Reset default"

} ; unplan

0 comments on commit 46899ea

Please sign in to comment.