Skip to content

Commit

Permalink
use log class in list_down_nodes
Browse files Browse the repository at this point in the history
  • Loading branch information
adammoody committed Jul 23, 2021
1 parent 11e3415 commit 63c742c
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 24 deletions.
34 changes: 13 additions & 21 deletions scripts/pyfe/pyfe/list_down_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from time import time
from pyfe import scr_const, scr_common
from pyfe.list_dir import list_dir
from pyfe.scr_common import runproc, pipeproc, scr_prefix
from pyfe.scr_common import runproc, pipeproc

# mark any nodes specified on the command line
def remove_argument_excluded_nodes(nodes=[],nodeset_down=[]):
Expand All @@ -18,7 +18,7 @@ def remove_argument_excluded_nodes(nodes=[],nodeset_down=[]):

# The main scr_list_down_nodes method.
# this method takes an scr_env, the contained resource manager will determine which methods above to use
def list_down_nodes(reason=False, free=False, nodeset_down='', log_nodes=False, runtime_secs=None, nodeset=None, scr_env=None):
def list_down_nodes(reason=False, free=False, nodeset_down='', runtime_secs=None, nodeset=None, scr_env=None, log=None):
if scr_env is None or scr_env.resmgr is None or scr_env.param is None:
return 1
bindir = scr_const.X_BINDIR
Expand All @@ -40,14 +40,6 @@ def list_down_nodes(reason=False, free=False, nodeset_down='', log_nodes=False,
# get list of nodes from nodeset
nodes = scr_env.resmgr.expand_hosts(nodeset)

# get prefix directory
prefix = scr_env.get_prefix()

# get jobid
jobid = resourcemgr.getjobid()
#if jobid == 'defjobid': # job id could not be determined
# print('Could not determine the job id') # the only place this is used here is in the logging below

### In each of the scr_list_down_nodes.in
### these nodes are marked as unavailable, and also removed from the list to log
### There is no use to keep track of them in the unavailable dictionary
Expand All @@ -66,23 +58,23 @@ def list_down_nodes(reason=False, free=False, nodeset_down='', log_nodes=False,
# TODO: read exclude list from a file, as well?

# print any failed nodes to stdout and exit with non-zero
if len(unavailable)>0:
if len(unavailable) > 0:
# log each newly failed node, along with the reason
if log_nodes:
# scr_common.log calls the external program: scr_log_event
# the method will also accept a dictionary (instead of a string)
# for the event_note argument, this moves the loop closer to the runproc call
scr_common.log(bindir=bindir, prefix=prefix, jobid=jobid, event_type='NODE_FAIL', event_note=unavailable, event_start=start_time, event_secs=runtime_secs)
if log:
for node in unavailable:
note = node + ": " + unavailable[node]
log.event('NODE_FAIL', note=note, start=start_time, secs=runtime_secs)

# now output info to the user
ret=''
if reason:
# list each node and the reason each is down
reasons = []
for node in unavailable:
ret += node+': '+unavailable[node]+'\n'
ret = ret[:-1] ### take off the final trailing newline (?)
reasons.append(node + ': ' + unavailable[node])
return "\n".join(reasons)
else:
# simply print the list of down node in range syntax
ret = scr_env.resmgr.compress_hosts(list(unavailable))
return ret
return scr_env.resmgr.compress_hosts(list(unavailable))

# otherwise, don't print anything and exit with 0
return 0
12 changes: 11 additions & 1 deletion scripts/pyfe/pyfe/scr_list_down_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from pyfe.scr_environment import SCR_Env
from pyfe.scr_param import SCR_Param
from pyfe.resmgr import AutoResourceManager
from pyfe.cli import SCRLog

if __name__=='__main__':
parser = argparse.ArgumentParser(add_help=False, argument_default=argparse.SUPPRESS, prog='scr_list_down_nodes')
Expand All @@ -31,5 +32,14 @@
scr_env = SCR_Env()
scr_env.resmgr = AutoResourceManager()
scr_env.param = SCR_Param()
ret = list_down_nodes(reason=args['reason'], free=args['free'], nodeset_down=args['down'], log_nodes=args['log'], runtime_secs=args['secs'], nodeset=args['[nodeset]'], scr_env=scr_env)

# create log object if asked to log down nodes
log = None
if args['log']:
prefix = scr_env.get_prefix()
jobid = scr_env.resmgr.getjobid()
user = scr_env.get_user()
log = SCRLog(prefix, jobid, user=user)

ret = list_down_nodes(reason=args['reason'], free=args['free'], nodeset_down=args['down'], runtime_secs=args['secs'], nodeset=args['[nodeset]'], scr_env=scr_env, log=log)
print(str(ret),end='') ### remove trailing newlines?
4 changes: 2 additions & 2 deletions scripts/pyfe/pyfe/scr_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def scr_run(launcher='',launcher_args=[],run_cmd='',restart_cmd='',restart_args=
down_nodes = ''
if down_nodes!='':
# print the reason for the down nodes, and log them
list_down_nodes(reason=True, free=free_flag, nodeset_down=down_nodes, log_nodes=True, runtime_secs='0', scr_env=scr_env)
list_down_nodes(reason=True, free=free_flag, nodeset_down=down_nodes, runtime_secs='0', scr_env=scr_env, log=log)

# if this is the first run, we hit down nodes right off the bat, make a record of them
if attempts==0:
Expand Down Expand Up @@ -290,7 +290,7 @@ def scr_run(launcher='',launcher_args=[],run_cmd='',restart_cmd='',restart_args=
run_secs = end_secs - start_secs

# check for and log any down nodes
list_down_nodes(reason=True, nodeset_down=keep_down, log_nodes=True, runtime_secs=str(run_secs), scr_env=scr_env)
list_down_nodes(reason=True, nodeset_down=keep_down, runtime_secs=str(run_secs), scr_env=scr_env, log=log)

# log stats on the latest run attempt
log.event('RUN_END', note='run=' + str(attempts), start=str(end_secs), secs=str(run_secs))
Expand Down

0 comments on commit 63c742c

Please sign in to comment.