From 47b3b121b344c2e9bf709ba200432fb1e253c579 Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Wed, 6 Mar 2019 09:53:32 -0800 Subject: [PATCH 1/7] [rabit harden] fix rabit model recovery unit test failure --- tracker/dmlc_tracker/local.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/tracker/dmlc_tracker/local.py b/tracker/dmlc_tracker/local.py index dff7c1797f..6d73f9559f 100644 --- a/tracker/dmlc_tracker/local.py +++ b/tracker/dmlc_tracker/local.py @@ -8,12 +8,13 @@ import logging from threading import Thread from . import tracker +import pdb def exec_cmd(cmd, role, taskid, pass_env): """Execute the command line command.""" if cmd[0].find('/') == -1 and os.path.exists(cmd[0]) and os.name != 'nt': cmd[0] = './' + cmd[0] - cmd = ' '.join(cmd) + cmdline = ' '.join(cmd) env = os.environ.copy() for k, v in pass_env.items(): env[k] = str(v) @@ -21,25 +22,39 @@ def exec_cmd(cmd, role, taskid, pass_env): env['DMLC_TASK_ID'] = str(taskid) env['DMLC_ROLE'] = role env['DMLC_JOB_CLUSTER'] = 'local' - num_retry = env.get('DMLC_NUM_ATTEMPT', 0) + #overwrite default num of retry with commandline value + for parm in cmd: + if parm.startswith('DMLC_NUM_ATTEMPT'): + num_retry = int(parm.split('=')[1]) + logging.debug('num of retry %d',num_retry) + while True: if os.name == 'nt': - ret = subprocess.call(cmd, shell=True, env=env) + ret = subprocess.call(cmdline, shell=True, env=env) else: - ret = subprocess.call(cmd, shell=True, executable='bash', env=env) + ret = subprocess.call(cmdline, shell=True, executable='bash', env=env) if ret == 0: logging.debug('Thread %d exit with 0', taskid) return else: num_retry -= 1 + newcmd = [] if num_retry >= 0: + # failure trail increase by 1 and restart failed worker + for arg in cmd: + if arg.startswith('rabit_num_trial'): + val = arg.split('=')[1] + arg = arg.replace(val, str(int(val)+1)) + newcmd.append(arg) + cmdline = ' '.join(newcmd) + cmd = newcmd continue if os.name == 'nt': sys.exit(-1) else: - raise RuntimeError('Get nonzero return code=%d on %s %s' % (ret, cmd, env)) + raise RuntimeError('Get nonzero return code=%d on %s' % (ret, cmd)) def submit(args): From 74780eef48a9c8ba373b2c60454c2ac2de8e2b47 Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Thu, 7 Mar 2019 08:52:44 -0800 Subject: [PATCH 2/7] fix cpplint isssue in python3 --- include/dmlc/concurrency.h | 3 ++- include/dmlc/memory.h | 2 ++ include/dmlc/optional.h | 2 +- include/dmlc/thread_group.h | 2 ++ 4 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/dmlc/concurrency.h b/include/dmlc/concurrency.h index 754cf5aa28..576f6389ce 100644 --- a/include/dmlc/concurrency.h +++ b/include/dmlc/concurrency.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "dmlc/base.h" @@ -24,7 +25,7 @@ namespace dmlc { class Spinlock { public: #ifdef _MSC_VER - Spinlock() { + Spinlock() { lock_.clear(); } #else diff --git a/include/dmlc/memory.h b/include/dmlc/memory.h index 3a2b9b0798..00801122e8 100644 --- a/include/dmlc/memory.h +++ b/include/dmlc/memory.h @@ -7,6 +7,8 @@ #define DMLC_MEMORY_H_ #include +#include +#include #include "./base.h" #include "./logging.h" #include "./thread_local.h" diff --git a/include/dmlc/optional.h b/include/dmlc/optional.h index dedbc74781..b02c5e967c 100644 --- a/include/dmlc/optional.h +++ b/include/dmlc/optional.h @@ -25,7 +25,7 @@ struct nullopt_t { explicit nullopt_t(int a) {} #else /*! \brief dummy constructor */ - constexpr nullopt_t(int a) {} + explicit nullopt_t(int a) {} #endif }; diff --git a/include/dmlc/thread_group.h b/include/dmlc/thread_group.h index 626142f302..3d2aa3bb64 100644 --- a/include/dmlc/thread_group.h +++ b/include/dmlc/thread_group.h @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include #include #include From eef04109dfd0009f87284538c0c56d672f2ac644 Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Thu, 7 Mar 2019 10:47:55 -0800 Subject: [PATCH 3/7] per feedback cleanup --- include/dmlc/concurrency.h | 2 +- include/dmlc/optional.h | 2 +- tracker/dmlc_tracker/local.py | 9 ++++----- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/include/dmlc/concurrency.h b/include/dmlc/concurrency.h index 576f6389ce..15ff32f03d 100644 --- a/include/dmlc/concurrency.h +++ b/include/dmlc/concurrency.h @@ -25,7 +25,7 @@ namespace dmlc { class Spinlock { public: #ifdef _MSC_VER - Spinlock() { + Spinlock() { lock_.clear(); } #else diff --git a/include/dmlc/optional.h b/include/dmlc/optional.h index b02c5e967c..85da2b827a 100644 --- a/include/dmlc/optional.h +++ b/include/dmlc/optional.h @@ -25,7 +25,7 @@ struct nullopt_t { explicit nullopt_t(int a) {} #else /*! \brief dummy constructor */ - explicit nullopt_t(int a) {} + constexpr explicit nullopt_t(int a) {} #endif }; diff --git a/tracker/dmlc_tracker/local.py b/tracker/dmlc_tracker/local.py index 6d73f9559f..0fa00faeec 100644 --- a/tracker/dmlc_tracker/local.py +++ b/tracker/dmlc_tracker/local.py @@ -8,7 +8,6 @@ import logging from threading import Thread from . import tracker -import pdb def exec_cmd(cmd, role, taskid, pass_env): """Execute the command line command.""" @@ -25,9 +24,9 @@ def exec_cmd(cmd, role, taskid, pass_env): num_retry = env.get('DMLC_NUM_ATTEMPT', 0) #overwrite default num of retry with commandline value - for parm in cmd: - if parm.startswith('DMLC_NUM_ATTEMPT'): - num_retry = int(parm.split('=')[1]) + for param in cmd: + if param.startswith('DMLC_NUM_ATTEMPT'): + num_retry = int(param.split('=')[1]) logging.debug('num of retry %d',num_retry) while True: @@ -54,7 +53,7 @@ def exec_cmd(cmd, role, taskid, pass_env): if os.name == 'nt': sys.exit(-1) else: - raise RuntimeError('Get nonzero return code=%d on %s' % (ret, cmd)) + raise RuntimeError('Get nonzero return code=%d on %s %s' % (ret, cmd, env)) def submit(args): From 4c84bafd91e1071dbb6b30e8cec73160280a7d70 Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Thu, 7 Mar 2019 13:35:28 -0800 Subject: [PATCH 4/7] make travis osx CXX setting consistent with cmakeetst --- scripts/travis/travis_script.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/travis/travis_script.sh b/scripts/travis/travis_script.sh index d9ddf48b1c..9100104e08 100755 --- a/scripts/travis/travis_script.sh +++ b/scripts/travis/travis_script.sh @@ -20,7 +20,7 @@ if [ ${TASK} == "unittest_gtest" ]; then make -f scripts/packages.mk gtest if [ ${TRAVIS_OS_NAME} != "osx" ]; then echo "USE_S3=1" >> config.mk - echo "export CXX = g++-4.8" >> config.mk + echo "export CXX = g++-7" >> config.mk else echo "USE_S3=0" >> config.mk echo "USE_OPENMP=0" >> config.mk From 1ae784bff7424ed4497b2a7a0d975a986b2f0abc Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Thu, 7 Mar 2019 14:03:01 -0800 Subject: [PATCH 5/7] Revert "make travis osx CXX setting consistent with cmakeetst" This reverts commit 4c84bafd91e1071dbb6b30e8cec73160280a7d70. --- scripts/travis/travis_script.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/travis/travis_script.sh b/scripts/travis/travis_script.sh index 9100104e08..d9ddf48b1c 100755 --- a/scripts/travis/travis_script.sh +++ b/scripts/travis/travis_script.sh @@ -20,7 +20,7 @@ if [ ${TASK} == "unittest_gtest" ]; then make -f scripts/packages.mk gtest if [ ${TRAVIS_OS_NAME} != "osx" ]; then echo "USE_S3=1" >> config.mk - echo "export CXX = g++-7" >> config.mk + echo "export CXX = g++-4.8" >> config.mk else echo "USE_S3=0" >> config.mk echo "USE_OPENMP=0" >> config.mk From 3c7f055a04ad8df89b1b0fd3085fa1f8efb45485 Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Thu, 7 Mar 2019 20:21:08 -0800 Subject: [PATCH 6/7] remove rabit setting from dmlc-core --- tracker/dmlc_tracker/local.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tracker/dmlc_tracker/local.py b/tracker/dmlc_tracker/local.py index 0fa00faeec..868775e1f2 100644 --- a/tracker/dmlc_tracker/local.py +++ b/tracker/dmlc_tracker/local.py @@ -21,12 +21,14 @@ def exec_cmd(cmd, role, taskid, pass_env): env['DMLC_TASK_ID'] = str(taskid) env['DMLC_ROLE'] = role env['DMLC_JOB_CLUSTER'] = 'local' + # keep use DMLC_NUM_ATTEMPT for backward compatible reason num_retry = env.get('DMLC_NUM_ATTEMPT', 0) - #overwrite default num of retry with commandline value + # overwrite default num of retry with commandline value for param in cmd: - if param.startswith('DMLC_NUM_ATTEMPT'): + if param.startswith('DMLC_MAX_RETRY'): num_retry = int(param.split('=')[1]) + logging.debug('num of retry %d',num_retry) while True: @@ -43,7 +45,7 @@ def exec_cmd(cmd, role, taskid, pass_env): if num_retry >= 0: # failure trail increase by 1 and restart failed worker for arg in cmd: - if arg.startswith('rabit_num_trial'): + if arg.startswith('DMLC_NUM_ATTEMPT'): val = arg.split('=')[1] arg = arg.replace(val, str(int(val)+1)) newcmd.append(arg) From 7e7922004ddfec354b2f7a60611dfbf2cc3fadb8 Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Thu, 7 Mar 2019 21:54:12 -0800 Subject: [PATCH 7/7] per feedback --- tracker/dmlc_tracker/local.py | 26 ++++++++------------------ tracker/dmlc_tracker/opts.py | 2 ++ 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/tracker/dmlc_tracker/local.py b/tracker/dmlc_tracker/local.py index 4ecdcbf55a..6e4af12575 100644 --- a/tracker/dmlc_tracker/local.py +++ b/tracker/dmlc_tracker/local.py @@ -9,7 +9,7 @@ from threading import Thread from . import tracker -def exec_cmd(cmd, role, taskid, pass_env): +def exec_cmd(cmd, num_attempt, role, taskid, pass_env): """Execute the command line command.""" if cmd[0].find('/') == -1 and os.path.exists(cmd[0]) and os.name != 'nt': cmd[0] = './' + cmd[0] @@ -22,13 +22,9 @@ def exec_cmd(cmd, role, taskid, pass_env): env['DMLC_ROLE'] = role env['DMLC_JOB_CLUSTER'] = 'local' - # keep use DMLC_NUM_ATTEMPT for backward compatible reason - num_retry = env.get('DMLC_NUM_ATTEMPT', 0) - - # overwrite default num of retry with commandline value - for param in cmd: - if param.startswith('DMLC_MAX_RETRY'): - num_retry = int(param.split('=')[1]) + # backward compatibility + num_retry = env.get('DMLC_NUM_ATTEMPT', num_attempt) + num_trial = 0 logging.debug('num of retry %d',num_retry) @@ -41,17 +37,11 @@ def exec_cmd(cmd, role, taskid, pass_env): logging.debug('Thread %d exit with 0', taskid) return else: + num_trial += 1 num_retry -= 1 - newcmd = [] + if num_retry >= 0: - # failure trail increase by 1 and restart failed worker - for arg in cmd: - if arg.startswith('DMLC_NUM_ATTEMPT'): - val = arg.split('=')[1] - arg = arg.replace(val, str(int(val)+1)) - newcmd.append(arg) - cmdline = ' '.join(newcmd) - cmd = newcmd + cmdline = ' '.join(cmd + ['DMLC_NUM_ATTEMPT=' + str(num_trial)]) continue if os.name == 'nt': sys.exit(-1) @@ -78,7 +68,7 @@ def mthread_submit(nworker, nserver, envs): role = 'worker' else: role = 'server' - procs[i] = Thread(target=exec_cmd, args=(args.command, role, i, envs)) + procs[i] = Thread(target=exec_cmd, args=(args.command, args.local_num_attempt, role, i, envs)) procs[i].setDaemon(True) procs[i].start() diff --git a/tracker/dmlc_tracker/opts.py b/tracker/dmlc_tracker/opts.py index 443c92a179..d642b004fb 100644 --- a/tracker/dmlc_tracker/opts.py +++ b/tracker/dmlc_tracker/opts.py @@ -162,6 +162,8 @@ def get_opts(args=None): parser.add_argument('--kube-server-template', default=None, type=str, help=('Manifest template for servers. Used only in Kubernetes mode.' + 'Can be used to override defaults.')) + parser.add_argument('--local-num-attempt', default=0, type=int, + help=('Number of attempt local tracker can restart slave.')) (args, unknown) = parser.parse_known_args(args) args.command += unknown