Skip to content

Commit

Permalink
add support of xgb worker fail recovery for approx
Browse files Browse the repository at this point in the history
  • Loading branch information
chenqin committed Aug 28, 2019
1 parent 3dfa7e4 commit 6734f9a
Show file tree
Hide file tree
Showing 10 changed files with 80 additions and 9 deletions.
2 changes: 1 addition & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
url = https://github.com/dmlc/dmlc-core
[submodule "rabit"]
path = rabit
url = https://github.com/chenqin/rabit
url = https://github.com/dmlc/rabit
[submodule "cub"]
path = cub
url = https://github.com/NVlabs/cub
2 changes: 2 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ env:
- TASK=java_test
# cmake test
# - TASK=cmake_test
# xgb worker fail recovery
- TASK=xgb_recovery

# dependent apt packages
addons:
Expand Down
19 changes: 14 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ option(GOOGLE_TEST "Build google tests" OFF)
option(USE_DMLC_GTEST "Use google tests bundled with dmlc-core submodule (EXPERIMENTAL)" OFF)
option(USE_NVTX "Build with cuda profiling annotations. Developers only." OFF)
set(NVTX_HEADER_DIR "" CACHE PATH "Path to the stand-alone nvtx header")
option(RABIT_MOCK "Build rabit with mock" OFF)
## CUDA
option(USE_CUDA "Build with GPU acceleration" OFF)
option(USE_NCCL "Build with NCCL to enable distributed GPU support." OFF)
Expand Down Expand Up @@ -93,11 +94,19 @@ if(MINGW OR R_LIB OR WIN32)
rabit/src/engine_empty.cc
rabit/src/c_api.cc)
else ()
set(RABIT_SOURCES
rabit/src/allreduce_base.cc
rabit/src/allreduce_robust.cc
rabit/src/engine.cc
rabit/src/c_api.cc)
if(RABIT_MOCK)
set(RABIT_SOURCES
rabit/src/allreduce_base.cc
rabit/src/allreduce_robust.cc
rabit/src/engine_mock.cc
rabit/src/c_api.cc)
else()
set(RABIT_SOURCES
rabit/src/allreduce_base.cc
rabit/src/allreduce_robust.cc
rabit/src/engine.cc
rabit/src/c_api.cc)
endif(RABIT_MOCK)
endif (MINGW OR R_LIB OR WIN32)
add_library(rabit STATIC ${RABIT_SOURCES})
target_include_directories(rabit PRIVATE
Expand Down
2 changes: 1 addition & 1 deletion rabit
5 changes: 3 additions & 2 deletions src/learner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -338,9 +338,10 @@ class LearnerImpl : public Learner {
}
}
{
std::vector<std::string> saved_params{"predictor", "n_gpus", "gpu_id"};
std::copy(saved_configs_.begin(), saved_configs_.end(), std::back_inserter(saved_params));
// Write `predictor`, `n_gpus`, `gpu_id` parameters as extra attributes
for (const auto& key : std::vector<std::string>{
"predictor", "n_gpus", "gpu_id"}) {
for (const auto& key : saved_params) {
auto it = cfg_.find(key);
if (it != cfg_.end()) {
mparam.contain_extra_attrs = 1;
Expand Down
12 changes: 12 additions & 0 deletions tests/cli/approx.conf.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Originally an example in demo/regression/
tree_method=approx
eta = 0.5
gamma = 1.0
seed = 0
min_child_weight = 0
max_depth = 5

num_round = 12
save_period = 100
data = "../../demo/data/agaricus.txt.train"
eval[test] = "../../demo/data/agaricus.txt.test"
11 changes: 11 additions & 0 deletions tests/cli/hist.conf.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
tree_method=hist
eta = 0.1
gamma = 1.0
seed = 0
min_child_weight = 0
max_depth = 5

num_round = 11
save_period = 100
data = "../../demo/data/agaricus.txt.train"
eval[test] = "../../demo/data/agaricus.txt.test"
13 changes: 13 additions & 0 deletions tests/cli/runxgb.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

#run make in rabit/test to generate librabit_mock
#update config.mk and build xgboost using mock
export DMLC_SUBMIT_CLUSTER=local

submit="python3 ../../dmlc-core/tracker/dmlc-submit"
# build xgboost with librabit mock
# define max worker retry with dmlc-core local num atempt
# instrument worker failure with mock=xxxx
# check if host recovered from expectected iteration
echo "====== 1. Fault recovery distributed test ======"
$submit --cluster=local --num-workers=10 --local-num-attempt=10 ../../xgboost $1 mock=0,10,1,0 mock=1,11,1,0 mock=1,11,1,1 mock=0,11,1,0 mock=4,11,1,0 mock=9,11,1,0 mock=8,11,2,0 mock=4,11,3,0 rabit_bootstrap_cache=1
13 changes: 13 additions & 0 deletions tests/travis/run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,16 @@ if [ ${TASK} == "cmake_test" ]; then
cd ..
rm -rf build
fi

if [ ${TASK} == "xgb_recovery" ]; then
set -e
mkdir build
cd build
cmake -DRABIT_MOCK=ON -DUSE_OPENMP=OFF ..
make
cd ..
rm -rf build
cd tests/cli
cp approx.conf.in approx.conf
./runxgb.sh approx.conf
fi
10 changes: 10 additions & 0 deletions tests/travis/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,13 @@ fi
if [ ${TASK} == "cmake_test" ] && [ ${TRAVIS_OS_NAME} == "osx" ]; then
sudo softwareupdate -i "Command Line Tools (macOS High Sierra version 10.13) for Xcode-9.3"
fi

if [ ${TASK} == "xgb_recovery" ]; then
if [[ ${TRAVIS_OS_NAME} == "linux" ]]; then sudo apt-get install python3-pip; fi
if [[ ${TRAVIS_OS_NAME} == "osx" ]]; then
brew install python3;
sudo softwareupdate -i "Command Line Tools (macOS High Sierra version 10.13) for Xcode-9.3"
fi
sudo pip3 install cpplint pylint urllib3 numpy
sudo pip3 install websocket-client kubernetes
fi

0 comments on commit 6734f9a

Please sign in to comment.