Skip to content

Commit

Permalink
per feedback, move distributed xgboost recovery to jenkins
Browse files Browse the repository at this point in the history
  • Loading branch information
chenqin committed Sep 13, 2019
1 parent 0a38691 commit b9f5b7d
Show file tree
Hide file tree
Showing 9 changed files with 52 additions and 38 deletions.
2 changes: 0 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@ env:
- TASK=java_test
# cmake test
# - TASK=cmake_test
# xgb worker fail recovery
- TASK=xgb_recovery

# dependent apt packages
addons:
Expand Down
41 changes: 40 additions & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ pipeline {
script {
parallel ([
'build-cpu': { BuildCPU() },
'build-cpu-rabit-mock': { BuildCPUMock() },
'build-gpu-cuda9.0': { BuildCUDA(cuda_version: '9.0') },
'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') },
'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') },
Expand All @@ -76,6 +77,7 @@ pipeline {
'test-python-gpu-cuda10.0': { TestPythonGPU(cuda_version: '10.0') },
'test-python-gpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1') },
'test-python-mgpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1', multi_gpu: true) },
'test-cpp-rabit': {TestCppRabit()},
'test-cpp-gpu': { TestCppGPU(cuda_version: '10.1') },
'test-cpp-mgpu': { TestCppGPU(cuda_version: '10.1', multi_gpu: true) },
'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '2.4.3') },
Expand Down Expand Up @@ -178,13 +180,36 @@ def BuildCPU() {
def docker_args = "--build-arg CMAKE_VERSION=3.12"
sh """
${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh -DUSE_SANITIZER=ON -DENABLED_SANITIZERS="address" \
-DCMAKE_BUILD_TYPE=Debug -DSANITIZER_PATH=/usr/lib/x86_64-linux-gnu/
-DCMAKE_BUILD_TYPE=Debug -DRABIT_MOCK=${args.mock} -DSANITIZER_PATH=/usr/lib/x86_64-linux-gnu/
${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} build/testxgboost
"""
deleteDir()
}
}

def BuildCPUMock() {
node('linux && cpu') {
unstash name: 'srcs'
echo "Build CPU with rabit mock"
def container_type = "cpu"
def docker_binary = "docker"
sh """
${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_mock_cmake.sh
${dockerRun} ${container_type} ${docker_binary} build/testxgboost
"""
// Sanitizer test
def docker_extra_params = "CI_DOCKER_EXTRA_PARAMS_INIT='-e ASAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer -e ASAN_OPTIONS=symbolize=1 --cap-add SYS_PTRACE'"
def docker_args = "--build-arg CMAKE_VERSION=3.12"
sh """
${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh -DUSE_SANITIZER=ON -DENABLED_SANITIZERS="address" \
-DCMAKE_BUILD_TYPE=Debug -DRABIT_MOCK=${args.mock} -DSANITIZER_PATH=/usr/lib/x86_64-linux-gnu/
${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} build/testxgboost
"""
deleteDir()
}
}


def BuildCUDA(args) {
node('linux && cpu') {
unstash name: 'srcs'
Expand Down Expand Up @@ -279,6 +304,20 @@ def TestPythonGPU(args) {
}
}

def TestCppRabit() {
node(nodeReq) {
unstash name: 'xgboost_rabit_cpp_tests'
unstash name: 'srcs'
echo "Test C++, rabit mock on"
def container_type = "cpu"
def docker_binary = "docker"
sh """
${dockerRun} ${container_type} ${docker_binary} tests/ci_build/runxgb.sh tests/ci_build/approx.conf.in
"""
deleteDir()
}
}

def TestCppGPU(args) {
nodeReq = (args.multi_gpu) ? 'linux && mgpu' : 'linux && gpu'
node(nodeReq) {
Expand Down
3 changes: 2 additions & 1 deletion src/common/hist_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,8 @@ void DenseCuts::Init
}
CHECK_EQ(summary_array.size(), in_sketchs->size());
size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_num_bins * kFactor);
// TODO(chenqin): after remove is_bootstrap, hist break rabit assumption
// TODO(chenqin): rabit failure recovery assumes no boostrap onetime call after loadcheckpoint
// we need to move this allreduce before loadcheckpoint call in future
sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
p_cuts_->min_vals_.resize(sketchs.size());

Expand Down
File renamed without changes.
9 changes: 9 additions & 0 deletions tests/ci_build/build_mock_cmake.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/usr/bin/env bash
set -e
rm -rf build
mkdir build
cd build
cmake .. -DRABIT_MOCK=ON -DCMAKE_VERBOSE_MAKEFILE=ON
make clean
make -j$(nproc)
cd ..
2 changes: 1 addition & 1 deletion tests/cli/runxgb.sh → tests/ci_build/runxgb.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ submit="python3 ../../dmlc-core/tracker/dmlc-submit"
# instrument worker failure with mock=xxxx
# check if host recovered from expectected iteration
echo "====== 1. Fault recovery distributed test ======"
$submit --cluster=local --num-workers=10 --local-num-attempt=10 ../../xgboost $1 mock=0,10,1,0 mock=1,11,1,0 mock=1,11,1,1 mock=0,11,1,0 mock=4,11,1,0 mock=9,11,1,0 mock=8,11,2,0 mock=4,11,3,0 rabit_bootstrap_cache=1 rabit_debug=1
exec($submit --cluster=local --num-workers=10 --local-num-attempt=10 ../../xgboost $1 mock=0,10,1,0 mock=1,11,1,0 mock=1,11,1,1 mock=0,11,1,0 mock=4,11,1,0 mock=9,11,1,0 mock=8,11,2,0 mock=4,11,3,0 rabit_bootstrap_cache=1 rabit_debug=1)
11 changes: 0 additions & 11 deletions tests/cli/hist.conf.in

This file was deleted.

12 changes: 0 additions & 12 deletions tests/travis/run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,3 @@ if [ ${TASK} == "cmake_test" ]; then
cd ..
rm -rf build
fi

if [ ${TASK} == "xgb_recovery" ]; then
set -e
mkdir build
cd build
cmake -DRABIT_MOCK=ON -DUSE_OPENMP=OFF ..
make
cd ..
rm -rf build
cd tests/cli
./runxgb.sh approx.conf.in
fi
10 changes: 0 additions & 10 deletions tests/travis/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,3 @@ fi
if [ ${TASK} == "cmake_test" ] && [ ${TRAVIS_OS_NAME} == "osx" ]; then
sudo softwareupdate -i "Command Line Tools (macOS High Sierra version 10.13) for Xcode-9.3"
fi

if [ ${TASK} == "xgb_recovery" ]; then
if [[ ${TRAVIS_OS_NAME} == "linux" ]]; then sudo apt-get install python3-pip; fi
if [[ ${TRAVIS_OS_NAME} == "osx" ]]; then
brew install python3;
sudo softwareupdate -i "Command Line Tools (macOS High Sierra version 10.13) for Xcode-9.3"
fi
sudo pip3 install cpplint pylint urllib3 numpy
sudo pip3 install websocket-client kubernetes
fi

0 comments on commit b9f5b7d

Please sign in to comment.