From b9f5b7d28f0c8beff774c3241075ad68a97ec2fa Mon Sep 17 00:00:00 2001 From: Chen Qin Date: Thu, 12 Sep 2019 19:19:12 -0700 Subject: [PATCH] per feedback, move distributed xgboost recovery to jenkins --- .travis.yml | 2 -- Jenkinsfile | 41 +++++++++++++++++++++++++- src/common/hist_util.cc | 3 +- tests/{cli => ci_build}/approx.conf.in | 0 tests/ci_build/build_mock_cmake.sh | 9 ++++++ tests/{cli => ci_build}/runxgb.sh | 2 +- tests/cli/hist.conf.in | 11 ------- tests/travis/run_test.sh | 12 -------- tests/travis/setup.sh | 10 ------- 9 files changed, 52 insertions(+), 38 deletions(-) rename tests/{cli => ci_build}/approx.conf.in (100%) create mode 100644 tests/ci_build/build_mock_cmake.sh rename tests/{cli => ci_build}/runxgb.sh (65%) delete mode 100644 tests/cli/hist.conf.in diff --git a/.travis.yml b/.travis.yml index 5425363cd989..513bff0b8ae5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,8 +16,6 @@ env: - TASK=java_test # cmake test # - TASK=cmake_test - # xgb worker fail recovery - - TASK=xgb_recovery # dependent apt packages addons: diff --git a/Jenkinsfile b/Jenkinsfile index 2ea9863725a0..adb4a83222d1 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -56,6 +56,7 @@ pipeline { script { parallel ([ 'build-cpu': { BuildCPU() }, + 'build-cpu-rabit-mock': { BuildCPUMock() }, 'build-gpu-cuda9.0': { BuildCUDA(cuda_version: '9.0') }, 'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') }, 'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') }, @@ -76,6 +77,7 @@ pipeline { 'test-python-gpu-cuda10.0': { TestPythonGPU(cuda_version: '10.0') }, 'test-python-gpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1') }, 'test-python-mgpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1', multi_gpu: true) }, + 'test-cpp-rabit': {TestCppRabit()}, 'test-cpp-gpu': { TestCppGPU(cuda_version: '10.1') }, 'test-cpp-mgpu': { TestCppGPU(cuda_version: '10.1', multi_gpu: true) }, 'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '2.4.3') }, @@ -178,13 +180,36 @@ def BuildCPU() { def docker_args = "--build-arg CMAKE_VERSION=3.12" sh """ ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh -DUSE_SANITIZER=ON -DENABLED_SANITIZERS="address" \ - -DCMAKE_BUILD_TYPE=Debug -DSANITIZER_PATH=/usr/lib/x86_64-linux-gnu/ + -DCMAKE_BUILD_TYPE=Debug -DRABIT_MOCK=${args.mock} -DSANITIZER_PATH=/usr/lib/x86_64-linux-gnu/ ${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} build/testxgboost """ deleteDir() } } +def BuildCPUMock() { + node('linux && cpu') { + unstash name: 'srcs' + echo "Build CPU with rabit mock" + def container_type = "cpu" + def docker_binary = "docker" + sh """ + ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_mock_cmake.sh + ${dockerRun} ${container_type} ${docker_binary} build/testxgboost + """ + // Sanitizer test + def docker_extra_params = "CI_DOCKER_EXTRA_PARAMS_INIT='-e ASAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer -e ASAN_OPTIONS=symbolize=1 --cap-add SYS_PTRACE'" + def docker_args = "--build-arg CMAKE_VERSION=3.12" + sh """ + ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh -DUSE_SANITIZER=ON -DENABLED_SANITIZERS="address" \ + -DCMAKE_BUILD_TYPE=Debug -DRABIT_MOCK=${args.mock} -DSANITIZER_PATH=/usr/lib/x86_64-linux-gnu/ + ${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} build/testxgboost + """ + deleteDir() + } +} + + def BuildCUDA(args) { node('linux && cpu') { unstash name: 'srcs' @@ -279,6 +304,20 @@ def TestPythonGPU(args) { } } +def TestCppRabit() { + node(nodeReq) { + unstash name: 'xgboost_rabit_cpp_tests' + unstash name: 'srcs' + echo "Test C++, rabit mock on" + def container_type = "cpu" + def docker_binary = "docker" + sh """ + ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/runxgb.sh tests/ci_build/approx.conf.in + """ + deleteDir() + } +} + def TestCppGPU(args) { nodeReq = (args.multi_gpu) ? 'linux && mgpu' : 'linux && gpu' node(nodeReq) { diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index d7ac820fc388..678bee018d26 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -303,7 +303,8 @@ void DenseCuts::Init } CHECK_EQ(summary_array.size(), in_sketchs->size()); size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_num_bins * kFactor); - // TODO(chenqin): after remove is_bootstrap, hist break rabit assumption + // TODO(chenqin): rabit failure recovery assumes no boostrap onetime call after loadcheckpoint + // we need to move this allreduce before loadcheckpoint call in future sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size()); p_cuts_->min_vals_.resize(sketchs.size()); diff --git a/tests/cli/approx.conf.in b/tests/ci_build/approx.conf.in similarity index 100% rename from tests/cli/approx.conf.in rename to tests/ci_build/approx.conf.in diff --git a/tests/ci_build/build_mock_cmake.sh b/tests/ci_build/build_mock_cmake.sh new file mode 100644 index 000000000000..5280f18c0ead --- /dev/null +++ b/tests/ci_build/build_mock_cmake.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +set -e +rm -rf build +mkdir build +cd build +cmake .. -DRABIT_MOCK=ON -DCMAKE_VERBOSE_MAKEFILE=ON +make clean +make -j$(nproc) +cd .. \ No newline at end of file diff --git a/tests/cli/runxgb.sh b/tests/ci_build/runxgb.sh similarity index 65% rename from tests/cli/runxgb.sh rename to tests/ci_build/runxgb.sh index d6ae2d88e85b..2672e6601f73 100755 --- a/tests/cli/runxgb.sh +++ b/tests/ci_build/runxgb.sh @@ -10,4 +10,4 @@ submit="python3 ../../dmlc-core/tracker/dmlc-submit" # instrument worker failure with mock=xxxx # check if host recovered from expectected iteration echo "====== 1. Fault recovery distributed test ======" -$submit --cluster=local --num-workers=10 --local-num-attempt=10 ../../xgboost $1 mock=0,10,1,0 mock=1,11,1,0 mock=1,11,1,1 mock=0,11,1,0 mock=4,11,1,0 mock=9,11,1,0 mock=8,11,2,0 mock=4,11,3,0 rabit_bootstrap_cache=1 rabit_debug=1 +exec($submit --cluster=local --num-workers=10 --local-num-attempt=10 ../../xgboost $1 mock=0,10,1,0 mock=1,11,1,0 mock=1,11,1,1 mock=0,11,1,0 mock=4,11,1,0 mock=9,11,1,0 mock=8,11,2,0 mock=4,11,3,0 rabit_bootstrap_cache=1 rabit_debug=1) diff --git a/tests/cli/hist.conf.in b/tests/cli/hist.conf.in deleted file mode 100644 index 5632c1544304..000000000000 --- a/tests/cli/hist.conf.in +++ /dev/null @@ -1,11 +0,0 @@ -tree_method=hist -eta = 0.1 -gamma = 1.0 -seed = 0 -min_child_weight = 0 -max_depth = 5 - -num_round = 11 -save_period = 100 -data = "../../demo/data/agaricus.txt.train" -eval[test] = "../../demo/data/agaricus.txt.test" diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh index 81c35b1b4e43..1fa0f513abe5 100755 --- a/tests/travis/run_test.sh +++ b/tests/travis/run_test.sh @@ -51,15 +51,3 @@ if [ ${TASK} == "cmake_test" ]; then cd .. rm -rf build fi - -if [ ${TASK} == "xgb_recovery" ]; then - set -e - mkdir build - cd build - cmake -DRABIT_MOCK=ON -DUSE_OPENMP=OFF .. - make - cd .. - rm -rf build - cd tests/cli - ./runxgb.sh approx.conf.in -fi diff --git a/tests/travis/setup.sh b/tests/travis/setup.sh index 48e53a599c09..8288b9c9ff67 100755 --- a/tests/travis/setup.sh +++ b/tests/travis/setup.sh @@ -19,13 +19,3 @@ fi if [ ${TASK} == "cmake_test" ] && [ ${TRAVIS_OS_NAME} == "osx" ]; then sudo softwareupdate -i "Command Line Tools (macOS High Sierra version 10.13) for Xcode-9.3" fi - -if [ ${TASK} == "xgb_recovery" ]; then - if [[ ${TRAVIS_OS_NAME} == "linux" ]]; then sudo apt-get install python3-pip; fi - if [[ ${TRAVIS_OS_NAME} == "osx" ]]; then - brew install python3; - sudo softwareupdate -i "Command Line Tools (macOS High Sierra version 10.13) for Xcode-9.3" - fi - sudo pip3 install cpplint pylint urllib3 numpy - sudo pip3 install websocket-client kubernetes -fi