dmlc · trivialfis · Sep 17, 2019 · Aug 25, 2019 · Aug 25, 2019 · Aug 25, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -1,7 +1,7 @@
 # disable sudo for container build.
 sudo: required
 
-# Enabling test on Linux and OS X
+# Enabling test OS X
 os:
   - osx
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -32,6 +32,7 @@ option(GOOGLE_TEST "Build google tests" OFF)
 option(USE_DMLC_GTEST "Use google tests bundled with dmlc-core submodule (EXPERIMENTAL)" OFF)
 option(USE_NVTX "Build with cuda profiling annotations. Developers only." OFF)
 set(NVTX_HEADER_DIR "" CACHE PATH "Path to the stand-alone nvtx header")
+option(RABIT_MOCK "Build rabit with mock" OFF)
 ## CUDA
 option(USE_CUDA  "Build with GPU acceleration" OFF)
 option(USE_NCCL  "Build with NCCL to enable distributed GPU support." OFF)
@@ -88,17 +89,25 @@ list(APPEND LINKED_LIBRARIES_PRIVATE dmlc)
 
 # rabit
 # full rabit doesn't build on windows, so we can't import it as subdirectory
-if(MINGW OR R_LIB)
+if(MINGW OR R_LIB OR WIN32)
   set(RABIT_SOURCES
     rabit/src/engine_empty.cc
     rabit/src/c_api.cc)
 else ()
-  set(RABIT_SOURCES
-    rabit/src/allreduce_base.cc
-    rabit/src/allreduce_robust.cc
-    rabit/src/engine.cc
-    rabit/src/c_api.cc)
-endif (MINGW OR R_LIB)
+  if(RABIT_MOCK)
+    set(RABIT_SOURCES
+      rabit/src/allreduce_base.cc
+      rabit/src/allreduce_robust.cc
+      rabit/src/engine_mock.cc
+      rabit/src/c_api.cc)
+  else()
+    set(RABIT_SOURCES
+      rabit/src/allreduce_base.cc
+      rabit/src/allreduce_robust.cc
+      rabit/src/engine.cc
+      rabit/src/c_api.cc)
+  endif(RABIT_MOCK)
+endif (MINGW OR R_LIB OR WIN32)
 add_library(rabit STATIC ${RABIT_SOURCES})
 target_include_directories(rabit PRIVATE
   $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/dmlc-core/include>

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -56,6 +56,7 @@ pipeline {
         script {
           parallel ([
             'build-cpu': { BuildCPU() },
+            'build-cpu-rabit-mock': { BuildCPUMock() },
             'build-gpu-cuda9.0': { BuildCUDA(cuda_version: '9.0') },
             'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') },
             'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') },
@@ -76,6 +77,7 @@ pipeline {
             'test-python-gpu-cuda10.0': { TestPythonGPU(cuda_version: '10.0') },
             'test-python-gpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1') },
             'test-python-mgpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1', multi_gpu: true) },
+            'test-cpp-rabit': {TestCppRabit()},
             'test-cpp-gpu': { TestCppGPU(cuda_version: '10.1') },
             'test-cpp-mgpu': { TestCppGPU(cuda_version: '10.1', multi_gpu: true) },
             'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '2.4.3') },
@@ -185,6 +187,22 @@ def BuildCPU() {
   }
 }
 
+def BuildCPUMock() {
+  node('linux && cpu') {
+    unstash name: 'srcs'
+    echo "Build CPU with rabit mock"
+    def container_type = "cpu"
+    def docker_binary = "docker"
+    sh """
+    ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/build_mock_cmake.sh
+    """
+     echo 'Stashing rabit C++ test executable (xgboost)...'
+    stash name: 'xgboost_rabit_tests', includes: 'xgboost'
+    deleteDir()
+  }
+}
+
+
 def BuildCUDA(args) {
   node('linux && cpu') {
     unstash name: 'srcs'
@@ -279,6 +297,20 @@ def TestPythonGPU(args) {
   }
 }
 
+def TestCppRabit() {
+  node(nodeReq) {
+    unstash name: 'xgboost_rabit_tests'
+    unstash name: 'srcs'
+    echo "Test C++, rabit mock on"
+    def container_type = "cpu"
+    def docker_binary = "docker"
+    sh """
+    ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/runxgb.sh xgboost tests/ci_build/approx.conf.in
+    """
+    deleteDir()
+  }
+}
+
 def TestCppGPU(args) {
   nodeReq = (args.multi_gpu) ? 'linux && mgpu' : 'linux && gpu'
   node(nodeReq) {

diff --git a/rabit b/rabit
diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
@@ -303,6 +303,8 @@ void DenseCuts::Init
   }
   CHECK_EQ(summary_array.size(), in_sketchs->size());
   size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_num_bins * kFactor);
+  // TODO(chenqin): rabit failure recovery assumes no boostrap onetime call after loadcheckpoint
+  // we need to move this allreduce before loadcheckpoint call in future
   sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
   p_cuts_->min_vals_.resize(sketchs.size());
 

diff --git a/src/common/random.h b/src/common/random.h
@@ -127,7 +127,7 @@ class ColumnSampler {
   */
   ColumnSampler() {
     uint32_t seed = common::GlobalRandom()();
-    rabit::Broadcast(&seed, sizeof(seed), 0);
+    rabit::Broadcast(&seed, sizeof(seed), 0, "seed");
     rng_.seed(seed);
   }
 

diff --git a/src/data/data.cc b/src/data/data.cc
@@ -229,7 +229,8 @@ DMatrix* DMatrix::Load(const std::string& uri,
   /* sync up number of features after matrix loaded.
    * partitioned data will fail the train/val validation check
    * since partitioned data not knowing the real number of features. */
-  rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1);
+  rabit::Allreduce<rabit::op::Max>(&dmat->Info().num_col_, 1, nullptr,
+    nullptr, fname.c_str());
   // backward compatiblity code.
   if (!load_row_split) {
     MetaInfo& info = dmat->Info();

diff --git a/src/learner.cc b/src/learner.cc
@@ -270,6 +270,9 @@ class LearnerImpl : public Learner {
             kv.second = "cpu_predictor";
             LOG(INFO) << "Switch gpu_predictor to cpu_predictor.";
           }
+          if (saved_configs_.find(saved_param) != saved_configs_.end()) {
+            cfg_[saved_param] = kv.second;
+          }
         }
       }
       attributes_ = std::map<std::string, std::string>(attr.begin(), attr.end());
@@ -302,6 +305,10 @@ class LearnerImpl : public Learner {
       p_metric->Configure({cfg_.begin(), cfg_.end()});
     }
 
+    // copy dsplit from config since it will not run again during restore
+    if (tparam_.dsplit == DataSplitMode::kAuto && rabit::IsDistributed()) {
+      tparam_.dsplit = DataSplitMode::kRow;
+    }
     this->configured_ = true;
   }
 
@@ -332,8 +339,15 @@ class LearnerImpl : public Learner {
       }
     }
     {
-      // Write `predictor`, `gpu_id` parameters as extra attributes
-      for (const auto& key : std::vector<std::string>{"predictor", "gpu_id"}) {
+      std::vector<std::string> saved_params{"predictor", "gpu_id"};
+      // check if rabit_bootstrap_cache were set to non zero before adding to checkpoint
+      if (cfg_.find("rabit_bootstrap_cache") != cfg_.end() &&
+        (cfg_.find("rabit_bootstrap_cache"))->second != "0") {
+        std::copy(saved_configs_.begin(), saved_configs_.end(),
+                  std::back_inserter(saved_params));
+      }
+      // Write `predictor`, `n_gpus`, `gpu_id` parameters as extra attributes
+      for (const auto& key : saved_params) {
         auto it = cfg_.find(key);
         if (it != cfg_.end()) {
           mparam.contain_extra_attrs = 1;
@@ -601,7 +615,7 @@ class LearnerImpl : public Learner {
       num_feature = std::max(num_feature, static_cast<unsigned>(num_col));
     }
     // run allreduce on num_feature to find the maximum value
-    rabit::Allreduce<rabit::op::Max>(&num_feature, 1);
+    rabit::Allreduce<rabit::op::Max>(&num_feature, 1, nullptr, nullptr, "num_feature");
     if (num_feature > mparam_.num_feature) {
       mparam_.num_feature = num_feature;
     }
@@ -648,6 +662,10 @@ class LearnerImpl : public Learner {
   std::vector<std::shared_ptr<DMatrix> > cache_;
 
   common::Monitor monitor_;
+
+  /*! \brief saved config keys used to restore failed worker */
+  std::set<std::string> saved_configs_ = {"max_depth", "tree_method", "dsplit",
+    "seed", "silent", "num_round", "gamma", "min_child_weight"};
 };
 
 std::string const LearnerImpl::kEvalMetric {"eval_metric"};  // NOLINT

diff --git a/tests/ci_build/approx.conf.in b/tests/ci_build/approx.conf.in
@@ -0,0 +1,12 @@
+# Originally an example in demo/regression/
+tree_method=approx
+eta = 0.5 
+gamma = 1.0
+seed = 0
+min_child_weight = 0
+max_depth = 5 
+
+num_round = 12 
+save_period = 100 
+data = "demo/data/agaricus.txt.train"
+eval[test] = "demo/data/agaricus.txt.test"
diff --git a/tests/ci_build/build_mock_cmake.sh b/tests/ci_build/build_mock_cmake.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+set -e
+
+rm -rf build
+mkdir build
+cd build
+cmake -DRABIT_MOCK=ON -DCMAKE_VERBOSE_MAKEFILE=ON ..
+make clean
+make -j$(nproc)
+cd ..
diff --git a/tests/ci_build/runxgb.sh b/tests/ci_build/runxgb.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+#run make in rabit/test to generate librabit_mock
+#update config.mk and build xgboost using mock
+export DMLC_SUBMIT_CLUSTER=local
+
+submit="python3 dmlc-core/tracker/dmlc-submit"
+# build xgboost with librabit mock
+# define max worker retry with dmlc-core local num atempt
+# instrument worker failure with mock=xxxx
+# check if host recovered from expectected iteration
+echo "====== 1. Fault recovery distributed test ======"
+exec $submit --cluster=local --num-workers=10 --local-num-attempt=10 $1 $2 mock=0,10,1,0 mock=1,11,1,0 mock=1,11,1,1 mock=0,11,1,0 mock=4,11,1,0 mock=9,11,1,0 mock=8,11,2,0 mock=4,11,3,0 rabit_bootstrap_cache=1 rabit_debug=1