From 366f3cb9d8b215c06877069e95262998ad105f44 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Tue, 9 Mar 2021 14:53:05 -0800
Subject: [PATCH] Add use_rmm flag to global configuration (#6656)

* Ensure RMM is 0.18 or later

* Add use_rmm flag to global configuration

* Modify XGBCachingDeviceAllocatorImpl to skip CUB when use_rmm=True

* Update the demo

* [CI] Pin NumPy to 1.19.4, since NumPy 1.19.5 doesn't work with latest Shap
---
 R-package/tests/testthat/test_config.R | 10 ++++++
 demo/rmm_plugin/rmm_mgpu_with_dask.py  |  5 ++-
 demo/rmm_plugin/rmm_singlegpu.py       |  2 ++
 doc/parameter.rst                      |  1 +
 include/xgboost/global_config.h        |  4 +++
 src/common/device_helpers.cuh          | 44 ++++++++++++++++++++++----
 tests/ci_build/Dockerfile.rmm          |  2 +-
 tests/ci_build/conda_env/cpu_test.yml  |  2 +-
 tests/cpp/c_api/test_c_api.cc          | 21 +++++++++++-
 tests/cpp/test_global_config.cc        | 10 ++++++
 tests/python/test_config.py            | 12 +++++++
 tests/python/test_with_dask.py         | 24 ++++++++------
 12 files changed, 117 insertions(+), 20 deletions(-)

diff --git a/R-package/tests/testthat/test_config.R b/R-package/tests/testthat/test_config.R
index dc1ea2d6a3e1..6951483011d7 100644
--- a/R-package/tests/testthat/test_config.R
+++ b/R-package/tests/testthat/test_config.R
@@ -9,3 +9,13 @@ test_that('Global configuration works with verbosity', {
   xgb.set.config(verbosity = old_verbosity)
   expect_equal(xgb.get.config()$verbosity, old_verbosity)
 })
+
+test_that('Global configuration works with use_rmm flag', {
+  old_use_rmm_flag <- xgb.get.config()$use_rmm
+  for (v in c(TRUE, FALSE)) {
+    xgb.set.config(use_rmm = v)
+    expect_equal(xgb.get.config()$use_rmm, v)
+  }
+  xgb.set.config(use_rmm = old_use_rmm_flag)
+  expect_equal(xgb.get.config()$use_rmm, old_use_rmm_flag)
+})
diff --git a/demo/rmm_plugin/rmm_mgpu_with_dask.py b/demo/rmm_plugin/rmm_mgpu_with_dask.py
index eac0c5da4822..a147e3072c12 100644
--- a/demo/rmm_plugin/rmm_mgpu_with_dask.py
+++ b/demo/rmm_plugin/rmm_mgpu_with_dask.py
@@ -5,13 +5,16 @@
 from dask_cuda import LocalCUDACluster
 
 def main(client):
+    # Inform XGBoost that RMM is used for GPU memory allocation
+    xgb.set_config(use_rmm=True)
+
     X, y = make_classification(n_samples=10000, n_informative=5, n_classes=3)
     X = dask.array.from_array(X)
     y = dask.array.from_array(y)
     dtrain = xgb.dask.DaskDMatrix(client, X, label=y)
 
     params = {'max_depth': 8, 'eta': 0.01, 'objective': 'multi:softprob', 'num_class': 3,
-              'tree_method': 'gpu_hist'}
+              'tree_method': 'gpu_hist', 'eval_metric': 'merror'}
     output = xgb.dask.train(client, params, dtrain, num_boost_round=100,
                             evals=[(dtrain, 'train')])
     bst = output['booster']
diff --git a/demo/rmm_plugin/rmm_singlegpu.py b/demo/rmm_plugin/rmm_singlegpu.py
index c56e0a0cef43..02caa1cc7e6d 100644
--- a/demo/rmm_plugin/rmm_singlegpu.py
+++ b/demo/rmm_plugin/rmm_singlegpu.py
@@ -4,6 +4,8 @@
 
 # Initialize RMM pool allocator
 rmm.reinitialize(pool_allocator=True)
+# Inform XGBoost that RMM is used for GPU memory allocation
+xgb.set_config(use_rmm=True)
 
 X, y = make_classification(n_samples=10000, n_informative=5, n_classes=3)
 dtrain = xgb.DMatrix(X, label=y)
diff --git a/doc/parameter.rst b/doc/parameter.rst
index 4f4b0560a784..05da8de30278 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -22,6 +22,7 @@ Global Configuration
 The following parameters can be set in the global scope, using ``xgb.config_context()`` (Python) or ``xgb.set.config()`` (R).
 
 * ``verbosity``: Verbosity of printing messages. Valid values of 0 (silent), 1 (warning), 2 (info), and 3 (debug).
+* ``use_rmm``: Whether to use RAPIDS Memory Manager (RMM) to allocate GPU memory. This option is only applicable when XGBoost is built (compiled) with the RMM plugin enabled. Valid values are ``true`` and ``false``.
 
 ******************
 General Parameters
diff --git a/include/xgboost/global_config.h b/include/xgboost/global_config.h
index 70eca5c23108..835d63c88c51 100644
--- a/include/xgboost/global_config.h
+++ b/include/xgboost/global_config.h
@@ -16,11 +16,15 @@ class Json;
 
 struct GlobalConfiguration : public XGBoostParameter<GlobalConfiguration> {
   int verbosity { 1 };
+  bool use_rmm { false };
   DMLC_DECLARE_PARAMETER(GlobalConfiguration) {
     DMLC_DECLARE_FIELD(verbosity)
         .set_range(0, 3)
         .set_default(1)  // shows only warning
         .describe("Flag to print out detailed breakdown of runtime.");
+    DMLC_DECLARE_FIELD(use_rmm)
+        .set_default(false)
+        .describe("Whether to use RAPIDS Memory Manager to allocate GPU memory in XGBoost");
   }
 };
 
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 62af08b429a0..f4578c3d2971 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -32,6 +32,7 @@
 #include "xgboost/logging.h"
 #include "xgboost/host_device_vector.h"
 #include "xgboost/span.h"
+#include "xgboost/global_config.h"
 
 #include "common.h"
 
@@ -42,6 +43,14 @@
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 #include "rmm/mr/device/per_device_resource.hpp"
 #include "rmm/mr/device/thrust_allocator_adaptor.hpp"
+#include "rmm/version_config.hpp"
+
+#if !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
+#error "Please use RMM version 0.18 or later"
+#elif RMM_VERSION_MAJOR == 0 && RMM_VERSION_MINOR < 18
+#error "Please use RMM version 0.18 or later"
+#endif  // !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR)
+
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__)
@@ -453,21 +462,42 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
     return *allocator;
   }
   pointer allocate(size_t n) {  // NOLINT
-    T* ptr;
-    auto errc =  GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&ptr),
-                                                            n * sizeof(T));
-    if (errc != cudaSuccess) {
-      ThrowOOMError("Caching allocator", n * sizeof(T));
+    pointer thrust_ptr;
+    if (use_cub_allocator_) {
+      T* raw_ptr{nullptr};
+      auto errc =  GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&raw_ptr),
+                                                              n * sizeof(T));
+      if (errc != cudaSuccess) {
+        ThrowOOMError("Caching allocator", n * sizeof(T));
+      }
+      thrust_ptr = pointer(raw_ptr);
+    } else {
+      try {
+        thrust_ptr = SuperT::allocate(n);
+        dh::safe_cuda(cudaGetLastError());
+      } catch (const std::exception &e) {
+        ThrowOOMError(e.what(), n * sizeof(T));
+      }
     }
-    pointer thrust_ptr{ ptr };
     GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
     return thrust_ptr;
   }
   void deallocate(pointer ptr, size_t n) {  // NOLINT
     GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
-    GetGlobalCachingAllocator().DeviceFree(ptr.get());
+    if (use_cub_allocator_) {
+      GetGlobalCachingAllocator().DeviceFree(ptr.get());
+    } else {
+      SuperT::deallocate(ptr, n);
+    }
   }
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+  XGBCachingDeviceAllocatorImpl()
+    : SuperT(rmm::cuda_stream_default, rmm::mr::get_current_device_resource()),
+      use_cub_allocator_(!xgboost::GlobalConfigThreadLocalStore::Get()->use_rmm) {}
+#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
   XGBOOST_DEVICE void construct(T *) {}  // NOLINT
+ private:
+  bool use_cub_allocator_{true};
 };
 }  // namespace detail
 
diff --git a/tests/ci_build/Dockerfile.rmm b/tests/ci_build/Dockerfile.rmm
index 042fc425f9eb..a99e7c92639e 100644
--- a/tests/ci_build/Dockerfile.rmm
+++ b/tests/ci_build/Dockerfile.rmm
@@ -28,7 +28,7 @@ ENV PATH=/opt/python/bin:$PATH
 
 # Create new Conda environment with RMM
 RUN \
-    conda create -n gpu_test -c nvidia -c rapidsai-nightly -c rapidsai -c conda-forge -c defaults \
+    conda create -n gpu_test -c nvidia -c rapidsai -c conda-forge -c defaults \
         python=3.7 rmm=0.18* cudatoolkit=$CUDA_VERSION_ARG
 
 ENV GOSU_VERSION 1.10
diff --git a/tests/ci_build/conda_env/cpu_test.yml b/tests/ci_build/conda_env/cpu_test.yml
index 691b466448dd..db09624ecd5d 100644
--- a/tests/ci_build/conda_env/cpu_test.yml
+++ b/tests/ci_build/conda_env/cpu_test.yml
@@ -8,7 +8,7 @@ dependencies:
 - pyyaml
 - cpplint
 - pylint
-- numpy
+- numpy=1.19.4
 - scipy
 - scikit-learn
 - pandas
diff --git a/tests/cpp/c_api/test_c_api.cc b/tests/cpp/c_api/test_c_api.cc
index 83dedd2da813..25388a324a11 100644
--- a/tests/cpp/c_api/test_c_api.cc
+++ b/tests/cpp/c_api/test_c_api.cc
@@ -220,7 +220,8 @@ TEST(CAPI, XGBGlobalConfig) {
   {
     const char *config_str = R"json(
     {
-      "verbosity": 0
+      "verbosity": 0,
+      "use_rmm": false
     }
   )json";
     ret = XGBSetGlobalConfig(config_str);
@@ -233,6 +234,24 @@ TEST(CAPI, XGBGlobalConfig) {
     auto updated_config =
         Json::Load({updated_config_str.data(), updated_config_str.size()});
     ASSERT_EQ(get<Integer>(updated_config["verbosity"]), 0);
+    ASSERT_EQ(get<Boolean>(updated_config["use_rmm"]), false);
+  }
+  {
+    const char *config_str = R"json(
+    {
+      "use_rmm": true
+    }
+  )json";
+    ret = XGBSetGlobalConfig(config_str);
+    ASSERT_EQ(ret, 0);
+    const char *updated_config_cstr;
+    ret = XGBGetGlobalConfig(&updated_config_cstr);
+    ASSERT_EQ(ret, 0);
+
+    std::string updated_config_str{updated_config_cstr};
+    auto updated_config =
+        Json::Load({updated_config_str.data(), updated_config_str.size()});
+    ASSERT_EQ(get<Boolean>(updated_config["use_rmm"]), true);
   }
   {
     const char *config_str = R"json(
diff --git a/tests/cpp/test_global_config.cc b/tests/cpp/test_global_config.cc
index 6a10aa1e1fcb..a9df0ed928ae 100644
--- a/tests/cpp/test_global_config.cc
+++ b/tests/cpp/test_global_config.cc
@@ -19,4 +19,14 @@ TEST(GlobalConfiguration, Verbosity) {
   EXPECT_EQ(get<String>(current_config["verbosity"]), "0");
 }
 
+TEST(GlobalConfiguration, UseRMM) {
+  Json config{JsonObject()};
+  config["use_rmm"] = String("true");
+  auto& global_config = *GlobalConfigThreadLocalStore::Get();
+  FromJson(config, &global_config);
+  // GetConfig() should return updated use_rmm flag
+  Json current_config { ToJson(*GlobalConfigThreadLocalStore::Get()) };
+  EXPECT_EQ(get<String>(current_config["use_rmm"]), "1");
+}
+
 }  // namespace xgboost
diff --git a/tests/python/test_config.py b/tests/python/test_config.py
index 03b52571d7f1..87a544e9c88b 100644
--- a/tests/python/test_config.py
+++ b/tests/python/test_config.py
@@ -14,3 +14,15 @@ def get_current_verbosity():
         new_verbosity = get_current_verbosity()
         assert new_verbosity == verbosity_level
     assert old_verbosity == get_current_verbosity()
+
+
+@pytest.mark.parametrize('use_rmm', [False, True])
+def test_global_config_use_rmm(use_rmm):
+    def get_current_use_rmm_flag():
+        return xgb.get_config()['use_rmm']
+
+    old_use_rmm_flag = get_current_use_rmm_flag()
+    with xgb.config_context(use_rmm=use_rmm):
+        new_use_rmm_flag = get_current_use_rmm_flag()
+        assert new_use_rmm_flag == use_rmm
+    assert old_use_rmm_flag == get_current_use_rmm_flag()
diff --git a/tests/python/test_with_dask.py b/tests/python/test_with_dask.py
index dec7bb75c52a..6bd7c5dcf528 100644
--- a/tests/python/test_with_dask.py
+++ b/tests/python/test_with_dask.py
@@ -834,9 +834,15 @@ def test_dask_predict_leaf(booster: str, client: "Client") -> None:
 
 
 class TestWithDask:
-    def test_global_config(self, client: "Client") -> None:
+    @pytest.mark.parametrize('config_key,config_value', [('verbosity', 0), ('use_rmm', True)])
+    def test_global_config(
+            self,
+            client: "Client",
+            config_key: str,
+            config_value: Any
+    ) -> None:
         X, y, _ = generate_array()
-        xgb.config.set_config(verbosity=0)
+        xgb.config.set_config(**{config_key: config_value})
         dtrain = DaskDMatrix(client, X, y)
         before_fname = './before_training-test_global_config'
         after_fname = './after_training-test_global_config'
@@ -844,36 +850,36 @@ def test_global_config(self, client: "Client") -> None:
         class TestCallback(xgb.callback.TrainingCallback):
             def write_file(self, fname: str) -> None:
                 with open(fname, 'w') as fd:
-                    fd.write(str(xgb.config.get_config()['verbosity']))
+                    fd.write(str(xgb.config.get_config()[config_key]))
 
             def before_training(self, model: xgb.Booster) -> xgb.Booster:
                 self.write_file(before_fname)
-                assert xgb.config.get_config()['verbosity'] == 0
+                assert xgb.config.get_config()[config_key] == config_value
                 return model
 
             def after_training(self, model: xgb.Booster) -> xgb.Booster:
-                assert xgb.config.get_config()['verbosity'] == 0
+                assert xgb.config.get_config()[config_key] == config_value
                 return model
 
             def before_iteration(
                     self, model: xgb.Booster, epoch: int, evals_log: Dict
             ) -> bool:
-                assert xgb.config.get_config()['verbosity'] == 0
+                assert xgb.config.get_config()[config_key] == config_value
                 return False
 
             def after_iteration(
                     self, model: xgb.Booster, epoch: int, evals_log: Dict
             ) -> bool:
                 self.write_file(after_fname)
-                assert xgb.config.get_config()['verbosity'] == 0
+                assert xgb.config.get_config()[config_key] == config_value
                 return False
 
         xgb.dask.train(client, {}, dtrain, num_boost_round=4, callbacks=[TestCallback()])[
             'booster']
 
         with open(before_fname, 'r') as before, open(after_fname, 'r') as after:
-            assert before.read() == '0'
-            assert after.read() == '0'
+            assert before.read() == str(config_value)
+            assert after.read() == str(config_value)
 
         os.remove(before_fname)
         os.remove(after_fname)