From e22e60b4d54e5e485b884b1a3c60242a25863879 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 18 Jul 2024 21:07:05 +0800
Subject: [PATCH] Move device histogram storage into `histogram.cuh`.

Split up the `updater_gpu_hist.cu` file for easier unittests. Currently, the test file
directly includes the `updater_gpu_hist.cu` file and accesses the internal of the updater,
which makes it difficult to make modification. Up coming PRs will try to revise some of
tests to avoid such direct accesses.
---
 src/tree/gpu_hist/histogram.cuh           | 119 ++++++++++++++++++++--
 src/tree/updater_gpu_hist.cu              | 113 +-------------------
 tests/cpp/helpers.cc                      |   2 +
 tests/cpp/helpers.h                       |   3 +
 tests/cpp/tree/gpu_hist/test_histogram.cu |  40 ++++++++
 tests/cpp/tree/test_gpu_hist.cu           |  61 ++---------
 6 files changed, 171 insertions(+), 167 deletions(-)

diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh
index 862821b00b63..87c60a8bfdbc 100644
--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@@ -5,12 +5,14 @@
 #define HISTOGRAM_CUH_
 #include <memory>  // for unique_ptr
 
-#include "../../common/cuda_context.cuh"  // for CUDAContext
-#include "../../data/ellpack_page.cuh"    // for EllpackDeviceAccessor
-#include "feature_groups.cuh"             // for FeatureGroupsAccessor
-#include "xgboost/base.h"                 // for GradientPair, GradientPairInt64
-#include "xgboost/context.h"              // for Context
-#include "xgboost/span.h"                 // for Span
+#include "../../common/cuda_context.cuh"    // for CUDAContext
+#include "../../common/device_helpers.cuh"  // for LaunchN
+#include "../../common/device_vector.cuh"   // for device_vector
+#include "../../data/ellpack_page.cuh"      // for EllpackDeviceAccessor
+#include "feature_groups.cuh"               // for FeatureGroupsAccessor
+#include "xgboost/base.h"                   // for GradientPair, GradientPairInt64
+#include "xgboost/context.h"                // for Context
+#include "xgboost/span.h"                   // for Span
 
 namespace xgboost::tree {
 /**
@@ -60,6 +62,111 @@ class GradientQuantiser {
   }
 };
 
+/**
+ * @brief Data storage for node histograms on device. Automatically expands.
+ *
+ * @tparam kStopGrowingSize  Do not grow beyond this size
+ *
+ * @author  Rory
+ * @date    28/07/2018
+ */
+template <size_t kStopGrowingSize = 1 << 28>
+class DeviceHistogramStorage {
+ private:
+  using GradientSumT = GradientPairInt64;
+  /** @brief Map nidx to starting index of its histogram. */
+  std::map<int, size_t> nidx_map_;
+  // Large buffer of zeroed memory, caches histograms
+  dh::device_vector<typename GradientSumT::ValueT> data_;
+  // If we run out of storage allocate one histogram at a time
+  // in overflow. Not cached, overwritten when a new histogram
+  // is requested
+  dh::device_vector<typename GradientSumT::ValueT> overflow_;
+  std::map<int, size_t> overflow_nidx_map_;
+  int n_bins_;
+  DeviceOrd device_id_;
+  static constexpr size_t kNumItemsInGradientSum =
+      sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
+  static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");
+
+ public:
+  // Start with about 16mb
+  DeviceHistogramStorage() { data_.reserve(1 << 22); }
+  void Init(DeviceOrd device_id, int n_bins) {
+    this->n_bins_ = n_bins;
+    this->device_id_ = device_id;
+  }
+
+  void Reset(Context const* ctx) {
+    auto d_data = data_.data().get();
+    dh::LaunchN(data_.size(), ctx->CUDACtx()->Stream(),
+                [=] __device__(size_t idx) { d_data[idx] = 0.0f; });
+    nidx_map_.clear();
+    overflow_nidx_map_.clear();
+  }
+  [[nodiscard]] bool HistogramExists(int nidx) const {
+    return nidx_map_.find(nidx) != nidx_map_.cend() ||
+           overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend();
+  }
+  [[nodiscard]] int Bins() const { return n_bins_; }
+  [[nodiscard]] size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; }
+  dh::device_vector<typename GradientSumT::ValueT>& Data() { return data_; }
+
+  void AllocateHistograms(Context const* ctx, const std::vector<int>& new_nidxs) {
+    for (int nidx : new_nidxs) {
+      CHECK(!HistogramExists(nidx));
+    }
+    // Number of items currently used in data
+    const size_t used_size = nidx_map_.size() * HistogramSize();
+    const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size();
+    if (used_size >= kStopGrowingSize) {
+      // Use overflow
+      // Delete previous entries
+      overflow_nidx_map_.clear();
+      overflow_.resize(HistogramSize() * new_nidxs.size());
+      // Zero memory
+      auto d_data = overflow_.data().get();
+      dh::LaunchN(overflow_.size(), ctx->CUDACtx()->Stream(),
+                  [=] __device__(size_t idx) { d_data[idx] = 0.0; });
+      // Append new histograms
+      for (int nidx : new_nidxs) {
+        overflow_nidx_map_[nidx] = overflow_nidx_map_.size() * HistogramSize();
+      }
+    } else {
+      CHECK_GE(data_.size(), used_size);
+      // Expand if necessary
+      if (data_.size() < new_used_size) {
+        data_.resize(std::max(data_.size() * 2, new_used_size));
+      }
+      // Append new histograms
+      for (int nidx : new_nidxs) {
+        nidx_map_[nidx] = nidx_map_.size() * HistogramSize();
+      }
+    }
+
+    CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize());
+  }
+
+  /**
+   * \summary   Return pointer to histogram memory for a given node.
+   * \param nidx    Tree node index.
+   * \return    hist pointer.
+   */
+  common::Span<GradientSumT> GetNodeHistogram(int nidx) {
+    CHECK(this->HistogramExists(nidx));
+
+    if (nidx_map_.find(nidx) != nidx_map_.cend()) {
+      // Fetch from normal cache
+      auto ptr = data_.data().get() + nidx_map_.at(nidx);
+      return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
+    } else {
+      // Fetch from overflow
+      auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx);
+      return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
+    }
+  }
+};
+
 class DeviceHistogramBuilderImpl;
 
 class DeviceHistogramBuilder {
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 7d566c3b40ae..83f84ec1f4a5 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -49,113 +49,6 @@ namespace xgboost::tree {
 DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);
 #endif  // !defined(GTEST_TEST)
 
-/**
- * \struct  DeviceHistogramStorage
- *
- * \summary Data storage for node histograms on device. Automatically expands.
- *
- * \tparam GradientSumT      histogram entry type.
- * \tparam kStopGrowingSize  Do not grow beyond this size
- *
- * \author  Rory
- * \date    28/07/2018
- */
-template <size_t kStopGrowingSize = 1 << 28>
-class DeviceHistogramStorage {
- private:
-  using GradientSumT = GradientPairInt64;
-  /*! \brief Map nidx to starting index of its histogram. */
-  std::map<int, size_t> nidx_map_;
-  // Large buffer of zeroed memory, caches histograms
-  dh::device_vector<typename GradientSumT::ValueT> data_;
-  // If we run out of storage allocate one histogram at a time
-  // in overflow. Not cached, overwritten when a new histogram
-  // is requested
-  dh::device_vector<typename GradientSumT::ValueT> overflow_;
-  std::map<int, size_t> overflow_nidx_map_;
-  int n_bins_;
-  DeviceOrd device_id_;
-  static constexpr size_t kNumItemsInGradientSum =
-      sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
-  static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");
-
- public:
-  // Start with about 16mb
-  DeviceHistogramStorage() { data_.reserve(1 << 22); }
-  void Init(DeviceOrd device_id, int n_bins) {
-    this->n_bins_ = n_bins;
-    this->device_id_ = device_id;
-  }
-
-  void Reset() {
-    auto d_data = data_.data().get();
-    dh::LaunchN(data_.size(), [=] __device__(size_t idx) { d_data[idx] = 0.0f; });
-    nidx_map_.clear();
-    overflow_nidx_map_.clear();
-  }
-  [[nodiscard]] bool HistogramExists(int nidx) const {
-    return nidx_map_.find(nidx) != nidx_map_.cend() ||
-           overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend();
-  }
-  [[nodiscard]] int Bins() const { return n_bins_; }
-  [[nodiscard]] size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; }
-  dh::device_vector<typename GradientSumT::ValueT>& Data() { return data_; }
-
-  void AllocateHistograms(const std::vector<int>& new_nidxs) {
-    for (int nidx : new_nidxs) {
-      CHECK(!HistogramExists(nidx));
-    }
-    // Number of items currently used in data
-    const size_t used_size = nidx_map_.size() * HistogramSize();
-    const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size();
-    if (used_size >= kStopGrowingSize) {
-      // Use overflow
-      // Delete previous entries
-      overflow_nidx_map_.clear();
-      overflow_.resize(HistogramSize() * new_nidxs.size());
-      // Zero memory
-      auto d_data = overflow_.data().get();
-      dh::LaunchN(overflow_.size(),
-                  [=] __device__(size_t idx) { d_data[idx] = 0.0; });
-      // Append new histograms
-      for (int nidx : new_nidxs) {
-        overflow_nidx_map_[nidx] = overflow_nidx_map_.size() * HistogramSize();
-      }
-    } else {
-      CHECK_GE(data_.size(), used_size);
-      // Expand if necessary
-      if (data_.size() < new_used_size) {
-        data_.resize(std::max(data_.size() * 2, new_used_size));
-      }
-      // Append new histograms
-      for (int nidx : new_nidxs) {
-        nidx_map_[nidx] = nidx_map_.size() * HistogramSize();
-      }
-    }
-
-    CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize());
-  }
-
-  /**
-   * \summary   Return pointer to histogram memory for a given node.
-   * \param nidx    Tree node index.
-   * \return    hist pointer.
-   */
-  common::Span<GradientSumT> GetNodeHistogram(int nidx) {
-    CHECK(this->HistogramExists(nidx));
-
-    if (nidx_map_.find(nidx) != nidx_map_.cend()) {
-      // Fetch from normal cache
-      auto ptr = data_.data().get() + nidx_map_.at(nidx);
-      return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
-    } else {
-      // Fetch from overflow
-      auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx);
-      return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
-    }
-  }
-};
-
 // Manage memory for a single GPU
 struct GPUHistMakerDevice {
  private:
@@ -258,7 +151,7 @@ struct GPUHistMakerDevice {
 
     // Init histogram
     hist.Init(ctx_->Device(), page->Cuts().TotalBins());
-    hist.Reset();
+    hist.Reset(ctx_);
 
     this->InitFeatureGroupsOnce();
 
@@ -657,7 +550,7 @@ struct GPUHistMakerDevice {
     all_new.insert(all_new.end(), subtraction_nidx.begin(), subtraction_nidx.end());
     // Allocate the histograms
     // Guaranteed contiguous memory
-    hist.AllocateHistograms(all_new);
+    hist.AllocateHistograms(ctx_, all_new);
 
     for (auto nidx : hist_nidx) {
       this->BuildHist(nidx);
@@ -748,7 +641,7 @@ struct GPUHistMakerDevice {
         ctx_, info_, linalg::MakeVec(reinterpret_cast<ReduceT*>(&root_sum_quantised), 2));
     collective::SafeColl(rc);
 
-    hist.AllocateHistograms({kRootNIdx});
+    hist.AllocateHistograms(ctx_, {kRootNIdx});
     this->BuildHist(kRootNIdx);
     this->AllReduceHist(kRootNIdx, 1);
 
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 9b988f9605bd..eebbaf8ef795 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -763,4 +763,6 @@ void DeleteRMMResource(RMMAllocator*) {}
 
 RMMAllocatorPtr SetUpRMMResourceForCppTests(int, char**) { return {nullptr, DeleteRMMResource}; }
 #endif  // !defined(XGBOOST_USE_RMM) || XGBOOST_USE_RMM != 1
+
+std::int32_t DistGpuIdx() { return common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank(); }
 } // namespace xgboost
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index b2e9e08cd80c..2821a11380c8 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -526,6 +526,9 @@ inline std::int32_t AllThreadsForTest() { return Context{}.Threads(); }
 
 inline DeviceOrd FstCU() { return DeviceOrd::CUDA(0); }
 
+// GPU device ordinal for distributed tests
+std::int32_t DistGpuIdx();
+
 inline auto GMockThrow(StringView msg) {
   return ::testing::ThrowsMessage<dmlc::Error>(::testing::HasSubstr(msg));
 }
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index 860e4bfd4ea0..c9320f616983 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -14,6 +14,46 @@
 #include "../../helpers.h"
 
 namespace xgboost::tree {
+TEST(Histogram, DeviceHistogramStorage) {
+  // Ensures that node allocates correctly after reaching `kStopGrowingSize`.
+  auto ctx = MakeCUDACtx(0);
+  constexpr size_t kNBins = 128;
+  constexpr int kNNodes = 4;
+  constexpr size_t kStopGrowing = kNNodes * kNBins * 2u;
+  DeviceHistogramStorage<kStopGrowing> histogram;
+  histogram.Init(FstCU(), kNBins);
+  for (int i = 0; i < kNNodes; ++i) {
+    histogram.AllocateHistograms(&ctx, {i});
+  }
+  histogram.Reset(&ctx);
+  ASSERT_EQ(histogram.Data().size(), kStopGrowing);
+
+  // Use allocated memory but do not erase nidx_map.
+  for (int i = 0; i < kNNodes; ++i) {
+    histogram.AllocateHistograms(&ctx, {i});
+  }
+  for (int i = 0; i < kNNodes; ++i) {
+    ASSERT_TRUE(histogram.HistogramExists(i));
+  }
+
+  // Add two new nodes
+  histogram.AllocateHistograms(&ctx, {kNNodes});
+  histogram.AllocateHistograms(&ctx, {kNNodes + 1});
+
+  // Old cached nodes should still exist
+  for (int i = 0; i < kNNodes; ++i) {
+    ASSERT_TRUE(histogram.HistogramExists(i));
+  }
+
+  // Should be deleted
+  ASSERT_FALSE(histogram.HistogramExists(kNNodes));
+  // Most recent node should exist
+  ASSERT_TRUE(histogram.HistogramExists(kNNodes + 1));
+
+  // Add same node again - should fail
+  EXPECT_ANY_THROW(histogram.AllocateHistograms(&ctx, {kNNodes + 1}););
+}
+
 void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global) {
   Context ctx = MakeCUDACtx(0);
   size_t constexpr kBins = 256, kCols = 120, kRows = 16384, kRounds = 16;
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index 291b46edea36..728fb62c46d4 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -6,7 +6,6 @@
 #include <thrust/host_vector.h>
 #include <xgboost/base.h>
 
-#include <random>
 #include <string>
 #include <vector>
 
@@ -23,46 +22,6 @@
 #include "xgboost/json.h"
 
 namespace xgboost::tree {
-TEST(GpuHist, DeviceHistogramStorage) {
-  // Ensures that node allocates correctly after reaching `kStopGrowingSize`.
-  dh::safe_cuda(cudaSetDevice(0));
-  constexpr size_t kNBins = 128;
-  constexpr int kNNodes = 4;
-  constexpr size_t kStopGrowing = kNNodes * kNBins * 2u;
-  DeviceHistogramStorage<kStopGrowing> histogram;
-  histogram.Init(FstCU(), kNBins);
-  for (int i = 0; i < kNNodes; ++i) {
-    histogram.AllocateHistograms({i});
-  }
-  histogram.Reset();
-  ASSERT_EQ(histogram.Data().size(), kStopGrowing);
-
-  // Use allocated memory but do not erase nidx_map.
-  for (int i = 0; i < kNNodes; ++i) {
-    histogram.AllocateHistograms({i});
-  }
-  for (int i = 0; i < kNNodes; ++i) {
-    ASSERT_TRUE(histogram.HistogramExists(i));
-  }
-
-  // Add two new nodes
-  histogram.AllocateHistograms({kNNodes});
-  histogram.AllocateHistograms({kNNodes + 1});
-
-  // Old cached nodes should still exist
-  for (int i = 0; i < kNNodes; ++i) {
-    ASSERT_TRUE(histogram.HistogramExists(i));
-  }
-
-  // Should be deleted
-  ASSERT_FALSE(histogram.HistogramExists(kNNodes));
-  // Most recent node should exist
-  ASSERT_TRUE(histogram.HistogramExists(kNNodes + 1));
-
-  // Add same node again - should fail
-  EXPECT_ANY_THROW(histogram.AllocateHistograms({kNNodes + 1}););
-}
-
 std::vector<GradientPairPrecise> GetHostHistGpair() {
   // 24 bins, 3 bins for each feature (column).
   std::vector<GradientPairPrecise> hist_gpair = {
@@ -108,7 +67,7 @@ void TestBuildHist(bool use_shared_memory_histograms) {
   maker.row_partitioner = std::make_unique<RowPartitioner>(&ctx, kNRows, 0);
 
   maker.hist.Init(ctx.Device(), page->Cuts().TotalBins());
-  maker.hist.AllocateHistograms({0});
+  maker.hist.AllocateHistograms(&ctx, {0});
 
   maker.gpair = gpair.DeviceSpan();
   maker.quantiser = std::make_unique<GradientQuantiser>(&ctx, maker.gpair, MetaInfo());
@@ -425,8 +384,8 @@ TEST(GpuHist, MaxDepth) {
 namespace {
 RegTree GetHistTree(Context const* ctx, DMatrix* dmat) {
   ObjInfo task{ObjInfo::kRegression};
-  GPUHistMaker hist_maker{ctx, &task};
-  hist_maker.Configure(Args{});
+  std::unique_ptr<TreeUpdater> hist_maker {TreeUpdater::Create("grow_gpu_hist", ctx, &task)};
+  hist_maker->Configure(Args{});
 
   TrainParam param;
   param.UpdateAllowUnknown(Args{});
@@ -436,8 +395,8 @@ RegTree GetHistTree(Context const* ctx, DMatrix* dmat) {
 
   std::vector<HostDeviceVector<bst_node_t>> position(1);
   RegTree tree;
-  hist_maker.Update(&param, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
-                    {&tree});
+  hist_maker->Update(&param, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
+                     {&tree});
   return tree;
 }
 
@@ -476,8 +435,8 @@ TEST_F(MGPUHistTest, HistColumnSplit) {
 namespace {
 RegTree GetApproxTree(Context const* ctx, DMatrix* dmat) {
   ObjInfo task{ObjInfo::kRegression};
-  GPUGlobalApproxMaker approx_maker{ctx, &task};
-  approx_maker.Configure(Args{});
+  std::unique_ptr<TreeUpdater> approx_maker{TreeUpdater::Create("grow_gpu_approx", ctx, &task)};
+  approx_maker->Configure(Args{});
 
   TrainParam param;
   param.UpdateAllowUnknown(Args{});
@@ -487,13 +446,13 @@ RegTree GetApproxTree(Context const* ctx, DMatrix* dmat) {
 
   std::vector<HostDeviceVector<bst_node_t>> position(1);
   RegTree tree;
-  approx_maker.Update(&param, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
-                      {&tree});
+  approx_maker->Update(&param, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
+                       {&tree});
   return tree;
 }
 
 void VerifyApproxColumnSplit(bst_idx_t rows, bst_feature_t cols, RegTree const& expected_tree) {
-  Context ctx(MakeCUDACtx(GPUIDX));
+  auto ctx = MakeCUDACtx(DistGpuIdx());
 
   auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true);
   auto const world_size = collective::GetWorldSize();