From e22e60b4d54e5e485b884b1a3c60242a25863879 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan <jm.yuan@outlook.com> Date: Thu, 18 Jul 2024 21:07:05 +0800 Subject: [PATCH] Move device histogram storage into `histogram.cuh`. Split up the `updater_gpu_hist.cu` file for easier unittests. Currently, the test file directly includes the `updater_gpu_hist.cu` file and accesses the internal of the updater, which makes it difficult to make modification. Up coming PRs will try to revise some of tests to avoid such direct accesses. --- src/tree/gpu_hist/histogram.cuh | 119 ++++++++++++++++++++-- src/tree/updater_gpu_hist.cu | 113 +------------------- tests/cpp/helpers.cc | 2 + tests/cpp/helpers.h | 3 + tests/cpp/tree/gpu_hist/test_histogram.cu | 40 ++++++++ tests/cpp/tree/test_gpu_hist.cu | 61 ++--------- 6 files changed, 171 insertions(+), 167 deletions(-) diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh index 862821b00b63..87c60a8bfdbc 100644 --- a/src/tree/gpu_hist/histogram.cuh +++ b/src/tree/gpu_hist/histogram.cuh @@ -5,12 +5,14 @@ #define HISTOGRAM_CUH_ #include <memory> // for unique_ptr -#include "../../common/cuda_context.cuh" // for CUDAContext -#include "../../data/ellpack_page.cuh" // for EllpackDeviceAccessor -#include "feature_groups.cuh" // for FeatureGroupsAccessor -#include "xgboost/base.h" // for GradientPair, GradientPairInt64 -#include "xgboost/context.h" // for Context -#include "xgboost/span.h" // for Span +#include "../../common/cuda_context.cuh" // for CUDAContext +#include "../../common/device_helpers.cuh" // for LaunchN +#include "../../common/device_vector.cuh" // for device_vector +#include "../../data/ellpack_page.cuh" // for EllpackDeviceAccessor +#include "feature_groups.cuh" // for FeatureGroupsAccessor +#include "xgboost/base.h" // for GradientPair, GradientPairInt64 +#include "xgboost/context.h" // for Context +#include "xgboost/span.h" // for Span namespace xgboost::tree { /** @@ -60,6 +62,111 @@ class GradientQuantiser { } }; +/** + * @brief Data storage for node histograms on device. Automatically expands. + * + * @tparam kStopGrowingSize Do not grow beyond this size + * + * @author Rory + * @date 28/07/2018 + */ +template <size_t kStopGrowingSize = 1 << 28> +class DeviceHistogramStorage { + private: + using GradientSumT = GradientPairInt64; + /** @brief Map nidx to starting index of its histogram. */ + std::map<int, size_t> nidx_map_; + // Large buffer of zeroed memory, caches histograms + dh::device_vector<typename GradientSumT::ValueT> data_; + // If we run out of storage allocate one histogram at a time + // in overflow. Not cached, overwritten when a new histogram + // is requested + dh::device_vector<typename GradientSumT::ValueT> overflow_; + std::map<int, size_t> overflow_nidx_map_; + int n_bins_; + DeviceOrd device_id_; + static constexpr size_t kNumItemsInGradientSum = + sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT); + static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2."); + + public: + // Start with about 16mb + DeviceHistogramStorage() { data_.reserve(1 << 22); } + void Init(DeviceOrd device_id, int n_bins) { + this->n_bins_ = n_bins; + this->device_id_ = device_id; + } + + void Reset(Context const* ctx) { + auto d_data = data_.data().get(); + dh::LaunchN(data_.size(), ctx->CUDACtx()->Stream(), + [=] __device__(size_t idx) { d_data[idx] = 0.0f; }); + nidx_map_.clear(); + overflow_nidx_map_.clear(); + } + [[nodiscard]] bool HistogramExists(int nidx) const { + return nidx_map_.find(nidx) != nidx_map_.cend() || + overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend(); + } + [[nodiscard]] int Bins() const { return n_bins_; } + [[nodiscard]] size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; } + dh::device_vector<typename GradientSumT::ValueT>& Data() { return data_; } + + void AllocateHistograms(Context const* ctx, const std::vector<int>& new_nidxs) { + for (int nidx : new_nidxs) { + CHECK(!HistogramExists(nidx)); + } + // Number of items currently used in data + const size_t used_size = nidx_map_.size() * HistogramSize(); + const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size(); + if (used_size >= kStopGrowingSize) { + // Use overflow + // Delete previous entries + overflow_nidx_map_.clear(); + overflow_.resize(HistogramSize() * new_nidxs.size()); + // Zero memory + auto d_data = overflow_.data().get(); + dh::LaunchN(overflow_.size(), ctx->CUDACtx()->Stream(), + [=] __device__(size_t idx) { d_data[idx] = 0.0; }); + // Append new histograms + for (int nidx : new_nidxs) { + overflow_nidx_map_[nidx] = overflow_nidx_map_.size() * HistogramSize(); + } + } else { + CHECK_GE(data_.size(), used_size); + // Expand if necessary + if (data_.size() < new_used_size) { + data_.resize(std::max(data_.size() * 2, new_used_size)); + } + // Append new histograms + for (int nidx : new_nidxs) { + nidx_map_[nidx] = nidx_map_.size() * HistogramSize(); + } + } + + CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize()); + } + + /** + * \summary Return pointer to histogram memory for a given node. + * \param nidx Tree node index. + * \return hist pointer. + */ + common::Span<GradientSumT> GetNodeHistogram(int nidx) { + CHECK(this->HistogramExists(nidx)); + + if (nidx_map_.find(nidx) != nidx_map_.cend()) { + // Fetch from normal cache + auto ptr = data_.data().get() + nidx_map_.at(nidx); + return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)}; + } else { + // Fetch from overflow + auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx); + return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)}; + } + } +}; + class DeviceHistogramBuilderImpl; class DeviceHistogramBuilder { diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 7d566c3b40ae..83f84ec1f4a5 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -49,113 +49,6 @@ namespace xgboost::tree { DMLC_REGISTRY_FILE_TAG(updater_gpu_hist); #endif // !defined(GTEST_TEST) -/** - * \struct DeviceHistogramStorage - * - * \summary Data storage for node histograms on device. Automatically expands. - * - * \tparam GradientSumT histogram entry type. - * \tparam kStopGrowingSize Do not grow beyond this size - * - * \author Rory - * \date 28/07/2018 - */ -template <size_t kStopGrowingSize = 1 << 28> -class DeviceHistogramStorage { - private: - using GradientSumT = GradientPairInt64; - /*! \brief Map nidx to starting index of its histogram. */ - std::map<int, size_t> nidx_map_; - // Large buffer of zeroed memory, caches histograms - dh::device_vector<typename GradientSumT::ValueT> data_; - // If we run out of storage allocate one histogram at a time - // in overflow. Not cached, overwritten when a new histogram - // is requested - dh::device_vector<typename GradientSumT::ValueT> overflow_; - std::map<int, size_t> overflow_nidx_map_; - int n_bins_; - DeviceOrd device_id_; - static constexpr size_t kNumItemsInGradientSum = - sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT); - static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2."); - - public: - // Start with about 16mb - DeviceHistogramStorage() { data_.reserve(1 << 22); } - void Init(DeviceOrd device_id, int n_bins) { - this->n_bins_ = n_bins; - this->device_id_ = device_id; - } - - void Reset() { - auto d_data = data_.data().get(); - dh::LaunchN(data_.size(), [=] __device__(size_t idx) { d_data[idx] = 0.0f; }); - nidx_map_.clear(); - overflow_nidx_map_.clear(); - } - [[nodiscard]] bool HistogramExists(int nidx) const { - return nidx_map_.find(nidx) != nidx_map_.cend() || - overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend(); - } - [[nodiscard]] int Bins() const { return n_bins_; } - [[nodiscard]] size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; } - dh::device_vector<typename GradientSumT::ValueT>& Data() { return data_; } - - void AllocateHistograms(const std::vector<int>& new_nidxs) { - for (int nidx : new_nidxs) { - CHECK(!HistogramExists(nidx)); - } - // Number of items currently used in data - const size_t used_size = nidx_map_.size() * HistogramSize(); - const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size(); - if (used_size >= kStopGrowingSize) { - // Use overflow - // Delete previous entries - overflow_nidx_map_.clear(); - overflow_.resize(HistogramSize() * new_nidxs.size()); - // Zero memory - auto d_data = overflow_.data().get(); - dh::LaunchN(overflow_.size(), - [=] __device__(size_t idx) { d_data[idx] = 0.0; }); - // Append new histograms - for (int nidx : new_nidxs) { - overflow_nidx_map_[nidx] = overflow_nidx_map_.size() * HistogramSize(); - } - } else { - CHECK_GE(data_.size(), used_size); - // Expand if necessary - if (data_.size() < new_used_size) { - data_.resize(std::max(data_.size() * 2, new_used_size)); - } - // Append new histograms - for (int nidx : new_nidxs) { - nidx_map_[nidx] = nidx_map_.size() * HistogramSize(); - } - } - - CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize()); - } - - /** - * \summary Return pointer to histogram memory for a given node. - * \param nidx Tree node index. - * \return hist pointer. - */ - common::Span<GradientSumT> GetNodeHistogram(int nidx) { - CHECK(this->HistogramExists(nidx)); - - if (nidx_map_.find(nidx) != nidx_map_.cend()) { - // Fetch from normal cache - auto ptr = data_.data().get() + nidx_map_.at(nidx); - return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)}; - } else { - // Fetch from overflow - auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx); - return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)}; - } - } -}; - // Manage memory for a single GPU struct GPUHistMakerDevice { private: @@ -258,7 +151,7 @@ struct GPUHistMakerDevice { // Init histogram hist.Init(ctx_->Device(), page->Cuts().TotalBins()); - hist.Reset(); + hist.Reset(ctx_); this->InitFeatureGroupsOnce(); @@ -657,7 +550,7 @@ struct GPUHistMakerDevice { all_new.insert(all_new.end(), subtraction_nidx.begin(), subtraction_nidx.end()); // Allocate the histograms // Guaranteed contiguous memory - hist.AllocateHistograms(all_new); + hist.AllocateHistograms(ctx_, all_new); for (auto nidx : hist_nidx) { this->BuildHist(nidx); @@ -748,7 +641,7 @@ struct GPUHistMakerDevice { ctx_, info_, linalg::MakeVec(reinterpret_cast<ReduceT*>(&root_sum_quantised), 2)); collective::SafeColl(rc); - hist.AllocateHistograms({kRootNIdx}); + hist.AllocateHistograms(ctx_, {kRootNIdx}); this->BuildHist(kRootNIdx); this->AllReduceHist(kRootNIdx, 1); diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc index 9b988f9605bd..eebbaf8ef795 100644 --- a/tests/cpp/helpers.cc +++ b/tests/cpp/helpers.cc @@ -763,4 +763,6 @@ void DeleteRMMResource(RMMAllocator*) {} RMMAllocatorPtr SetUpRMMResourceForCppTests(int, char**) { return {nullptr, DeleteRMMResource}; } #endif // !defined(XGBOOST_USE_RMM) || XGBOOST_USE_RMM != 1 + +std::int32_t DistGpuIdx() { return common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank(); } } // namespace xgboost diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h index b2e9e08cd80c..2821a11380c8 100644 --- a/tests/cpp/helpers.h +++ b/tests/cpp/helpers.h @@ -526,6 +526,9 @@ inline std::int32_t AllThreadsForTest() { return Context{}.Threads(); } inline DeviceOrd FstCU() { return DeviceOrd::CUDA(0); } +// GPU device ordinal for distributed tests +std::int32_t DistGpuIdx(); + inline auto GMockThrow(StringView msg) { return ::testing::ThrowsMessage<dmlc::Error>(::testing::HasSubstr(msg)); } diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu index 860e4bfd4ea0..c9320f616983 100644 --- a/tests/cpp/tree/gpu_hist/test_histogram.cu +++ b/tests/cpp/tree/gpu_hist/test_histogram.cu @@ -14,6 +14,46 @@ #include "../../helpers.h" namespace xgboost::tree { +TEST(Histogram, DeviceHistogramStorage) { + // Ensures that node allocates correctly after reaching `kStopGrowingSize`. + auto ctx = MakeCUDACtx(0); + constexpr size_t kNBins = 128; + constexpr int kNNodes = 4; + constexpr size_t kStopGrowing = kNNodes * kNBins * 2u; + DeviceHistogramStorage<kStopGrowing> histogram; + histogram.Init(FstCU(), kNBins); + for (int i = 0; i < kNNodes; ++i) { + histogram.AllocateHistograms(&ctx, {i}); + } + histogram.Reset(&ctx); + ASSERT_EQ(histogram.Data().size(), kStopGrowing); + + // Use allocated memory but do not erase nidx_map. + for (int i = 0; i < kNNodes; ++i) { + histogram.AllocateHistograms(&ctx, {i}); + } + for (int i = 0; i < kNNodes; ++i) { + ASSERT_TRUE(histogram.HistogramExists(i)); + } + + // Add two new nodes + histogram.AllocateHistograms(&ctx, {kNNodes}); + histogram.AllocateHistograms(&ctx, {kNNodes + 1}); + + // Old cached nodes should still exist + for (int i = 0; i < kNNodes; ++i) { + ASSERT_TRUE(histogram.HistogramExists(i)); + } + + // Should be deleted + ASSERT_FALSE(histogram.HistogramExists(kNNodes)); + // Most recent node should exist + ASSERT_TRUE(histogram.HistogramExists(kNNodes + 1)); + + // Add same node again - should fail + EXPECT_ANY_THROW(histogram.AllocateHistograms(&ctx, {kNNodes + 1});); +} + void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global) { Context ctx = MakeCUDACtx(0); size_t constexpr kBins = 256, kCols = 120, kRows = 16384, kRounds = 16; diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index 291b46edea36..728fb62c46d4 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -6,7 +6,6 @@ #include <thrust/host_vector.h> #include <xgboost/base.h> -#include <random> #include <string> #include <vector> @@ -23,46 +22,6 @@ #include "xgboost/json.h" namespace xgboost::tree { -TEST(GpuHist, DeviceHistogramStorage) { - // Ensures that node allocates correctly after reaching `kStopGrowingSize`. - dh::safe_cuda(cudaSetDevice(0)); - constexpr size_t kNBins = 128; - constexpr int kNNodes = 4; - constexpr size_t kStopGrowing = kNNodes * kNBins * 2u; - DeviceHistogramStorage<kStopGrowing> histogram; - histogram.Init(FstCU(), kNBins); - for (int i = 0; i < kNNodes; ++i) { - histogram.AllocateHistograms({i}); - } - histogram.Reset(); - ASSERT_EQ(histogram.Data().size(), kStopGrowing); - - // Use allocated memory but do not erase nidx_map. - for (int i = 0; i < kNNodes; ++i) { - histogram.AllocateHistograms({i}); - } - for (int i = 0; i < kNNodes; ++i) { - ASSERT_TRUE(histogram.HistogramExists(i)); - } - - // Add two new nodes - histogram.AllocateHistograms({kNNodes}); - histogram.AllocateHistograms({kNNodes + 1}); - - // Old cached nodes should still exist - for (int i = 0; i < kNNodes; ++i) { - ASSERT_TRUE(histogram.HistogramExists(i)); - } - - // Should be deleted - ASSERT_FALSE(histogram.HistogramExists(kNNodes)); - // Most recent node should exist - ASSERT_TRUE(histogram.HistogramExists(kNNodes + 1)); - - // Add same node again - should fail - EXPECT_ANY_THROW(histogram.AllocateHistograms({kNNodes + 1});); -} - std::vector<GradientPairPrecise> GetHostHistGpair() { // 24 bins, 3 bins for each feature (column). std::vector<GradientPairPrecise> hist_gpair = { @@ -108,7 +67,7 @@ void TestBuildHist(bool use_shared_memory_histograms) { maker.row_partitioner = std::make_unique<RowPartitioner>(&ctx, kNRows, 0); maker.hist.Init(ctx.Device(), page->Cuts().TotalBins()); - maker.hist.AllocateHistograms({0}); + maker.hist.AllocateHistograms(&ctx, {0}); maker.gpair = gpair.DeviceSpan(); maker.quantiser = std::make_unique<GradientQuantiser>(&ctx, maker.gpair, MetaInfo()); @@ -425,8 +384,8 @@ TEST(GpuHist, MaxDepth) { namespace { RegTree GetHistTree(Context const* ctx, DMatrix* dmat) { ObjInfo task{ObjInfo::kRegression}; - GPUHistMaker hist_maker{ctx, &task}; - hist_maker.Configure(Args{}); + std::unique_ptr<TreeUpdater> hist_maker {TreeUpdater::Create("grow_gpu_hist", ctx, &task)}; + hist_maker->Configure(Args{}); TrainParam param; param.UpdateAllowUnknown(Args{}); @@ -436,8 +395,8 @@ RegTree GetHistTree(Context const* ctx, DMatrix* dmat) { std::vector<HostDeviceVector<bst_node_t>> position(1); RegTree tree; - hist_maker.Update(¶m, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position}, - {&tree}); + hist_maker->Update(¶m, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position}, + {&tree}); return tree; } @@ -476,8 +435,8 @@ TEST_F(MGPUHistTest, HistColumnSplit) { namespace { RegTree GetApproxTree(Context const* ctx, DMatrix* dmat) { ObjInfo task{ObjInfo::kRegression}; - GPUGlobalApproxMaker approx_maker{ctx, &task}; - approx_maker.Configure(Args{}); + std::unique_ptr<TreeUpdater> approx_maker{TreeUpdater::Create("grow_gpu_approx", ctx, &task)}; + approx_maker->Configure(Args{}); TrainParam param; param.UpdateAllowUnknown(Args{}); @@ -487,13 +446,13 @@ RegTree GetApproxTree(Context const* ctx, DMatrix* dmat) { std::vector<HostDeviceVector<bst_node_t>> position(1); RegTree tree; - approx_maker.Update(¶m, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position}, - {&tree}); + approx_maker->Update(¶m, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position}, + {&tree}); return tree; } void VerifyApproxColumnSplit(bst_idx_t rows, bst_feature_t cols, RegTree const& expected_tree) { - Context ctx(MakeCUDACtx(GPUIDX)); + auto ctx = MakeCUDACtx(DistGpuIdx()); auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true); auto const world_size = collective::GetWorldSize();