Skip to content

Commit

Permalink
Move device histogram storage into histogram.cuh.
Browse files Browse the repository at this point in the history
Split up the `updater_gpu_hist.cu` file for easier unittests. Currently, the test file
directly includes the `updater_gpu_hist.cu` file and accesses the internal of the updater,
which makes it difficult to make modification. Up coming PRs will try to revise some of
tests to avoid such direct accesses.
  • Loading branch information
trivialfis committed Jul 20, 2024
1 parent 0846ad8 commit e22e60b
Showing 6 changed files with 171 additions and 167 deletions.
119 changes: 113 additions & 6 deletions src/tree/gpu_hist/histogram.cuh
Original file line number Diff line number Diff line change
@@ -5,12 +5,14 @@
#define HISTOGRAM_CUH_
#include <memory> // for unique_ptr

#include "../../common/cuda_context.cuh" // for CUDAContext
#include "../../data/ellpack_page.cuh" // for EllpackDeviceAccessor
#include "feature_groups.cuh" // for FeatureGroupsAccessor
#include "xgboost/base.h" // for GradientPair, GradientPairInt64
#include "xgboost/context.h" // for Context
#include "xgboost/span.h" // for Span
#include "../../common/cuda_context.cuh" // for CUDAContext
#include "../../common/device_helpers.cuh" // for LaunchN
#include "../../common/device_vector.cuh" // for device_vector
#include "../../data/ellpack_page.cuh" // for EllpackDeviceAccessor
#include "feature_groups.cuh" // for FeatureGroupsAccessor
#include "xgboost/base.h" // for GradientPair, GradientPairInt64
#include "xgboost/context.h" // for Context
#include "xgboost/span.h" // for Span

namespace xgboost::tree {
/**
@@ -60,6 +62,111 @@ class GradientQuantiser {
}
};

/**
* @brief Data storage for node histograms on device. Automatically expands.
*
* @tparam kStopGrowingSize Do not grow beyond this size
*
* @author Rory
* @date 28/07/2018
*/
template <size_t kStopGrowingSize = 1 << 28>
class DeviceHistogramStorage {
private:
using GradientSumT = GradientPairInt64;
/** @brief Map nidx to starting index of its histogram. */
std::map<int, size_t> nidx_map_;
// Large buffer of zeroed memory, caches histograms
dh::device_vector<typename GradientSumT::ValueT> data_;
// If we run out of storage allocate one histogram at a time
// in overflow. Not cached, overwritten when a new histogram
// is requested
dh::device_vector<typename GradientSumT::ValueT> overflow_;
std::map<int, size_t> overflow_nidx_map_;
int n_bins_;
DeviceOrd device_id_;
static constexpr size_t kNumItemsInGradientSum =
sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");

public:
// Start with about 16mb
DeviceHistogramStorage() { data_.reserve(1 << 22); }
void Init(DeviceOrd device_id, int n_bins) {
this->n_bins_ = n_bins;
this->device_id_ = device_id;
}

void Reset(Context const* ctx) {
auto d_data = data_.data().get();
dh::LaunchN(data_.size(), ctx->CUDACtx()->Stream(),
[=] __device__(size_t idx) { d_data[idx] = 0.0f; });
nidx_map_.clear();
overflow_nidx_map_.clear();
}
[[nodiscard]] bool HistogramExists(int nidx) const {
return nidx_map_.find(nidx) != nidx_map_.cend() ||
overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend();
}
[[nodiscard]] int Bins() const { return n_bins_; }
[[nodiscard]] size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; }
dh::device_vector<typename GradientSumT::ValueT>& Data() { return data_; }

void AllocateHistograms(Context const* ctx, const std::vector<int>& new_nidxs) {
for (int nidx : new_nidxs) {
CHECK(!HistogramExists(nidx));
}
// Number of items currently used in data
const size_t used_size = nidx_map_.size() * HistogramSize();
const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size();
if (used_size >= kStopGrowingSize) {
// Use overflow
// Delete previous entries
overflow_nidx_map_.clear();
overflow_.resize(HistogramSize() * new_nidxs.size());
// Zero memory
auto d_data = overflow_.data().get();
dh::LaunchN(overflow_.size(), ctx->CUDACtx()->Stream(),
[=] __device__(size_t idx) { d_data[idx] = 0.0; });
// Append new histograms
for (int nidx : new_nidxs) {
overflow_nidx_map_[nidx] = overflow_nidx_map_.size() * HistogramSize();
}
} else {
CHECK_GE(data_.size(), used_size);
// Expand if necessary
if (data_.size() < new_used_size) {
data_.resize(std::max(data_.size() * 2, new_used_size));
}
// Append new histograms
for (int nidx : new_nidxs) {
nidx_map_[nidx] = nidx_map_.size() * HistogramSize();
}
}

CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize());
}

/**
* \summary Return pointer to histogram memory for a given node.
* \param nidx Tree node index.
* \return hist pointer.
*/
common::Span<GradientSumT> GetNodeHistogram(int nidx) {
CHECK(this->HistogramExists(nidx));

if (nidx_map_.find(nidx) != nidx_map_.cend()) {
// Fetch from normal cache
auto ptr = data_.data().get() + nidx_map_.at(nidx);
return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
} else {
// Fetch from overflow
auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx);
return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
}
}
};

class DeviceHistogramBuilderImpl;

class DeviceHistogramBuilder {
113 changes: 3 additions & 110 deletions src/tree/updater_gpu_hist.cu
Original file line number Diff line number Diff line change
@@ -49,113 +49,6 @@ namespace xgboost::tree {
DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);
#endif // !defined(GTEST_TEST)

/**
* \struct DeviceHistogramStorage
*
* \summary Data storage for node histograms on device. Automatically expands.
*
* \tparam GradientSumT histogram entry type.
* \tparam kStopGrowingSize Do not grow beyond this size
*
* \author Rory
* \date 28/07/2018
*/
template <size_t kStopGrowingSize = 1 << 28>
class DeviceHistogramStorage {
private:
using GradientSumT = GradientPairInt64;
/*! \brief Map nidx to starting index of its histogram. */
std::map<int, size_t> nidx_map_;
// Large buffer of zeroed memory, caches histograms
dh::device_vector<typename GradientSumT::ValueT> data_;
// If we run out of storage allocate one histogram at a time
// in overflow. Not cached, overwritten when a new histogram
// is requested
dh::device_vector<typename GradientSumT::ValueT> overflow_;
std::map<int, size_t> overflow_nidx_map_;
int n_bins_;
DeviceOrd device_id_;
static constexpr size_t kNumItemsInGradientSum =
sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");

public:
// Start with about 16mb
DeviceHistogramStorage() { data_.reserve(1 << 22); }
void Init(DeviceOrd device_id, int n_bins) {
this->n_bins_ = n_bins;
this->device_id_ = device_id;
}

void Reset() {
auto d_data = data_.data().get();
dh::LaunchN(data_.size(), [=] __device__(size_t idx) { d_data[idx] = 0.0f; });
nidx_map_.clear();
overflow_nidx_map_.clear();
}
[[nodiscard]] bool HistogramExists(int nidx) const {
return nidx_map_.find(nidx) != nidx_map_.cend() ||
overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend();
}
[[nodiscard]] int Bins() const { return n_bins_; }
[[nodiscard]] size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; }
dh::device_vector<typename GradientSumT::ValueT>& Data() { return data_; }

void AllocateHistograms(const std::vector<int>& new_nidxs) {
for (int nidx : new_nidxs) {
CHECK(!HistogramExists(nidx));
}
// Number of items currently used in data
const size_t used_size = nidx_map_.size() * HistogramSize();
const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size();
if (used_size >= kStopGrowingSize) {
// Use overflow
// Delete previous entries
overflow_nidx_map_.clear();
overflow_.resize(HistogramSize() * new_nidxs.size());
// Zero memory
auto d_data = overflow_.data().get();
dh::LaunchN(overflow_.size(),
[=] __device__(size_t idx) { d_data[idx] = 0.0; });
// Append new histograms
for (int nidx : new_nidxs) {
overflow_nidx_map_[nidx] = overflow_nidx_map_.size() * HistogramSize();
}
} else {
CHECK_GE(data_.size(), used_size);
// Expand if necessary
if (data_.size() < new_used_size) {
data_.resize(std::max(data_.size() * 2, new_used_size));
}
// Append new histograms
for (int nidx : new_nidxs) {
nidx_map_[nidx] = nidx_map_.size() * HistogramSize();
}
}

CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize());
}

/**
* \summary Return pointer to histogram memory for a given node.
* \param nidx Tree node index.
* \return hist pointer.
*/
common::Span<GradientSumT> GetNodeHistogram(int nidx) {
CHECK(this->HistogramExists(nidx));

if (nidx_map_.find(nidx) != nidx_map_.cend()) {
// Fetch from normal cache
auto ptr = data_.data().get() + nidx_map_.at(nidx);
return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
} else {
// Fetch from overflow
auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx);
return {reinterpret_cast<GradientSumT*>(ptr), static_cast<std::size_t>(n_bins_)};
}
}
};

// Manage memory for a single GPU
struct GPUHistMakerDevice {
private:
@@ -258,7 +151,7 @@ struct GPUHistMakerDevice {

// Init histogram
hist.Init(ctx_->Device(), page->Cuts().TotalBins());
hist.Reset();
hist.Reset(ctx_);

this->InitFeatureGroupsOnce();

@@ -657,7 +550,7 @@ struct GPUHistMakerDevice {
all_new.insert(all_new.end(), subtraction_nidx.begin(), subtraction_nidx.end());
// Allocate the histograms
// Guaranteed contiguous memory
hist.AllocateHistograms(all_new);
hist.AllocateHistograms(ctx_, all_new);

for (auto nidx : hist_nidx) {
this->BuildHist(nidx);
@@ -748,7 +641,7 @@ struct GPUHistMakerDevice {
ctx_, info_, linalg::MakeVec(reinterpret_cast<ReduceT*>(&root_sum_quantised), 2));
collective::SafeColl(rc);

hist.AllocateHistograms({kRootNIdx});
hist.AllocateHistograms(ctx_, {kRootNIdx});
this->BuildHist(kRootNIdx);
this->AllReduceHist(kRootNIdx, 1);

2 changes: 2 additions & 0 deletions tests/cpp/helpers.cc
Original file line number Diff line number Diff line change
@@ -763,4 +763,6 @@ void DeleteRMMResource(RMMAllocator*) {}

RMMAllocatorPtr SetUpRMMResourceForCppTests(int, char**) { return {nullptr, DeleteRMMResource}; }
#endif // !defined(XGBOOST_USE_RMM) || XGBOOST_USE_RMM != 1

std::int32_t DistGpuIdx() { return common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank(); }
} // namespace xgboost
3 changes: 3 additions & 0 deletions tests/cpp/helpers.h
Original file line number Diff line number Diff line change
@@ -526,6 +526,9 @@ inline std::int32_t AllThreadsForTest() { return Context{}.Threads(); }

inline DeviceOrd FstCU() { return DeviceOrd::CUDA(0); }

// GPU device ordinal for distributed tests
std::int32_t DistGpuIdx();

inline auto GMockThrow(StringView msg) {
return ::testing::ThrowsMessage<dmlc::Error>(::testing::HasSubstr(msg));
}
40 changes: 40 additions & 0 deletions tests/cpp/tree/gpu_hist/test_histogram.cu
Original file line number Diff line number Diff line change
@@ -14,6 +14,46 @@
#include "../../helpers.h"

namespace xgboost::tree {
TEST(Histogram, DeviceHistogramStorage) {
// Ensures that node allocates correctly after reaching `kStopGrowingSize`.
auto ctx = MakeCUDACtx(0);
constexpr size_t kNBins = 128;
constexpr int kNNodes = 4;
constexpr size_t kStopGrowing = kNNodes * kNBins * 2u;
DeviceHistogramStorage<kStopGrowing> histogram;
histogram.Init(FstCU(), kNBins);
for (int i = 0; i < kNNodes; ++i) {
histogram.AllocateHistograms(&ctx, {i});
}
histogram.Reset(&ctx);
ASSERT_EQ(histogram.Data().size(), kStopGrowing);

// Use allocated memory but do not erase nidx_map.
for (int i = 0; i < kNNodes; ++i) {
histogram.AllocateHistograms(&ctx, {i});
}
for (int i = 0; i < kNNodes; ++i) {
ASSERT_TRUE(histogram.HistogramExists(i));
}

// Add two new nodes
histogram.AllocateHistograms(&ctx, {kNNodes});
histogram.AllocateHistograms(&ctx, {kNNodes + 1});

// Old cached nodes should still exist
for (int i = 0; i < kNNodes; ++i) {
ASSERT_TRUE(histogram.HistogramExists(i));
}

// Should be deleted
ASSERT_FALSE(histogram.HistogramExists(kNNodes));
// Most recent node should exist
ASSERT_TRUE(histogram.HistogramExists(kNNodes + 1));

// Add same node again - should fail
EXPECT_ANY_THROW(histogram.AllocateHistograms(&ctx, {kNNodes + 1}););
}

void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global) {
Context ctx = MakeCUDACtx(0);
size_t constexpr kBins = 256, kCols = 120, kRows = 16384, kRounds = 16;
Loading

0 comments on commit e22e60b

Please sign in to comment.