From 5c2575535f313f358c663f3ac710713e73093d28 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 10 Apr 2019 19:21:26 +0800 Subject: [PATCH] Fix Histogram allocation. (#4347) * Fix Histogram allocation. nidx_map is cleared after `Reset`, but histogram data size isn't changed hence histogram recycling is used in later iterations. After a reset(building new tree), newly allocated node will start from 0, while recycling always choose the node with smallest index, which happens to be our newly allocated node 0. --- src/tree/updater_gpu_hist.cu | 52 +++++++++++++++++++++------------ tests/cpp/tree/test_gpu_hist.cu | 36 +++++++++++++++++++++++ 2 files changed, 70 insertions(+), 18 deletions(-) diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 75402ffabd0b..e93ca9a3524c 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -365,18 +365,24 @@ __global__ void EvaluateSplitKernel( * * \summary Data storage for node histograms on device. Automatically expands. * + * \tparam GradientSumT histogram entry type. + * \tparam kStopGrowingSize Do not grow beyond this size + * * \author Rory * \date 28/07/2018 */ -template +template class DeviceHistogram { private: /*! \brief Map nidx to starting index of its histogram. */ std::map nidx_map_; thrust::device_vector data_; - static constexpr size_t kStopGrowingSize = 1 << 26; // Do not grow beyond this size int n_bins_; int device_id_; + static constexpr size_t kNumItemsInGradientSum = + sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT); + static_assert(kNumItemsInGradientSum == 2, + "Number of items in gradient type should be 2."); public: void Init(int device_id, int n_bins) { @@ -390,34 +396,44 @@ class DeviceHistogram { data_.size() * sizeof(typename decltype(data_)::value_type))); nidx_map_.clear(); } - bool HistogramExists(int nidx) { - return nidx_map_.find(nidx) != nidx_map_.end(); + bool HistogramExists(int nidx) const { + return nidx_map_.find(nidx) != nidx_map_.cend(); + } + size_t HistogramSize() const { + return n_bins_ * kNumItemsInGradientSum; } - thrust::device_vector &Data() { + thrust::device_vector& Data() { return data_; } void AllocateHistogram(int nidx) { if (HistogramExists(nidx)) return; - size_t current_size = nidx_map_.size() * n_bins_ * - 2; // Number of items currently used in data + // Number of items currently used in data + const size_t used_size = nidx_map_.size() * HistogramSize(); + const size_t new_used_size = used_size + HistogramSize(); dh::safe_cuda(cudaSetDevice(device_id_)); if (data_.size() >= kStopGrowingSize) { // Recycle histogram memory - std::pair old_entry = *nidx_map_.begin(); - nidx_map_.erase(old_entry.first); - dh::safe_cuda(cudaMemsetAsync(data_.data().get() + old_entry.second, 0, - n_bins_ * sizeof(GradientSumT))); - nidx_map_[nidx] = old_entry.second; + if (new_used_size <= data_.size()) { + // no need to remove old node, just insert the new one. + nidx_map_[nidx] = used_size; + // memset histogram size in bytes + dh::safe_cuda(cudaMemsetAsync(data_.data().get() + used_size, 0, + n_bins_ * sizeof(GradientSumT))); + } else { + std::pair old_entry = *nidx_map_.begin(); + nidx_map_.erase(old_entry.first); + dh::safe_cuda(cudaMemsetAsync(data_.data().get() + old_entry.second, 0, + n_bins_ * sizeof(GradientSumT))); + nidx_map_[nidx] = old_entry.second; + } } else { // Append new node histogram - nidx_map_[nidx] = current_size; - if (data_.size() < current_size + n_bins_ * 2) { - size_t new_size = current_size * 2; // Double in size - new_size = std::max(static_cast(n_bins_ * 2), - new_size); // Have at least one histogram - data_.resize(new_size); + nidx_map_[nidx] = used_size; + size_t new_required_memory = std::max(data_.size() * 2, HistogramSize()); + if (data_.size() < new_required_memory) { + data_.resize(new_required_memory); } } } diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index d7e4b2654e79..37cce8b577df 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -20,6 +20,42 @@ namespace xgboost { namespace tree { +TEST(GpuHist, DeviceHistogram) { + // Ensures that node allocates correctly after reaching `kStopGrowingSize`. + dh::SaveCudaContext{ + [&]() { + dh::safe_cuda(cudaSetDevice(0)); + constexpr size_t kNBins = 128; + constexpr size_t kNNodes = 4; + constexpr size_t kStopGrowing = kNNodes * kNBins * 2u; + DeviceHistogram histogram; + histogram.Init(0, kNBins); + for (size_t i = 0; i < kNNodes; ++i) { + histogram.AllocateHistogram(i); + } + histogram.Reset(); + ASSERT_EQ(histogram.Data().size(), kStopGrowing); + + // Use allocated memory but do not erase nidx_map. + for (size_t i = 0; i < kNNodes; ++i) { + histogram.AllocateHistogram(i); + } + for (size_t i = 0; i < kNNodes; ++i) { + ASSERT_TRUE(histogram.HistogramExists(i)); + } + + // Erase existing nidx_map. + for (size_t i = kNNodes; i < kNNodes * 2; ++i) { + histogram.AllocateHistogram(i); + } + for (size_t i = 0; i < kNNodes; ++i) { + ASSERT_FALSE(histogram.HistogramExists(i)); + } + } + }; + +} + template void BuildGidx(DeviceShard* shard, int n_rows, int n_cols, bst_float sparsity=0) {