From 675765433758ad1399aba97d61a0ccc8a207cb7f Mon Sep 17 00:00:00 2001 From: sriramch <33358417+sriramch@users.noreply.github.com> Date: Tue, 18 Jun 2019 15:50:25 -0700 Subject: [PATCH] Optimizations for quantisation on device (#4572) * - do not create device vectors for the entire sparse page while computing histograms... - while creating the compressed histogram indices, the row vector is created for the entire sparse page batch. this is needless as we only process chunks at a time based on a slice of the total gpu memory - this pr will allocate only as much as required to store the ppropriate row indices and the entries * - do not dereference row_ptrs once the device_vector has been created to elide host copies of those counts - instead, grab the entry counts directly from the sparsepage --- src/tree/updater_gpu_hist.cu | 43 ++++++++++++++------------------- tests/cpp/tree/test_gpu_hist.cu | 40 +++++++++++++++++++++--------- 2 files changed, 46 insertions(+), 37 deletions(-) diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 42587b624d89..f34463d43b6c 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -483,8 +483,6 @@ __global__ void CompressBinEllpackKernel( const uint32_t* __restrict__ cut_rows, // HistCutMatrix::row_ptrs size_t base_row, // batch_row_begin size_t n_rows, - // row_ptr_begin: row_offset[base_row], the start position of base_row - size_t row_ptr_begin, size_t row_stride, unsigned int null_gidx_value) { size_t irow = threadIdx.x + blockIdx.x * blockDim.x; @@ -495,7 +493,7 @@ __global__ void CompressBinEllpackKernel( int row_length = static_cast(row_ptrs[irow + 1] - row_ptrs[irow]); unsigned int bin = null_gidx_value; if (ifeature < row_length) { - Entry entry = entries[row_ptrs[irow] - row_ptr_begin + ifeature]; + Entry entry = entries[row_ptrs[irow] - row_ptrs[0] + ifeature]; int feature = entry.index; float fvalue = entry.fvalue; // {feature_cuts, ncuts} forms the array of cuts of `feature'. @@ -697,8 +695,6 @@ struct DeviceShard { /*! \brief Sum gradient for each node. */ std::vector node_sum_gradients; common::Span node_sum_gradients_d; - /*! \brief On-device feature set, only actually used on one of the devices */ - dh::device_vector feature_set_d; dh::device_vector left_counts; // Useful to keep a bunch of zeroed memory for sort position /*! The row offset for this shard. */ @@ -1311,14 +1307,6 @@ inline void DeviceShard::CreateHistIndices( size_t row_stride = this->ellpack_matrix.row_stride; const auto &offset_vec = row_batch.offset.ConstHostVector(); - /*! \brief row offset in SparsePage (the input data). */ - CHECK_LE(device_row_state.rows_to_process_from_batch, offset_vec.size()); - dh::device_vector row_ptrs(device_row_state.rows_to_process_from_batch+1); - thrust::copy( - offset_vec.data() + device_row_state.row_offset_in_current_batch, - offset_vec.data() + device_row_state.row_offset_in_current_batch + - device_row_state.rows_to_process_from_batch + 1, - row_ptrs.begin()); int num_symbols = n_bins + 1; // bin and compress entries in batches of rows @@ -1327,7 +1315,6 @@ inline void DeviceShard::CreateHistIndices( static_cast(device_row_state.rows_to_process_from_batch)); const std::vector& data_vec = row_batch.data.ConstHostVector(); - dh::device_vector entries_d(gpu_batch_nrows * row_stride); size_t gpu_nbatches = dh::DivRoundUp(device_row_state.rows_to_process_from_batch, gpu_batch_nrows); @@ -1339,35 +1326,41 @@ inline void DeviceShard::CreateHistIndices( } size_t batch_nrows = batch_row_end - batch_row_begin; + const auto ent_cnt_begin = + offset_vec[device_row_state.row_offset_in_current_batch + batch_row_begin]; + const auto ent_cnt_end = + offset_vec[device_row_state.row_offset_in_current_batch + batch_row_end]; + + /*! \brief row offset in SparsePage (the input data). */ + dh::device_vector row_ptrs(batch_nrows+1); + thrust::copy( + offset_vec.data() + device_row_state.row_offset_in_current_batch + batch_row_begin, + offset_vec.data() + device_row_state.row_offset_in_current_batch + batch_row_end + 1, + row_ptrs.begin()); + // number of entries in this batch. - size_t n_entries = row_ptrs[batch_row_end] - row_ptrs[batch_row_begin]; + size_t n_entries = ent_cnt_end - ent_cnt_begin; + dh::device_vector entries_d(n_entries); // copy data entries to device. dh::safe_cuda (cudaMemcpy - (entries_d.data().get(), data_vec.data() + row_ptrs[batch_row_begin], + (entries_d.data().get(), data_vec.data() + ent_cnt_begin, n_entries * sizeof(Entry), cudaMemcpyDefault)); const dim3 block3(32, 8, 1); // 256 threads - const dim3 grid3(dh::DivRoundUp(device_row_state.rows_to_process_from_batch, block3.x), + const dim3 grid3(dh::DivRoundUp(batch_nrows, block3.x), dh::DivRoundUp(row_stride, block3.y), 1); CompressBinEllpackKernel<<>> (common::CompressedBufferWriter(num_symbols), gidx_buffer.data(), - row_ptrs.data().get() + batch_row_begin, + row_ptrs.data().get(), entries_d.data().get(), gidx_fvalue_map.data(), feature_segments.data(), device_row_state.total_rows_processed + batch_row_begin, batch_nrows, - row_ptrs[batch_row_begin], row_stride, null_gidx_value); } - - // free the memory that is no longer needed - row_ptrs.resize(0); - row_ptrs.shrink_to_fit(); - entries_d.resize(0); - entries_d.shrink_to_fit(); } // An instance of this type is created which keeps track of total number of rows to process, diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index e7c48eaf214a..614f2ff9469b 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -477,7 +477,7 @@ TEST(GpuHist, SortPosition) { TestSortPosition({1, 2, 1, 2, 3}, 1, 2); } -TEST(GpuHist, TestHistogramIndex) { +void TestHistogramIndexImpl(int n_gpus) { // Test if the compressed histogram index matches when using a sparse // dmatrix with and without using external memory @@ -491,31 +491,47 @@ TEST(GpuHist, TestHistogramIndex) { CreateSparsePageDMatrixWithRC(kNRows, kNCols, 128UL, true)); std::vector> training_params = { - {"max_depth", "1"}, + {"max_depth", "10"}, {"max_leaves", "0"} }; - LearnerTrainParam learner_param(CreateEmptyGenericParam(0, 1)); + LearnerTrainParam learner_param(CreateEmptyGenericParam(0, n_gpus)); hist_maker.Init(training_params, &learner_param); hist_maker.InitDataOnce(hist_maker_dmat.get()); hist_maker_ext.Init(training_params, &learner_param); hist_maker_ext.InitDataOnce(hist_maker_ext_dmat.get()); + ASSERT_EQ(hist_maker.shards_.size(), hist_maker_ext.shards_.size()); + // Extract the device shards from the histogram makers and from that its compressed // histogram index - const auto &dev_shard = hist_maker.shards_[0]; - std::vector h_gidx_buffer(dev_shard->gidx_buffer.size()); - dh::CopyDeviceSpanToVector(&h_gidx_buffer, dev_shard->gidx_buffer); + for (size_t i = 0; i < hist_maker.shards_.size(); ++i) { + const auto &dev_shard = hist_maker.shards_[i]; + std::vector h_gidx_buffer(dev_shard->gidx_buffer.size()); + dh::CopyDeviceSpanToVector(&h_gidx_buffer, dev_shard->gidx_buffer); + + const auto &dev_shard_ext = hist_maker_ext.shards_[i]; + std::vector h_gidx_buffer_ext(dev_shard_ext->gidx_buffer.size()); + dh::CopyDeviceSpanToVector(&h_gidx_buffer_ext, dev_shard_ext->gidx_buffer); + + ASSERT_EQ(dev_shard->n_bins, dev_shard_ext->n_bins); + ASSERT_EQ(dev_shard->gidx_buffer.size(), dev_shard_ext->gidx_buffer.size()); - const auto &dev_shard_ext = hist_maker_ext.shards_[0]; - std::vector h_gidx_buffer_ext(dev_shard_ext->gidx_buffer.size()); - dh::CopyDeviceSpanToVector(&h_gidx_buffer_ext, dev_shard_ext->gidx_buffer); + ASSERT_EQ(h_gidx_buffer, h_gidx_buffer_ext); + } +} - ASSERT_EQ(dev_shard->n_bins, dev_shard_ext->n_bins); - ASSERT_EQ(dev_shard->gidx_buffer.size(), dev_shard_ext->gidx_buffer.size()); +TEST(GpuHist, TestHistogramIndex) { + TestHistogramIndexImpl(1); +} - ASSERT_EQ(h_gidx_buffer, h_gidx_buffer_ext); +#if defined(XGBOOST_USE_NCCL) +TEST(GpuHist, MGPU_TestHistogramIndex) { + auto devices = GPUSet::AllVisible(); + CHECK_GT(devices.Size(), 1); + TestHistogramIndexImpl(-1); } +#endif } // namespace tree } // namespace xgboost