From 1b4c5fb8d06cb3446113fddc017c18ac09c7c889 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 4 Oct 2024 00:00:19 +0800 Subject: [PATCH] [EM] Treat ellpack as dense matrix when there's no compression. (#10870) This enables XGBoost to use shared memory to build histograms. In addition, it lowers memory usage and speeds up external memory for data that cannot be compressed using the ellpack format. - From now on, only datasets with at least one missing value for every sample are considered sparse. - We need to distinguish fully dense, mostly dense, and sparse. This is similar to what the CPU implementation currently does. - Some cleanups --- src/common/compressed_iterator.h | 29 +- src/common/hist_util.cuh | 23 - src/common/hist_util.h | 10 +- src/data/ellpack_page.cc | 6 - src/data/ellpack_page.cu | 452 +++++++++++------- src/data/ellpack_page.cuh | 137 +++--- src/data/ellpack_page.h | 3 +- src/data/ellpack_page_raw_format.cu | 4 +- src/data/ellpack_page_source.cu | 4 +- src/data/ellpack_page_source.h | 6 + src/data/extmem_quantile_dmatrix.cu | 2 +- src/data/gradient_index.cu | 65 ++- src/data/iterative_dmatrix.cc | 3 + src/tree/gpu_hist/feature_groups.cu | 35 +- src/tree/gpu_hist/feature_groups.cuh | 123 ++--- src/tree/gpu_hist/gradient_based_sampler.cu | 8 +- src/tree/gpu_hist/histogram.cu | 73 +-- src/tree/updater_gpu_hist.cu | 48 +- tests/ci_build/lint_python.py | 1 + tests/cpp/common/test_compressed_iterator.cc | 24 +- tests/cpp/data/test_ellpack_page.cu | 172 ++++++- .../cpp/data/test_ellpack_page_raw_format.cu | 5 +- .../cpp/data/test_extmem_quantile_dmatrix.cc | 3 +- .../cpp/data/test_extmem_quantile_dmatrix.cu | 7 +- tests/cpp/data/test_extmem_quantile_dmatrix.h | 7 +- tests/cpp/data/test_iterative_dmatrix.cu | 78 ++- tests/cpp/data/test_sparse_page_dmatrix.cu | 10 +- tests/cpp/test_context.cu | 4 +- tests/cpp/tree/gpu_hist/test_histogram.cu | 64 +-- tests/cpp/tree/test_gpu_approx.cu | 65 +++ .../test_gpu_external_memory.py | 31 ++ 31 files changed, 923 insertions(+), 579 deletions(-) create mode 100644 tests/cpp/tree/test_gpu_approx.cu create mode 100644 tests/test_distributed/test_gpu_with_dask/test_gpu_external_memory.py diff --git a/src/common/compressed_iterator.h b/src/common/compressed_iterator.h index 71d2d520264e..ab5815557f66 100644 --- a/src/common/compressed_iterator.h +++ b/src/common/compressed_iterator.h @@ -1,12 +1,11 @@ /** - * Copyright 2017-2023 by XGBoost Contributors + * Copyright 2017-2024, XGBoost Contributors * \file compressed_iterator.h */ #pragma once #include -#include -#include +#include // for ceil, log2 #include // for size_t #include "common.h" @@ -15,9 +14,7 @@ #include "device_helpers.cuh" #endif // __CUDACC__ -namespace xgboost { -namespace common { - +namespace xgboost::common { using CompressedByteT = unsigned char; namespace detail { @@ -87,13 +84,12 @@ class CompressedBufferWriter { template void WriteSymbol(CompressedByteT *buffer, T symbol, size_t offset) { - const int bits_per_byte = 8; + constexpr std::int32_t kBitsPerByte = 8; for (size_t i = 0; i < symbol_bits_; i++) { - size_t byte_idx = ((offset + 1) * symbol_bits_ - (i + 1)) / bits_per_byte; + size_t byte_idx = ((offset + 1) * symbol_bits_ - (i + 1)) / kBitsPerByte; byte_idx += detail::kPadding; - size_t bit_idx = - ((bits_per_byte + i) - ((offset + 1) * symbol_bits_)) % bits_per_byte; + size_t bit_idx = ((kBitsPerByte + i) - ((offset + 1) * symbol_bits_)) % kBitsPerByte; if (detail::CheckBit(symbol, i)) { detail::SetBit(&buffer[byte_idx], bit_idx); @@ -181,16 +177,14 @@ class CompressedIterator { typedef value_type reference; // NOLINT private: - const CompressedByteT *buffer_ {nullptr}; - size_t symbol_bits_ {0}; + CompressedByteT const *buffer_{nullptr}; + bst_idx_t const symbol_bits_{0}; size_t offset_ {0}; public: CompressedIterator() = default; - CompressedIterator(const CompressedByteT *buffer, size_t num_symbols) - : buffer_(buffer) { - symbol_bits_ = detail::SymbolBits(num_symbols); - } + CompressedIterator(CompressedByteT const *buffer, bst_idx_t num_symbols) + : buffer_{buffer}, symbol_bits_{detail::SymbolBits(num_symbols)} {} XGBOOST_DEVICE reference operator*() const { const int bits_per_byte = 8; @@ -218,5 +212,4 @@ class CompressedIterator { return *offset; } }; -} // namespace common -} // namespace xgboost +} // namespace xgboost::common diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh index 66463ef2fe86..ffdafa29205c 100644 --- a/src/common/hist_util.cuh +++ b/src/common/hist_util.cuh @@ -24,29 +24,6 @@ #include "xgboost/span.h" // for IterSpan namespace xgboost::common { -namespace cuda { -/** - * copy and paste of the host version, we can't make it a __host__ __device__ function as - * the fn might be a host only or device only callable object, which is not allowed by nvcc. - */ -template -auto __device__ DispatchBinType(BinTypeSize type, Fn&& fn) { - switch (type) { - case kUint8BinsTypeSize: { - return fn(uint8_t{}); - } - case kUint16BinsTypeSize: { - return fn(uint16_t{}); - } - case kUint32BinsTypeSize: { - return fn(uint32_t{}); - } - } - SPAN_CHECK(false); - return fn(uint32_t{}); -} -} // namespace cuda - namespace detail { struct EntryCompareOp { __device__ bool operator()(const Entry& a, const Entry& b) { diff --git a/src/common/hist_util.h b/src/common/hist_util.h index 559093bb5a3f..ab2a64786827 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -84,6 +84,10 @@ class HistogramCuts { [[nodiscard]] bst_bin_t FeatureBins(bst_feature_t feature) const { return cut_ptrs_.ConstHostVector().at(feature + 1) - cut_ptrs_.ConstHostVector()[feature]; } + [[nodiscard]] bst_feature_t NumFeatures() const { + CHECK_EQ(this->min_vals_.Size(), this->cut_ptrs_.Size() - 1); + return this->min_vals_.Size(); + } std::vector const& Ptrs() const { return cut_ptrs_.ConstHostVector(); } std::vector const& Values() const { return cut_values_.ConstHostVector(); } @@ -101,8 +105,10 @@ class HistogramCuts { has_categorical_ = has_cat; max_cat_ = max_cat; } - - [[nodiscard]] bst_bin_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); } + /** + * @brief The total number of histogram bins (excluding min values.) + */ + [[nodiscard]] bst_bin_t TotalBins() const { return this->cut_values_.Size(); } // Return the index of a cut point that is strictly greater than the input // value, or the last available index if none exists diff --git a/src/data/ellpack_page.cc b/src/data/ellpack_page.cc index 4d918358d34c..1d99380c4e7d 100644 --- a/src/data/ellpack_page.cc +++ b/src/data/ellpack_page.cc @@ -53,12 +53,6 @@ bst_idx_t EllpackPage::Size() const { "EllpackPage is required"; return 0; } - -[[nodiscard]] bool EllpackPage::IsDense() const { - LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but " - "EllpackPage is required"; - return false; -} } // namespace xgboost #endif // XGBOOST_USE_CUDA diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu index f0c155701ead..ef7f7975baf0 100644 --- a/src/data/ellpack_page.cu +++ b/src/data/ellpack_page.cu @@ -1,30 +1,32 @@ /** * Copyright 2019-2024, XGBoost contributors */ -#include // for proclaim_return_type -#include -#include - -#include // for copy -#include // for move -#include // for vector - -#include "../common/algorithm.cuh" // for InclusiveScan -#include "../common/categorical.h" -#include "../common/cuda_context.cuh" +#include // for lower_bound, upper_bound +#include // for max_element +#include // for make_counting_iterator +#include // for transform_output_iterator + +#include // for copy +#include // for numeric_limits +#include // for move +#include // for vector + +#include "../common/algorithm.cuh" // for InclusiveScan +#include "../common/categorical.h" // for IsCat +#include "../common/cuda_context.cuh" // for CUDAContext #include "../common/cuda_rt_utils.h" // for SetDevice #include "../common/hist_util.cuh" // for HistogramCuts #include "../common/ref_resource_view.cuh" // for MakeFixedVecWithCudaMalloc #include "../common/transform_iterator.h" // for MakeIndexTransformIter #include "device_adapter.cuh" // for NoInfInData -#include "ellpack_page.cuh" -#include "ellpack_page.h" -#include "gradient_index.h" -#include "xgboost/data.h" +#include "ellpack_page.cuh" // for EllpackPageImpl +#include "ellpack_page.h" // for EllpackPage +#include "gradient_index.h" // for GHistIndexMatrix +#include "xgboost/context.h" // for Context +#include "xgboost/data.h" // for DMatrix namespace xgboost { - -EllpackPage::EllpackPage() : impl_{new EllpackPageImpl()} {} +EllpackPage::EllpackPage() : impl_{new EllpackPageImpl{}} {} EllpackPage::EllpackPage(Context const* ctx, DMatrix* dmat, const BatchParam& param) : impl_{new EllpackPageImpl{ctx, dmat, param}} {} @@ -43,10 +45,9 @@ void EllpackPage::SetBaseRowId(std::size_t row_id) { impl_->SetBaseRowId(row_id) } [[nodiscard]] bst_idx_t EllpackPage::BaseRowId() const { return this->Impl()->base_rowid; } -[[nodiscard]] bool EllpackPage::IsDense() const { return this->Impl()->IsDense(); } // Bin each input data entry, store the bin indices in compressed form. -template +template __global__ void CompressBinEllpackKernel( common::CompressedBufferWriter wr, common::CompressedByteT* __restrict__ buffer, // gidx_buffer @@ -57,60 +58,98 @@ __global__ void CompressBinEllpackKernel( common::Span feature_types, size_t base_row, // batch_row_begin size_t n_rows, size_t row_stride, std::uint32_t null_gidx_value) { - size_t irow = threadIdx.x + blockIdx.x * blockDim.x; - int ifeature = threadIdx.y + blockIdx.y * blockDim.y; - if (irow >= n_rows || ifeature >= row_stride) { + auto irow = threadIdx.x + blockIdx.x * blockDim.x; + auto cpr_fidx = threadIdx.y + blockIdx.y * blockDim.y; // compressed fidx + if (irow >= n_rows || cpr_fidx >= row_stride) { return; } - int row_length = static_cast(row_ptrs[irow + 1] - row_ptrs[irow]); + auto row_length = static_cast(row_ptrs[irow + 1] - row_ptrs[irow]); std::uint32_t bin = null_gidx_value; - if (ifeature < row_length) { - Entry entry = entries[row_ptrs[irow] - row_ptrs[0] + ifeature]; - int feature = entry.index; + + // When treating a sparse matrix as dense, we need to write null values in between valid + // values. But we don't know where to write if the feature index is not recorded for a + // missing value. Here we use binary search to ensure `cpr_fidx` is the same as `fidx`. + if (kDenseCompressed && !HasNoMissing) { + auto row_beg = entries + row_ptrs[irow] - row_ptrs[0]; + auto row_end = entries + row_ptrs[irow + 1] - row_ptrs[0]; + auto it = thrust::make_transform_iterator(thrust::make_counting_iterator(0ul), + [=](std::size_t i) { return row_beg[i].index; }); + auto it_end = it + thrust::distance(row_beg, row_end); + auto res_it = thrust::lower_bound(thrust::seq, it, it_end, cpr_fidx); + if (res_it == it_end || cpr_fidx != *res_it) { + wr.AtomicWriteSymbol(buffer, bin, (irow + base_row) * row_stride + cpr_fidx); + return; + } + cpr_fidx = thrust::distance(it, res_it); + SPAN_CHECK(cpr_fidx < row_length); + } + + if (cpr_fidx < row_length) { + // We are using sub-batch of a SparsePage, need to account for the first offset within + // the sub-batch. + // + // The block.y idx is calculated using row_stride, which is the longest row. We can + // use `compressed_fidx` to fully index the sparse page row. + Entry entry = entries[row_ptrs[irow] - row_ptrs[0] + cpr_fidx]; + + bst_feature_t fidx = entry.index; float fvalue = entry.fvalue; - // {feature_cuts, ncuts} forms the array of cuts of `feature'. - const float* feature_cuts = &cuts[cut_ptrs[feature]]; - int ncuts = cut_ptrs[feature + 1] - cut_ptrs[feature]; - bool is_cat = common::IsCat(feature_types, ifeature); + // {feature_cuts, n_cuts} forms the array of cuts of the current `feature'. + float const* feature_cuts = &cuts[cut_ptrs[fidx]]; + auto n_cuts = cut_ptrs[fidx + 1] - cut_ptrs[fidx]; + + bool is_cat = common::IsCat(feature_types, fidx); // Assigning the bin in current entry. // S.t.: fvalue < feature_cuts[bin] + bin = std::numeric_limits::max(); if (is_cat) { auto it = dh::MakeTransformIterator(feature_cuts, [](float v) { return common::AsCat(v); }); - bin = thrust::lower_bound(thrust::seq, it, it + ncuts, common::AsCat(fvalue)) - it; + bin = thrust::lower_bound(thrust::seq, it, it + n_cuts, common::AsCat(fvalue)) - it; } else { - bin = thrust::upper_bound(thrust::seq, feature_cuts, feature_cuts + ncuts, fvalue) - + bin = thrust::upper_bound(thrust::seq, feature_cuts, feature_cuts + n_cuts, fvalue) - feature_cuts; } - if (bin >= ncuts) { - bin = ncuts - 1; + if (bin >= n_cuts) { + bin = n_cuts - 1; } - // Add the number of bins in previous features. - if (!kIsDense) { - bin += cut_ptrs[feature]; + if (!kDenseCompressed) { + // Sparse data, use the compressed fidx. Add the number of bins in previous + // features since we can't compresse it based on feature-local index. + bin += cut_ptrs[fidx]; + } else { + // Write to the actual fidx for dense data. + cpr_fidx = fidx; } } - // Write to gidx buffer. - wr.AtomicWriteSymbol(buffer, bin, (irow + base_row) * row_stride + ifeature); + // Write to the gidx buffer for non-missing values. + wr.AtomicWriteSymbol(buffer, bin, (irow + base_row) * row_stride + cpr_fidx); } namespace { // Calculate the number of symbols for the compressed ellpack. Similar to what the CPU // implementation does, we compress the dense data by subtracting the bin values with the -// starting bin of its feature. -[[nodiscard]] std::size_t CalcNumSymbols(Context const* ctx, bool is_dense, - std::shared_ptr cuts) { - // Cut values can be empty when the input data is empty. - if (!is_dense || cuts->cut_values_.Empty()) { - // Return the total number of symbols (total number of bins plus 1 for not found) - return cuts->cut_values_.Size() + 1; +// starting bin of its feature if it's dense. In addition, we treat the data as dense if +// there's no compression to be made by using ellpack. +[[nodiscard]] EllpackPageImpl::Info CalcNumSymbols( + Context const* ctx, bst_idx_t row_stride, bool is_dense, + std::shared_ptr cuts) { + // Return the total number of symbols (total number of bins plus 1 for missing) + // The null value equals the total number of bins. + bst_idx_t n_symbols = cuts->TotalBins() + 1; + if (n_symbols == 1) { // Empty DMatrix + return {static_cast(0), n_symbols}; } + bst_idx_t n_features = cuts->NumFeatures(); cuts->cut_ptrs_.SetDevice(ctx->Device()); common::Span dptrs = cuts->cut_ptrs_.ConstDeviceSpan(); - auto cuctx = ctx->CUDACtx(); using PtrT = typename decltype(dptrs)::value_type; + + // Calculate the number of required symbols if we treat the data as dense. + PtrT n_symbols_dense{0}; + CUDAContext const* cuctx = ctx->CUDACtx(); auto it = dh::MakeTransformIterator( thrust::make_counting_iterator(1ul), [=] XGBOOST_DEVICE(std::size_t i) { return dptrs[i] - dptrs[i - 1]; }); @@ -119,13 +158,23 @@ namespace { dh::CachingDeviceUVector max_element(1); auto d_me = max_element.data(); dh::LaunchN(1, cuctx->Stream(), [=] XGBOOST_DEVICE(std::size_t i) { d_me[i] = *max_it; }); - PtrT h_me{0}; - dh::safe_cuda( - cudaMemcpyAsync(&h_me, d_me, sizeof(PtrT), cudaMemcpyDeviceToHost, cuctx->Stream())); + dh::safe_cuda(cudaMemcpyAsync(&n_symbols_dense, d_me, sizeof(PtrT), cudaMemcpyDeviceToHost, + cuctx->Stream())); cuctx->Stream().Sync(); - // No missing, hence no null value, hence no + 1 symbol. - // FIXME(jiamingy): When we extend this to use a sparsity threshold, +1 is needed back. - return h_me; + // Decide the type of the data. + CHECK_LE(row_stride, n_features); + if (is_dense) { + // No missing, hence no null value, hence no + 1 symbol. + LOG(INFO) << "Ellpack is dense."; + return {n_features, n_symbols_dense}; + } else if (n_features == row_stride) { + // Treat the ellpack as dense if we can save memory. + LOG(INFO) << "Ellpack is relatively dense."; + return {n_features, n_symbols_dense + 1}; // +1 for missing value (null in ellpack) + } else { + LOG(INFO) << "Ellpack is sparse."; + return {row_stride, n_symbols}; + } } } // namespace @@ -134,10 +183,9 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, std::shared_ptr cuts, bool is_dense, bst_idx_t row_stride, bst_idx_t n_rows) : is_dense{is_dense}, - cuts_{std::move(cuts)}, - row_stride{row_stride}, n_rows{n_rows}, - n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} { + cuts_{std::move(cuts)}, + info{CalcNumSymbols(ctx, row_stride, is_dense, this->cuts_)} { monitor_.Init("ellpack_page"); curt::SetDevice(ctx->Ordinal()); @@ -148,11 +196,10 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, std::shared_ptr cuts, const SparsePage& page, bool is_dense, size_t row_stride, common::Span feature_types) - : cuts_{std::move(cuts)}, - is_dense{is_dense}, + : is_dense{is_dense}, n_rows{page.Size()}, - row_stride{row_stride}, - n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} { + cuts_{std::move(cuts)}, + info{CalcNumSymbols(ctx, row_stride, is_dense, this->cuts_)} { monitor_.Init("ellpack_page"); curt::SetDevice(ctx->Ordinal()); @@ -164,14 +211,13 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* p_fmat, const BatchParam& param) : is_dense{p_fmat->IsDense()}, n_rows{p_fmat->Info().num_row_}, - row_stride{GetRowStride(p_fmat)}, // Create the quantile sketches for the dmatrix and initialize HistogramCuts. cuts_{param.hess.empty() ? std::make_shared( common::DeviceSketch(ctx, p_fmat, param.max_bin)) : std::make_shared( common::DeviceSketchWithHessian(ctx, p_fmat, param.max_bin, param.hess))}, - n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} { + info{CalcNumSymbols(ctx, GetRowStride(p_fmat), p_fmat->IsDense(), this->cuts_)} { monitor_.Init("ellpack_page"); curt::SetDevice(ctx->Ordinal()); @@ -179,15 +225,13 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* p_fmat, const Batc p_fmat->Info().feature_types.SetDevice(ctx->Device()); auto ft = p_fmat->Info().feature_types.ConstDeviceSpan(); - monitor_.Start("BinningCompression"); CHECK(p_fmat->SingleColBlock()); for (auto const& page : p_fmat->GetBatches()) { this->CreateHistIndices(ctx, page, ft); } - monitor_.Stop("BinningCompression"); } -template +template struct WriteCompressedEllpackFunctor { WriteCompressedEllpackFunctor(common::CompressedByteT* buffer, const common::CompressedBufferWriter& writer, AdapterBatchT batch, @@ -208,22 +252,40 @@ struct WriteCompressedEllpackFunctor { common::Span feature_types; data::IsValidFunctor is_valid; - using Tuple = thrust::tuple; - __device__ size_t operator()(Tuple out) { + // Tuple[0] = The row index of the input, used as a key to define segments + // Tuple[1] = Scanned flags of valid elements for each row + // Tuple[2] = The index in the input data + using Tuple = thrust::tuple; + + template + __device__ void Write(data::COOTuple const& e, bst_idx_t out_position) { + bst_bin_t bin_idx = 0; + if (common::IsCat(feature_types, e.column_idx)) { + bin_idx = accessor.SearchBin(e.value, e.column_idx); + } else { + bin_idx = accessor.SearchBin(e.value, e.column_idx); + } + if constexpr (kIsDenseCompressed) { + bin_idx -= accessor.feature_segments[e.column_idx]; + } + writer.AtomicWriteSymbol(d_buffer, bin_idx, out_position); + } + // Used for dense or as dense data. + __device__ void operator()(bst_idx_t i) { + auto e = batch.GetElement(i); + if (is_valid(e)) { + this->Write(e, i); + } else { + writer.AtomicWriteSymbol(d_buffer, accessor.NullValue(), i); + } + } + // Used for sparse data. + __device__ size_t operator()(Tuple const& out) { auto e = batch.GetElement(thrust::get<2>(out)); if (is_valid(e)) { // -1 because the scan is inclusive size_t output_position = accessor.row_stride * e.row_idx + thrust::get<1>(out) - 1; - uint32_t bin_idx = 0; - if (common::IsCat(feature_types, e.column_idx)) { - bin_idx = accessor.SearchBin(e.value, e.column_idx); - } else { - bin_idx = accessor.SearchBin(e.value, e.column_idx); - } - if (kIsDense) { - bin_idx -= accessor.feature_segments[e.column_idx]; - } - writer.AtomicWriteSymbol(d_buffer, bin_idx, output_position); + this->Write(e, output_position); } return 0; } @@ -244,90 +306,83 @@ struct TupleScanOp { // Here the data is already correctly ordered and simply needs to be compacted // to remove missing data -template +template void CopyDataToEllpack(Context const* ctx, const AdapterBatchT& batch, common::Span feature_types, EllpackPageImpl* dst, float missing) { - // Some witchcraft happens here - // The goal is to copy valid elements out of the input to an ELLPACK matrix - // with a given row stride, using no extra working memory Standard stream - // compaction needs to be modified to do this, so we manually define a - // segmented stream compaction via operators on an inclusive scan. The output - // of this inclusive scan is fed to a custom function which works out the - // correct output position - auto counting = thrust::make_counting_iterator(0llu); data::IsValidFunctor is_valid(missing); bool valid = data::NoInfInData(batch, is_valid); CHECK(valid) << error::InfInData(); - auto key_iter = dh::MakeTransformIterator( - counting, - [=] __device__(size_t idx) { - return batch.GetElement(idx).row_idx; - }); - auto value_iter = dh::MakeTransformIterator( - counting, - [=] __device__(size_t idx) -> size_t { - return is_valid(batch.GetElement(idx)); - }); - - auto key_value_index_iter = - thrust::make_zip_iterator(thrust::make_tuple(key_iter, value_iter, counting)); - - // Tuple[0] = The row index of the input, used as a key to define segments - // Tuple[1] = Scanned flags of valid elements for each row - // Tuple[2] = The index in the input data - using Tuple = thrust::tuple; - - auto device_accessor = dst->GetDeviceAccessor(ctx); + auto cnt = thrust::make_counting_iterator(0llu); auto n_symbols = dst->NumSymbols(); - common::CompressedBufferWriter writer{n_symbols}; auto d_compressed_buffer = dst->gidx_buffer.data(); // We redirect the scan output into this functor to do the actual writing + using Tuple = typename WriteCompressedEllpackFunctor::Tuple; dh::TypedDiscard discard; - WriteCompressedEllpackFunctor functor{ + auto device_accessor = dst->GetDeviceAccessor(ctx); + WriteCompressedEllpackFunctor functor{ d_compressed_buffer, writer, batch, device_accessor, feature_types, is_valid}; - thrust::transform_output_iterator out(discard, functor); + // For dense compressed data, we can simply copy the data with the input position. + if (kIsDenseCompressed) { + CHECK(batch.NumRows() == 0 || batch.NumCols() == dst->info.row_stride); + thrust::for_each_n(ctx->CUDACtx()->CTP(), cnt, dst->Size() * dst->info.row_stride, functor); + return; + } + + // Some witchcraft happens here. + // + // The goal is to copy valid elements out of the input to an ELLPACK matrix with a given + // row stride, using no extra working memory Standard stream compaction needs to be + // modified to do this, so we manually define a segmented stream compaction via + // operators on an inclusive scan. The output of this inclusive scan is fed to a custom + // function which works out the correct output position + auto key_iter = dh::MakeTransformIterator( + cnt, [=] __device__(size_t idx) { return batch.GetElement(idx).row_idx; }); + auto value_iter = dh::MakeTransformIterator( + cnt, [=] __device__(size_t idx) -> size_t { return is_valid(batch.GetElement(idx)); }); + + auto key_value_index_iter = + thrust::make_zip_iterator(thrust::make_tuple(key_iter, value_iter, cnt)); + thrust::transform_output_iterator out(discard, functor); common::InclusiveScan(ctx, key_value_index_iter, out, TupleScanOp{}, batch.Size()); } void WriteNullValues(Context const* ctx, EllpackPageImpl* dst, common::Span row_counts) { // Write the null values - auto device_accessor = dst->GetDeviceAccessor(ctx); + auto null = dst->GetDeviceAccessor(ctx).NullValue(); common::CompressedBufferWriter writer(dst->NumSymbols()); auto d_compressed_buffer = dst->gidx_buffer.data(); - auto row_stride = dst->row_stride; - dh::LaunchN(row_stride * dst->n_rows, ctx->CUDACtx()->Stream(), [=] __device__(bst_idx_t idx) { - // For some reason this variable got captured as const - auto writer_non_const = writer; + auto row_stride = dst->info.row_stride; + auto n = row_stride * dst->n_rows; + dh::LaunchN(n, ctx->CUDACtx()->Stream(), [=] __device__(bst_idx_t idx) mutable { size_t row_idx = idx / row_stride; size_t row_offset = idx % row_stride; if (row_offset >= row_counts[row_idx]) { - writer_non_const.AtomicWriteSymbol(d_compressed_buffer, device_accessor.NullValue(), idx); + writer.AtomicWriteSymbol(d_compressed_buffer, null, idx); } }); } template EllpackPageImpl::EllpackPageImpl(Context const* ctx, AdapterBatch batch, float missing, - bool is_dense, common::Span row_counts_span, + bool is_dense, common::Span row_counts, common::Span feature_types, size_t row_stride, bst_idx_t n_rows, std::shared_ptr cuts) : EllpackPageImpl{ctx, cuts, is_dense, row_stride, n_rows} { curt::SetDevice(ctx->Ordinal()); - if (this->IsDense()) { + if (this->IsDenseCompressed()) { CopyDataToEllpack(ctx, batch, feature_types, this, missing); } else { CopyDataToEllpack(ctx, batch, feature_types, this, missing); + WriteNullValues(ctx, this, row_counts); } - - WriteNullValues(ctx, this, row_counts_span); } #define ELLPACK_BATCH_SPECIALIZE(__BATCH_T) \ @@ -339,10 +394,14 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, AdapterBatch batch, float m ELLPACK_BATCH_SPECIALIZE(data::CudfAdapterBatch) ELLPACK_BATCH_SPECIALIZE(data::CupyAdapterBatch) +#undef ELLPACK_BATCH_SPECIALIZE + namespace { +template void CopyGHistToEllpack(Context const* ctx, GHistIndexMatrix const& page, common::Span d_row_ptr, bst_idx_t row_stride, bst_bin_t null, bst_idx_t n_symbols, + common::Span d_cut_ptrs, common::CompressedByteT* d_compressed_buffer) { dh::device_vector data(page.index.begin(), page.index.end()); auto d_data = dh::ToSpan(data); @@ -350,43 +409,59 @@ void CopyGHistToEllpack(Context const* ctx, GHistIndexMatrix const& page, // GPU employs the same dense compression as CPU, no need to handle page.index.Offset() auto bin_type = page.index.GetBinTypeSize(); common::CompressedBufferWriter writer{n_symbols}; - auto cuctx = ctx->CUDACtx(); - dh::LaunchN(row_stride * page.Size(), cuctx->Stream(), [=] __device__(bst_idx_t idx) mutable { - auto ridx = idx / row_stride; - auto ifeature = idx % row_stride; + bool dense_compress = row_stride == page.Features() && !page.IsDense(); + auto n_samples = page.Size(); + auto cnt = thrust::make_counting_iterator(0ul); + auto ptr = reinterpret_cast(d_data.data()); + auto fn = [=] __device__(std::size_t i) mutable { + auto [ridx, fidx] = linalg::UnravelIndex(i, n_samples, row_stride); auto r_begin = d_row_ptr[ridx]; auto r_end = d_row_ptr[ridx + 1]; auto r_size = r_end - r_begin; - if (ifeature >= r_size) { - writer.AtomicWriteSymbol(d_compressed_buffer, null, idx); - return; + bst_bin_t bin_idx; + if (dense_compress) { + auto f_begin = d_cut_ptrs[fidx]; + auto f_end = d_cut_ptrs[fidx + 1]; + // CPU gidx is not compressed, can be used for binary search. + bin_idx = common::BinarySearchBin(r_begin, r_end, ptr, f_begin, f_end); + if (bin_idx == -1) { + bin_idx = null; + } else { + bin_idx -= d_cut_ptrs[fidx]; + } + } else if (fidx >= r_size) { + bin_idx = null; + } else { + bin_idx = ptr[r_begin + fidx]; } - common::cuda::DispatchBinType(bin_type, [&](auto t) { - using T = decltype(t); - auto ptr = reinterpret_cast(d_data.data()); - auto bin_idx = ptr[r_begin + ifeature]; - writer.AtomicWriteSymbol(d_compressed_buffer, bin_idx, idx); - }); - }); + writer.AtomicWriteSymbol(d_compressed_buffer, bin_idx, i); + }; + thrust::for_each_n(cuctx->CTP(), cnt, row_stride * page.Size(), fn); } } // anonymous namespace EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& page, common::Span ft) : is_dense{page.IsDense()}, - row_stride{[&] { - auto it = common::MakeIndexTransformIter( - [&](bst_idx_t i) { return page.row_ptr[i + 1] - page.row_ptr[i]; }); - return *std::max_element(it, it + page.Size()); - }()}, base_rowid{page.base_rowid}, n_rows{page.Size()}, - cuts_{std::make_shared(page.cut)}, - n_symbols_{CalcNumSymbols(ctx, page.IsDense(), cuts_)} { + cuts_{[&] { + auto cuts = std::make_shared(page.cut); + cuts->SetDevice(ctx->Device()); + return cuts; + }()}, + info{CalcNumSymbols( + ctx, + [&] { + auto it = common::MakeIndexTransformIter( + [&](bst_idx_t i) { return page.row_ptr[i + 1] - page.row_ptr[i]; }); + return *std::max_element(it, it + page.Size()); + }(), + page.IsDense(), cuts_)} { this->monitor_.Init("ellpack_page"); CHECK(ctx->IsCUDA()); this->InitCompressedData(ctx); @@ -400,8 +475,12 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag auto accessor = this->GetDeviceAccessor(ctx, ft); this->monitor_.Start("CopyGHistToEllpack"); - CopyGHistToEllpack(ctx, page, d_row_ptr, row_stride, accessor.NullValue(), this->NumSymbols(), - d_compressed_buffer); + common::DispatchBinType(page.index.GetBinTypeSize(), [&](auto t) { + using T = decltype(t); + CopyGHistToEllpack(ctx, page, d_row_ptr, this->info.row_stride, accessor.NullValue(), + this->NumSymbols(), this->cuts_->cut_ptrs_.ConstDeviceSpan(), + d_compressed_buffer); + }); this->monitor_.Stop("CopyGHistToEllpack"); } @@ -432,13 +511,13 @@ struct CopyPage { // Copy the data from the given EllpackPage to the current page. bst_idx_t EllpackPageImpl::Copy(Context const* ctx, EllpackPageImpl const* page, bst_idx_t offset) { monitor_.Start(__func__); - bst_idx_t num_elements = page->n_rows * page->row_stride; - CHECK_EQ(this->row_stride, page->row_stride); + bst_idx_t num_elements = page->n_rows * page->info.row_stride; + CHECK_EQ(this->info.row_stride, page->info.row_stride); CHECK_EQ(NumSymbols(), page->NumSymbols()); - CHECK_GE(this->n_rows * this->row_stride, offset + num_elements); + CHECK_GE(this->n_rows * this->info.row_stride, offset + num_elements); if (page == this) { LOG(FATAL) << "Concatenating the same Ellpack."; - return this->n_rows * this->row_stride; + return this->n_rows * this->info.row_stride; } dh::LaunchN(num_elements, ctx->CUDACtx()->Stream(), CopyPage{this, page, offset}); monitor_.Stop(__func__); @@ -471,7 +550,7 @@ struct CompactPage { src_iterator_d{src->gidx_buffer.data(), src->NumSymbols()}, row_indexes(row_indexes), base_rowid{src->base_rowid}, - row_stride{src->row_stride} {} + row_stride{src->info.row_stride} {} __device__ void operator()(bst_idx_t row_id) { size_t src_row = base_rowid + row_id; @@ -491,8 +570,8 @@ struct CompactPage { void EllpackPageImpl::Compact(Context const* ctx, EllpackPageImpl const* page, common::Span row_indexes) { monitor_.Start(__func__); - CHECK_EQ(row_stride, page->row_stride); - CHECK_EQ(NumSymbols(), page->NumSymbols()); + CHECK_EQ(this->info.row_stride, page->info.row_stride); + CHECK_EQ(this->NumSymbols(), page->NumSymbols()); CHECK_LE(page->base_rowid + page->n_rows, row_indexes.size()); auto cuctx = ctx->CUDACtx(); dh::LaunchN(page->n_rows, cuctx->Stream(), CompactPage{this, page, row_indexes}); @@ -508,35 +587,37 @@ void EllpackPageImpl::InitCompressedData(Context const* ctx) { monitor_.Start(__func__); auto num_symbols = this->NumSymbols(); // Required buffer size for storing data matrix in ELLPack format. - std::size_t compressed_size_bytes = - common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, num_symbols); + std::size_t compressed_size_bytes = common::CompressedBufferWriter::CalculateBufferSize( + this->info.row_stride * this->n_rows, num_symbols); auto init = static_cast(0); gidx_buffer = common::MakeFixedVecWithCudaMalloc(ctx, compressed_size_bytes, init); monitor_.Stop(__func__); } // Compress a CSR page into ELLPACK. -void EllpackPageImpl::CreateHistIndices(Context const* ctx, - const SparsePage& row_batch, +void EllpackPageImpl::CreateHistIndices(Context const* ctx, const SparsePage& row_batch, common::Span feature_types) { if (row_batch.Size() == 0) { return; } + + this->monitor_.Start(__func__); auto null_gidx_value = this->GetDeviceAccessor(ctx, feature_types).NullValue(); auto const& offset_vec = row_batch.offset.ConstHostVector(); // bin and compress entries in batches of rows size_t gpu_batch_nrows = - std::min(dh::TotalMemory(ctx->Ordinal()) / (16 * row_stride * sizeof(Entry)), + std::min(dh::TotalMemory(ctx->Ordinal()) / (16 * this->info.row_stride * sizeof(Entry)), static_cast(row_batch.Size())); size_t gpu_nbatches = common::DivRoundUp(row_batch.Size(), gpu_batch_nrows); + auto writer = common::CompressedBufferWriter{this->NumSymbols()}; + auto gidx_buffer_data = gidx_buffer.data(); for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) { size_t batch_row_begin = gpu_batch * gpu_batch_nrows; - size_t batch_row_end = - std::min((gpu_batch + 1) * gpu_batch_nrows, row_batch.Size()); + size_t batch_row_end = std::min((gpu_batch + 1) * gpu_batch_nrows, row_batch.Size()); size_t batch_nrows = batch_row_end - batch_row_begin; const auto ent_cnt_begin = offset_vec[batch_row_begin]; @@ -569,40 +650,45 @@ void EllpackPageImpl::CreateHistIndices(Context const* ctx, const dim3 block3(32, 8, 1); // 256 threads const dim3 grid3(common::DivRoundUp(batch_nrows, block3.x), - common::DivRoundUp(row_stride, block3.y), 1); + common::DivRoundUp(this->info.row_stride, block3.y), 1); auto device_accessor = this->GetDeviceAccessor(ctx); auto launcher = [&](auto kernel) { dh::LaunchKernel{grid3, block3, 0, ctx->CUDACtx()->Stream()}( // NOLINT - kernel, common::CompressedBufferWriter(this->NumSymbols()), gidx_buffer.data(), - row_ptrs.data(), entries_d.data(), device_accessor.gidx_fvalue_map.data(), - device_accessor.feature_segments.data(), feature_types, batch_row_begin, batch_nrows, - row_stride, null_gidx_value); + kernel, writer, gidx_buffer_data, row_ptrs.data(), entries_d.data(), + device_accessor.gidx_fvalue_map.data(), device_accessor.feature_segments, feature_types, + batch_row_begin, batch_nrows, this->info.row_stride, null_gidx_value); }; if (this->IsDense()) { - launcher(CompressBinEllpackKernel); + launcher(CompressBinEllpackKernel); } else { - launcher(CompressBinEllpackKernel); + if (this->IsDenseCompressed()) { + launcher(CompressBinEllpackKernel); + } else { + launcher(CompressBinEllpackKernel); + } } } + this->monitor_.Stop(__func__); } // Return the number of rows contained in this page. [[nodiscard]] bst_idx_t EllpackPageImpl::Size() const { return n_rows; } std::size_t EllpackPageImpl::MemCostBytes() const { - return this->gidx_buffer.size_bytes() + sizeof(this->n_rows) + sizeof(this->is_dense) + - sizeof(this->row_stride) + sizeof(this->base_rowid) + sizeof(this->n_symbols_); + return this->gidx_buffer.size_bytes() + sizeof(this->is_dense) + sizeof(this->n_rows) + + sizeof(this->base_rowid) + sizeof(this->info); } EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor( Context const* ctx, common::Span feature_types) const { + auto null = this->IsDense() ? this->NumSymbols() : this->NumSymbols() - 1; return {ctx, - cuts_, - is_dense, - row_stride, - base_rowid, - n_rows, - common::CompressedIterator(gidx_buffer.data(), this->NumSymbols()), + this->cuts_, + this->info.row_stride, + this->base_rowid, + this->n_rows, + common::CompressedIterator{gidx_buffer.data(), this->NumSymbols()}, + null, feature_types}; } @@ -615,26 +701,28 @@ EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor( dh::safe_cuda(cudaMemcpyAsync(h_gidx_buffer->data(), gidx_buffer.data(), gidx_buffer.size_bytes(), cudaMemcpyDefault, ctx->CUDACtx()->Stream())); Context cpu_ctx; + auto null = this->IsDense() ? this->NumSymbols() : this->NumSymbols() - 1; return {ctx->IsCPU() ? ctx : &cpu_ctx, - cuts_, - is_dense, - row_stride, - base_rowid, - n_rows, - common::CompressedIterator(h_gidx_buffer->data(), this->NumSymbols()), + this->cuts_, + this->info.row_stride, + this->base_rowid, + this->n_rows, + common::CompressedIterator{h_gidx_buffer->data(), this->NumSymbols()}, + null, feature_types}; } [[nodiscard]] bst_idx_t EllpackPageImpl::NumNonMissing( Context const* ctx, common::Span feature_types) const { + if (this->IsDense()) { + return this->n_rows * this->info.row_stride; + } auto d_acc = this->GetDeviceAccessor(ctx, feature_types); using T = typename decltype(d_acc.gidx_iter)::value_type; auto it = thrust::make_transform_iterator( thrust::make_counting_iterator(0ull), - cuda::proclaim_return_type([=] __device__(std::size_t i) { return d_acc.gidx_iter[i]; })); - auto nnz = thrust::count_if(ctx->CUDACtx()->CTP(), it, it + d_acc.row_stride * d_acc.n_rows, - cuda::proclaim_return_type( - [=] __device__(T gidx) { return gidx != d_acc.NullValue(); })); - return nnz; + [=] XGBOOST_DEVICE(std::size_t i) { return d_acc.gidx_iter[i]; }); + return thrust::count_if(ctx->CUDACtx()->CTP(), it, it + d_acc.row_stride * d_acc.n_rows, + [=] XGBOOST_DEVICE(T gidx) -> bool { return gidx != d_acc.NullValue(); }); } } // namespace xgboost diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh index 78641c5ac9c7..85e412f86398 100644 --- a/src/data/ellpack_page.cuh +++ b/src/data/ellpack_page.cuh @@ -24,7 +24,7 @@ namespace xgboost { */ struct EllpackDeviceAccessor { /** @brief Whether or not if the matrix is dense. */ - bool is_dense; + bst_idx_t null_value; /** @brief Row length for ELLPACK, equal to number of features when the data is dense. */ bst_idx_t row_stride; /** @brief Starting index of the rows. Used for external memory. */ @@ -36,7 +36,7 @@ struct EllpackDeviceAccessor { /** @brief Minimum value for each feature. Size equals to number of features. */ common::Span min_fvalue; /** @brief Histogram cut pointers. Size equals to (number of features + 1). */ - common::Span feature_segments; + std::uint32_t const* feature_segments; /** @brief Histogram cut values. Size equals to (bins per feature * number of features). */ common::Span gidx_fvalue_map; /** @brief Type of each feature, categorical or numerical. */ @@ -44,10 +44,10 @@ struct EllpackDeviceAccessor { EllpackDeviceAccessor() = delete; EllpackDeviceAccessor(Context const* ctx, std::shared_ptr cuts, - bool is_dense, bst_idx_t row_stride, bst_idx_t base_rowid, bst_idx_t n_rows, - common::CompressedIterator gidx_iter, + bst_idx_t row_stride, bst_idx_t base_rowid, bst_idx_t n_rows, + common::CompressedIterator gidx_iter, bst_idx_t null_value, common::Span feature_types) - : is_dense{is_dense}, + : null_value{null_value}, row_stride{row_stride}, base_rowid{base_rowid}, n_rows{n_rows}, @@ -58,62 +58,72 @@ struct EllpackDeviceAccessor { cuts->cut_ptrs_.SetDevice(ctx->Device()); cuts->min_vals_.SetDevice(ctx->Device()); gidx_fvalue_map = cuts->cut_values_.ConstDeviceSpan(); - feature_segments = cuts->cut_ptrs_.ConstDeviceSpan(); + feature_segments = cuts->cut_ptrs_.ConstDevicePointer(); min_fvalue = cuts->min_vals_.ConstDeviceSpan(); } else { gidx_fvalue_map = cuts->cut_values_.ConstHostSpan(); - feature_segments = cuts->cut_ptrs_.ConstHostSpan(); + feature_segments = cuts->cut_ptrs_.ConstHostPointer(); min_fvalue = cuts->min_vals_.ConstHostSpan(); } } + [[nodiscard]] XGBOOST_HOST_DEV_INLINE bool IsDenseCompressed() const { + return this->row_stride == this->NumFeatures(); + } /** - * @brief Given a row index and a feature index, returns the corresponding cut value. + * @brief Given a row index and a feature index, returns the corresponding bin index. * - * Uses binary search for look up. Returns NaN if missing. + * Uses binary search for look up. * * @tparam global_ridx Whether the row index is global to all ellpack batches or it's * local to the current batch. + * + * @return -1 if it's a missing value. */ template - [[nodiscard]] __device__ bst_bin_t GetBinIndex(bst_idx_t ridx, size_t fidx) const { + [[nodiscard]] __device__ bst_bin_t GetBinIndex(bst_idx_t ridx, std::size_t fidx) const { if (global_ridx) { ridx -= base_rowid; } auto row_begin = row_stride * ridx; - auto row_end = row_begin + row_stride; - bst_bin_t gidx = -1; - if (is_dense) { - gidx = gidx_iter[row_begin + fidx]; - gidx += this->feature_segments[fidx]; - } else { - gidx = common::BinarySearchBin(row_begin, row_end, gidx_iter, feature_segments[fidx], - feature_segments[fidx + 1]); + if (!this->IsDenseCompressed()) { + // binary search returns -1 if it's missing + auto row_end = row_begin + row_stride; + bst_bin_t gidx = common::BinarySearchBin(row_begin, row_end, gidx_iter, + feature_segments[fidx], feature_segments[fidx + 1]); + return gidx; + } + bst_bin_t gidx = gidx_iter[row_begin + fidx]; + if (gidx == this->NullValue()) { + // Missing value in a dense ellpack + return -1; } + // Dense ellpack + gidx += this->feature_segments[fidx]; return gidx; } - + /** + * @brief Find a bin to place the value in. Used during construction of the Ellpack. + */ template - [[nodiscard]] __device__ uint32_t SearchBin(float value, size_t column_id) const { - auto beg = feature_segments[column_id]; - auto end = feature_segments[column_id + 1]; - uint32_t idx = 0; + [[nodiscard]] __device__ bst_bin_t SearchBin(float value, std::size_t fidx) const { + auto beg = feature_segments[fidx]; + auto end = feature_segments[fidx + 1]; + bst_bin_t gidx = 0; if (is_cat) { - auto it = dh::MakeTransformIterator( - gidx_fvalue_map.cbegin(), [](float v) { return common::AsCat(v); }); - idx = thrust::lower_bound(thrust::seq, it + beg, it + end, - common::AsCat(value)) - - it; + auto it = dh::MakeTransformIterator(gidx_fvalue_map.cbegin(), + [](float v) { return common::AsCat(v); }); + gidx = thrust::lower_bound(thrust::seq, it + beg, it + end, common::AsCat(value)) - it; } else { auto it = thrust::upper_bound(thrust::seq, gidx_fvalue_map.cbegin() + beg, gidx_fvalue_map.cbegin() + end, value); - idx = it - gidx_fvalue_map.cbegin(); + gidx = it - gidx_fvalue_map.cbegin(); } - if (idx == end) { - idx -= 1; + if (gidx == end) { + gidx -= 1; } - return idx; + return gidx; } [[nodiscard]] __device__ float GetFvalue(bst_idx_t ridx, size_t fidx) const { @@ -123,22 +133,21 @@ struct EllpackDeviceAccessor { } return gidx_fvalue_map[gidx]; } - - // Check if the row id is withing range of the current batch. - [[nodiscard]] __device__ bool IsInRange(size_t row_id) const { - return row_id >= base_rowid && row_id < base_rowid + n_rows; - } - - [[nodiscard]] XGBOOST_DEVICE size_t NullValue() const { return this->NumBins(); } - - [[nodiscard]] XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); } - - [[nodiscard]] XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); } + [[nodiscard]] XGBOOST_HOST_DEV_INLINE bst_idx_t NullValue() const { return this->null_value; } + [[nodiscard]] XGBOOST_HOST_DEV_INLINE bst_idx_t NumBins() const { return gidx_fvalue_map.size(); } + [[nodiscard]] XGBOOST_HOST_DEV_INLINE size_t NumFeatures() const { return min_fvalue.size(); } }; class GHistIndexMatrix; +/** + * @brief This is either an Ellpack format matrix or a dense matrix. + * + * When there's no compression can be made by using ellpack, we use this structure as a + * simple dense matrix. For dense matrix, we can provide extra compression by counting the + * histogram bin for each feature instead of for the entire dataset. + */ class EllpackPageImpl { public: /** @@ -150,7 +159,7 @@ class EllpackPageImpl { EllpackPageImpl() = default; /** - * @brief Constructor from an existing EllpackInfo. + * @brief Constructor from existing ellpack matrics. * * This is used in the sampling case. The ELLPACK page is constructed from an existing * Ellpack page and the given number of rows. @@ -223,8 +232,17 @@ class EllpackPageImpl { [[nodiscard]] common::HistogramCuts const& Cuts() const { return *cuts_; } [[nodiscard]] std::shared_ptr CutsShared() const { return cuts_; } void SetCuts(std::shared_ptr cuts); + /** + * @brief Fully dense, there's not a single missing value. + */ + [[nodiscard]] bool IsDense() const { return this->is_dense; } + /** + * @brief Stored as a dense matrix, but there might be missing values. + */ + [[nodiscard]] bool IsDenseCompressed() const { + return this->cuts_->NumFeatures() == this->info.row_stride; + } - [[nodiscard]] bool IsDense() const { return is_dense; } /** @return Estimation of memory cost of this page. */ std::size_t MemCostBytes() const; @@ -232,9 +250,8 @@ class EllpackPageImpl { * @brief Return the total number of symbols (total number of bins plus 1 for not * found). */ - [[nodiscard]] std::size_t NumSymbols() const { return this->n_symbols_; } - void SetNumSymbols(bst_idx_t n_symbols) { this->n_symbols_ = n_symbols; } - + [[nodiscard]] auto NumSymbols() const { return this->info.n_symbols; } + void SetNumSymbols(bst_idx_t n_symbols) { this->info.n_symbols = n_symbols; } /** * @brief Get an accessor that can be passed into CUDA kernels. */ @@ -265,11 +282,11 @@ class EllpackPageImpl { */ void InitCompressedData(Context const* ctx); + std::shared_ptr cuts_; + public: - /** @brief Whether or not if the matrix is dense. */ - bool is_dense; - /** @brief Row length for ELLPACK. */ - bst_idx_t row_stride; + bool is_dense{false}; + bst_idx_t base_rowid{0}; bst_idx_t n_rows{0}; /** @@ -278,22 +295,28 @@ class EllpackPageImpl { * This can be backed by various storage types. */ common::RefResourceView gidx_buffer; + /** + * @brief Compression infomation. + */ + struct Info { + /** @brief Row length for ELLPACK. */ + bst_idx_t row_stride{0}; + /** @brief The number of unique bins including missing. */ + bst_idx_t n_symbols{0}; + } info; private: - std::shared_ptr cuts_; - bst_idx_t n_symbols_{0}; common::Monitor monitor_; }; -inline size_t GetRowStride(DMatrix* dmat) { +[[nodiscard]] inline bst_idx_t GetRowStride(DMatrix* dmat) { if (dmat->IsDense()) return dmat->Info().num_col_; size_t row_stride = 0; for (const auto& batch : dmat->GetBatches()) { const auto& row_offset = batch.offset.ConstHostVector(); for (auto i = 1ull; i < row_offset.size(); i++) { - row_stride = std::max( - row_stride, static_cast(row_offset[i] - row_offset[i - 1])); + row_stride = std::max(row_stride, static_cast(row_offset[i] - row_offset[i - 1])); } } return row_stride; diff --git a/src/data/ellpack_page.h b/src/data/ellpack_page.h index fa312f6e7887..8e54855049a2 100644 --- a/src/data/ellpack_page.h +++ b/src/data/ellpack_page.h @@ -17,6 +17,8 @@ class EllpackPageImpl; * * This class uses the PImpl idiom (https://en.cppreference.com/w/cpp/language/pimpl) to avoid * including CUDA-specific implementation details in the header. + * + * See @ref EllpackPageImpl . */ class EllpackPage { public: @@ -42,7 +44,6 @@ class EllpackPage { /*! \return Number of instances in the page. */ [[nodiscard]] bst_idx_t Size() const; - [[nodiscard]] bool IsDense() const; /*! \brief Set the base row id for this page. */ void SetBaseRowId(std::size_t row_id); diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu index 839966b08151..262b9c8d3796 100644 --- a/src/data/ellpack_page_raw_format.cu +++ b/src/data/ellpack_page_raw_format.cu @@ -57,7 +57,7 @@ template RET_IF_NOT(fi->Read(&impl->n_rows)); RET_IF_NOT(fi->Read(&impl->is_dense)); - RET_IF_NOT(fi->Read(&impl->row_stride)); + RET_IF_NOT(fi->Read(&impl->info.row_stride)); if (this->param_.prefetch_copy || !has_hmm_ats_) { RET_IF_NOT(ReadDeviceVec(fi, &impl->gidx_buffer)); @@ -83,7 +83,7 @@ template auto* impl = page.Impl(); bytes += fo->Write(impl->n_rows); bytes += fo->Write(impl->is_dense); - bytes += fo->Write(impl->row_stride); + bytes += fo->Write(impl->info.row_stride); std::vector h_gidx_buffer; Context ctx = Context{}.MakeCUDA(curt::CurrentDevice()); [[maybe_unused]] auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx_buffer); diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu index 588ddccec32b..09fc0da68847 100644 --- a/src/data/ellpack_page_source.cu +++ b/src/data/ellpack_page_source.cu @@ -80,7 +80,7 @@ class EllpackHostCacheStreamImpl { common::MakeFixedVecWithPinnedMalloc(impl->gidx_buffer.size()); new_impl->n_rows = impl->Size(); new_impl->is_dense = impl->IsDense(); - new_impl->row_stride = impl->row_stride; + new_impl->info.row_stride = impl->info.row_stride; new_impl->base_rowid = impl->base_rowid; new_impl->SetNumSymbols(impl->NumSymbols()); @@ -108,7 +108,7 @@ class EllpackHostCacheStreamImpl { impl->n_rows = page->Size(); impl->is_dense = page->IsDense(); - impl->row_stride = page->row_stride; + impl->info.row_stride = page->info.row_stride; impl->base_rowid = page->base_rowid; impl->SetNumSymbols(page->NumSymbols()); } diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h index 40f29b6b93b2..2d5d5b8f1ce8 100644 --- a/src/data/ellpack_page_source.h +++ b/src/data/ellpack_page_source.h @@ -80,6 +80,12 @@ class EllpackFormatPolicy { if (!GlobalConfigThreadLocalStore::Get()->use_rmm) { LOG(WARNING) << "`use_rmm` is set to false." << msg; } + std::int32_t major{0}, minor{0}; + curt::DrVersion(&major, &minor); + if (!(major >= 12 && minor >= 7) && curt::SupportsAts()) { + // Use ATS, but with an old kernel driver. + LOG(WARNING) << "Using an old kernel driver with supported CTK<12.7." << msg; + } } // For testing with the HMM flag. explicit EllpackFormatPolicy(bool has_hmm) : has_hmm_{has_hmm} {} diff --git a/src/data/extmem_quantile_dmatrix.cu b/src/data/extmem_quantile_dmatrix.cu index 3fb1557e9993..119f9298c501 100644 --- a/src/data/extmem_quantile_dmatrix.cu +++ b/src/data/extmem_quantile_dmatrix.cu @@ -56,7 +56,7 @@ void ExtMemQuantileDMatrix::InitFromCUDA( for (auto const &page : this->GetEllpackPageImpl()) { n_total_samples += page.Size(); CHECK_EQ(page.Impl()->base_rowid, ext_info.base_rows[k]); - CHECK_EQ(page.Impl()->row_stride, ext_info.row_stride); + CHECK_EQ(page.Impl()->info.row_stride, ext_info.row_stride); ++k, ++batch_cnt; } CHECK_EQ(batch_cnt, ext_info.n_batches); diff --git a/src/data/gradient_index.cu b/src/data/gradient_index.cu index ebdc99051924..5e15ff5f0fa2 100644 --- a/src/data/gradient_index.cu +++ b/src/data/gradient_index.cu @@ -1,7 +1,9 @@ /** - * Copyright 2022-2023, XGBoost Contributors + * Copyright 2022-2024, XGBoost Contributors */ -#include // std::unique_ptr +#include // for size_t +#include // for unique_ptr +#include // for vector #include "../common/column_matrix.h" #include "../common/hist_util.h" // Index @@ -20,23 +22,34 @@ void SetIndexData(Context const* ctx, EllpackPageImpl const* page, auto accessor = page->GetHostAccessor(ctx, &h_gidx_buffer); auto const kNull = static_cast(accessor.NullValue()); - common::Span index_data_span = {out->index.data(), out->index.Size()}; + auto index_data_span = common::Span{out->index.data(), out->index.Size()}; auto n_bins_total = page->Cuts().TotalBins(); auto& hit_count_tloc = *p_hit_count_tloc; hit_count_tloc.clear(); hit_count_tloc.resize(ctx->Threads() * n_bins_total, 0); - - common::ParallelFor(page->Size(), ctx->Threads(), [&](auto i) { + bool dense_compressed = page->IsDenseCompressed() && !page->IsDense(); + common::ParallelFor(page->Size(), ctx->Threads(), [&](auto ridx) { auto tid = omp_get_thread_num(); - size_t in_rbegin = page->row_stride * i; - size_t out_rbegin = out->row_ptr[i]; - auto r_size = out->row_ptr[i + 1] - out->row_ptr[i]; - for (size_t j = 0; j < r_size; ++j) { - auto bin_idx = accessor.gidx_iter[in_rbegin + j]; - assert(bin_idx != kNull); - index_data_span[out_rbegin + j] = bin_idx; - ++hit_count_tloc[tid * n_bins_total + get_offset(bin_idx, j)]; + size_t in_rbegin = page->info.row_stride * ridx; + size_t out_rbegin = out->row_ptr[ridx]; + if (dense_compressed) { + for (std::size_t j = 0, k = 0; j < page->info.row_stride; ++j) { + bst_bin_t bin_idx = accessor.gidx_iter[in_rbegin + j]; + if (XGBOOST_EXPECT((bin_idx != kNull), true)) { // relatively dense + bin_idx = get_offset(bin_idx, j); + index_data_span[out_rbegin + k++] = bin_idx; + ++hit_count_tloc[tid * n_bins_total + bin_idx]; + } + } + } else { + auto r_size = out->row_ptr[ridx + 1] - out->row_ptr[ridx]; + for (size_t j = 0; j < r_size; ++j) { + bst_bin_t bin_idx = accessor.gidx_iter[in_rbegin + j]; + assert(bin_idx != kNull); + index_data_span[out_rbegin + j] = bin_idx; + ++hit_count_tloc[tid * n_bins_total + get_offset(bin_idx, j)]; + } } }); } @@ -45,16 +58,16 @@ void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page, common::RefResourceView* p_out) { auto& row_ptr = *p_out; row_ptr = common::MakeFixedVecWithMalloc(page->Size() + 1, std::size_t{0}); - if (page->is_dense) { - std::fill(row_ptr.begin() + 1, row_ptr.end(), page->row_stride); + if (page->IsDense()) { + std::fill(row_ptr.begin() + 1, row_ptr.end(), page->info.row_stride); } else { std::vector h_gidx_buffer; auto accessor = page->GetHostAccessor(ctx, &h_gidx_buffer); auto const kNull = static_cast(accessor.NullValue()); common::ParallelFor(page->Size(), ctx->Threads(), [&](auto i) { - size_t ibegin = page->row_stride * i; - for (size_t j = 0; j < page->row_stride; ++j) { + size_t ibegin = page->info.row_stride * i; + for (size_t j = 0; j < page->info.row_stride; ++j) { bst_bin_t bin_idx = accessor.gidx_iter[ibegin + j]; if (bin_idx != kNull) { row_ptr[i + 1]++; @@ -67,27 +80,27 @@ void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page, GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info, EllpackPage const& in_page, BatchParam const& p) - : max_numeric_bins_per_feat{p.max_bin} { + : cut{in_page.Cuts()}, + max_numeric_bins_per_feat{p.max_bin}, + isDense_{in_page.Impl()->IsDense()}, + base_rowid{in_page.BaseRowId()} { auto page = in_page.Impl(); - isDense_ = page->is_dense; - CHECK_EQ(info.num_row_, in_page.Size()); - this->cut = page->Cuts(); // pull to host early, prevent race condition this->cut.Ptrs(); this->cut.Values(); this->cut.MinValues(); - this->ResizeIndex(info.num_nonzero_, page->is_dense); - if (page->is_dense) { + this->ResizeIndex(info.num_nonzero_, page->IsDense()); + if (page->IsDense()) { this->index.SetBinOffset(page->Cuts().Ptrs()); } + auto offset = page->Cuts().cut_ptrs_.ConstHostSpan(); auto n_bins_total = page->Cuts().TotalBins(); GetRowPtrFromEllpack(ctx, page, &this->row_ptr); - if (page->IsDense()) { - auto offset = index.Offset(); + if (page->IsDenseCompressed()) { common::DispatchBinType(this->index.GetBinTypeSize(), [&](auto dtype) { using T = decltype(dtype); ::xgboost::SetIndexData( @@ -104,7 +117,7 @@ GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info, this->GatherHitCount(ctx->Threads(), n_bins_total); // sanity checks - CHECK_EQ(this->Features(), info.num_col_); + CHECK_EQ(this->Features(), in_page.Cuts().NumFeatures()); CHECK_EQ(this->Size(), info.num_row_); CHECK(this->cut.cut_ptrs_.HostCanRead()); CHECK(this->cut.cut_values_.HostCanRead()); diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc index 29d38976361b..c9830fa093e8 100644 --- a/src/data/iterative_dmatrix.cc +++ b/src/data/iterative_dmatrix.cc @@ -49,6 +49,9 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro this->fmat_ctx_ = ctx; this->batch_ = p; + + LOG(INFO) << "Finished constructing the `IterativeDMatrix`: (" << this->Info().num_row_ << ", " + << this->Info().num_col_ << ", " << this->Info().num_nonzero_ << ")."; } void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p, diff --git a/src/tree/gpu_hist/feature_groups.cu b/src/tree/gpu_hist/feature_groups.cu index c6c6619852ca..0a1272a10aff 100644 --- a/src/tree/gpu_hist/feature_groups.cu +++ b/src/tree/gpu_hist/feature_groups.cu @@ -2,17 +2,17 @@ * Copyright 2020-2024, XGBoost Contributors */ -#include -#include -#include +#include // for max +#include // for size_t +#include // for uint32_t +#include // for vector +#include "../../common/hist_util.h" // for HistogramCuts #include "feature_groups.cuh" -#include "../../common/hist_util.h" - namespace xgboost::tree { -FeatureGroups::FeatureGroups(const common::HistogramCuts& cuts, bool is_dense, - size_t shm_size, size_t bin_size) { +FeatureGroups::FeatureGroups(common::HistogramCuts const& cuts, bool is_dense, size_t shm_size) + : max_group_bins{0} { // Only use a single feature group for sparse matrices. bool single_group = !is_dense; if (single_group) { @@ -20,14 +20,14 @@ FeatureGroups::FeatureGroups(const common::HistogramCuts& cuts, bool is_dense, return; } - std::vector& feature_segments_h = feature_segments.HostVector(); - std::vector& bin_segments_h = bin_segments.HostVector(); + auto& feature_segments_h = feature_segments.HostVector(); + auto& bin_segments_h = bin_segments.HostVector(); feature_segments_h.push_back(0); bin_segments_h.push_back(0); - const std::vector& cut_ptrs = cuts.Ptrs(); - size_t max_shmem_bins = shm_size / bin_size; - max_group_bins = 0; + std::vector const& cut_ptrs = cuts.Ptrs(); + // Maximum number of bins that can be placed into shared memory. + std::size_t max_shmem_bins = shm_size / sizeof(GradientPairInt64); for (size_t i = 2; i < cut_ptrs.size(); ++i) { int last_start = bin_segments_h.back(); @@ -41,17 +41,16 @@ FeatureGroups::FeatureGroups(const common::HistogramCuts& cuts, bool is_dense, } feature_segments_h.push_back(cut_ptrs.size() - 1); bin_segments_h.push_back(cut_ptrs.back()); - max_group_bins = std::max(max_group_bins, - bin_segments_h.back() - - bin_segments_h[bin_segments_h.size() - 2]); + max_group_bins = + std::max(max_group_bins, bin_segments_h.back() - bin_segments_h[bin_segments_h.size() - 2]); } -void FeatureGroups::InitSingle(const common::HistogramCuts& cuts) { - std::vector& feature_segments_h = feature_segments.HostVector(); +void FeatureGroups::InitSingle(common::HistogramCuts const& cuts) { + auto& feature_segments_h = feature_segments.HostVector(); feature_segments_h.push_back(0); feature_segments_h.push_back(cuts.Ptrs().size() - 1); - std::vector& bin_segments_h = bin_segments.HostVector(); + auto& bin_segments_h = bin_segments.HostVector(); bin_segments_h.push_back(0); bin_segments_h.push_back(cuts.TotalBins()); diff --git a/src/tree/gpu_hist/feature_groups.cuh b/src/tree/gpu_hist/feature_groups.cuh index 82df69796ebd..37d87a9f577a 100644 --- a/src/tree/gpu_hist/feature_groups.cuh +++ b/src/tree/gpu_hist/feature_groups.cuh @@ -16,92 +16,97 @@ class HistogramCuts; namespace tree { -/** \brief FeatureGroup is a feature group. It is defined by a range of - consecutive feature indices, and also contains a range of all bin indices - associated with those features. */ +/** + * @brief FeatureGroup is a single group of features. + * + * It is defined by a range of consecutive feature indices, and also contains a range of + * all bin indices associated with those features. + */ struct FeatureGroup { - __host__ __device__ FeatureGroup(int start_feature_, int num_features_, - int start_bin_, int num_bins_) : - start_feature(start_feature_), num_features(num_features_), - start_bin(start_bin_), num_bins(num_bins_) {} + XGBOOST_DEVICE FeatureGroup(bst_feature_t start_feature, bst_feature_t n_features, + bst_bin_t start_bin, bst_bin_t num_bins) + : start_feature{start_feature}, + num_features{n_features}, + start_bin{start_bin}, + num_bins{num_bins} {} /** The first feature of the group. */ - int start_feature; + bst_feature_t start_feature; /** The number of features in the group. */ - int num_features; + bst_feature_t num_features; /** The first bin in the group. */ bst_bin_t start_bin; /** The number of bins in the group. */ bst_bin_t num_bins; }; -/** \brief FeatureGroupsAccessor is a non-owning accessor for FeatureGroups. */ +/** @brief FeatureGroupsAccessor is a non-owning accessor for FeatureGroups. */ struct FeatureGroupsAccessor { - FeatureGroupsAccessor(common::Span feature_segments_, - common::Span bin_segments_, int max_group_bins_) - : feature_segments(feature_segments_), - bin_segments(bin_segments_), - max_group_bins(max_group_bins_) {} + FeatureGroupsAccessor(common::Span feature_segments, + common::Span bin_segments, bst_bin_t max_group_bins) + : feature_segments{feature_segments}, + bin_segments{bin_segments}, + max_group_bins{max_group_bins} {} - common::Span feature_segments; + common::Span feature_segments; common::Span bin_segments; - int max_group_bins; + bst_bin_t max_group_bins; - /** \brief Gets the number of feature groups. */ - __host__ __device__ int NumGroups() const { - return feature_segments.size() - 1; - } + /** @brief Gets the number of feature groups. */ + XGBOOST_DEVICE int NumGroups() const { return feature_segments.size() - 1; } - /** \brief Gets the information about a feature group with index i. */ - __host__ __device__ FeatureGroup operator[](int i) const { - return {feature_segments[i], feature_segments[i + 1] - feature_segments[i], - bin_segments[i], bin_segments[i + 1] - bin_segments[i]}; + /** @brief Gets the information about a feature group with index i. */ + XGBOOST_DEVICE FeatureGroup operator[](bst_feature_t i) const { + return {feature_segments[i], feature_segments[i + 1] - feature_segments[i], bin_segments[i], + bin_segments[i + 1] - bin_segments[i]}; } }; -/** \brief FeatureGroups contains information that defines a split of features - into groups. Bins of a single feature group typically fit into shared - memory, so the histogram for the features of a single group can be computed - faster. - - \notes Known limitations: - - - splitting features into groups currently works only for dense matrices, - where it is easy to get a feature value in a row by its index; for sparse - matrices, the structure contains only a single group containing all - features; - - - if a single feature requires more bins than fit into shared memory, the - histogram is computed in global memory even if there are multiple feature - groups; note that this is unlikely to occur in practice, as the default - number of bins per feature is 256, whereas a thread block with 48 KiB - shared memory can contain 3072 bins if each gradient sum component is a - 64-bit floating-point value (double) +/** + * @brief FeatureGroups contains information that defines a split of features + * into groups. Bins of a single feature group typically fit into shared + * memory, so the histogram for the features of a single group can be computed + * faster. + * + * @note Known limitations: + * + * - splitting features into groups currently works only for dense matrices, + * where it is easy to get a feature value in a row by its index; for sparse + * matrices, the structure contains only a single group containing all + * features; + * + * - if a single feature requires more bins than fit into shared memory, the + * histogram is computed in global memory even if there are multiple feature + * groups; note that this is unlikely to occur in practice, as the default + * number of bins per feature is 256, whereas a thread block with 48 KiB + * shared memory can contain 3072 bins if each gradient sum component is a + * 64-bit floating-point value (double) */ struct FeatureGroups { /** Group cuts for features. Size equals to (number of groups + 1). */ - HostDeviceVector feature_segments; + HostDeviceVector feature_segments; /** Group cuts for bins. Size equals to (number of groups + 1) */ HostDeviceVector bin_segments; /** Maximum number of bins in a group. Useful to compute the amount of dynamic shared memory when launching a kernel. */ int max_group_bins; - /** Creates feature groups by splitting features into groups. - \param cuts Histogram cuts that given the number of bins per feature. - \param is_dense Whether the data matrix is dense. - \param shm_size Available size of shared memory per thread block (in - bytes) used to compute feature groups. - \param bin_size Size of a single bin of the histogram. */ - FeatureGroups(const common::HistogramCuts& cuts, bool is_dense, - size_t shm_size, size_t bin_size); - - /** Creates a single feature group containing all features and bins. - \notes This is used as a fallback for sparse matrices, and is also useful - for testing. + /** + * @brief Creates feature groups by splitting features into groups. + * + * @param cuts Histogram cuts that given the number of bins per feature. + * @param is_dense Whether the data matrix is dense. + * @param shm_size Available size of shared memory per thread block (in bytes) used to + * compute feature groups. */ - explicit FeatureGroups(const common::HistogramCuts& cuts) { - InitSingle(cuts); - } + FeatureGroups(common::HistogramCuts const& cuts, bool is_dense, size_t shm_size); + + /** + * @brief Creates a single feature group containing all features and bins. + * + * @notes This is used as a fallback for sparse matrices, and is also useful for + * testing. + */ + explicit FeatureGroups(const common::HistogramCuts& cuts) { this->InitSingle(cuts); } [[nodiscard]] FeatureGroupsAccessor DeviceAccessor(DeviceOrd device) const { feature_segments.SetDevice(device); diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu index 46a52a8ea5d7..c9048e68ac77 100644 --- a/src/tree/gpu_hist/gradient_based_sampler.cu +++ b/src/tree/gpu_hist/gradient_based_sampler.cu @@ -205,8 +205,8 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx, auto batch_iterator = dmat->GetBatches(ctx, batch_param_); auto first_page = (*batch_iterator.begin()).Impl(); // Create a new ELLPACK page with empty rows. - *page = EllpackPageImpl{ctx, first_page->CutsShared(), first_page->is_dense, - first_page->row_stride, sample_rows}; + *page = EllpackPageImpl{ctx, first_page->CutsShared(), first_page->IsDense(), + first_page->info.row_stride, sample_rows}; // Compact the ELLPACK pages into the single sample page. thrust::fill(cuctx->CTP(), page->gidx_buffer.begin(), page->gidx_buffer.end(), 0); @@ -290,8 +290,8 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c auto first_page = (*batch_iterator.begin()).Impl(); // Create a new ELLPACK page with empty rows. - *page = EllpackPageImpl{ctx, first_page->CutsShared(), dmat->IsDense(), first_page->row_stride, - sample_rows}; + *page = EllpackPageImpl{ctx, first_page->CutsShared(), dmat->IsDense(), + first_page->info.row_stride, sample_rows}; // Compact the ELLPACK pages into the single sample page. thrust::fill(cuctx->CTP(), page->gidx_buffer.begin(), page->gidx_buffer.end(), 0); for (auto& batch : batch_iterator) { diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index d50f7284e9ad..102b1113be6b 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -138,19 +138,20 @@ XGBOOST_DEV_INLINE void AtomicAddGpairGlobal(xgboost::GradientPairInt64* dest, *reinterpret_cast(&h)); } -template +template class HistogramAgent { + int constexpr static kItemsPerTile = kBlockThreads * kItemsPerThread; + GradientPairInt64* smem_arr_; GradientPairInt64* d_node_hist_; - using Idx = RowPartitioner::RowIndexT; + using Idx = cuda_impl::RowIndexT; dh::LDGIterator d_ridx_; const GradientPair* d_gpair_; const FeatureGroup group_; const EllpackDeviceAccessor& matrix_; const int feature_stride_; - const std::size_t n_elements_; + const bst_idx_t n_elements_; const GradientQuantiser& rounding_; public: @@ -158,34 +159,33 @@ class HistogramAgent { GradientPairInt64* __restrict__ d_node_hist, const FeatureGroup& group, const EllpackDeviceAccessor& matrix, common::Span d_ridx, const GradientQuantiser& rounding, const GradientPair* d_gpair) - : smem_arr_(smem_arr), - d_node_hist_(d_node_hist), + : smem_arr_{smem_arr}, + d_node_hist_{d_node_hist}, d_ridx_(d_ridx.data()), - group_(group), + group_{group}, matrix_(matrix), - feature_stride_(kIsDense ? group.num_features : matrix.row_stride), - n_elements_(feature_stride_ * d_ridx.size()), - rounding_(rounding), - d_gpair_(d_gpair) {} + feature_stride_(kCompressed ? group.num_features : matrix.row_stride), + n_elements_{feature_stride_ * d_ridx.size()}, + rounding_{rounding}, + d_gpair_{d_gpair} {} __device__ void ProcessPartialTileShared(std::size_t offset) { - for (std::size_t idx = offset + threadIdx.x; - idx < std::min(offset + kBlockThreads * kItemsPerTile, n_elements_); - idx += kBlockThreads) { + for (std::size_t idx = offset + threadIdx.x, + n = std::min(offset + kBlockThreads * kItemsPerTile, n_elements_); + idx < n; idx += kBlockThreads) { Idx ridx = d_ridx_[idx / feature_stride_]; auto fidx = FeatIdx(group_, idx, feature_stride_); bst_bin_t compressed_bin = matrix_.gidx_iter[IterIdx(matrix_, ridx, fidx)]; - if (kIsDense || compressed_bin != matrix_.NullValue()) { + if (compressed_bin != matrix_.NullValue()) { + // The matrix is compressed with feature-local bins. + if (kCompressed) { + compressed_bin += this->matrix_.feature_segments[fidx]; + } + // Avoid atomic add if it's a null value. auto adjusted = rounding_.ToFixedPoint(d_gpair_[ridx]); // Subtract start_bin to write to group-local histogram. If this is not a dense // matrix, then start_bin is 0 since featuregrouping doesn't support sparse data. - if (kIsDense) { - AtomicAddGpairShared( - smem_arr_ + compressed_bin + this->matrix_.feature_segments[fidx] - group_.start_bin, - adjusted); - } else { - AtomicAddGpairShared(smem_arr_ + compressed_bin - group_.start_bin, adjusted); - } + AtomicAddGpairShared(smem_arr_ + compressed_bin - group_.start_bin, adjusted); } } } @@ -210,16 +210,19 @@ class HistogramAgent { for (int i = 0; i < kItemsPerThread; i++) { gpair[i] = d_gpair_[ridx[i]]; auto fidx = FeatIdx(group_, idx[i], feature_stride_); - if (kIsDense) { - gidx[i] = - matrix_.gidx_iter[IterIdx(matrix_, ridx[i], fidx)] + matrix_.feature_segments[fidx]; + gidx[i] = matrix_.gidx_iter[IterIdx(matrix_, ridx[i], fidx)]; + if (gidx[i] != matrix_.NullValue()) { + if (kCompressed) { + gidx[i] += matrix_.feature_segments[fidx]; + } } else { - gidx[i] = matrix_.gidx_iter[IterIdx(matrix_, ridx[i], fidx)]; + gidx[i] = -1; // missing } } #pragma unroll for (int i = 0; i < kItemsPerThread; i++) { - if ((kIsDense || gidx[i] != matrix_.NullValue())) { + // Avoid atomic add if it's a null value. + if (gidx[i] != -1) { auto adjusted = rounding_.ToFixedPoint(gpair[i]); AtomicAddGpairShared(smem_arr_ + gidx[i] - group_.start_bin, adjusted); } @@ -248,14 +251,12 @@ class HistogramAgent { Idx ridx = d_ridx_[idx / feature_stride_]; auto fidx = FeatIdx(group_, idx, feature_stride_); bst_bin_t compressed_bin = matrix_.gidx_iter[IterIdx(matrix_, ridx, fidx)]; - if (kIsDense || compressed_bin != matrix_.NullValue()) { - auto adjusted = rounding_.ToFixedPoint(d_gpair_[ridx]); - if (kIsDense) { - auto start_bin = this->matrix_.feature_segments[fidx]; - AtomicAddGpairGlobal(d_node_hist_ + compressed_bin + start_bin, adjusted); - } else { - AtomicAddGpairGlobal(d_node_hist_ + compressed_bin, adjusted); + if (compressed_bin != matrix_.NullValue()) { + if (kCompressed) { + compressed_bin += this->matrix_.feature_segments[fidx]; } + auto adjusted = rounding_.ToFixedPoint(d_gpair_[ridx]); + AtomicAddGpairGlobal(d_node_hist_ + compressed_bin, adjusted); } } } @@ -382,14 +383,14 @@ class DeviceHistogramBuilderImpl { if (!this->kernel_->shared) { CHECK_EQ(this->kernel_->smem_size, 0); - if (matrix.is_dense) { + if (matrix.IsDenseCompressed()) { launcher(this->kernel_->global_dense_kernel); } else { launcher(this->kernel_->global_kernel); } } else { CHECK_NE(this->kernel_->smem_size, 0); - if (matrix.is_dense) { + if (matrix.IsDenseCompressed()) { launcher(this->kernel_->shared_dense_kernel); } else { launcher(this->kernel_->shared_kernel); diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 31b8d34964b2..002f3e9a6ff1 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -104,6 +104,7 @@ struct GPUHistMakerDevice { dh::device_vector positions_; HistMakerTrainParam const* hist_param_; std::shared_ptr const cuts_; + std::unique_ptr feature_groups_; auto CreatePartitionNodes(RegTree const* p_tree, std::vector const& candidates) { std::vector nidx(candidates.size()); @@ -143,19 +144,20 @@ struct GPUHistMakerDevice { std::unique_ptr sampler; - std::unique_ptr feature_groups; common::Monitor monitor; GPUHistMakerDevice(Context const* ctx, TrainParam _param, HistMakerTrainParam const* hist_param, std::shared_ptr column_sampler, BatchParam batch_param, MetaInfo const& info, std::vector batch_ptr, - std::shared_ptr cuts) + std::shared_ptr cuts, bool dense_compressed) : evaluator_{_param, static_cast(info.num_col_), ctx->Device()}, ctx_{ctx}, column_sampler_{std::move(column_sampler)}, batch_ptr_{std::move(batch_ptr)}, hist_param_{hist_param}, cuts_{std::move(cuts)}, + feature_groups_{std::make_unique(*cuts_, dense_compressed, + dh::MaxSharedMemoryOptin(ctx_->Ordinal()))}, param{std::move(_param)}, interaction_constraints(param, static_cast(info.num_col_)), sampler{std::make_unique( @@ -172,15 +174,6 @@ struct GPUHistMakerDevice { ~GPUHistMakerDevice() = default; - void InitFeatureGroupsOnce(MetaInfo const& info) { - if (!feature_groups) { - CHECK(cuts_); - feature_groups = std::make_unique(*cuts_, info.IsDense(), - dh::MaxSharedMemoryOptin(ctx_->Ordinal()), - sizeof(GradientPairInt64)); - } - } - // Reset values for each update iteration [[nodiscard]] DMatrix* Reset(HostDeviceVector const* dh_gpair, DMatrix* p_fmat) { this->monitor.Start(__func__); @@ -240,10 +233,8 @@ struct GPUHistMakerDevice { */ this->quantiser = std::make_unique(ctx_, this->gpair, p_fmat->Info()); - this->InitFeatureGroupsOnce(info); - this->histogram_.Reset(ctx_, this->hist_param_->MaxCachedHistNodes(ctx_->Device()), - feature_groups->DeviceAccessor(ctx_->Device()), cuts_->TotalBins(), + feature_groups_->DeviceAccessor(ctx_->Device()), cuts_->TotalBins(), false); this->monitor.Stop(__func__); return p_fmat; @@ -334,7 +325,7 @@ struct GPUHistMakerDevice { auto d_ridx = partitioners_.at(k)->GetRows(nidx); this->histogram_.BuildHistogram(ctx_->CUDACtx(), acc, - feature_groups->DeviceAccessor(ctx_->Device()), this->gpair, + feature_groups_->DeviceAccessor(ctx_->Device()), this->gpair, d_ridx, d_node_hist, *quantiser); monitor.Stop(__func__); } @@ -762,22 +753,27 @@ struct GPUHistMakerDevice { } }; -std::shared_ptr InitBatchCuts(Context const* ctx, DMatrix* p_fmat, - BatchParam batch, - std::vector* p_batch_ptr) { +std::pair, bool> InitBatchCuts( + Context const* ctx, DMatrix* p_fmat, BatchParam const& batch, + std::vector* p_batch_ptr) { std::vector& batch_ptr = *p_batch_ptr; batch_ptr = {0}; std::shared_ptr cuts; + std::int32_t dense_compressed = -1; for (auto const& page : p_fmat->GetBatches(ctx, batch)) { batch_ptr.push_back(page.Size()); cuts = page.Impl()->CutsShared(); CHECK(cuts->cut_values_.DeviceCanRead()); + if (dense_compressed != -1) { + CHECK_EQ(page.Impl()->IsDenseCompressed(), static_cast(dense_compressed)); + } + dense_compressed = page.Impl()->IsDenseCompressed(); } CHECK(cuts); CHECK_EQ(p_fmat->NumBatches(), batch_ptr.size() - 1); std::partial_sum(batch_ptr.cbegin(), batch_ptr.cend(), batch_ptr.begin()); - return cuts; + return {cuts, static_cast(dense_compressed)}; } class GPUHistMaker : public TreeUpdater { @@ -840,10 +836,11 @@ class GPUHistMaker : public TreeUpdater { std::vector batch_ptr; auto batch = HistBatch(*param); - auto cuts = InitBatchCuts(ctx_, p_fmat, batch, &batch_ptr); + auto [cuts, dense_compressed] = InitBatchCuts(ctx_, p_fmat, batch, &batch_ptr); - this->maker = std::make_unique( - ctx_, *param, &hist_maker_param_, column_sampler_, batch, p_fmat->Info(), batch_ptr, cuts); + this->maker = std::make_unique(ctx_, *param, &hist_maker_param_, + column_sampler_, batch, p_fmat->Info(), + batch_ptr, cuts, dense_compressed); p_last_fmat_ = p_fmat; initialised_ = true; @@ -947,11 +944,12 @@ class GPUGlobalApproxMaker : public TreeUpdater { std::vector batch_ptr; auto batch = ApproxBatch(*param, hess, *task_); - auto cuts = InitBatchCuts(ctx_, p_fmat, batch, &batch_ptr); + auto [cuts, dense_compressed] = InitBatchCuts(ctx_, p_fmat, batch, &batch_ptr); batch.regen = false; // Regen only at the beginning of the iteration. - this->maker_ = std::make_unique( - ctx_, *param, &hist_maker_param_, column_sampler_, batch, p_fmat->Info(), batch_ptr, cuts); + this->maker_ = std::make_unique(ctx_, *param, &hist_maker_param_, + column_sampler_, batch, p_fmat->Info(), + batch_ptr, cuts, dense_compressed); std::size_t t_idx{0}; for (xgboost::RegTree* tree : trees) { diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py index 91302c1ed563..c8d0f47709ab 100644 --- a/tests/ci_build/lint_python.py +++ b/tests/ci_build/lint_python.py @@ -105,6 +105,7 @@ class LintersPaths: "tests/test_distributed/test_with_spark/test_data.py", "tests/test_distributed/test_gpu_with_spark/test_data.py", "tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py", + "tests/test_distributed/test_gpu_with_dask/test_gpu_external_memory.py", # demo "demo/dask/", "demo/json-model/json_parser.py", diff --git a/tests/cpp/common/test_compressed_iterator.cc b/tests/cpp/common/test_compressed_iterator.cc index 93243c0b336e..41280dde7fa7 100644 --- a/tests/cpp/common/test_compressed_iterator.cc +++ b/tests/cpp/common/test_compressed_iterator.cc @@ -1,9 +1,25 @@ +/** + * Copyright 2017-2024, XGBoost contributors + */ #include "../../../src/common/compressed_iterator.h" #include "gtest/gtest.h" #include -namespace xgboost { -namespace common { +namespace xgboost::common { +TEST(CompressedIterator, Size) { + bst_idx_t n = 2048; + { + bst_idx_t n_symbols = 256; + auto n_bytes = CompressedBufferWriter::CalculateBufferSize(n, n_symbols); + ASSERT_EQ(n_bytes, 2052); + } + { + bst_idx_t n_symbols = 64; + auto n_bytes = CompressedBufferWriter::CalculateBufferSize(n, n_symbols); + ASSERT_EQ(n_bytes, 1540); + } +} + TEST(CompressedIterator, Test) { ASSERT_TRUE(detail::SymbolBits(256) == 8); ASSERT_TRUE(detail::SymbolBits(150) == 8); @@ -50,6 +66,4 @@ TEST(CompressedIterator, Test) { } } } - -} // namespace common -} // namespace xgboost +} // namespace xgboost::common diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu index 55375a5a7ffa..0ccb79def2ec 100644 --- a/tests/cpp/data/test_ellpack_page.cu +++ b/tests/cpp/data/test_ellpack_page.cu @@ -7,9 +7,11 @@ #include "../../../src/common/categorical.h" #include "../../../src/common/hist_util.h" +#include "../../../src/data/device_adapter.cuh" // for CupyAdapter #include "../../../src/data/ellpack_page.cuh" #include "../../../src/data/ellpack_page.h" -#include "../../../src/tree/param.h" // TrainParam +#include "../../../src/data/gradient_index.h" // for GHistIndexMatrix +#include "../../../src/tree/param.h" // for TrainParam #include "../helpers.h" #include "../histogram_helpers.h" #include "gtest/gtest.h" @@ -24,7 +26,7 @@ TEST(EllpackPage, EmptyDMatrix) { &ctx, BatchParam{kMaxBin, tree::TrainParam::DftSparseThreshold()}) .begin(); auto impl = page.Impl(); - ASSERT_EQ(impl->row_stride, 0); + ASSERT_EQ(impl->info.row_stride, 0); ASSERT_EQ(impl->Cuts().TotalBins(), 0); ASSERT_EQ(impl->gidx_buffer.size(), 4); } @@ -36,7 +38,7 @@ TEST(EllpackPage, BuildGidxDense) { std::vector h_gidx_buffer; auto h_accessor = page->GetHostAccessor(&ctx, &h_gidx_buffer); - ASSERT_EQ(page->row_stride, n_features); + ASSERT_EQ(page->info.row_stride, n_features); std::vector solution = { 0, 3, 8, 9, 14, 17, 20, 21, @@ -60,6 +62,9 @@ TEST(EllpackPage, BuildGidxDense) { auto fidx = i % n_features; ASSERT_EQ(solution[i], h_accessor.gidx_iter[i] + h_accessor.feature_segments[fidx]); } + ASSERT_EQ(page->NumSymbols(), 3); + ASSERT_EQ(page->NumNonMissing(&ctx, {}), n_samples * n_features); + ASSERT_EQ(page->NumSymbols(), h_accessor.NullValue()); } TEST(EllpackPage, BuildGidxSparse) { @@ -68,9 +73,9 @@ TEST(EllpackPage, BuildGidxSparse) { auto page = BuildEllpackPage(&ctx, kNRows, kNCols, 0.9f); std::vector h_gidx_buffer; - auto h_accessor = page->GetHostAccessor(&ctx, &h_gidx_buffer); + auto h_acc = page->GetHostAccessor(&ctx, &h_gidx_buffer); - ASSERT_LE(page->row_stride, 3); + ASSERT_EQ(page->info.row_stride, 3); // row_stride = 3, 16 rows, 48 entries for ELLPack std::vector solution = { @@ -78,8 +83,8 @@ TEST(EllpackPage, BuildGidxSparse) { 24, 24, 24, 24, 24, 5, 24, 24, 0, 16, 24, 15, 24, 24, 24, 24, 24, 7, 14, 16, 4, 24, 24, 24, 24, 24, 9, 24, 24, 1, 24, 24 }; - for (size_t i = 0; i < kNRows * page->row_stride; ++i) { - ASSERT_EQ(solution[i], h_accessor.gidx_iter[i]); + for (size_t i = 0; i < kNRows * page->info.row_stride; ++i) { + ASSERT_EQ(solution[i], h_acc.gidx_iter[i]); } } @@ -103,8 +108,9 @@ TEST(EllpackPage, FromCategoricalBasic) { auto n_uniques = std::unique(x_copy.begin(), x_copy.end()) - x_copy.begin(); ASSERT_EQ(n_uniques, kCats); - std::vector h_cuts_ptr(accessor.feature_segments.size()); - dh::CopyDeviceSpanToVector(&h_cuts_ptr, accessor.feature_segments); + std::vector h_cuts_ptr(accessor.NumFeatures() + 1); + dh::safe_cuda(cudaMemcpyAsync(h_cuts_ptr.data(), accessor.feature_segments, + sizeof(bst_feature_t) * h_cuts_ptr.size(), cudaMemcpyDefault)); std::vector h_cuts_values(accessor.gidx_fvalue_map.size()); dh::CopyDeviceSpanToVector(&h_cuts_values, accessor.gidx_fvalue_map); @@ -149,7 +155,7 @@ TEST(EllpackPage, Copy) { auto page = (*dmat->GetBatches(&ctx, param).begin()).Impl(); // Create an empty result page. - EllpackPageImpl result(&ctx, page->CutsShared(), page->is_dense, page->row_stride, kRows); + EllpackPageImpl result(&ctx, page->CutsShared(), page->is_dense, page->info.row_stride, kRows); // Copy batch pages into the result page. size_t offset = 0; @@ -195,7 +201,7 @@ TEST(EllpackPage, Compact) { auto page = (*dmat->GetBatches(&ctx, param).begin()).Impl(); // Create an empty result page. - EllpackPageImpl result(&ctx, page->CutsShared(), page->is_dense, page->row_stride, + EllpackPageImpl result(&ctx, page->CutsShared(), page->is_dense, page->info.row_stride, kCompactedRows); // Compact batch pages into the result page. @@ -240,7 +246,141 @@ TEST(EllpackPage, Compact) { } namespace { -class EllpackPageTest : public testing::TestWithParam { +// Test for treating sparse ellpack as a dense +class CompressedDense : public ::testing::TestWithParam { + auto InitSparsePage(std::size_t null_column) const { + bst_idx_t n_samples = 16, n_features = 8; + std::vector data(n_samples * n_features); + + std::iota(data.begin(), data.end(), 0.0f); + for (std::size_t i = 0; i < data.size(); i += n_features) { + data[i + null_column] = std::numeric_limits::quiet_NaN(); + } + data[null_column] = null_column; // keep the first sample full. + auto p_fmat = GetDMatrixFromData(data, n_samples, n_features); + return p_fmat; + } + + void CheckBasic(Context const* ctx, BatchParam batch, std::size_t null_column, + EllpackPageImpl const& impl) { + ASSERT_FALSE(impl.IsDense()); + ASSERT_TRUE(impl.IsDenseCompressed()); + ASSERT_EQ(impl.NumSymbols(), batch.max_bin + 1); + + std::vector h_gidx; + auto h_acc = impl.GetHostAccessor(ctx, &h_gidx); + ASSERT_EQ(h_acc.row_stride, h_acc.NumFeatures()); + ASSERT_EQ(h_acc.NullValue(), batch.max_bin); + for (std::size_t i = 0; i < h_acc.row_stride * h_acc.n_rows; ++i) { + auto [m, n] = linalg::UnravelIndex(i, h_acc.n_rows, h_acc.row_stride); + if (n == null_column && m != 0) { + ASSERT_EQ(static_cast(h_acc.gidx_iter[i]), h_acc.NullValue()); + } else { + ASSERT_EQ(static_cast(h_acc.gidx_iter[i]), m); + } + } + } + + public: + void CheckFromSparsePage(std::size_t null_column) { + auto p_fmat = this->InitSparsePage(null_column); + auto ctx = MakeCUDACtx(0); + auto batch = BatchParam{static_cast(p_fmat->Info().num_row_), + std::numeric_limits::quiet_NaN()}; + + for (auto const& ellpack : p_fmat->GetBatches(&ctx, batch)) { + auto impl = ellpack.Impl(); + this->CheckBasic(&ctx, batch, null_column, *impl); + } + } + + void CheckFromAdapter(std::size_t null_column) { + bst_idx_t n_samples = 16, n_features = 8; + + auto ctx = MakeCUDACtx(0); + HostDeviceVector data(n_samples * n_features, 0.0f, ctx.Device()); + auto& h_data = data.HostVector(); + std::iota(h_data.begin(), h_data.end(), 0.0f); + for (std::size_t i = 0; i < h_data.size(); i += n_features) { + h_data[i + null_column] = std::numeric_limits::quiet_NaN(); + } + h_data[null_column] = null_column; // Keep the first sample full. + auto p_fmat = GetDMatrixFromData(h_data, n_samples, n_features); + + data.ConstDeviceSpan(); // Pull to device + auto arri = GetArrayInterface(&data, n_samples, n_features); + auto sarri = Json::Dump(arri); + data::CupyAdapter adapter{StringView{sarri}}; + + Context cpu_ctx; + auto batch = BatchParam{static_cast(p_fmat->Info().num_row_), 0.8}; + + std::shared_ptr cuts; + for (auto const& page : p_fmat->GetBatches(&cpu_ctx, batch)) { + cuts = std::make_shared(page.Cuts()); + } + dh::device_vector row_counts(n_samples, n_features - 1); + row_counts[0] = n_features; + auto d_row_counts = dh::ToSpan(row_counts); + ASSERT_EQ(adapter.NumColumns(), n_features); + auto impl = + EllpackPageImpl{&ctx, adapter.Value(), std::numeric_limits::quiet_NaN(), + false, d_row_counts, {}, + n_features, n_samples, cuts}; + this->CheckBasic(&ctx, batch, null_column, impl); + dh::DefaultStream().Sync(); + } + + void CheckFromToGHist(std::size_t null_column) { + Context cpu_ctx; + auto ctx = MakeCUDACtx(0); + std::vector orig; + { + // Test from GHist + auto p_fmat = this->InitSparsePage(null_column); + auto batch = BatchParam{static_cast(p_fmat->Info().num_row_), 0.8}; + for (auto const& page : p_fmat->GetBatches(&cpu_ctx, batch)) { + orig = {page.data.cbegin(), page.data.cend()}; + auto impl = EllpackPageImpl{&ctx, page, {}}; + this->CheckBasic(&ctx, batch, null_column, impl); + } + } + + { + // Test to GHist + auto p_fmat = this->InitSparsePage(null_column); + auto batch = BatchParam{static_cast(p_fmat->Info().num_row_), 0.8}; + for (auto const& page : p_fmat->GetBatches(&ctx, batch)) { + auto gidx = GHistIndexMatrix{&ctx, p_fmat->Info(), page, batch}; + ASSERT_EQ(gidx.Size(), p_fmat->Info().num_row_); + for (std::size_t ridx = 0; ridx < gidx.Size(); ++ridx) { + auto rbegin = gidx.row_ptr[ridx]; + auto rend = gidx.row_ptr[ridx + 1]; + if (ridx == 0) { + ASSERT_EQ(rend - rbegin, p_fmat->Info().num_col_); + } else { + ASSERT_EQ(rend - rbegin, p_fmat->Info().num_col_ - 1); + } + } + // GHist can't compress a dataset with missing values + ASSERT_FALSE(gidx.index.Offset()); + ASSERT_TRUE(std::equal(gidx.data.cbegin(), gidx.data.cend(), orig.cbegin())); + } + } + } +}; + +TEST_P(CompressedDense, FromSparsePage) { this->CheckFromSparsePage(this->GetParam()); } + +TEST_P(CompressedDense, FromAdapter) { this->CheckFromAdapter(this->GetParam()); } + +TEST_P(CompressedDense, FromToGHist) { this->CheckFromToGHist(this->GetParam()); } +} // anonymous namespace + +INSTANTIATE_TEST_SUITE_P(EllpackPage, CompressedDense, testing::Values(0ul, 1ul, 7ul)); + +namespace { +class SparseEllpack : public testing::TestWithParam { protected: void TestFromGHistIndex(float sparsity) const { // Only testing with small sample size as the cuts might be different between host and @@ -268,7 +408,7 @@ class EllpackPageTest : public testing::TestWithParam { std::vector h_gidx_from_sparse, h_gidx_from_ghist; auto from_ghist_acc = from_ghist->GetHostAccessor(&gpu_ctx, &h_gidx_from_ghist); auto from_sparse_acc = from_sparse_page->GetHostAccessor(&gpu_ctx, &h_gidx_from_sparse); - for (size_t i = 0; i < from_ghist->n_rows * from_ghist->row_stride; ++i) { + for (size_t i = 0; i < from_ghist->n_rows * from_ghist->info.row_stride; ++i) { ASSERT_EQ(from_ghist_acc.gidx_iter[i], from_sparse_acc.gidx_iter[i]); } } @@ -289,9 +429,9 @@ class EllpackPageTest : public testing::TestWithParam { }; } // namespace -TEST_P(EllpackPageTest, FromGHistIndex) { this->TestFromGHistIndex(GetParam()); } +TEST_P(SparseEllpack, FromGHistIndex) { this->TestFromGHistIndex(GetParam()); } -TEST_P(EllpackPageTest, NumNonMissing) { this->TestNumNonMissing(this->GetParam()); } +TEST_P(SparseEllpack, NumNonMissing) { this->TestNumNonMissing(this->GetParam()); } -INSTANTIATE_TEST_SUITE_P(EllpackPage, EllpackPageTest, testing::Values(.0f, .2f, .4f, .8f)); +INSTANTIATE_TEST_SUITE_P(EllpackPage, SparseEllpack, testing::Values(.0f, .2f, .4f, .8f)); } // namespace xgboost diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu index a26aaedb5e07..87fd6db5fa05 100644 --- a/tests/cpp/data/test_ellpack_page_raw_format.cu +++ b/tests/cpp/data/test_ellpack_page_raw_format.cu @@ -56,7 +56,7 @@ class TestEllpackPageRawFormat : public ::testing::TestWithParam { ASSERT_EQ(loaded->Cuts().MinValues(), orig->Cuts().MinValues()); ASSERT_EQ(loaded->Cuts().Values(), orig->Cuts().Values()); ASSERT_EQ(loaded->base_rowid, orig->base_rowid); - ASSERT_EQ(loaded->row_stride, orig->row_stride); + ASSERT_EQ(loaded->info.row_stride, orig->info.row_stride); std::vector h_loaded, h_orig; [[maybe_unused]] auto h_loaded_acc = loaded->GetHostAccessor(&ctx, &h_loaded); [[maybe_unused]] auto h_orig_acc = orig->GetHostAccessor(&ctx, &h_orig); @@ -125,7 +125,8 @@ TEST_P(TestEllpackPageRawFormat, HostIO) { ASSERT_EQ(h_acc_orig.row_stride, h_acc.row_stride); ASSERT_EQ(h_acc_orig.n_rows, h_acc.n_rows); ASSERT_EQ(h_acc_orig.base_rowid, h_acc.base_rowid); - ASSERT_EQ(h_acc_orig.is_dense, h_acc.is_dense); + ASSERT_EQ(h_acc_orig.IsDenseCompressed(), h_acc.IsDenseCompressed()); + ASSERT_EQ(h_acc_orig.NullValue(), h_acc.NullValue()); } } } diff --git a/tests/cpp/data/test_extmem_quantile_dmatrix.cc b/tests/cpp/data/test_extmem_quantile_dmatrix.cc index 623637ea4ec2..691bcf3369f8 100644 --- a/tests/cpp/data/test_extmem_quantile_dmatrix.cc +++ b/tests/cpp/data/test_extmem_quantile_dmatrix.cc @@ -45,7 +45,8 @@ class ExtMemQuantileDMatrixCpu : public ::testing::TestWithParam { }; Context ctx; - TestExtMemQdmBasic(&ctx, false, sparsity, equal); + TestExtMemQdmBasic( + &ctx, false, sparsity, equal, [](GHistIndexMatrix const& page) { return page.IsDense(); }); } }; } // anonymous namespace diff --git a/tests/cpp/data/test_extmem_quantile_dmatrix.cu b/tests/cpp/data/test_extmem_quantile_dmatrix.cu index 3b65dffa19a1..0c0be6be8c40 100644 --- a/tests/cpp/data/test_extmem_quantile_dmatrix.cu +++ b/tests/cpp/data/test_extmem_quantile_dmatrix.cu @@ -29,10 +29,13 @@ class ExtMemQuantileDMatrixGpu : public ::testing::TestWithParam { auto equal = std::equal(h_orig.cbegin(), h_orig.cend(), h_sparse.cbegin()); ASSERT_TRUE(equal); }; + auto no_missing = [](EllpackPage const& page) { + return page.Impl()->IsDense(); + }; auto ctx = MakeCUDACtx(0); - TestExtMemQdmBasic(&ctx, true, sparsity, equal); - TestExtMemQdmBasic(&ctx, false, sparsity, equal); + TestExtMemQdmBasic(&ctx, true, sparsity, equal, no_missing); + TestExtMemQdmBasic(&ctx, false, sparsity, equal, no_missing); } }; diff --git a/tests/cpp/data/test_extmem_quantile_dmatrix.h b/tests/cpp/data/test_extmem_quantile_dmatrix.h index 25f2e06542c8..2d4958010b03 100644 --- a/tests/cpp/data/test_extmem_quantile_dmatrix.h +++ b/tests/cpp/data/test_extmem_quantile_dmatrix.h @@ -8,8 +8,9 @@ #include "../helpers.h" // for RandomDataGenerator namespace xgboost::data { -template -void TestExtMemQdmBasic(Context const* ctx, bool on_host, float sparsity, Equal&& check_equal) { +template +void TestExtMemQdmBasic(Context const* ctx, bool on_host, float sparsity, Equal&& check_equal, + NoMissing&& no_missing) { bst_idx_t n_samples = 256, n_features = 16, n_batches = 4; bst_bin_t max_bin = 64; bst_target_t n_targets = 3; @@ -31,7 +32,7 @@ void TestExtMemQdmBasic(Context const* ctx, bool on_host, float sparsity, Equal& ++batch_cnt; base_cnt += n_samples / n_batches; row_cnt += page.Size(); - ASSERT_EQ((sparsity == 0.0f), page.IsDense()); + ASSERT_EQ((sparsity == 0.0f), no_missing(page)); } ASSERT_EQ(n_batches, batch_cnt); ASSERT_EQ(p_fmat->Info().num_row_, n_samples); diff --git a/tests/cpp/data/test_iterative_dmatrix.cu b/tests/cpp/data/test_iterative_dmatrix.cu index 8d2e837ff38c..eebc38fbeee2 100644 --- a/tests/cpp/data/test_iterative_dmatrix.cu +++ b/tests/cpp/data/test_iterative_dmatrix.cu @@ -20,14 +20,15 @@ void TestEquivalent(float sparsity) { std::numeric_limits::quiet_NaN(), 0, 256); std::size_t offset = 0; auto first = (*m.GetEllpackBatches(&ctx, {}).begin()).Impl(); - std::unique_ptr page_concatenated{new EllpackPageImpl( - &ctx, first->CutsShared(), first->is_dense, first->row_stride, 1000 * 100)}; + std::unique_ptr page_concatenated{new EllpackPageImpl{ + &ctx, first->CutsShared(), first->is_dense, first->info.row_stride, 1000 * 100}}; for (auto& batch : m.GetBatches(&ctx, {})) { auto page = batch.Impl(); size_t num_elements = page_concatenated->Copy(&ctx, page, offset); offset += num_elements; } - auto from_iter = page_concatenated->GetDeviceAccessor(&ctx); + std::vector h_iter_buffer; + auto from_iter = page_concatenated->GetHostAccessor(&ctx, &h_iter_buffer); ASSERT_EQ(m.Info().num_col_, CudaArrayIterForTest::Cols()); ASSERT_EQ(m.Info().num_row_, CudaArrayIterForTest::Rows()); @@ -37,33 +38,20 @@ void TestEquivalent(float sparsity) { DMatrix::Create(&adapter, std::numeric_limits::quiet_NaN(), 0)}; auto bp = BatchParam{256, tree::TrainParam::DftSparseThreshold()}; for (auto& ellpack : dm->GetBatches(&ctx, bp)) { - auto from_data = ellpack.Impl()->GetDeviceAccessor(&ctx); + std::vector h_data_buffer; + auto from_data = ellpack.Impl()->GetHostAccessor(&ctx, &h_data_buffer); - std::vector cuts_from_iter(from_iter.gidx_fvalue_map.size()); - std::vector min_fvalues_iter(from_iter.min_fvalue.size()); - std::vector cut_ptrs_iter(from_iter.feature_segments.size()); - dh::CopyDeviceSpanToVector(&cuts_from_iter, from_iter.gidx_fvalue_map); - dh::CopyDeviceSpanToVector(&min_fvalues_iter, from_iter.min_fvalue); - dh::CopyDeviceSpanToVector(&cut_ptrs_iter, from_iter.feature_segments); - - std::vector cuts_from_data(from_data.gidx_fvalue_map.size()); - std::vector min_fvalues_data(from_data.min_fvalue.size()); - std::vector cut_ptrs_data(from_data.feature_segments.size()); - dh::CopyDeviceSpanToVector(&cuts_from_data, from_data.gidx_fvalue_map); - dh::CopyDeviceSpanToVector(&min_fvalues_data, from_data.min_fvalue); - dh::CopyDeviceSpanToVector(&cut_ptrs_data, from_data.feature_segments); - - ASSERT_EQ(cuts_from_iter.size(), cuts_from_data.size()); - for (size_t i = 0; i < cuts_from_iter.size(); ++i) { - EXPECT_NEAR(cuts_from_iter[i], cuts_from_data[i], kRtEps); + ASSERT_EQ(from_iter.gidx_fvalue_map.size(), from_data.gidx_fvalue_map.size()); + for (size_t i = 0; i < from_iter.gidx_fvalue_map.size(); ++i) { + EXPECT_NEAR(from_iter.gidx_fvalue_map[i], from_data.gidx_fvalue_map[i], kRtEps); } - ASSERT_EQ(min_fvalues_iter.size(), min_fvalues_data.size()); - for (size_t i = 0; i < min_fvalues_iter.size(); ++i) { - ASSERT_NEAR(min_fvalues_iter[i], min_fvalues_data[i], kRtEps); + ASSERT_EQ(from_iter.min_fvalue.size(), from_data.min_fvalue.size()); + for (size_t i = 0; i < from_iter.min_fvalue.size(); ++i) { + ASSERT_NEAR(from_iter.min_fvalue[i], from_data.min_fvalue[i], kRtEps); } - ASSERT_EQ(cut_ptrs_iter.size(), cut_ptrs_data.size()); - for (size_t i = 0; i < cut_ptrs_iter.size(); ++i) { - ASSERT_EQ(cut_ptrs_iter[i], cut_ptrs_data[i]); + ASSERT_EQ(from_iter.NumFeatures(), from_data.NumFeatures()); + for (size_t i = 0; i < from_iter.NumFeatures() + 1; ++i) { + ASSERT_EQ(from_iter.feature_segments[i], from_data.feature_segments[i]); } std::vector buffer_from_iter, buffer_from_data; @@ -122,39 +110,43 @@ TEST(IterativeDeviceDMatrix, RowMajor) { TEST(IterativeDeviceDMatrix, RowMajorMissing) { const float kMissing = std::numeric_limits::quiet_NaN(); - size_t rows = 10; - size_t cols = 2; - CudaArrayIterForTest iter(0.0f, rows, cols, 2); + bst_idx_t rows = 4; + size_t cols = 3; + CudaArrayIterForTest iter{0.0f, rows, cols, 2}; std::string interface_str = iter.AsArray(); - auto j_interface = - Json::Load({interface_str.c_str(), interface_str.size()}); - ArrayInterface<2> loaded {get(j_interface)}; + auto j_interface = Json::Load({interface_str.c_str(), interface_str.size()}); + ArrayInterface<2> loaded{get(j_interface)}; std::vector h_data(cols * rows); common::Span s_data{static_cast(loaded.data), cols * rows}; dh::CopyDeviceSpanToVector(&h_data, s_data); h_data[1] = kMissing; h_data[5] = kMissing; h_data[6] = kMissing; - auto ptr = thrust::device_ptr( - reinterpret_cast(get(j_interface["data"][0]))); + h_data[9] = kMissing; // idx = (2, 0) + h_data[10] = kMissing; // idx = (2, 1) + auto ptr = + thrust::device_ptr(reinterpret_cast(get(j_interface["data"][0]))); thrust::copy(h_data.cbegin(), h_data.cend(), ptr); - - IterativeDMatrix m(&iter, iter.Proxy(), nullptr, Reset, Next, - std::numeric_limits::quiet_NaN(), 0, 256); + IterativeDMatrix m{ + &iter, iter.Proxy(), nullptr, Reset, Next, std::numeric_limits::quiet_NaN(), 0, 256}; auto ctx = MakeCUDACtx(0); auto& ellpack = *m.GetBatches(&ctx, BatchParam{256, tree::TrainParam::DftSparseThreshold()}) .begin(); auto impl = ellpack.Impl(); std::vector h_gidx; - auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx); - EXPECT_EQ(h_accessor.gidx_iter[1], impl->GetDeviceAccessor(&ctx).NullValue()); - EXPECT_EQ(h_accessor.gidx_iter[5], impl->GetDeviceAccessor(&ctx).NullValue()); + auto h_acc = impl->GetHostAccessor(&ctx, &h_gidx); // null values get placed after valid values in a row - EXPECT_EQ(h_accessor.gidx_iter[7], impl->GetDeviceAccessor(&ctx).NullValue()); + ASSERT_FALSE(h_acc.IsDenseCompressed()); + ASSERT_EQ(h_acc.row_stride, cols - 1); + ASSERT_EQ(h_acc.gidx_iter[7], impl->GetDeviceAccessor(&ctx).NullValue()); + for (std::size_t i = 0; i < 7; ++i) { + ASSERT_NE(h_acc.gidx_iter[i], impl->GetDeviceAccessor(&ctx).NullValue()); + } + EXPECT_EQ(m.Info().num_col_, cols); EXPECT_EQ(m.Info().num_row_, rows); - EXPECT_EQ(m.Info().num_nonzero_, rows* cols - 3); + EXPECT_EQ(m.Info().num_nonzero_, rows * cols - 5); } TEST(IterativeDeviceDMatrix, IsDense) { diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu index ff65b6ae59b3..dd26eca609fc 100644 --- a/tests/cpp/data/test_sparse_page_dmatrix.cu +++ b/tests/cpp/data/test_sparse_page_dmatrix.cu @@ -203,8 +203,8 @@ class TestEllpackPageExt : public ::testing::TestWithParamGetBatches(&ctx, param).begin()).Impl(); ASSERT_EQ(impl->base_rowid, 0); ASSERT_EQ(impl->n_rows, kRows); - ASSERT_EQ(impl->is_dense, is_dense); - ASSERT_EQ(impl->row_stride, 2); + ASSERT_EQ(impl->IsDense(), is_dense); + ASSERT_EQ(impl->info.row_stride, 2); ASSERT_EQ(impl->Cuts().TotalBins(), 4); std::unique_ptr impl_ext; @@ -213,15 +213,15 @@ class TestEllpackPageExt : public ::testing::TestWithParam(&ctx, batch.Impl()->CutsShared(), batch.Impl()->is_dense, - batch.Impl()->row_stride, kRows); + batch.Impl()->info.row_stride, kRows); } auto n_elems = impl_ext->Copy(&ctx, batch.Impl(), offset); offset += n_elems; } ASSERT_EQ(impl_ext->base_rowid, 0); ASSERT_EQ(impl_ext->n_rows, kRows); - ASSERT_EQ(impl_ext->is_dense, is_dense); - ASSERT_EQ(impl_ext->row_stride, 2); + ASSERT_EQ(impl_ext->IsDense(), is_dense); + ASSERT_EQ(impl_ext->info.row_stride, 2); ASSERT_EQ(impl_ext->Cuts().TotalBins(), 4); std::vector buffer; diff --git a/tests/cpp/test_context.cu b/tests/cpp/test_context.cu index a2322d23b1f4..5d8a67c22b05 100644 --- a/tests/cpp/test_context.cu +++ b/tests/cpp/test_context.cu @@ -28,7 +28,7 @@ void TestCUDA(Context const& ctx, bst_d_ordinal_t ord) { } } // namespace -TEST(Context, DeviceOrdinal) { +TEST(Context, MGPUDeviceOrdinal) { Context ctx; auto n_vis = curt::AllVisibleGPUs(); auto ord = n_vis - 1; @@ -77,7 +77,7 @@ TEST(Context, DeviceOrdinal) { TestCUDA(ctx, 0); } -TEST(Context, GPUId) { +TEST(Context, MGPUId) { Context ctx; ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}}); TestCUDA(ctx, 0); diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu index e26c8b980649..84be979d85ad 100644 --- a/tests/cpp/tree/gpu_hist/test_histogram.cu +++ b/tests/cpp/tree/gpu_hist/test_histogram.cu @@ -61,8 +61,7 @@ TEST(Histogram, SubtractionTrack) { auto page = BuildEllpackPage(&ctx, 64, 4); auto cuts = page->CutsShared(); - FeatureGroups fg{*cuts, true, std::numeric_limits::max(), - sizeof(GradientPairPrecise)}; + FeatureGroups fg{*cuts, true, std::numeric_limits::max()}; auto fg_acc = fg.DeviceAccessor(ctx.Device()); auto n_total_bins = cuts->TotalBins(); @@ -102,14 +101,7 @@ std::vector GetHostHistGpair() { void TestBuildHist(bool use_shared_memory_histograms) { int const kNRows = 16, kNCols = 8; - Context ctx{MakeCUDACtx(0)}; - - TrainParam param; - Args args{ - {"max_depth", "6"}, - {"max_leaves", "0"}, - }; - param.Init(args); + auto ctx = MakeCUDACtx(0); auto page = BuildEllpackPage(&ctx, kNRows, kNCols); BatchParam batch_param{}; @@ -129,7 +121,7 @@ void TestBuildHist(bool use_shared_memory_histograms) { auto quantiser = std::make_unique(&ctx, gpair.ConstDeviceSpan(), MetaInfo()); auto shm_size = use_shared_memory_histograms ? dh::MaxSharedMemoryOptin(ctx.Ordinal()) : 0; - FeatureGroups feature_groups(page->Cuts(), page->is_dense, shm_size, sizeof(GradientPairInt64)); + FeatureGroups feature_groups(page->Cuts(), page->IsDenseCompressed(), shm_size); DeviceHistogramBuilder builder; builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(), @@ -161,7 +153,8 @@ TEST(Histogram, BuildHistSharedMem) { TestBuildHist(true); } -void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global) { +namespace { +void TestDeterministicHistogram(bool is_dense, std::size_t shm_size, bool force_global) { Context ctx = MakeCUDACtx(0); size_t constexpr kBins = 256, kCols = 120, kRows = 16384, kRounds = 16; float constexpr kLower = -1e-2, kUpper = 1e2; @@ -183,7 +176,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global) auto gpair = GenerateRandomGradients(kRows, kLower, kUpper); gpair.SetDevice(ctx.Device()); - FeatureGroups feature_groups(page->Cuts(), page->is_dense, shm_size, sizeof(GradientPairInt64)); + FeatureGroups feature_groups{page->Cuts(), page->IsDenseCompressed(), shm_size}; auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo()); DeviceHistogramBuilder builder; @@ -211,8 +204,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global) std::vector new_histogram_h(num_bins); dh::safe_cuda(cudaMemcpy(new_histogram_h.data(), d_new_histogram.data(), - num_bins * sizeof(GradientPairInt64), - cudaMemcpyDeviceToHost)); + num_bins * sizeof(GradientPairInt64), cudaMemcpyDeviceToHost)); for (size_t j = 0; j < new_histogram_h.size(); ++j) { ASSERT_EQ(new_histogram_h[j].GetQuantisedGrad(), histogram_h[j].GetQuantisedGrad()); ASSERT_EQ(new_histogram_h[j].GetQuantisedHess(), histogram_h[j].GetQuantisedHess()); @@ -236,28 +228,30 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global) std::vector baseline_h(num_bins); dh::safe_cuda(cudaMemcpy(baseline_h.data(), baseline.data().get(), - num_bins * sizeof(GradientPairInt64), - cudaMemcpyDeviceToHost)); + num_bins * sizeof(GradientPairInt64), cudaMemcpyDeviceToHost)); for (size_t i = 0; i < baseline.size(); ++i) { - EXPECT_NEAR(baseline_h[i].GetQuantisedGrad(), histogram_h[i].GetQuantisedGrad(), + ASSERT_NEAR(baseline_h[i].GetQuantisedGrad(), histogram_h[i].GetQuantisedGrad(), baseline_h[i].GetQuantisedGrad() * 1e-3); } } } } - -TEST(Histogram, GPUDeterministic) { - std::vector is_dense_array{false, true}; - std::vector shm_sizes{48 * 1024, 64 * 1024, 160 * 1024}; - for (bool is_dense : is_dense_array) { - for (int shm_size : shm_sizes) { - for (bool force_global : {true, false}) { - TestDeterministicHistogram(is_dense, shm_size, force_global); - } - } +class TestGPUDeterministic : public ::testing::TestWithParam> { + protected: + void Run() { + auto [is_dense, shm_size, force_global] = this->GetParam(); + TestDeterministicHistogram(is_dense, shm_size, force_global); } -} +}; +} // anonymous namespace + +TEST_P(TestGPUDeterministic, Histogram) { this->Run(); } + +INSTANTIATE_TEST_SUITE_P(Histogram, TestGPUDeterministic, + ::testing::Combine(::testing::Bool(), + ::testing::Values(48 * 1024, 64 * 1024, 160 * 1024), + ::testing::Bool())); void ValidateCategoricalHistogram(size_t n_categories, common::Span onehot, common::Span cat) { @@ -513,13 +507,7 @@ TEST_P(HistogramExternalMemoryTest, ExternalMemory) { std::apply(&HistogramExternalMemoryTest::Run, std::tuple_cat(std::make_tuple(this), GetParam())); } -INSTANTIATE_TEST_SUITE_P(Histogram, HistogramExternalMemoryTest, ::testing::ValuesIn([]() { - std::vector> params; - for (auto global : {true, false}) { - for (auto sparsity : {0.0f, 0.2f, 0.8f}) { - params.emplace_back(sparsity, global); - } - } - return params; - }())); +INSTANTIATE_TEST_SUITE_P(Histogram, HistogramExternalMemoryTest, + ::testing::Combine(::testing::Values(0.0f, 0.2f, 0.8f), + ::testing::Bool())); } // namespace xgboost::tree diff --git a/tests/cpp/tree/test_gpu_approx.cu b/tests/cpp/tree/test_gpu_approx.cu new file mode 100644 index 000000000000..7df60b8cbcd2 --- /dev/null +++ b/tests/cpp/tree/test_gpu_approx.cu @@ -0,0 +1,65 @@ +/** + * Copyright 2024, XGBoost contributors + */ +#include +#include // for Json +#include // for ObjInfo +#include // for RegTree +#include // for TreeUpdater + +#include "../../../src/tree/param.h" // for TrainParam +#include "../collective/test_worker.h" // for BaseMGPUTest +#include "../helpers.h" // for GenerateRandomGradients + +namespace xgboost::tree { +namespace { +RegTree GetApproxTree(Context const* ctx, DMatrix* dmat) { + ObjInfo task{ObjInfo::kRegression}; + std::unique_ptr approx_maker{TreeUpdater::Create("grow_gpu_approx", ctx, &task)}; + approx_maker->Configure(Args{}); + + TrainParam param; + param.UpdateAllowUnknown(Args{}); + + linalg::Matrix gpair({dmat->Info().num_row_}, ctx->Device()); + gpair.Data()->Copy(GenerateRandomGradients(dmat->Info().num_row_)); + + std::vector> position(1); + RegTree tree; + approx_maker->Update(¶m, &gpair, dmat, common::Span>{position}, + {&tree}); + return tree; +} + +void VerifyApproxColumnSplit(bst_idx_t rows, bst_feature_t cols, RegTree const& expected_tree) { + auto ctx = MakeCUDACtx(DistGpuIdx()); + + auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true); + auto const world_size = collective::GetWorldSize(); + auto const rank = collective::GetRank(); + std::unique_ptr sliced{Xy->SliceCol(world_size, rank)}; + + RegTree tree = GetApproxTree(&ctx, sliced.get()); + + Json json{Object{}}; + tree.SaveModel(&json); + Json expected_json{Object{}}; + expected_tree.SaveModel(&expected_json); + ASSERT_EQ(json, expected_json); +} +} // anonymous namespace + +class MGPUApproxTest : public collective::BaseMGPUTest {}; + +TEST_F(MGPUApproxTest, GPUApproxColumnSplit) { + auto constexpr kRows = 32; + auto constexpr kCols = 16; + + Context ctx(MakeCUDACtx(0)); + auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true); + RegTree expected_tree = GetApproxTree(&ctx, dmat.get()); + + this->DoTest([&] { VerifyApproxColumnSplit(kRows, kCols, expected_tree); }, true); + this->DoTest([&] { VerifyApproxColumnSplit(kRows, kCols, expected_tree); }, false); +} +} // namespace xgboost::tree diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_external_memory.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_external_memory.py new file mode 100644 index 000000000000..4e0f0bcc2f89 --- /dev/null +++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_external_memory.py @@ -0,0 +1,31 @@ +"""Copyright 2024, XGBoost contributors""" + +import pytest +from dask_cuda import LocalCUDACluster +from distributed import Client + +import xgboost as xgb +from xgboost.testing.dask import check_external_memory + + +@pytest.mark.parametrize("is_qdm", [True, False]) +def test_external_memory(is_qdm: bool) -> None: + n_workers = 2 + with LocalCUDACluster(n_workers=2) as cluster: + with Client(cluster) as client: + args = client.sync( + xgb.dask._get_rabit_args, + 2, + None, + client, + ) + + futs = client.map( + check_external_memory, + range(n_workers), + n_workers=n_workers, + device="cuda", + comm_args=args, + is_qdm=is_qdm, + ) + client.gather(futs)