diff --git a/include/xgboost/data.h b/include/xgboost/data.h index 04b489d8b061..69176994bc0b 100644 --- a/include/xgboost/data.h +++ b/include/xgboost/data.h @@ -178,7 +178,7 @@ class MetaInfo { * in vertical federated learning, since each worker loads its own list of columns, * we need to sum them. */ - void SynchronizeNumberOfColumns(); + void SynchronizeNumberOfColumns(Context const* ctx); /*! \brief Whether the data is split row-wise. */ bool IsRowSplit() const { diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h index 901c9ae914de..8806818fbc22 100644 --- a/include/xgboost/linalg.h +++ b/include/xgboost/linalg.h @@ -582,20 +582,20 @@ auto MakeTensorView(Context const *ctx, Container &data, S &&...shape) { // NOL return TensorView{data, in_shape, ctx->Device()}; } -template -LINALG_HD auto MakeTensorView(DeviceOrd device, common::Span data, S &&...shape) { +template +LINALG_HD auto MakeTensorView(DeviceOrd device, common::Span data, S &&...shape) { std::size_t in_shape[sizeof...(S)]; detail::IndexToArr(in_shape, std::forward(shape)...); return TensorView{data, in_shape, device}; } -template -auto MakeTensorView(Context const *ctx, common::Span data, S &&...shape) { +template +auto MakeTensorView(Context const *ctx, common::Span data, S &&...shape) { return MakeTensorView(ctx->Device(), data, std::forward(shape)...); } -template -auto MakeTensorView(Context const *ctx, Order order, common::Span data, S &&...shape) { +template +auto MakeTensorView(Context const *ctx, Order order, common::Span data, S &&...shape) { std::size_t in_shape[sizeof...(S)]; detail::IndexToArr(in_shape, std::forward(shape)...); return TensorView{data, in_shape, ctx->Device(), order}; diff --git a/plugin/federated/federated_coll.cc b/plugin/federated/federated_coll.cc index 7c25eeba5ad5..980992d61100 100644 --- a/plugin/federated/federated_coll.cc +++ b/plugin/federated/federated_coll.cc @@ -29,7 +29,7 @@ namespace { auto stub = fed->Handle(); BroadcastRequest request; - request.set_sequence_number(*sequence_number++); + request.set_sequence_number((*sequence_number)++); request.set_rank(comm.Rank()); if (comm.Rank() != root) { request.set_send_buffer(nullptr, 0); @@ -90,9 +90,9 @@ Coll *FederatedColl::MakeCUDAVar() { [[nodiscard]] Result FederatedColl::Broadcast(Comm const &comm, common::Span data, std::int32_t root) { if (comm.Rank() == root) { - return BroadcastImpl(comm, &sequence_number_, data, root); + return BroadcastImpl(comm, &this->sequence_number_, data, root); } else { - return BroadcastImpl(comm, &sequence_number_, data, root); + return BroadcastImpl(comm, &this->sequence_number_, data, root); } } diff --git a/src/collective/allreduce.cc b/src/collective/allreduce.cc index 6948f6758016..65c06686860b 100644 --- a/src/collective/allreduce.cc +++ b/src/collective/allreduce.cc @@ -62,6 +62,9 @@ Result RingScatterReduceTyped(Comm const& comm, common::Span data, Result RingAllreduce(Comm const& comm, common::Span data, Func const& op, ArrayInterfaceHandler::Type type) { + if (comm.World() == 1) { + return Success(); + } return DispatchDType(type, [&](auto t) { using T = decltype(t); // Divide the data into segments according to the number of workers. diff --git a/src/collective/comm.cu b/src/collective/comm.cu index 09faf31cd641..09edc522df16 100644 --- a/src/collective/comm.cu +++ b/src/collective/comm.cu @@ -10,6 +10,7 @@ #include // for stringstream #include // for vector +#include "../common/cuda_context.cuh" // for CUDAContext #include "../common/device_helpers.cuh" // for DefaultStream #include "../common/type.h" // for EraseType #include "broadcast.h" // for Broadcast @@ -60,7 +61,7 @@ Comm* Comm::MakeCUDAVar(Context const* ctx, std::shared_ptr pimpl) const { NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr pimpl) : Comm{root.TrackerInfo().host, root.TrackerInfo().port, root.Timeout(), root.Retry(), root.TaskID()}, - stream_{dh::DefaultStream()} { + stream_{ctx->CUDACtx()->Stream()} { this->world_ = root.World(); this->rank_ = root.Rank(); this->domain_ = root.Domain(); diff --git a/src/collective/comm_group.cc b/src/collective/comm_group.cc index 57050084324d..3d2e24492b04 100644 --- a/src/collective/comm_group.cc +++ b/src/collective/comm_group.cc @@ -105,7 +105,7 @@ CommGroup::CommGroup() } std::unique_ptr& GlobalCommGroup() { - static std::unique_ptr sptr; + static thread_local std::unique_ptr sptr; if (!sptr) { Json config{Null{}}; sptr.reset(CommGroup::Create(config)); diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 89b3ad2e6a17..74336ac61a82 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -480,7 +480,8 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator { cub::CachingDeviceAllocator& GetGlobalCachingAllocator() { // Configure allocator with maximum cached bin size of ~1GB and no limit on // maximum cached bytes - thread_local cub::CachingDeviceAllocator *allocator = new cub::CachingDeviceAllocator(2, 9, 29); + thread_local std::unique_ptr allocator{ + std::make_unique(2, 9, 29)}; return *allocator; } pointer allocate(size_t n) { // NOLINT diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index 65ab18630e1f..f101247920a4 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -51,7 +51,7 @@ HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins for (auto const &page : m->GetBatches()) { container.PushRowPage(page, info, hessian); } - container.MakeCuts(m->Info(), &out); + container.MakeCuts(ctx, m->Info(), &out); } else { SortedSketchContainer container{ctx, max_bins, @@ -61,7 +61,7 @@ HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins for (auto const &page : m->GetBatches(ctx)) { container.PushColPage(page, info, hessian); } - container.MakeCuts(m->Info(), &out); + container.MakeCuts(ctx, m->Info(), &out); } return out; diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu index 1f06c2a6fdf4..fbe6356bf501 100644 --- a/src/common/hist_util.cu +++ b/src/common/hist_util.cu @@ -359,7 +359,7 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b } } - sketch_container.MakeCuts(&cuts, p_fmat->Info().IsColumnSplit()); + sketch_container.MakeCuts(ctx, &cuts, p_fmat->Info().IsColumnSplit()); return cuts; } } // namespace xgboost::common diff --git a/src/common/quantile.cc b/src/common/quantile.cc index 5250abd0f69b..0490add264ac 100644 --- a/src/common/quantile.cc +++ b/src/common/quantile.cc @@ -11,9 +11,7 @@ #include "categorical.h" #include "hist_util.h" -namespace xgboost { -namespace common { - +namespace xgboost::common { template SketchContainerImpl::SketchContainerImpl(Context const *ctx, std::vector columns_size, @@ -129,7 +127,7 @@ struct QuantileAllreduce { * \param rank rank of target worker * \param fidx feature idx */ - auto Values(int32_t rank, bst_feature_t fidx) const { + [[nodiscard]] auto Values(int32_t rank, bst_feature_t fidx) const { // get span for worker auto wsize = worker_indptr[rank + 1] - worker_indptr[rank]; auto worker_values = global_values.subspan(worker_indptr[rank], wsize); @@ -145,7 +143,7 @@ struct QuantileAllreduce { template void SketchContainerImpl::GatherSketchInfo( - MetaInfo const& info, + Context const *, MetaInfo const &info, std::vector const &reduced, std::vector *p_worker_segments, std::vector *p_sketches_scan, std::vector *p_global_sketches) { @@ -206,7 +204,7 @@ void SketchContainerImpl::GatherSketchInfo( } template -void SketchContainerImpl::AllreduceCategories(MetaInfo const& info) { +void SketchContainerImpl::AllreduceCategories(Context const*, MetaInfo const& info) { auto world_size = collective::GetWorldSize(); auto rank = collective::GetRank(); if (world_size == 1 || info.IsColumnSplit()) { @@ -274,16 +272,15 @@ void SketchContainerImpl::AllreduceCategories(MetaInfo const& info) { template void SketchContainerImpl::AllReduce( - MetaInfo const& info, - std::vector *p_reduced, - std::vector* p_num_cuts) { + Context const *ctx, MetaInfo const &info, + std::vector *p_reduced, std::vector *p_num_cuts) { monitor_.Start(__func__); size_t n_columns = sketches_.size(); collective::Allreduce(&n_columns, 1); CHECK_EQ(n_columns, sketches_.size()) << "Number of columns differs across workers"; - AllreduceCategories(info); + AllreduceCategories(ctx, info); auto& num_cuts = *p_num_cuts; CHECK_EQ(num_cuts.size(), 0); @@ -324,7 +321,7 @@ void SketchContainerImpl::AllReduce( std::vector sketches_scan((n_columns + 1) * world, 0); std::vector global_sketches; - this->GatherSketchInfo(info, reduced, &worker_segments, &sketches_scan, &global_sketches); + this->GatherSketchInfo(ctx, info, reduced, &worker_segments, &sketches_scan, &global_sketches); std::vector final_sketches(n_columns); @@ -383,11 +380,12 @@ auto AddCategories(std::set const &categories, HistogramCuts *cuts) { } template -void SketchContainerImpl::MakeCuts(MetaInfo const &info, HistogramCuts *p_cuts) { +void SketchContainerImpl::MakeCuts(Context const *ctx, MetaInfo const &info, + HistogramCuts *p_cuts) { monitor_.Start(__func__); std::vector reduced; std::vector num_cuts; - this->AllReduce(info, &reduced, &num_cuts); + this->AllReduce(ctx, info, &reduced, &num_cuts); p_cuts->min_vals_.HostVector().resize(sketches_.size(), 0.0f); std::vector final_summaries(reduced.size()); @@ -496,5 +494,4 @@ void SortedSketchContainer::PushColPage(SparsePage const &page, MetaInfo const & }); monitor_.Stop(__func__); } -} // namespace common -} // namespace xgboost +} // namespace xgboost::common diff --git a/src/common/quantile.cu b/src/common/quantile.cu index 2bf6070d5e89..4b110f5e0164 100644 --- a/src/common/quantile.cu +++ b/src/common/quantile.cu @@ -22,9 +22,7 @@ #include "transform_iterator.h" // MakeIndexTransformIter #include "xgboost/span.h" -namespace xgboost { -namespace common { - +namespace xgboost::common { using WQSketch = HostSketchContainer::WQSketch; using SketchEntry = WQSketch::Entry; @@ -501,7 +499,7 @@ void SketchContainer::FixError() { }); } -void SketchContainer::AllReduce(bool is_column_split) { +void SketchContainer::AllReduce(Context const*, bool is_column_split) { dh::safe_cuda(cudaSetDevice(device_.ordinal)); auto world = collective::GetWorldSize(); if (world == 1 || is_column_split) { @@ -582,13 +580,13 @@ struct InvalidCatOp { }; } // anonymous namespace -void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) { +void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool is_column_split) { timer_.Start(__func__); dh::safe_cuda(cudaSetDevice(device_.ordinal)); p_cuts->min_vals_.Resize(num_columns_); // Sync between workers. - this->AllReduce(is_column_split); + this->AllReduce(ctx, is_column_split); // Prune to final number of bins. this->Prune(num_bins_ + 1); @@ -731,5 +729,4 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) { p_cuts->SetCategorical(this->has_categorical_, max_cat); timer_.Stop(__func__); } -} // namespace common -} // namespace xgboost +} // namespace xgboost::common diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh index b478347824c8..f7124f079b6d 100644 --- a/src/common/quantile.cuh +++ b/src/common/quantile.cuh @@ -151,9 +151,9 @@ class SketchContainer { Span that); /* \brief Merge quantiles from other GPU workers. */ - void AllReduce(bool is_column_split); + void AllReduce(Context const* ctx, bool is_column_split); /* \brief Create the final histogram cut values. */ - void MakeCuts(HistogramCuts* cuts, bool is_column_split); + void MakeCuts(Context const* ctx, HistogramCuts* cuts, bool is_column_split); Span Data() const { return {this->Current().data().get(), this->Current().size()}; diff --git a/src/common/quantile.h b/src/common/quantile.h index 47db5f875af5..0af93a03e021 100644 --- a/src/common/quantile.h +++ b/src/common/quantile.h @@ -827,13 +827,14 @@ class SketchContainerImpl { return group_ind; } // Gather sketches from all workers. - void GatherSketchInfo(MetaInfo const& info, + void GatherSketchInfo(Context const *ctx, MetaInfo const &info, std::vector const &reduced, std::vector *p_worker_segments, std::vector *p_sketches_scan, std::vector *p_global_sketches); // Merge sketches from all workers. - void AllReduce(MetaInfo const& info, std::vector *p_reduced, + void AllReduce(Context const *ctx, MetaInfo const &info, + std::vector *p_reduced, std::vector *p_num_cuts); template @@ -887,11 +888,11 @@ class SketchContainerImpl { /* \brief Push a CSR matrix. */ void PushRowPage(SparsePage const &page, MetaInfo const &info, Span hessian = {}); - void MakeCuts(MetaInfo const& info, HistogramCuts* cuts); + void MakeCuts(Context const *ctx, MetaInfo const &info, HistogramCuts *cuts); private: // Merge all categories from other workers. - void AllreduceCategories(MetaInfo const& info); + void AllreduceCategories(Context const* ctx, MetaInfo const& info); }; class HostSketchContainer : public SketchContainerImpl> { diff --git a/src/data/data.cc b/src/data/data.cc index 7e70fff3fff3..50f64a4066a0 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -745,7 +745,7 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col } } -void MetaInfo::SynchronizeNumberOfColumns() { +void MetaInfo::SynchronizeNumberOfColumns(Context const*) { if (IsColumnSplit()) { collective::Allreduce(&num_col_, 1); } else { diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc index 45f6286fb606..e5aa98278c8e 100644 --- a/src/data/iterative_dmatrix.cc +++ b/src/data/iterative_dmatrix.cc @@ -95,7 +95,7 @@ void GetCutsFromRef(Context const* ctx, std::shared_ptr ref, bst_featur namespace { // Synchronize feature type in case of empty DMatrix -void SyncFeatureType(std::vector* p_h_ft) { +void SyncFeatureType(Context const*, std::vector* p_h_ft) { if (!collective::IsDistributed()) { return; } @@ -193,7 +193,7 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p, // From here on Info() has the correct data shape Info().num_row_ = accumulated_rows; Info().num_nonzero_ = nnz; - Info().SynchronizeNumberOfColumns(); + Info().SynchronizeNumberOfColumns(ctx); CHECK(std::none_of(column_sizes.cbegin(), column_sizes.cend(), [&](auto f) { return f > accumulated_rows; })) << "Something went wrong during iteration."; @@ -213,9 +213,9 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p, while (iter.Next()) { if (!p_sketch) { h_ft = proxy->Info().feature_types.ConstHostVector(); - SyncFeatureType(&h_ft); - p_sketch.reset(new common::HostSketchContainer{ctx, p.max_bin, h_ft, column_sizes, - !proxy->Info().group_ptr_.empty()}); + SyncFeatureType(ctx, &h_ft); + p_sketch = std::make_unique(ctx, p.max_bin, h_ft, column_sizes, + !proxy->Info().group_ptr_.empty()); } HostAdapterDispatch(proxy, [&](auto const& batch) { proxy->Info().num_nonzero_ = batch_nnz[i]; @@ -230,7 +230,7 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p, CHECK_EQ(accumulated_rows, Info().num_row_); CHECK(p_sketch); - p_sketch->MakeCuts(Info(), &cuts); + p_sketch->MakeCuts(ctx, Info(), &cuts); } if (!h_ft.empty()) { CHECK_EQ(h_ft.size(), n_features); diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu index 2fffd516bfb6..09a3976d785c 100644 --- a/src/data/iterative_dmatrix.cu +++ b/src/data/iterative_dmatrix.cu @@ -105,7 +105,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p, sketch_containers.clear(); sketch_containers.shrink_to_fit(); - final_sketch.MakeCuts(&cuts, this->info_.IsColumnSplit()); + final_sketch.MakeCuts(ctx, &cuts, this->info_.IsColumnSplit()); } else { GetCutsFromRef(ctx, ref, Info().num_col_, p, &cuts); } @@ -167,7 +167,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p, iter.Reset(); // Synchronise worker columns - info_.SynchronizeNumberOfColumns(); + info_.SynchronizeNumberOfColumns(ctx); } BatchSet IterativeDMatrix::GetEllpackBatches(Context const* ctx, diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc index 3814d74d2cec..2bf81892f013 100644 --- a/src/data/simple_dmatrix.cc +++ b/src/data/simple_dmatrix.cc @@ -283,7 +283,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread, // Synchronise worker columns info_.data_split_mode = data_split_mode; ReindexFeatures(&ctx); - info_.SynchronizeNumberOfColumns(); + info_.SynchronizeNumberOfColumns(&ctx); if (adapter->NumRows() == kAdapterUnknownSize) { using IteratorAdapterT = diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu index e41d5939463f..e5b4d18f77db 100644 --- a/src/data/simple_dmatrix.cu +++ b/src/data/simple_dmatrix.cu @@ -42,7 +42,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr info_.num_row_ = adapter->NumRows(); // Synchronise worker columns info_.data_split_mode = data_split_mode; - info_.SynchronizeNumberOfColumns(); + info_.SynchronizeNumberOfColumns(&ctx); this->fmat_ctx_ = ctx; } diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc index 042a75c56a85..f1754c1b5a5d 100644 --- a/src/data/sparse_page_dmatrix.cc +++ b/src/data/sparse_page_dmatrix.cc @@ -97,7 +97,7 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p this->info_.num_col_ = n_features; this->info_.num_nonzero_ = nnz; - info_.SynchronizeNumberOfColumns(); + info_.SynchronizeNumberOfColumns(&ctx); CHECK_NE(info_.num_col_, 0); fmat_ctx_ = ctx; diff --git a/src/learner.cc b/src/learner.cc index 08c59ba601c0..6b0fd7e4b8d7 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -209,7 +209,7 @@ struct LearnerModelParamLegacy : public dmlc::Parameter return dmlc::Parameter::UpdateAllowUnknown(kwargs); } // sanity check - void Validate() { + void Validate(Context const*) { if (!collective::IsDistributed()) { return; } @@ -434,7 +434,7 @@ class LearnerConfiguration : public Learner { } // Update the shared model parameter this->ConfigureModelParamWithoutBaseScore(); - mparam_.Validate(); + mparam_.Validate(&ctx_); } CHECK(!std::isnan(mparam_.base_score)); CHECK(!std::isinf(mparam_.base_score)); diff --git a/src/metric/auc.cu b/src/metric/auc.cu index a4838d783d69..8b8349e1bd25 100644 --- a/src/metric/auc.cu +++ b/src/metric/auc.cu @@ -199,9 +199,9 @@ void Transpose(common::Span in, common::Span out, size_t m, }); } -double ScaleClasses(common::Span results, common::Span local_area, - common::Span tp, common::Span auc, size_t n_classes) { - dh::XGBDeviceAllocator alloc; +double ScaleClasses(Context const *ctx, common::Span results, + common::Span local_area, common::Span tp, + common::Span auc, size_t n_classes) { if (collective::IsDistributed()) { int32_t device = dh::CurrentDevice(); CHECK_EQ(dh::CudaGetPointerDevice(results.data()), device); @@ -218,8 +218,8 @@ double ScaleClasses(common::Span results, common::Span local_are double tp_sum; double auc_sum; thrust::tie(auc_sum, tp_sum) = - thrust::reduce(thrust::cuda::par(alloc), reduce_in, reduce_in + n_classes, - Pair{0.0, 0.0}, PairPlus{}); + thrust::reduce(ctx->CUDACtx()->CTP(), reduce_in, reduce_in + n_classes, Pair{0.0, 0.0}, + PairPlus{}); if (tp_sum != 0 && !std::isnan(auc_sum)) { auc_sum /= tp_sum; } else { @@ -309,10 +309,10 @@ void SegmentedReduceAUC(common::Span d_unique_idx, * up each class in all kernels. */ template -double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device, +double GPUMultiClassAUCOVR(Context const *ctx, MetaInfo const &info, common::Span d_class_ptr, size_t n_classes, std::shared_ptr cache, Fn area_fn) { - dh::safe_cuda(cudaSetDevice(device.ordinal)); + dh::safe_cuda(cudaSetDevice(ctx->Ordinal())); /** * Sorted idx */ @@ -320,7 +320,7 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device, // Index is sorted within class. auto d_sorted_idx = dh::ToSpan(cache->sorted_idx); - auto labels = info.labels.View(device); + auto labels = info.labels.View(ctx->Device()); auto weights = info.weights_.ConstDeviceSpan(); size_t n_samples = labels.Shape(0); @@ -328,12 +328,11 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device, if (n_samples == 0) { dh::TemporaryArray resutls(n_classes * 4, 0.0f); auto d_results = dh::ToSpan(resutls); - dh::LaunchN(n_classes * 4, - [=] XGBOOST_DEVICE(size_t i) { d_results[i] = 0.0f; }); + dh::LaunchN(n_classes * 4, [=] XGBOOST_DEVICE(size_t i) { d_results[i] = 0.0f; }); auto local_area = d_results.subspan(0, n_classes); auto tp = d_results.subspan(2 * n_classes, n_classes); auto auc = d_results.subspan(3 * n_classes, n_classes); - return ScaleClasses(d_results, local_area, tp, auc, n_classes); + return ScaleClasses(ctx, d_results, local_area, tp, auc, n_classes); } /** @@ -437,7 +436,7 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device, tp[c] = 1.0f; } }); - return ScaleClasses(d_results, local_area, tp, auc, n_classes); + return ScaleClasses(ctx, d_results, local_area, tp, auc, n_classes); } void MultiClassSortedIdx(Context const *ctx, common::Span predts, @@ -472,8 +471,7 @@ double GPUMultiClassROCAUC(Context const *ctx, common::Span predts, size_t /*class_id*/) { return TrapezoidArea(fp_prev, fp, tp_prev, tp); }; - return GPUMultiClassAUCOVR(info, ctx->Device(), dh::ToSpan(class_ptr), n_classes, cache, - fn); + return GPUMultiClassAUCOVR(ctx, info, dh::ToSpan(class_ptr), n_classes, cache, fn); } namespace { @@ -697,7 +695,7 @@ double GPUMultiClassPRAUC(Context const *ctx, common::Span predts, return detail::CalcDeltaPRAUC(fp_prev, fp, tp_prev, tp, d_totals[class_id].first); }; - return GPUMultiClassAUCOVR(info, ctx->Device(), d_class_ptr, n_classes, cache, fn); + return GPUMultiClassAUCOVR(ctx, info, d_class_ptr, n_classes, cache, fn); } template diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu index feabedfab795..f245f3e06306 100644 --- a/src/metric/elementwise_metric.cu +++ b/src/metric/elementwise_metric.cu @@ -215,7 +215,7 @@ struct EvalError { has_param_ = false; } } - const char *Name() const { + [[nodiscard]] const char *Name() const { static thread_local std::string name; if (has_param_) { std::ostringstream os; @@ -228,7 +228,7 @@ struct EvalError { } } - XGBOOST_DEVICE bst_float EvalRow(bst_float label, bst_float pred) const { + [[nodiscard]] XGBOOST_DEVICE bst_float EvalRow(bst_float label, bst_float pred) const { // assume label is in [0,1] return pred > threshold_ ? 1.0f - label : label; } @@ -370,7 +370,7 @@ struct EvalEWiseBase : public MetricNoCache { return Policy::GetFinal(dat[0], dat[1]); } - const char* Name() const override { return policy_.Name(); } + [[nodiscard]] const char* Name() const override { return policy_.Name(); } private: Policy policy_; diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc index 41495164ce7e..6762aec32d46 100644 --- a/src/metric/rank_metric.cc +++ b/src/metric/rank_metric.cc @@ -162,7 +162,7 @@ struct EvalRank : public MetricNoCache, public EvalRankConfig { return collective::GlobalRatio(info, sum_metric, static_cast(ngroups)); } - const char* Name() const override { + [[nodiscard]] const char* Name() const override { return name.c_str(); } @@ -294,7 +294,7 @@ class EvalRankWithCache : public Metric { }; namespace { -double Finalize(MetaInfo const& info, double score, double sw) { +double Finalize(Context const*, MetaInfo const& info, double score, double sw) { std::array dat{score, sw}; collective::GlobalSum(info, &dat); std::tie(score, sw) = std::tuple_cat(dat); @@ -323,7 +323,7 @@ class EvalPrecision : public EvalRankWithCache { if (ctx_->IsCUDA()) { auto pre = cuda_impl::PreScore(ctx_, info, predt, p_cache); - return Finalize(info, pre.Residue(), pre.Weights()); + return Finalize(ctx_, info, pre.Residue(), pre.Weights()); } auto gptr = p_cache->DataGroupPtr(ctx_); @@ -352,7 +352,7 @@ class EvalPrecision : public EvalRankWithCache { } auto sum = std::accumulate(pre.cbegin(), pre.cend(), 0.0); - return Finalize(info, sum, sw); + return Finalize(ctx_, info, sum, sw); } }; @@ -369,7 +369,7 @@ class EvalNDCG : public EvalRankWithCache { std::shared_ptr p_cache) override { if (ctx_->IsCUDA()) { auto ndcg = cuda_impl::NDCGScore(ctx_, info, preds, minus_, p_cache); - return Finalize(info, ndcg.Residue(), ndcg.Weights()); + return Finalize(ctx_, info, ndcg.Residue(), ndcg.Weights()); } // group local ndcg @@ -415,7 +415,7 @@ class EvalNDCG : public EvalRankWithCache { sum_w = std::accumulate(weights.weights.cbegin(), weights.weights.cend(), 0.0); } auto ndcg = std::accumulate(linalg::cbegin(ndcg_gloc), linalg::cend(ndcg_gloc), 0.0); - return Finalize(info, ndcg, sum_w); + return Finalize(ctx_, info, ndcg, sum_w); } }; @@ -427,7 +427,7 @@ class EvalMAPScore : public EvalRankWithCache { std::shared_ptr p_cache) override { if (ctx_->IsCUDA()) { auto map = cuda_impl::MAPScore(ctx_, info, predt, minus_, p_cache); - return Finalize(info, map.Residue(), map.Weights()); + return Finalize(ctx_, info, map.Residue(), map.Weights()); } auto gptr = p_cache->DataGroupPtr(ctx_); @@ -469,7 +469,7 @@ class EvalMAPScore : public EvalRankWithCache { sw += weight[i]; } auto sum = std::accumulate(map_gloc.cbegin(), map_gloc.cend(), 0.0); - return Finalize(info, sum, sw); + return Finalize(ctx_, info, sum, sw); } }; diff --git a/src/metric/survival_metric.cu b/src/metric/survival_metric.cu index 0625af25ad22..c13702a1917f 100644 --- a/src/metric/survival_metric.cu +++ b/src/metric/survival_metric.cu @@ -217,7 +217,7 @@ struct EvalEWiseSurvivalBase : public MetricNoCache { return Policy::GetFinal(dat[0], dat[1]); } - const char* Name() const override { + [[nodiscard]] const char* Name() const override { return policy_.Name(); } diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc index 26d8f3440a8f..20305850a7f6 100644 --- a/src/predictor/cpu_predictor.cc +++ b/src/predictor/cpu_predictor.cc @@ -189,7 +189,7 @@ struct SparsePageView { explicit SparsePageView(SparsePage const *p) : base_rowid{p->base_rowid} { view = p->GetView(); } SparsePage::Inst operator[](size_t i) { return view[i]; } - size_t Size() const { return view.Size(); } + [[nodiscard]] size_t Size() const { return view.Size(); } }; struct SingleInstanceView { @@ -250,7 +250,7 @@ struct GHistIndexMatrixView { } return ret; } - size_t Size() const { return page_.Size(); } + [[nodiscard]] size_t Size() const { return page_.Size(); } }; template @@ -290,7 +290,7 @@ class AdapterView { return ret; } - size_t Size() const { return adapter_->NumRows(); } + [[nodiscard]] size_t Size() const { return adapter_->NumRows(); } bst_row_t const static base_rowid = 0; // NOLINT }; @@ -408,31 +408,33 @@ class ColumnSplitHelper { ColumnSplitHelper(ColumnSplitHelper &&) noexcept = delete; ColumnSplitHelper &operator=(ColumnSplitHelper &&) noexcept = delete; - void PredictDMatrix(DMatrix *p_fmat, std::vector *out_preds) { + void PredictDMatrix(Context const *ctx, DMatrix *p_fmat, std::vector *out_preds) { CHECK(xgboost::collective::IsDistributed()) << "column-split prediction is only supported for distributed training"; for (auto const &batch : p_fmat->GetBatches()) { CHECK_EQ(out_preds->size(), p_fmat->Info().num_row_ * model_.learner_model_param->num_output_group); - PredictBatchKernel(SparsePageView{&batch}, out_preds); + PredictBatchKernel(ctx, SparsePageView{&batch}, out_preds); } } - void PredictInstance(SparsePage::Inst const &inst, std::vector *out_preds) { + void PredictInstance(Context const *ctx, SparsePage::Inst const &inst, + std::vector *out_preds) { CHECK(xgboost::collective::IsDistributed()) << "column-split prediction is only supported for distributed training"; - PredictBatchKernel(SingleInstanceView{inst}, out_preds); + PredictBatchKernel(ctx, SingleInstanceView{inst}, out_preds); } - void PredictLeaf(DMatrix *p_fmat, std::vector *out_preds) { + void PredictLeaf(Context const* ctx, DMatrix *p_fmat, std::vector *out_preds) { CHECK(xgboost::collective::IsDistributed()) << "column-split prediction is only supported for distributed training"; for (auto const &batch : p_fmat->GetBatches()) { CHECK_EQ(out_preds->size(), p_fmat->Info().num_row_ * (tree_end_ - tree_begin_)); - PredictBatchKernel(SparsePageView{&batch}, out_preds); + PredictBatchKernel(ctx, SparsePageView{&batch}, + out_preds); } } @@ -453,12 +455,13 @@ class ColumnSplitHelper { std::fill(missing_storage_.begin(), missing_storage_.end(), 0); } - std::size_t BitIndex(std::size_t tree_id, std::size_t row_id, std::size_t node_id) const { + [[nodiscard]] std::size_t BitIndex(std::size_t tree_id, std::size_t row_id, + std::size_t node_id) const { size_t tree_index = tree_id - tree_begin_; return tree_offsets_[tree_index] * n_rows_ + row_id * tree_sizes_[tree_index] + node_id; } - void AllreduceBitVectors() { + void AllreduceBitVectors(Context const*) { collective::Allreduce(decision_storage_.data(), decision_storage_.size()); collective::Allreduce(missing_storage_.data(), @@ -547,7 +550,7 @@ class ColumnSplitHelper { } template - void PredictBatchKernel(DataView batch, std::vector *out_preds) { + void PredictBatchKernel(Context const* ctx, DataView batch, std::vector *out_preds) { auto const num_group = model_.learner_model_param->num_output_group; // parallel over local batch @@ -568,7 +571,7 @@ class ColumnSplitHelper { FVecDrop(block_size, fvec_offset, &feat_vecs_); }); - AllreduceBitVectors(); + AllreduceBitVectors(ctx); // auto block_id has the same type as `n_blocks`. common::ParallelFor(n_blocks, n_threads_, [&](auto block_id) { @@ -646,7 +649,7 @@ class CPUPredictor : public Predictor { << "Predict DMatrix with column split" << MTNotImplemented(); ColumnSplitHelper helper(this->ctx_->Threads(), model, tree_begin, tree_end); - helper.PredictDMatrix(p_fmat, out_preds); + helper.PredictDMatrix(ctx_, p_fmat, out_preds); return; } @@ -779,7 +782,7 @@ class CPUPredictor : public Predictor { << "Predict instance with column split" << MTNotImplemented(); ColumnSplitHelper helper(this->ctx_->Threads(), model, 0, ntree_limit); - helper.PredictInstance(inst, out_preds); + helper.PredictInstance(ctx_, inst, out_preds); return; } @@ -811,7 +814,7 @@ class CPUPredictor : public Predictor { << "Predict leaf with column split" << MTNotImplemented(); ColumnSplitHelper helper(n_threads, model, 0, ntree_limit); - helper.PredictLeaf(p_fmat, &preds); + helper.PredictLeaf(ctx_, p_fmat, &preds); return; } diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index e41248e29749..7fad0739758a 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -62,9 +62,7 @@ struct TreeView { cats.node_ptr = tree_cat_ptrs; } - __device__ bool HasCategoricalSplit() const { - return !cats.categories.empty(); - } + [[nodiscard]] __device__ bool HasCategoricalSplit() const { return !cats.categories.empty(); } }; struct SparsePageView { @@ -77,7 +75,7 @@ struct SparsePageView { common::Span row_ptr, bst_feature_t num_features) : d_data{data}, d_row_ptr{row_ptr}, num_features(num_features) {} - __device__ float GetElement(size_t ridx, size_t fidx) const { + [[nodiscard]] __device__ float GetElement(size_t ridx, size_t fidx) const { // Binary search auto begin_ptr = d_data.begin() + d_row_ptr[ridx]; auto end_ptr = d_data.begin() + d_row_ptr[ridx + 1]; @@ -105,8 +103,8 @@ struct SparsePageView { // Value is missing return nanf(""); } - XGBOOST_DEVICE size_t NumRows() const { return d_row_ptr.size() - 1; } - XGBOOST_DEVICE size_t NumCols() const { return num_features; } + [[nodiscard]] XGBOOST_DEVICE size_t NumRows() const { return d_row_ptr.size() - 1; } + [[nodiscard]] XGBOOST_DEVICE size_t NumCols() const { return num_features; } }; struct SparsePageLoader { @@ -137,7 +135,7 @@ struct SparsePageLoader { __syncthreads(); } } - __device__ float GetElement(size_t ridx, size_t fidx) const { + [[nodiscard]] __device__ float GetElement(size_t ridx, size_t fidx) const { if (use_shared) { return smem[threadIdx.x * data.num_features + fidx]; } else { @@ -151,7 +149,7 @@ struct EllpackLoader { XGBOOST_DEVICE EllpackLoader(EllpackDeviceAccessor const& m, bool, bst_feature_t, bst_row_t, size_t, float) : matrix{m} {} - __device__ __forceinline__ float GetElement(size_t ridx, size_t fidx) const { + [[nodiscard]] __device__ __forceinline__ float GetElement(size_t ridx, size_t fidx) const { auto gidx = matrix.GetBinIndex(ridx, fidx); if (gidx == -1) { return nan(""); diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index 627bf4ca4fab..ceb322c28616 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -395,11 +395,11 @@ void GPUHistEvaluator::CopyToHost(const std::vector &nidx) { } } -void GPUHistEvaluator::EvaluateSplits( - const std::vector &nidx, bst_feature_t max_active_features, - common::Span d_inputs, - EvaluateSplitSharedInputs shared_inputs, - common::Span out_entries) { +void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector &nidx, + bst_feature_t max_active_features, + common::Span d_inputs, + EvaluateSplitSharedInputs shared_inputs, + common::Span out_entries) { auto evaluator = this->tree_evaluator_.template GetEvaluator(); dh::TemporaryArray splits_out_storage(d_inputs.size()); @@ -417,19 +417,20 @@ void GPUHistEvaluator::EvaluateSplits( out_splits.size() * sizeof(DeviceSplitCandidate)); // Reduce to get the best candidate from all workers. - dh::LaunchN(out_splits.size(), [world_size, all_candidates, out_splits] __device__(size_t i) { - out_splits[i] = all_candidates[i]; - for (auto rank = 1; rank < world_size; rank++) { - out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i]; - } - }); + dh::LaunchN(out_splits.size(), ctx->CUDACtx()->Stream(), + [world_size, all_candidates, out_splits] __device__(size_t i) { + out_splits[i] = all_candidates[i]; + for (auto rank = 1; rank < world_size; rank++) { + out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i]; + } + }); } auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()); auto d_entries = out_entries; auto device_cats_accessor = this->DeviceCatStorage(nidx); // turn candidate into entry, along with handling sort based split. - dh::LaunchN(d_inputs.size(), [=] __device__(size_t i) mutable { + dh::LaunchN(d_inputs.size(), ctx->CUDACtx()->Stream(), [=] __device__(size_t i) mutable { auto const input = d_inputs[i]; auto &split = out_splits[i]; // Subtract parent gain here @@ -464,12 +465,12 @@ void GPUHistEvaluator::EvaluateSplits( this->CopyToHost(nidx); } -GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit( - EvaluateSplitInputs input, EvaluateSplitSharedInputs shared_inputs) { +GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input, + EvaluateSplitSharedInputs shared_inputs) { dh::device_vector inputs = std::vector{input}; dh::TemporaryArray out_entries(1); - this->EvaluateSplits({input.nidx}, input.feature_set.size(), dh::ToSpan(inputs), shared_inputs, - dh::ToSpan(out_entries)); + this->EvaluateSplits(ctx, {input.nidx}, input.feature_set.size(), dh::ToSpan(inputs), + shared_inputs, dh::ToSpan(out_entries)); GPUExpandEntry root_entry; dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry), cudaMemcpyDeviceToHost)); diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh index 7c61099a1a43..8c387f6324bb 100644 --- a/src/tree/gpu_hist/evaluate_splits.cuh +++ b/src/tree/gpu_hist/evaluate_splits.cuh @@ -193,7 +193,7 @@ class GPUHistEvaluator { /** * \brief Evaluate splits for left and right nodes. */ - void EvaluateSplits(const std::vector &nidx, + void EvaluateSplits(Context const* ctx, const std::vector &nidx, bst_feature_t max_active_features, common::Span d_inputs, EvaluateSplitSharedInputs shared_inputs, @@ -201,7 +201,7 @@ class GPUHistEvaluator { /** * \brief Evaluate splits for root node. */ - GPUExpandEntry EvaluateSingleSplit(EvaluateSplitInputs input, + GPUExpandEntry EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input, EvaluateSplitSharedInputs shared_inputs); }; } // namespace tree diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index 22eb7ab8185c..c473c9269580 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -16,8 +16,7 @@ #include "row_partitioner.cuh" #include "xgboost/base.h" -namespace xgboost { -namespace tree { +namespace xgboost::tree { namespace { struct Pair { GradientPair first; @@ -53,7 +52,8 @@ struct Clip : public thrust::unary_function { * * to avoid outliers, as the full reduction is reproducible on GPU with reduction tree. */ -GradientQuantiser::GradientQuantiser(common::Span gpair, MetaInfo const& info) { +GradientQuantiser::GradientQuantiser(Context const*, common::Span gpair, + MetaInfo const& info) { using GradientSumT = GradientPairPrecise; using T = typename GradientSumT::ValueT; dh::XGBCachingDeviceAllocator alloc; @@ -99,7 +99,6 @@ GradientQuantiser::GradientQuantiser(common::Span gpair, Met static_cast(1) / to_floating_point_.GetHess()); } - XGBOOST_DEV_INLINE void AtomicAddGpairShared(xgboost::GradientPairInt64 *dest, xgboost::GradientPairInt64 const &gpair) { @@ -314,6 +313,4 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& dh::safe_cuda(cudaGetLastError()); } - -} // namespace tree -} // namespace xgboost +} // namespace xgboost::tree diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh index c693e2e62e52..925c548936f4 100644 --- a/src/tree/gpu_hist/histogram.cuh +++ b/src/tree/gpu_hist/histogram.cuh @@ -39,18 +39,20 @@ private: GradientPairPrecise to_floating_point_; public: - GradientQuantiser(common::Span gpair, MetaInfo const& info); - XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const { + GradientQuantiser(Context const* ctx, common::Span gpair, MetaInfo const& info); + [[nodiscard]] XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const { auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(), - gpair.GetHess() * to_fixed_point_.GetHess()); + gpair.GetHess() * to_fixed_point_.GetHess()); return adjusted; } - XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPairPrecise const& gpair) const { + [[nodiscard]] XGBOOST_DEVICE GradientPairInt64 + ToFixedPoint(GradientPairPrecise const& gpair) const { auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(), - gpair.GetHess() * to_fixed_point_.GetHess()); + gpair.GetHess() * to_fixed_point_.GetHess()); return adjusted; } - XGBOOST_DEVICE GradientPairPrecise ToFloatingPoint(const GradientPairInt64&gpair) const { + [[nodiscard]] XGBOOST_DEVICE GradientPairPrecise + ToFloatingPoint(const GradientPairInt64& gpair) const { auto g = gpair.GetQuantisedGrad() * to_floating_point_.GetGrad(); auto h = gpair.GetQuantisedHess() * to_floating_point_.GetHess(); return {g,h}; diff --git a/src/tree/hist/histogram.h b/src/tree/hist/histogram.h index f378c78089db..033d2221ec89 100644 --- a/src/tree/hist/histogram.h +++ b/src/tree/hist/histogram.h @@ -171,7 +171,8 @@ class HistogramBuilder { } } - void SyncHistogram(RegTree const *p_tree, std::vector const &nodes_to_build, + void SyncHistogram(Context const *, RegTree const *p_tree, + std::vector const &nodes_to_build, std::vector const &nodes_to_trick) { auto n_total_bins = buffer_.TotalBins(); common::BlockedSpace2d space( @@ -277,14 +278,14 @@ class MultiHistogramBuilder { } for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) { - this->target_builders_[t].SyncHistogram(p_tree, nodes, dummy_sub); + this->target_builders_[t].SyncHistogram(ctx_, p_tree, nodes, dummy_sub); } } /** * @brief Build histogram for left and right child of valid candidates */ template - void BuildHistLeftRight(DMatrix *p_fmat, RegTree const *p_tree, + void BuildHistLeftRight(Context const *ctx, DMatrix *p_fmat, RegTree const *p_tree, std::vector const &partitioners, std::vector const &valid_candidates, linalg::MatrixView gpair, BatchParam const ¶m, @@ -318,7 +319,7 @@ class MultiHistogramBuilder { } for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) { - this->target_builders_[t].SyncHistogram(p_tree, nodes_to_build, nodes_to_sub); + this->target_builders_[t].SyncHistogram(ctx, p_tree, nodes_to_build, nodes_to_sub); } } diff --git a/src/tree/hist/param.cc b/src/tree/hist/param.cc index 602566cd399e..bd8d7a85c407 100644 --- a/src/tree/hist/param.cc +++ b/src/tree/hist/param.cc @@ -12,7 +12,7 @@ namespace xgboost::tree { DMLC_REGISTER_PARAMETER(HistMakerTrainParam); -void HistMakerTrainParam::CheckTreesSynchronized(RegTree const* local_tree) const { +void HistMakerTrainParam::CheckTreesSynchronized(Context const*, RegTree const* local_tree) const { if (!this->debug_synchronize) { return; } diff --git a/src/tree/hist/param.h b/src/tree/hist/param.h index 8757b65e6610..aa9d8cedf644 100644 --- a/src/tree/hist/param.h +++ b/src/tree/hist/param.h @@ -15,7 +15,7 @@ struct HistMakerTrainParam : public XGBoostParameter { bool debug_synchronize{false}; std::size_t max_cached_hist_node{DefaultNodes()}; - void CheckTreesSynchronized(RegTree const* local_tree) const; + void CheckTreesSynchronized(Context const* ctx, RegTree const* local_tree) const; // declare parameters DMLC_DECLARE_PARAMETER(HistMakerTrainParam) { diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc index 17e020ced5fc..3c37556e1091 100644 --- a/src/tree/updater_approx.cc +++ b/src/tree/updater_approx.cc @@ -140,7 +140,7 @@ class GloablApproxBuilder { std::vector const &gpair, common::Span hess) { monitor_->Start(__func__); this->histogram_builder_.BuildHistLeftRight( - p_fmat, p_tree, partitioner_, valid_candidates, + ctx_, p_fmat, p_tree, partitioner_, valid_candidates, linalg::MakeTensorView(ctx_, gpair, gpair.size(), 1), BatchSpec(*param_, hess)); monitor_->Stop(__func__); } @@ -300,7 +300,7 @@ class GlobalApproxUpdater : public TreeUpdater { std::size_t t_idx = 0; for (auto p_tree : trees) { this->pimpl_->UpdateTree(m, s_gpair, hess, p_tree, &out_position[t_idx]); - hist_param_.CheckTreesSynchronized(p_tree); + hist_param_.CheckTreesSynchronized(ctx_, p_tree); ++t_idx; } } diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 6db201dd5277..3c9c61f88847 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -246,7 +246,7 @@ struct GPUHistMakerDevice { this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param, dmat->Info().IsColumnSplit(), ctx_->Device()); - quantiser = std::make_unique(this->gpair, dmat->Info()); + quantiser = std::make_unique(ctx_, this->gpair, dmat->Info()); row_partitioner.reset(); // Release the device memory first before reallocating row_partitioner = std::make_unique(ctx_->Device(), sample.sample_rows); @@ -276,7 +276,7 @@ struct GPUHistMakerDevice { matrix.min_fvalue, matrix.is_dense && !collective::IsDistributed() }; - auto split = this->evaluator_.EvaluateSingleSplit(inputs, shared_inputs); + auto split = this->evaluator_.EvaluateSingleSplit(ctx_, inputs, shared_inputs); return split; } @@ -329,7 +329,7 @@ struct GPUHistMakerDevice { d_node_inputs.data().get(), h_node_inputs.data(), h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault)); - this->evaluator_.EvaluateSplits(nidx, max_active_features, dh::ToSpan(d_node_inputs), + this->evaluator_.EvaluateSplits(ctx_, nidx, max_active_features, dh::ToSpan(d_node_inputs), shared_inputs, dh::ToSpan(entries)); dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(), entries.data().get(), sizeof(GPUExpandEntry) * entries.size(), @@ -842,7 +842,7 @@ class GPUHistMaker : public TreeUpdater { std::size_t t_idx{0}; for (xgboost::RegTree* tree : trees) { this->UpdateTree(param, gpair_hdv, dmat, tree, &out_position[t_idx]); - this->hist_maker_param_.CheckTreesSynchronized(tree); + this->hist_maker_param_.CheckTreesSynchronized(ctx_, tree); ++t_idx; } dh::safe_cuda(cudaGetLastError()); @@ -985,7 +985,7 @@ class GPUGlobalApproxMaker : public TreeUpdater { std::size_t t_idx{0}; for (xgboost::RegTree* tree : trees) { this->UpdateTree(gpair->Data(), p_fmat, tree, &out_position[t_idx]); - this->hist_maker_param_.CheckTreesSynchronized(tree); + this->hist_maker_param_.CheckTreesSynchronized(ctx_, tree); ++t_idx; } diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index 50943e1c403a..375b24cfa97a 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -228,8 +228,8 @@ class MultiTargetHistBuilder { std::vector const &valid_candidates, linalg::MatrixView gpair) { monitor_->Start(__func__); - histogram_builder_->BuildHistLeftRight(p_fmat, p_tree, partitioner_, valid_candidates, gpair, - HistBatch(param_)); + histogram_builder_->BuildHistLeftRight(ctx_, p_fmat, p_tree, partitioner_, valid_candidates, + gpair, HistBatch(param_)); monitor_->Stop(__func__); } @@ -436,8 +436,8 @@ class HistUpdater { std::vector const &valid_candidates, linalg::MatrixView gpair) { monitor_->Start(__func__); - this->histogram_builder_->BuildHistLeftRight(p_fmat, p_tree, partitioner_, valid_candidates, - gpair, HistBatch(param_)); + this->histogram_builder_->BuildHistLeftRight(ctx_, p_fmat, p_tree, partitioner_, + valid_candidates, gpair, HistBatch(param_)); monitor_->Stop(__func__); } @@ -537,7 +537,7 @@ class QuantileHistMaker : public TreeUpdater { h_out_position, *tree_it); } - hist_param_.CheckTreesSynchronized(*tree_it); + hist_param_.CheckTreesSynchronized(ctx_, *tree_it); } } diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu index 4782f9580567..92d8ff753603 100644 --- a/tests/cpp/common/test_hist_util.cu +++ b/tests/cpp/common/test_hist_util.cu @@ -360,25 +360,27 @@ TEST(HistUtil, DeviceSketchExternalMemoryWithWeights) { } template -auto MakeUnweightedCutsForTest(Adapter adapter, int32_t num_bins, float missing, size_t batch_size = 0) { +auto MakeUnweightedCutsForTest(Context const* ctx, Adapter adapter, int32_t num_bins, float missing, + size_t batch_size = 0) { common::HistogramCuts batched_cuts; HostDeviceVector ft; SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(), DeviceOrd::CUDA(0)); MetaInfo info; AdapterDeviceSketch(adapter.Value(), num_bins, info, missing, &sketch_container, batch_size); - sketch_container.MakeCuts(&batched_cuts, info.IsColumnSplit()); + sketch_container.MakeCuts(ctx, &batched_cuts, info.IsColumnSplit()); return batched_cuts; } template -void ValidateBatchedCuts(Adapter adapter, int num_bins, DMatrix* dmat, size_t batch_size = 0) { +void ValidateBatchedCuts(Context const* ctx, Adapter adapter, int num_bins, DMatrix* dmat, size_t batch_size = 0) { common::HistogramCuts batched_cuts = MakeUnweightedCutsForTest( - adapter, num_bins, std::numeric_limits::quiet_NaN(), batch_size); + ctx, adapter, num_bins, std::numeric_limits::quiet_NaN(), batch_size); ValidateCuts(batched_cuts, dmat, num_bins); } TEST(HistUtil, AdapterDeviceSketch) { + auto ctx = MakeCUDACtx(0); int rows = 5; int cols = 1; int num_bins = 4; @@ -391,8 +393,8 @@ TEST(HistUtil, AdapterDeviceSketch) { data::CupyAdapter adapter(str); - auto device_cuts = MakeUnweightedCutsForTest(adapter, num_bins, missing); - Context ctx; + auto device_cuts = MakeUnweightedCutsForTest(&ctx, adapter, num_bins, missing); + ctx = ctx.MakeCPU(); auto host_cuts = GetHostCuts(&ctx, &adapter, num_bins, missing); EXPECT_EQ(device_cuts.Values(), host_cuts.Values()); @@ -401,6 +403,7 @@ TEST(HistUtil, AdapterDeviceSketch) { } TEST(HistUtil, AdapterDeviceSketchMemory) { + auto ctx = MakeCUDACtx(0); int num_columns = 100; int num_rows = 1000; int num_bins = 256; @@ -410,7 +413,8 @@ TEST(HistUtil, AdapterDeviceSketchMemory) { dh::GlobalMemoryLogger().Clear(); ConsoleLogger::Configure({{"verbosity", "3"}}); - auto cuts = MakeUnweightedCutsForTest(adapter, num_bins, std::numeric_limits::quiet_NaN()); + auto cuts = + MakeUnweightedCutsForTest(&ctx, adapter, num_bins, std::numeric_limits::quiet_NaN()); ConsoleLogger::Configure({{"verbosity", "0"}}); size_t bytes_required = detail::RequiredMemory( num_rows, num_columns, num_rows * num_columns, num_bins, false); @@ -419,6 +423,7 @@ TEST(HistUtil, AdapterDeviceSketchMemory) { } TEST(HistUtil, AdapterSketchSlidingWindowMemory) { + auto ctx = MakeCUDACtx(0); int num_columns = 100; int num_rows = 1000; int num_bins = 256; @@ -435,7 +440,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowMemory) { AdapterDeviceSketch(adapter.Value(), num_bins, info, std::numeric_limits::quiet_NaN(), &sketch_container); HistogramCuts cuts; - sketch_container.MakeCuts(&cuts, info.IsColumnSplit()); + sketch_container.MakeCuts(&ctx, &cuts, info.IsColumnSplit()); size_t bytes_required = detail::RequiredMemory( num_rows, num_columns, num_rows * num_columns, num_bins, false); EXPECT_LE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required * 1.05); @@ -444,6 +449,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowMemory) { } TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) { + auto ctx = MakeCUDACtx(0); int num_columns = 100; int num_rows = 1000; int num_bins = 256; @@ -465,7 +471,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) { &sketch_container); HistogramCuts cuts; - sketch_container.MakeCuts(&cuts, info.IsColumnSplit()); + sketch_container.MakeCuts(&ctx, &cuts, info.IsColumnSplit()); ConsoleLogger::Configure({{"verbosity", "0"}}); size_t bytes_required = detail::RequiredMemory( num_rows, num_columns, num_rows * num_columns, num_bins, true); @@ -475,6 +481,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) { void TestCategoricalSketchAdapter(size_t n, size_t num_categories, int32_t num_bins, bool weighted) { + auto ctx = MakeCUDACtx(0); auto h_x = GenerateRandomCategoricalSingleColumn(n, num_categories); thrust::device_vector x(h_x); auto adapter = AdapterFromData(x, n, 1); @@ -498,7 +505,7 @@ void TestCategoricalSketchAdapter(size_t n, size_t num_categories, AdapterDeviceSketch(adapter.Value(), num_bins, info, std::numeric_limits::quiet_NaN(), &container); HistogramCuts cuts; - container.MakeCuts(&cuts, info.IsColumnSplit()); + container.MakeCuts(&ctx, &cuts, info.IsColumnSplit()); thrust::sort(x.begin(), x.end()); auto n_uniques = thrust::unique(x.begin(), x.end()) - x.begin(); @@ -522,6 +529,7 @@ void TestCategoricalSketchAdapter(size_t n, size_t num_categories, TEST(HistUtil, AdapterDeviceSketchCategorical) { auto categorical_sizes = {2, 6, 8, 12}; int num_bins = 256; + auto ctx = MakeCUDACtx(0); auto sizes = {25, 100, 1000}; for (auto n : sizes) { for (auto num_categories : categorical_sizes) { @@ -529,7 +537,7 @@ TEST(HistUtil, AdapterDeviceSketchCategorical) { auto dmat = GetDMatrixFromData(x, n, 1); auto x_device = thrust::device_vector(x); auto adapter = AdapterFromData(x_device, n, 1); - ValidateBatchedCuts(adapter, num_bins, dmat.get()); + ValidateBatchedCuts(&ctx, adapter, num_bins, dmat.get()); TestCategoricalSketchAdapter(n, num_categories, num_bins, true); TestCategoricalSketchAdapter(n, num_categories, num_bins, false); } @@ -540,13 +548,14 @@ TEST(HistUtil, AdapterDeviceSketchMultipleColumns) { auto bin_sizes = {2, 16, 256, 512}; auto sizes = {100, 1000, 1500}; int num_columns = 5; + auto ctx = MakeCUDACtx(0); for (auto num_rows : sizes) { auto x = GenerateRandom(num_rows, num_columns); auto dmat = GetDMatrixFromData(x, num_rows, num_columns); auto x_device = thrust::device_vector(x); for (auto num_bins : bin_sizes) { auto adapter = AdapterFromData(x_device, num_rows, num_columns); - ValidateBatchedCuts(adapter, num_bins, dmat.get()); + ValidateBatchedCuts(&ctx, adapter, num_bins, dmat.get()); } } } @@ -556,12 +565,13 @@ TEST(HistUtil, AdapterDeviceSketchBatches) { int num_rows = 5000; auto batch_sizes = {0, 100, 1500, 6000}; int num_columns = 5; + auto ctx = MakeCUDACtx(0); for (auto batch_size : batch_sizes) { auto x = GenerateRandom(num_rows, num_columns); auto dmat = GetDMatrixFromData(x, num_rows, num_columns); auto x_device = thrust::device_vector(x); auto adapter = AdapterFromData(x_device, num_rows, num_columns); - ValidateBatchedCuts(adapter, num_bins, dmat.get(), batch_size); + ValidateBatchedCuts(&ctx, adapter, num_bins, dmat.get(), batch_size); } } @@ -647,12 +657,12 @@ TEST(HistUtil, SketchingEquivalent) { auto x_device = thrust::device_vector(x); auto adapter = AdapterFromData(x_device, num_rows, num_columns); common::HistogramCuts adapter_cuts = MakeUnweightedCutsForTest( - adapter, num_bins, std::numeric_limits::quiet_NaN()); + &ctx, adapter, num_bins, std::numeric_limits::quiet_NaN()); EXPECT_EQ(dmat_cuts.Values(), adapter_cuts.Values()); EXPECT_EQ(dmat_cuts.Ptrs(), adapter_cuts.Ptrs()); EXPECT_EQ(dmat_cuts.MinValues(), adapter_cuts.MinValues()); - ValidateBatchedCuts(adapter, num_bins, dmat.get()); + ValidateBatchedCuts(&ctx, adapter, num_bins, dmat.get()); } } } @@ -702,7 +712,7 @@ void TestAdapterSketchFromWeights(bool with_group) { .Device(DeviceOrd::CUDA(0)) .GenerateArrayInterface(&storage); MetaInfo info; - Context ctx; + auto ctx = MakeCUDACtx(0); auto& h_weights = info.weights_.HostVector(); if (with_group) { h_weights.resize(kGroups); @@ -731,7 +741,7 @@ void TestAdapterSketchFromWeights(bool with_group) { &sketch_container); common::HistogramCuts cuts; - sketch_container.MakeCuts(&cuts, info.IsColumnSplit()); + sketch_container.MakeCuts(&ctx, &cuts, info.IsColumnSplit()); auto dmat = GetDMatrixFromData(storage.HostVector(), kRows, kCols); if (with_group) { @@ -744,10 +754,9 @@ void TestAdapterSketchFromWeights(bool with_group) { ASSERT_EQ(cuts.Ptrs().size(), kCols + 1); ValidateCuts(cuts, dmat.get(), kBins); - auto cuda_ctx = MakeCUDACtx(0); if (with_group) { dmat->Info().weights_ = decltype(dmat->Info().weights_)(); // remove weight - HistogramCuts non_weighted = DeviceSketch(&cuda_ctx, dmat.get(), kBins, 0); + HistogramCuts non_weighted = DeviceSketch(&ctx, dmat.get(), kBins, 0); for (size_t i = 0; i < cuts.Values().size(); ++i) { ASSERT_EQ(cuts.Values()[i], non_weighted.Values()[i]); } @@ -773,7 +782,7 @@ void TestAdapterSketchFromWeights(bool with_group) { SketchContainer sketch_container{ft, kBins, kCols, kRows, DeviceOrd::CUDA(0)}; AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits::quiet_NaN(), &sketch_container); - sketch_container.MakeCuts(&weighted, info.IsColumnSplit()); + sketch_container.MakeCuts(&ctx, &weighted, info.IsColumnSplit()); ValidateCuts(weighted, dmat.get(), kBins); } } diff --git a/tests/cpp/common/test_quantile.cc b/tests/cpp/common/test_quantile.cc index 343f59cda3ea..9fa1566ea130 100644 --- a/tests/cpp/common/test_quantile.cc +++ b/tests/cpp/common/test_quantile.cc @@ -86,7 +86,7 @@ void DoTestDistributedQuantile(size_t rows, size_t cols) { } HistogramCuts distributed_cuts; - sketch_distributed.MakeCuts(m->Info(), &distributed_cuts); + sketch_distributed.MakeCuts(&ctx, m->Info(), &distributed_cuts); // Generate cuts for single node environment collective::Finalize(); @@ -117,7 +117,7 @@ void DoTestDistributedQuantile(size_t rows, size_t cols) { } HistogramCuts single_node_cuts; - sketch_on_single_node.MakeCuts(m->Info(), &single_node_cuts); + sketch_on_single_node.MakeCuts(&ctx, m->Info(), &single_node_cuts); auto const& sptrs = single_node_cuts.Ptrs(); auto const& dptrs = distributed_cuts.Ptrs(); @@ -220,7 +220,7 @@ void DoTestColSplitQuantile(size_t rows, size_t cols) { } } - sketch_distributed.MakeCuts(m->Info(), &distributed_cuts); + sketch_distributed.MakeCuts(&ctx, m->Info(), &distributed_cuts); } // Generate cuts for single node environment @@ -243,7 +243,7 @@ void DoTestColSplitQuantile(size_t rows, size_t cols) { } } - sketch_on_single_node.MakeCuts(m->Info(), &single_node_cuts); + sketch_on_single_node.MakeCuts(&ctx, m->Info(), &single_node_cuts); } auto const& sptrs = single_node_cuts.Ptrs(); diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu index 49353439f21a..26bd05524ded 100644 --- a/tests/cpp/common/test_quantile.cu +++ b/tests/cpp/common/test_quantile.cu @@ -370,6 +370,7 @@ void TestAllReduceBasic() { constexpr size_t kRows = 1000, kCols = 100; RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) { auto const device = DeviceOrd::CUDA(GPUIDX); + auto ctx = MakeCUDACtx(device.ordinal); // Set up single node version; HostDeviceVector ft({}, device); @@ -413,7 +414,7 @@ void TestAllReduceBasic() { AdapterDeviceSketch(adapter.Value(), n_bins, info, std::numeric_limits::quiet_NaN(), &sketch_distributed); - sketch_distributed.AllReduce(false); + sketch_distributed.AllReduce(&ctx, false); sketch_distributed.Unique(); ASSERT_EQ(sketch_distributed.ColumnsPtr().size(), @@ -517,6 +518,7 @@ void TestSameOnAllWorkers() { MetaInfo const &info) { auto const rank = collective::GetRank(); auto const device = DeviceOrd::CUDA(GPUIDX); + Context ctx = MakeCUDACtx(device.ordinal); HostDeviceVector ft({}, device); SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device); HostDeviceVector storage({}, device); @@ -528,7 +530,7 @@ void TestSameOnAllWorkers() { AdapterDeviceSketch(adapter.Value(), n_bins, info, std::numeric_limits::quiet_NaN(), &sketch_distributed); - sketch_distributed.AllReduce(false); + sketch_distributed.AllReduce(&ctx, false); sketch_distributed.Unique(); TestQuantileElemRank(device, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true); diff --git a/tests/cpp/plugin/helpers.h b/tests/cpp/plugin/helpers.h index 3dd0c3a1fe15..85f2e014bdf2 100644 --- a/tests/cpp/plugin/helpers.h +++ b/tests/cpp/plugin/helpers.h @@ -73,6 +73,7 @@ void RunWithFederatedCommunicator(int32_t world_size, std::string const& server_ auto run = [&](auto rank) { Json config{JsonObject()}; config["xgboost_communicator"] = String("federated"); + config["federated_secure"] = false; config["federated_server_address"] = String(server_address); config["federated_world_size"] = world_size; config["federated_rank"] = rank; diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc index 6922577485ba..04f1d35b4002 100644 --- a/tests/cpp/test_learner.cc +++ b/tests/cpp/test_learner.cc @@ -2,6 +2,7 @@ * Copyright (c) 2017-2023, XGBoost contributors */ #include +#include #include // for Learner #include // for LogCheck_NE, CHECK_NE, LogCheck_EQ #include // for ObjFunction @@ -81,7 +82,9 @@ TEST(Learner, ParameterValidation) { // whitespace learner->SetParam("tree method", "exact"); - EXPECT_THROW(learner->Configure(), dmlc::Error); + EXPECT_THAT([&] { learner->Configure(); }, + ::testing::ThrowsMessage( + ::testing::HasSubstr(R"("tree method" contains whitespace)"))); } TEST(Learner, CheckGroup) { diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu index 7d5f15a1c47e..862bc6bfcca9 100644 --- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu +++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu @@ -19,14 +19,15 @@ auto ZeroParam() { } } // anonymous namespace -inline GradientQuantiser DummyRoundingFactor() { +inline GradientQuantiser DummyRoundingFactor(Context const* ctx) { thrust::device_vector gpair(1); gpair[0] = {1000.f, 1000.f}; // Tests should not exceed sum of 1000 - return {dh::ToSpan(gpair), MetaInfo()}; + return {ctx, dh::ToSpan(gpair), MetaInfo()}; } -thrust::device_vector ConvertToInteger(std::vector x) { - auto r = DummyRoundingFactor(); +thrust::device_vector ConvertToInteger(Context const* ctx, + std::vector x) { + auto r = DummyRoundingFactor(ctx); std::vector y(x.size()); for (std::size_t i = 0; i < x.size(); i++) { y[i] = r.ToFixedPoint(GradientPair(x[i])); @@ -41,11 +42,12 @@ TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) { cuts_.cut_ptrs_.SetDevice(ctx.Device()); cuts_.cut_values_.SetDevice(ctx.Device()); cuts_.min_vals_.SetDevice(ctx.Device()); - thrust::device_vector feature_histogram{ConvertToInteger(feature_histogram_)}; + thrust::device_vector feature_histogram{ + ConvertToInteger(&ctx, feature_histogram_)}; dh::device_vector feature_types(feature_set.size(), FeatureType::kCategorical); auto d_feature_types = dh::ToSpan(feature_types); - auto quantiser = DummyRoundingFactor(); + auto quantiser = DummyRoundingFactor(&ctx); EvaluateSplitInputs input{1, 0, quantiser.ToFixedPoint(parent_sum_), dh::ToSpan(feature_set), dh::ToSpan(feature_histogram)}; EvaluateSplitSharedInputs shared_inputs{param, @@ -60,7 +62,7 @@ TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) { evaluator.Reset(cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, false, ctx.Device()); - DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split; ASSERT_EQ(result.thresh, 1); this->CheckResult(result.loss_chg, result.findex, result.fvalue, result.is_cat, @@ -90,7 +92,7 @@ TEST(GpuHist, PartitionBasic) { *std::max_element(cuts.cut_values_.HostVector().begin(), cuts.cut_values_.HostVector().end()); cuts.SetCategorical(true, max_cat); d_feature_types = dh::ToSpan(feature_types); - auto quantiser = DummyRoundingFactor(); + auto quantiser = DummyRoundingFactor(&ctx); EvaluateSplitSharedInputs shared_inputs{ param, quantiser, @@ -108,10 +110,10 @@ TEST(GpuHist, PartitionBasic) { // -1.0s go right // -3.0s go left auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-5.0, 3.0}); - auto feature_histogram = ConvertToInteger({{-1.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}}); + auto feature_histogram = ConvertToInteger(&ctx, {{-1.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}}); EvaluateSplitInputs input{0, 0, parent_sum, dh::ToSpan(feature_set), dh::ToSpan(feature_histogram)}; - DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split; auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); EXPECT_EQ(result.dir, kLeftDir); EXPECT_EQ(cats, std::bitset<32>("11000000000000000000000000000000")); @@ -122,10 +124,10 @@ TEST(GpuHist, PartitionBasic) { // -1.0s go right // -3.0s go left auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-7.0, 3.0}); - auto feature_histogram = ConvertToInteger({{-1.0, 1.0}, {-3.0, 1.0}, {-3.0, 1.0}}); + auto feature_histogram = ConvertToInteger(&ctx, {{-1.0, 1.0}, {-3.0, 1.0}, {-3.0, 1.0}}); EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set), dh::ToSpan(feature_histogram)}; - DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split; auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); EXPECT_EQ(result.dir, kLeftDir); EXPECT_EQ(cats, std::bitset<32>("10000000000000000000000000000000")); @@ -134,10 +136,10 @@ TEST(GpuHist, PartitionBasic) { { // All -1.0, gain from splitting should be 0.0 auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-3.0, 3.0}); - auto feature_histogram = ConvertToInteger({{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}}); + auto feature_histogram = ConvertToInteger(&ctx, {{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}}); EvaluateSplitInputs input{2, 0, parent_sum, dh::ToSpan(feature_set), dh::ToSpan(feature_histogram)}; - DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split; EXPECT_EQ(result.dir, kLeftDir); EXPECT_FLOAT_EQ(result.loss_chg, 0.0f); EXPECT_EQ(result.left_sum + result.right_sum, parent_sum); @@ -147,10 +149,10 @@ TEST(GpuHist, PartitionBasic) { // value { auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 6.0}); - auto feature_histogram = ConvertToInteger({{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}}); + auto feature_histogram = ConvertToInteger(&ctx, {{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}}); EvaluateSplitInputs input{3, 0, parent_sum, dh::ToSpan(feature_set), dh::ToSpan(feature_histogram)}; - DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split; auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); EXPECT_EQ(cats, std::bitset<32>("11000000000000000000000000000000")); EXPECT_EQ(result.dir, kLeftDir); @@ -160,10 +162,10 @@ TEST(GpuHist, PartitionBasic) { // -1.0s go right // -3.0s go left auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-5.0, 3.0}); - auto feature_histogram = ConvertToInteger({{-1.0, 1.0}, {-3.0, 1.0}, {-1.0, 1.0}}); + auto feature_histogram = ConvertToInteger(&ctx, {{-1.0, 1.0}, {-3.0, 1.0}, {-1.0, 1.0}}); EvaluateSplitInputs input{4, 0, parent_sum, dh::ToSpan(feature_set), dh::ToSpan(feature_histogram)}; - DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split; auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); EXPECT_EQ(result.dir, kLeftDir); EXPECT_EQ(cats, std::bitset<32>("10100000000000000000000000000000")); @@ -173,10 +175,10 @@ TEST(GpuHist, PartitionBasic) { // -1.0s go right // -3.0s go left auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-5.0, 3.0}); - auto feature_histogram = ConvertToInteger({{-3.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}}); + auto feature_histogram = ConvertToInteger(&ctx, {{-3.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}}); EvaluateSplitInputs input{5, 0, parent_sum, dh::ToSpan(feature_set), dh::ToSpan(feature_histogram)}; - DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split; auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); EXPECT_EQ(cats, std::bitset<32>("01000000000000000000000000000000")); EXPECT_EQ(result.left_sum + result.right_sum, parent_sum); @@ -205,7 +207,7 @@ TEST(GpuHist, PartitionTwoFeatures) { *std::max_element(cuts.cut_values_.HostVector().begin(), cuts.cut_values_.HostVector().end()); cuts.SetCategorical(true, max_cat); - auto quantiser = DummyRoundingFactor(); + auto quantiser = DummyRoundingFactor(&ctx); EvaluateSplitSharedInputs shared_inputs{param, quantiser, d_feature_types, @@ -220,10 +222,10 @@ TEST(GpuHist, PartitionTwoFeatures) { { auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0}); auto feature_histogram = ConvertToInteger( - {{-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}}); + &ctx, {{-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}}); EvaluateSplitInputs input{0, 0, parent_sum, dh::ToSpan(feature_set), dh::ToSpan(feature_histogram)}; - DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split; auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); EXPECT_EQ(result.findex, 1); EXPECT_EQ(cats, std::bitset<32>("11000000000000000000000000000000")); @@ -233,10 +235,10 @@ TEST(GpuHist, PartitionTwoFeatures) { { auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0}); auto feature_histogram = ConvertToInteger( - {{-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0}}); + &ctx, {{-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0}}); EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set), dh::ToSpan(feature_histogram)}; - DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split; auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); EXPECT_EQ(result.findex, 1); EXPECT_EQ(cats, std::bitset<32>("10000000000000000000000000000000")); @@ -266,7 +268,7 @@ TEST(GpuHist, PartitionTwoNodes) { *std::max_element(cuts.cut_values_.HostVector().begin(), cuts.cut_values_.HostVector().end()); cuts.SetCategorical(true, max_cat); - auto quantiser = DummyRoundingFactor(); + auto quantiser = DummyRoundingFactor(&ctx); EvaluateSplitSharedInputs shared_inputs{param, quantiser, d_feature_types, @@ -283,15 +285,16 @@ TEST(GpuHist, PartitionTwoNodes) { { auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0}); auto feature_histogram_a = ConvertToInteger( - {{-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}}); + &ctx, {{-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}}); thrust::device_vector inputs(2); inputs[0] = EvaluateSplitInputs{0, 0, parent_sum, dh::ToSpan(feature_set), dh::ToSpan(feature_histogram_a)}; - auto feature_histogram_b = ConvertToInteger({{-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}}); + auto feature_histogram_b = ConvertToInteger(&ctx, {{-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}}); inputs[1] = EvaluateSplitInputs{1, 0, parent_sum, dh::ToSpan(feature_set), dh::ToSpan(feature_histogram_b)}; thrust::device_vector results(2); - evaluator.EvaluateSplits({0, 1}, 1, dh::ToSpan(inputs), shared_inputs, dh::ToSpan(results)); + evaluator.EvaluateSplits(&ctx, {0, 1}, 1, dh::ToSpan(inputs), shared_inputs, + dh::ToSpan(results)); EXPECT_EQ(std::bitset<32>(evaluator.GetHostNodeCats(0)[0]), std::bitset<32>("10000000000000000000000000000000")); EXPECT_EQ(std::bitset<32>(evaluator.GetHostNodeCats(1)[0]), @@ -301,7 +304,7 @@ TEST(GpuHist, PartitionTwoNodes) { void TestEvaluateSingleSplit(bool is_categorical) { auto ctx = MakeCUDACtx(0); - auto quantiser = DummyRoundingFactor(); + auto quantiser = DummyRoundingFactor(&ctx); auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0}); TrainParam tparam = ZeroParam(); GPUTrainingParam param{tparam}; @@ -311,7 +314,8 @@ void TestEvaluateSingleSplit(bool is_categorical) { thrust::device_vector feature_set = std::vector{0, 1}; // Setup gradients so that second feature gets higher gain - auto feature_histogram = ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}}); + auto feature_histogram = + ConvertToInteger(&ctx, {{-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}}); dh::device_vector feature_types(feature_set.size(), FeatureType::kCategorical); common::Span d_feature_types; @@ -336,7 +340,7 @@ void TestEvaluateSingleSplit(bool is_categorical) { ctx.Device()}; evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, ctx.Device()); - DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split; EXPECT_EQ(result.findex, 1); if (is_categorical) { @@ -352,7 +356,8 @@ TEST(GpuHist, EvaluateSingleSplit) { TestEvaluateSingleSplit(false); } TEST(GpuHist, EvaluateSingleCategoricalSplit) { TestEvaluateSingleSplit(true); } TEST(GpuHist, EvaluateSingleSplitMissing) { - auto quantiser = DummyRoundingFactor(); + auto ctx = MakeCUDACtx(0); + auto quantiser = DummyRoundingFactor(&ctx); auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{1.0, 1.5}); TrainParam tparam = ZeroParam(); GPUTrainingParam param{tparam}; @@ -361,7 +366,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) { thrust::device_vector feature_segments = std::vector{0, 2}; thrust::device_vector feature_values = std::vector{1.0, 2.0}; thrust::device_vector feature_min_values = std::vector{0.0}; - auto feature_histogram = ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}}); + auto feature_histogram = ConvertToInteger(&ctx, {{-0.5, 0.5}, {0.5, 0.5}}); EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set), dh::ToSpan(feature_histogram)}; EvaluateSplitSharedInputs shared_inputs{param, @@ -373,7 +378,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) { false}; GPUHistEvaluator evaluator(tparam, feature_set.size(), FstCU()); - DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split; EXPECT_EQ(result.findex, 0); EXPECT_EQ(result.fvalue, 1.0); @@ -383,14 +388,15 @@ TEST(GpuHist, EvaluateSingleSplitMissing) { } TEST(GpuHist, EvaluateSingleSplitEmpty) { + auto ctx = MakeCUDACtx(0); TrainParam tparam = ZeroParam(); GPUHistEvaluator evaluator(tparam, 1, FstCU()); DeviceSplitCandidate result = evaluator .EvaluateSingleSplit( - EvaluateSplitInputs{}, + &ctx, EvaluateSplitInputs{}, EvaluateSplitSharedInputs{ - GPUTrainingParam(tparam), DummyRoundingFactor(), {}, {}, {}, {}, false}) + GPUTrainingParam(tparam), DummyRoundingFactor(&ctx), {}, {}, {}, {}, false}) .split; EXPECT_EQ(result.findex, -1); EXPECT_LT(result.loss_chg, 0.0f); @@ -398,7 +404,8 @@ TEST(GpuHist, EvaluateSingleSplitEmpty) { // Feature 0 has a better split, but the algorithm must select feature 1 TEST(GpuHist, EvaluateSingleSplitFeatureSampling) { - auto quantiser = DummyRoundingFactor(); + auto ctx = MakeCUDACtx(0); + auto quantiser = DummyRoundingFactor(&ctx); auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0}); TrainParam tparam = ZeroParam(); tparam.UpdateAllowUnknown(Args{}); @@ -408,7 +415,8 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) { thrust::device_vector feature_segments = std::vector{0, 2, 4}; thrust::device_vector feature_values = std::vector{1.0, 2.0, 11.0, 12.0}; thrust::device_vector feature_min_values = std::vector{0.0, 10.0}; - auto feature_histogram = ConvertToInteger({{-10.0, 0.5}, {10.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}}); + auto feature_histogram = + ConvertToInteger(&ctx, {{-10.0, 0.5}, {10.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}}); EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set), dh::ToSpan(feature_histogram)}; EvaluateSplitSharedInputs shared_inputs{param, @@ -420,7 +428,7 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) { false}; GPUHistEvaluator evaluator(tparam, feature_min_values.size(), FstCU()); - DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split; EXPECT_EQ(result.findex, 1); EXPECT_EQ(result.fvalue, 11.0); @@ -430,7 +438,8 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) { // Features 0 and 1 have identical gain, the algorithm must select 0 TEST(GpuHist, EvaluateSingleSplitBreakTies) { - auto quantiser = DummyRoundingFactor(); + auto ctx = MakeCUDACtx(0); + auto quantiser = DummyRoundingFactor(&ctx); auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0}); TrainParam tparam = ZeroParam(); tparam.UpdateAllowUnknown(Args{}); @@ -440,7 +449,8 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) { thrust::device_vector feature_segments = std::vector{0, 2, 4}; thrust::device_vector feature_values = std::vector{1.0, 2.0, 11.0, 12.0}; thrust::device_vector feature_min_values = std::vector{0.0, 10.0}; - auto feature_histogram = ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}, {-0.5, 0.5}, {0.5, 0.5}}); + auto feature_histogram = + ConvertToInteger(&ctx, {{-0.5, 0.5}, {0.5, 0.5}, {-0.5, 0.5}, {0.5, 0.5}}); EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set), dh::ToSpan(feature_histogram)}; EvaluateSplitSharedInputs shared_inputs{param, @@ -452,15 +462,16 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) { false}; GPUHistEvaluator evaluator(tparam, feature_min_values.size(), FstCU()); - DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split; EXPECT_EQ(result.findex, 0); EXPECT_EQ(result.fvalue, 1.0); } TEST(GpuHist, EvaluateSplits) { + auto ctx = MakeCUDACtx(0); thrust::device_vector out_splits(2); - auto quantiser = DummyRoundingFactor(); + auto quantiser = DummyRoundingFactor(&ctx); auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0}); TrainParam tparam = ZeroParam(); tparam.UpdateAllowUnknown(Args{}); @@ -471,9 +482,9 @@ TEST(GpuHist, EvaluateSplits) { thrust::device_vector feature_values = std::vector{1.0, 2.0, 11.0, 12.0}; thrust::device_vector feature_min_values = std::vector{0.0, 0.0}; auto feature_histogram_left = - ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}}); + ConvertToInteger(&ctx, {{-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}}); auto feature_histogram_right = - ConvertToInteger({{-1.0, 0.5}, {1.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}}); + ConvertToInteger(&ctx, {{-1.0, 0.5}, {1.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}}); EvaluateSplitInputs input_left{1, 0, parent_sum, dh::ToSpan(feature_set), dh::ToSpan(feature_histogram_left)}; EvaluateSplitInputs input_right{2, 0, parent_sum, dh::ToSpan(feature_set), @@ -514,7 +525,7 @@ TEST_F(TestPartitionBasedSplit, GpuHist) { evaluator.Reset(cuts_, dh::ToSpan(ft), info_.num_col_, param_, false, ctx.Device()); // Convert the sample histogram to fixed point - auto quantiser = DummyRoundingFactor(); + auto quantiser = DummyRoundingFactor(&ctx); thrust::host_vector h_hist; for (auto e : hist_[0]) { h_hist.push_back(quantiser.ToFixedPoint(e)); @@ -531,7 +542,7 @@ TEST_F(TestPartitionBasedSplit, GpuHist) { cuts_.cut_values_.ConstDeviceSpan(), cuts_.min_vals_.ConstDeviceSpan(), false}; - auto split = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + auto split = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split; ASSERT_NEAR(split.loss_chg, best_score_, 1e-2); } @@ -541,7 +552,7 @@ namespace { void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) { auto ctx = MakeCUDACtx(GPUIDX); auto rank = collective::GetRank(); - auto quantiser = DummyRoundingFactor(); + auto quantiser = DummyRoundingFactor(&ctx); auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0}); TrainParam tparam = ZeroParam(); GPUTrainingParam param{tparam}; @@ -552,8 +563,8 @@ void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) { thrust::device_vector feature_set = std::vector{0, 1}; // Setup gradients so that second feature gets higher gain - auto feature_histogram = rank == 0 ? ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}}) - : ConvertToInteger({{-1.0, 0.5}, {1.0, 0.5}}); + auto feature_histogram = rank == 0 ? ConvertToInteger(&ctx, {{-0.5, 0.5}, {0.5, 0.5}}) + : ConvertToInteger(&ctx, {{-1.0, 0.5}, {1.0, 0.5}}); dh::device_vector feature_types(feature_set.size(), FeatureType::kCategorical); common::Span d_feature_types; @@ -576,7 +587,7 @@ void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) { GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), ctx.Device()}; evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true, ctx.Device()); - DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split; EXPECT_EQ(result.findex, 1) << "rank: " << rank; if (is_categorical) { diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu index 0c91cf21e7f1..f7f2e27ea1be 100644 --- a/tests/cpp/tree/gpu_hist/test_histogram.cu +++ b/tests/cpp/tree/gpu_hist/test_histogram.cu @@ -37,7 +37,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) { FeatureGroups feature_groups(page->Cuts(), page->is_dense, shm_size, sizeof(GradientPairInt64)); - auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo()); + auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo()); BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(FstCU()), feature_groups.DeviceAccessor(FstCU()), gpair.DeviceSpan(), ridx, d_histogram, quantiser); @@ -51,7 +51,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) { dh::device_vector new_histogram(num_bins); auto d_new_histogram = dh::ToSpan(new_histogram); - auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo()); + auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo()); BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(FstCU()), feature_groups.DeviceAccessor(FstCU()), gpair.DeviceSpan(), ridx, d_new_histogram, quantiser); @@ -129,7 +129,7 @@ void TestGPUHistogramCategorical(size_t num_categories) { dh::device_vector cat_hist(num_categories); auto gpair = GenerateRandomGradients(kRows, 0, 2); gpair.SetDevice(DeviceOrd::CUDA(0)); - auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo()); + auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo()); /** * Generate hist with cat data. */ diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc index 8949b5f4be2a..25a800367c49 100644 --- a/tests/cpp/tree/hist/test_histogram.cc +++ b/tests/cpp/tree/hist/test_histogram.cc @@ -181,7 +181,7 @@ void TestSyncHist(bool is_distributed) { histogram.Buffer().Reset(1, n_nodes, space, target_hists); // sync hist - histogram.SyncHistogram(&tree, nodes_for_explicit_hist_build, nodes_for_subtraction_trick); + histogram.SyncHistogram(&ctx, &tree, nodes_for_explicit_hist_build, nodes_for_subtraction_trick); using GHistRowT = common::GHistRow; auto check_hist = [](const GHistRowT parent, const GHistRowT left, const GHistRowT right, @@ -266,7 +266,7 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_ histogram.BuildHist(0, space, gidx, row_set_collection, nodes_to_build, linalg::MakeTensorView(&ctx, gpair, gpair.size()), force_read_by_column); } - histogram.SyncHistogram(&tree, nodes_to_build, {}); + histogram.SyncHistogram(&ctx, &tree, nodes_to_build, {}); // Check if number of histogram bins is correct ASSERT_EQ(histogram.Histogram()[nid].size(), gmat.cut.Ptrs().back()); @@ -366,7 +366,7 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) { linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size()), force_read_by_column); } - cat_hist.SyncHistogram(&tree, nodes_to_build, {}); + cat_hist.SyncHistogram(&ctx, &tree, nodes_to_build, {}); /** * Generate hist with one hot encoded data. @@ -382,7 +382,7 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) { linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size()), force_read_by_column); } - onehot_hist.SyncHistogram(&tree, nodes_to_build, {}); + onehot_hist.SyncHistogram(&ctx, &tree, nodes_to_build, {}); auto cat = cat_hist.Histogram()[0]; auto onehot = onehot_hist.Histogram()[0]; @@ -451,7 +451,7 @@ void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, boo force_read_by_column); ++page_idx; } - multi_build.SyncHistogram(&tree, nodes, {}); + multi_build.SyncHistogram(ctx, &tree, nodes, {}); multi_page = multi_build.Histogram()[RegTree::kRoot]; } @@ -480,7 +480,7 @@ void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, boo single_build.BuildHist(0, space, gmat, row_set_collection, nodes, linalg::MakeTensorView(ctx, h_gpair, h_gpair.size()), force_read_by_column); - single_build.SyncHistogram(&tree, nodes, {}); + single_build.SyncHistogram(ctx, &tree, nodes, {}); single_page = single_build.Histogram()[RegTree::kRoot]; } @@ -570,7 +570,7 @@ class OverflowTest : public ::testing::TestWithParam> { CHECK_NE(partitioners.front()[tree.RightChild(best.nid)].Size(), 0); hist_builder.BuildHistLeftRight( - Xy.get(), &tree, partitioners, valid_candidates, + &ctx, Xy.get(), &tree, partitioners, valid_candidates, linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size(), 1), batch); if (limit) { diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index accfbae082c7..6f937351ea23 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -111,7 +111,7 @@ void TestBuildHist(bool use_shared_memory_histograms) { maker.hist.AllocateHistograms({0}); maker.gpair = gpair.DeviceSpan(); - maker.quantiser = std::make_unique(maker.gpair, MetaInfo()); + maker.quantiser = std::make_unique(&ctx, maker.gpair, MetaInfo()); maker.page = page.get(); maker.InitFeatureGroupsOnce(); @@ -162,12 +162,6 @@ HistogramCutsWrapper GetHostCutMatrix () { return cmat; } -inline GradientQuantiser DummyRoundingFactor() { - thrust::device_vector gpair(1); - gpair[0] = {1000.f, 1000.f}; // Tests should not exceed sum of 1000 - return {dh::ToSpan(gpair), MetaInfo()}; -} - void TestHistogramIndexImpl() { // Test if the compressed histogram index matches when using a sparse // dmatrix with and without using external memory