diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index 04b489d8b061..69176994bc0b 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -178,7 +178,7 @@ class MetaInfo {
    * in vertical federated learning, since each worker loads its own list of columns,
    * we need to sum them.
    */
-  void SynchronizeNumberOfColumns();
+  void SynchronizeNumberOfColumns(Context const* ctx);
 
   /*! \brief Whether the data is split row-wise. */
   bool IsRowSplit() const {
diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index 901c9ae914de..8806818fbc22 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -582,20 +582,20 @@ auto MakeTensorView(Context const *ctx, Container &data, S &&...shape) {  // NOL
   return TensorView<T, sizeof...(S)>{data, in_shape, ctx->Device()};
 }
 
-template <typename T, typename... S>
-LINALG_HD auto MakeTensorView(DeviceOrd device, common::Span<T> data, S &&...shape) {
+template <typename T, decltype(common::dynamic_extent) ext, typename... S>
+LINALG_HD auto MakeTensorView(DeviceOrd device, common::Span<T, ext> data, S &&...shape) {
   std::size_t in_shape[sizeof...(S)];
   detail::IndexToArr(in_shape, std::forward<S>(shape)...);
   return TensorView<T, sizeof...(S)>{data, in_shape, device};
 }
 
-template <typename T, typename... S>
-auto MakeTensorView(Context const *ctx, common::Span<T> data, S &&...shape) {
+template <typename T, decltype(common::dynamic_extent) ext, typename... S>
+auto MakeTensorView(Context const *ctx, common::Span<T, ext> data, S &&...shape) {
   return MakeTensorView(ctx->Device(), data, std::forward<S>(shape)...);
 }
 
-template <typename T, typename... S>
-auto MakeTensorView(Context const *ctx, Order order, common::Span<T> data, S &&...shape) {
+template <typename T, decltype(common::dynamic_extent) ext, typename... S>
+auto MakeTensorView(Context const *ctx, Order order, common::Span<T, ext> data, S &&...shape) {
   std::size_t in_shape[sizeof...(S)];
   detail::IndexToArr(in_shape, std::forward<S>(shape)...);
   return TensorView<T, sizeof...(S)>{data, in_shape, ctx->Device(), order};
diff --git a/plugin/federated/federated_coll.cc b/plugin/federated/federated_coll.cc
index 7c25eeba5ad5..980992d61100 100644
--- a/plugin/federated/federated_coll.cc
+++ b/plugin/federated/federated_coll.cc
@@ -29,7 +29,7 @@ namespace {
   auto stub = fed->Handle();
 
   BroadcastRequest request;
-  request.set_sequence_number(*sequence_number++);
+  request.set_sequence_number((*sequence_number)++);
   request.set_rank(comm.Rank());
   if (comm.Rank() != root) {
     request.set_send_buffer(nullptr, 0);
@@ -90,9 +90,9 @@ Coll *FederatedColl::MakeCUDAVar() {
 [[nodiscard]] Result FederatedColl::Broadcast(Comm const &comm, common::Span<std::int8_t> data,
                                               std::int32_t root) {
   if (comm.Rank() == root) {
-    return BroadcastImpl(comm, &sequence_number_, data, root);
+    return BroadcastImpl(comm, &this->sequence_number_, data, root);
   } else {
-    return BroadcastImpl(comm, &sequence_number_, data, root);
+    return BroadcastImpl(comm, &this->sequence_number_, data, root);
   }
 }
 
diff --git a/src/collective/allreduce.cc b/src/collective/allreduce.cc
index 6948f6758016..65c06686860b 100644
--- a/src/collective/allreduce.cc
+++ b/src/collective/allreduce.cc
@@ -62,6 +62,9 @@ Result RingScatterReduceTyped(Comm const& comm, common::Span<std::int8_t> data,
 
 Result RingAllreduce(Comm const& comm, common::Span<std::int8_t> data, Func const& op,
                      ArrayInterfaceHandler::Type type) {
+  if (comm.World() == 1) {
+    return Success();
+  }
   return DispatchDType(type, [&](auto t) {
     using T = decltype(t);
     // Divide the data into segments according to the number of workers.
diff --git a/src/collective/comm.cu b/src/collective/comm.cu
index 09faf31cd641..09edc522df16 100644
--- a/src/collective/comm.cu
+++ b/src/collective/comm.cu
@@ -10,6 +10,7 @@
 #include <sstream>    // for stringstream
 #include <vector>     // for vector
 
+#include "../common/cuda_context.cuh"    // for CUDAContext
 #include "../common/device_helpers.cuh"  // for DefaultStream
 #include "../common/type.h"              // for EraseType
 #include "broadcast.h"                   // for Broadcast
@@ -60,7 +61,7 @@ Comm* Comm::MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const {
 NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl)
     : Comm{root.TrackerInfo().host, root.TrackerInfo().port, root.Timeout(), root.Retry(),
            root.TaskID()},
-      stream_{dh::DefaultStream()} {
+      stream_{ctx->CUDACtx()->Stream()} {
   this->world_ = root.World();
   this->rank_ = root.Rank();
   this->domain_ = root.Domain();
diff --git a/src/collective/comm_group.cc b/src/collective/comm_group.cc
index 57050084324d..3d2e24492b04 100644
--- a/src/collective/comm_group.cc
+++ b/src/collective/comm_group.cc
@@ -105,7 +105,7 @@ CommGroup::CommGroup()
 }
 
 std::unique_ptr<collective::CommGroup>& GlobalCommGroup() {
-  static std::unique_ptr<collective::CommGroup> sptr;
+  static thread_local std::unique_ptr<collective::CommGroup> sptr;
   if (!sptr) {
     Json config{Null{}};
     sptr.reset(CommGroup::Create(config));
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 89b3ad2e6a17..74336ac61a82 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -480,7 +480,8 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
   cub::CachingDeviceAllocator& GetGlobalCachingAllocator() {
     // Configure allocator with maximum cached bin size of ~1GB and no limit on
     // maximum cached bytes
-    thread_local cub::CachingDeviceAllocator *allocator = new cub::CachingDeviceAllocator(2, 9, 29);
+    thread_local std::unique_ptr<cub::CachingDeviceAllocator> allocator{
+        std::make_unique<cub::CachingDeviceAllocator>(2, 9, 29)};
     return *allocator;
   }
   pointer allocate(size_t n) {  // NOLINT
diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
index 65ab18630e1f..f101247920a4 100644
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -51,7 +51,7 @@ HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins
     for (auto const &page : m->GetBatches<SparsePage>()) {
       container.PushRowPage(page, info, hessian);
     }
-    container.MakeCuts(m->Info(), &out);
+    container.MakeCuts(ctx, m->Info(), &out);
   } else {
     SortedSketchContainer container{ctx,
                                     max_bins,
@@ -61,7 +61,7 @@ HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins
     for (auto const &page : m->GetBatches<SortedCSCPage>(ctx)) {
       container.PushColPage(page, info, hessian);
     }
-    container.MakeCuts(m->Info(), &out);
+    container.MakeCuts(ctx, m->Info(), &out);
   }
 
   return out;
diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
index 1f06c2a6fdf4..fbe6356bf501 100644
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -359,7 +359,7 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b
     }
   }
 
-  sketch_container.MakeCuts(&cuts, p_fmat->Info().IsColumnSplit());
+  sketch_container.MakeCuts(ctx, &cuts, p_fmat->Info().IsColumnSplit());
   return cuts;
 }
 }  // namespace xgboost::common
diff --git a/src/common/quantile.cc b/src/common/quantile.cc
index 5250abd0f69b..0490add264ac 100644
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@@ -11,9 +11,7 @@
 #include "categorical.h"
 #include "hist_util.h"
 
-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 template <typename WQSketch>
 SketchContainerImpl<WQSketch>::SketchContainerImpl(Context const *ctx,
                                                    std::vector<bst_row_t> columns_size,
@@ -129,7 +127,7 @@ struct QuantileAllreduce {
    * \param rank rank of target worker
    * \param fidx feature idx
    */
-  auto Values(int32_t rank, bst_feature_t fidx) const {
+  [[nodiscard]] auto Values(int32_t rank, bst_feature_t fidx) const {
     // get span for worker
     auto wsize = worker_indptr[rank + 1] - worker_indptr[rank];
     auto worker_values = global_values.subspan(worker_indptr[rank], wsize);
@@ -145,7 +143,7 @@ struct QuantileAllreduce {
 
 template <typename WQSketch>
 void SketchContainerImpl<WQSketch>::GatherSketchInfo(
-    MetaInfo const& info,
+    Context const *, MetaInfo const &info,
     std::vector<typename WQSketch::SummaryContainer> const &reduced,
     std::vector<size_t> *p_worker_segments, std::vector<bst_row_t> *p_sketches_scan,
     std::vector<typename WQSketch::Entry> *p_global_sketches) {
@@ -206,7 +204,7 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(
 }
 
 template <typename WQSketch>
-void SketchContainerImpl<WQSketch>::AllreduceCategories(MetaInfo const& info) {
+void SketchContainerImpl<WQSketch>::AllreduceCategories(Context const*, MetaInfo const& info) {
   auto world_size = collective::GetWorldSize();
   auto rank = collective::GetRank();
   if (world_size == 1 || info.IsColumnSplit()) {
@@ -274,16 +272,15 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(MetaInfo const& info) {
 
 template <typename WQSketch>
 void SketchContainerImpl<WQSketch>::AllReduce(
-    MetaInfo const& info,
-    std::vector<typename WQSketch::SummaryContainer> *p_reduced,
-    std::vector<int32_t>* p_num_cuts) {
+    Context const *ctx, MetaInfo const &info,
+    std::vector<typename WQSketch::SummaryContainer> *p_reduced, std::vector<int32_t> *p_num_cuts) {
   monitor_.Start(__func__);
 
   size_t n_columns = sketches_.size();
   collective::Allreduce<collective::Operation::kMax>(&n_columns, 1);
   CHECK_EQ(n_columns, sketches_.size()) << "Number of columns differs across workers";
 
-  AllreduceCategories(info);
+  AllreduceCategories(ctx, info);
 
   auto& num_cuts = *p_num_cuts;
   CHECK_EQ(num_cuts.size(), 0);
@@ -324,7 +321,7 @@ void SketchContainerImpl<WQSketch>::AllReduce(
   std::vector<bst_row_t> sketches_scan((n_columns + 1) * world, 0);
 
   std::vector<typename WQSketch::Entry> global_sketches;
-  this->GatherSketchInfo(info, reduced, &worker_segments, &sketches_scan, &global_sketches);
+  this->GatherSketchInfo(ctx, info, reduced, &worker_segments, &sketches_scan, &global_sketches);
 
   std::vector<typename WQSketch::SummaryContainer> final_sketches(n_columns);
 
@@ -383,11 +380,12 @@ auto AddCategories(std::set<float> const &categories, HistogramCuts *cuts) {
 }
 
 template <typename WQSketch>
-void SketchContainerImpl<WQSketch>::MakeCuts(MetaInfo const &info, HistogramCuts *p_cuts) {
+void SketchContainerImpl<WQSketch>::MakeCuts(Context const *ctx, MetaInfo const &info,
+                                             HistogramCuts *p_cuts) {
   monitor_.Start(__func__);
   std::vector<typename WQSketch::SummaryContainer> reduced;
   std::vector<int32_t> num_cuts;
-  this->AllReduce(info, &reduced, &num_cuts);
+  this->AllReduce(ctx, info, &reduced, &num_cuts);
 
   p_cuts->min_vals_.HostVector().resize(sketches_.size(), 0.0f);
   std::vector<typename WQSketch::SummaryContainer> final_summaries(reduced.size());
@@ -496,5 +494,4 @@ void SortedSketchContainer::PushColPage(SparsePage const &page, MetaInfo const &
   });
   monitor_.Stop(__func__);
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/src/common/quantile.cu b/src/common/quantile.cu
index 2bf6070d5e89..4b110f5e0164 100644
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -22,9 +22,7 @@
 #include "transform_iterator.h"  // MakeIndexTransformIter
 #include "xgboost/span.h"
 
-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 using WQSketch = HostSketchContainer::WQSketch;
 using SketchEntry = WQSketch::Entry;
 
@@ -501,7 +499,7 @@ void SketchContainer::FixError() {
   });
 }
 
-void SketchContainer::AllReduce(bool is_column_split) {
+void SketchContainer::AllReduce(Context const*, bool is_column_split) {
   dh::safe_cuda(cudaSetDevice(device_.ordinal));
   auto world = collective::GetWorldSize();
   if (world == 1 || is_column_split) {
@@ -582,13 +580,13 @@ struct InvalidCatOp {
 };
 }  // anonymous namespace
 
-void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
+void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool is_column_split) {
   timer_.Start(__func__);
   dh::safe_cuda(cudaSetDevice(device_.ordinal));
   p_cuts->min_vals_.Resize(num_columns_);
 
   // Sync between workers.
-  this->AllReduce(is_column_split);
+  this->AllReduce(ctx, is_column_split);
 
   // Prune to final number of bins.
   this->Prune(num_bins_ + 1);
@@ -731,5 +729,4 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
   p_cuts->SetCategorical(this->has_categorical_, max_cat);
   timer_.Stop(__func__);
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh
index b478347824c8..f7124f079b6d 100644
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -151,9 +151,9 @@ class SketchContainer {
              Span<SketchEntry const> that);
 
   /* \brief Merge quantiles from other GPU workers. */
-  void AllReduce(bool is_column_split);
+  void AllReduce(Context const* ctx, bool is_column_split);
   /* \brief Create the final histogram cut values. */
-  void MakeCuts(HistogramCuts* cuts, bool is_column_split);
+  void MakeCuts(Context const* ctx, HistogramCuts* cuts, bool is_column_split);
 
   Span<SketchEntry const> Data() const {
     return {this->Current().data().get(), this->Current().size()};
diff --git a/src/common/quantile.h b/src/common/quantile.h
index 47db5f875af5..0af93a03e021 100644
--- a/src/common/quantile.h
+++ b/src/common/quantile.h
@@ -827,13 +827,14 @@ class SketchContainerImpl {
     return group_ind;
   }
   // Gather sketches from all workers.
-  void GatherSketchInfo(MetaInfo const& info,
+  void GatherSketchInfo(Context const *ctx, MetaInfo const &info,
                         std::vector<typename WQSketch::SummaryContainer> const &reduced,
                         std::vector<bst_row_t> *p_worker_segments,
                         std::vector<bst_row_t> *p_sketches_scan,
                         std::vector<typename WQSketch::Entry> *p_global_sketches);
   // Merge sketches from all workers.
-  void AllReduce(MetaInfo const& info, std::vector<typename WQSketch::SummaryContainer> *p_reduced,
+  void AllReduce(Context const *ctx, MetaInfo const &info,
+                 std::vector<typename WQSketch::SummaryContainer> *p_reduced,
                  std::vector<int32_t> *p_num_cuts);
 
   template <typename Batch, typename IsValid>
@@ -887,11 +888,11 @@ class SketchContainerImpl {
   /* \brief Push a CSR matrix. */
   void PushRowPage(SparsePage const &page, MetaInfo const &info, Span<float const> hessian = {});
 
-  void MakeCuts(MetaInfo const& info, HistogramCuts* cuts);
+  void MakeCuts(Context const *ctx, MetaInfo const &info, HistogramCuts *cuts);
 
  private:
   // Merge all categories from other workers.
-  void AllreduceCategories(MetaInfo const& info);
+  void AllreduceCategories(Context const* ctx, MetaInfo const& info);
 };
 
 class HostSketchContainer : public SketchContainerImpl<WQuantileSketch<float, float>> {
diff --git a/src/data/data.cc b/src/data/data.cc
index 7e70fff3fff3..50f64a4066a0 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -745,7 +745,7 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
   }
 }
 
-void MetaInfo::SynchronizeNumberOfColumns() {
+void MetaInfo::SynchronizeNumberOfColumns(Context const*) {
   if (IsColumnSplit()) {
     collective::Allreduce<collective::Operation::kSum>(&num_col_, 1);
   } else {
diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
index 45f6286fb606..e5aa98278c8e 100644
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -95,7 +95,7 @@ void GetCutsFromRef(Context const* ctx, std::shared_ptr<DMatrix> ref, bst_featur
 
 namespace {
 // Synchronize feature type in case of empty DMatrix
-void SyncFeatureType(std::vector<FeatureType>* p_h_ft) {
+void SyncFeatureType(Context const*, std::vector<FeatureType>* p_h_ft) {
   if (!collective::IsDistributed()) {
     return;
   }
@@ -193,7 +193,7 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
   // From here on Info() has the correct data shape
   Info().num_row_ = accumulated_rows;
   Info().num_nonzero_ = nnz;
-  Info().SynchronizeNumberOfColumns();
+  Info().SynchronizeNumberOfColumns(ctx);
   CHECK(std::none_of(column_sizes.cbegin(), column_sizes.cend(), [&](auto f) {
     return f > accumulated_rows;
   })) << "Something went wrong during iteration.";
@@ -213,9 +213,9 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
     while (iter.Next()) {
       if (!p_sketch) {
         h_ft = proxy->Info().feature_types.ConstHostVector();
-        SyncFeatureType(&h_ft);
-        p_sketch.reset(new common::HostSketchContainer{ctx, p.max_bin, h_ft, column_sizes,
-                                                       !proxy->Info().group_ptr_.empty()});
+        SyncFeatureType(ctx, &h_ft);
+        p_sketch = std::make_unique<common::HostSketchContainer>(ctx, p.max_bin, h_ft, column_sizes,
+                                                                 !proxy->Info().group_ptr_.empty());
       }
       HostAdapterDispatch(proxy, [&](auto const& batch) {
         proxy->Info().num_nonzero_ = batch_nnz[i];
@@ -230,7 +230,7 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
     CHECK_EQ(accumulated_rows, Info().num_row_);
 
     CHECK(p_sketch);
-    p_sketch->MakeCuts(Info(), &cuts);
+    p_sketch->MakeCuts(ctx, Info(), &cuts);
   }
   if (!h_ft.empty()) {
     CHECK_EQ(h_ft.size(), n_features);
diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index 2fffd516bfb6..09a3976d785c 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -105,7 +105,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
     sketch_containers.clear();
     sketch_containers.shrink_to_fit();
 
-    final_sketch.MakeCuts(&cuts, this->info_.IsColumnSplit());
+    final_sketch.MakeCuts(ctx, &cuts, this->info_.IsColumnSplit());
   } else {
     GetCutsFromRef(ctx, ref, Info().num_col_, p, &cuts);
   }
@@ -167,7 +167,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
 
   iter.Reset();
   // Synchronise worker columns
-  info_.SynchronizeNumberOfColumns();
+  info_.SynchronizeNumberOfColumns(ctx);
 }
 
 BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const* ctx,
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index 3814d74d2cec..2bf81892f013 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -283,7 +283,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
   // Synchronise worker columns
   info_.data_split_mode = data_split_mode;
   ReindexFeatures(&ctx);
-  info_.SynchronizeNumberOfColumns();
+  info_.SynchronizeNumberOfColumns(&ctx);
 
   if (adapter->NumRows() == kAdapterUnknownSize) {
     using IteratorAdapterT =
diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu
index e41d5939463f..e5b4d18f77db 100644
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -42,7 +42,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
   info_.num_row_ = adapter->NumRows();
   // Synchronise worker columns
   info_.data_split_mode = data_split_mode;
-  info_.SynchronizeNumberOfColumns();
+  info_.SynchronizeNumberOfColumns(&ctx);
 
   this->fmat_ctx_ = ctx;
 }
diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc
index 042a75c56a85..f1754c1b5a5d 100644
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -97,7 +97,7 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
   this->info_.num_col_ = n_features;
   this->info_.num_nonzero_ = nnz;
 
-  info_.SynchronizeNumberOfColumns();
+  info_.SynchronizeNumberOfColumns(&ctx);
   CHECK_NE(info_.num_col_, 0);
 
   fmat_ctx_ = ctx;
diff --git a/src/learner.cc b/src/learner.cc
index 08c59ba601c0..6b0fd7e4b8d7 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -209,7 +209,7 @@ struct LearnerModelParamLegacy : public dmlc::Parameter<LearnerModelParamLegacy>
     return dmlc::Parameter<LearnerModelParamLegacy>::UpdateAllowUnknown(kwargs);
   }
   // sanity check
-  void Validate() {
+  void Validate(Context const*) {
     if (!collective::IsDistributed()) {
       return;
     }
@@ -434,7 +434,7 @@ class LearnerConfiguration : public Learner {
       }
       // Update the shared model parameter
       this->ConfigureModelParamWithoutBaseScore();
-      mparam_.Validate();
+      mparam_.Validate(&ctx_);
     }
     CHECK(!std::isnan(mparam_.base_score));
     CHECK(!std::isinf(mparam_.base_score));
diff --git a/src/metric/auc.cu b/src/metric/auc.cu
index a4838d783d69..8b8349e1bd25 100644
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -199,9 +199,9 @@ void Transpose(common::Span<float const> in, common::Span<float> out, size_t m,
   });
 }
 
-double ScaleClasses(common::Span<double> results, common::Span<double> local_area,
-                    common::Span<double> tp, common::Span<double> auc, size_t n_classes) {
-  dh::XGBDeviceAllocator<char> alloc;
+double ScaleClasses(Context const *ctx, common::Span<double> results,
+                    common::Span<double> local_area, common::Span<double> tp,
+                    common::Span<double> auc, size_t n_classes) {
   if (collective::IsDistributed()) {
     int32_t device = dh::CurrentDevice();
     CHECK_EQ(dh::CudaGetPointerDevice(results.data()), device);
@@ -218,8 +218,8 @@ double ScaleClasses(common::Span<double> results, common::Span<double> local_are
   double tp_sum;
   double auc_sum;
   thrust::tie(auc_sum, tp_sum) =
-      thrust::reduce(thrust::cuda::par(alloc), reduce_in, reduce_in + n_classes,
-                     Pair{0.0, 0.0}, PairPlus<double, double>{});
+      thrust::reduce(ctx->CUDACtx()->CTP(), reduce_in, reduce_in + n_classes, Pair{0.0, 0.0},
+                     PairPlus<double, double>{});
   if (tp_sum != 0 && !std::isnan(auc_sum)) {
     auc_sum /= tp_sum;
   } else {
@@ -309,10 +309,10 @@ void SegmentedReduceAUC(common::Span<size_t const> d_unique_idx,
  * up each class in all kernels.
  */
 template <bool scale, typename Fn>
-double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
+double GPUMultiClassAUCOVR(Context const *ctx, MetaInfo const &info,
                            common::Span<uint32_t> d_class_ptr, size_t n_classes,
                            std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
-  dh::safe_cuda(cudaSetDevice(device.ordinal));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
   /**
    * Sorted idx
    */
@@ -320,7 +320,7 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
   // Index is sorted within class.
   auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
 
-  auto labels = info.labels.View(device);
+  auto labels = info.labels.View(ctx->Device());
   auto weights = info.weights_.ConstDeviceSpan();
 
   size_t n_samples = labels.Shape(0);
@@ -328,12 +328,11 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
   if (n_samples == 0) {
     dh::TemporaryArray<double> resutls(n_classes * 4, 0.0f);
     auto d_results = dh::ToSpan(resutls);
-    dh::LaunchN(n_classes * 4,
-                [=] XGBOOST_DEVICE(size_t i) { d_results[i] = 0.0f; });
+    dh::LaunchN(n_classes * 4, [=] XGBOOST_DEVICE(size_t i) { d_results[i] = 0.0f; });
     auto local_area = d_results.subspan(0, n_classes);
     auto tp = d_results.subspan(2 * n_classes, n_classes);
     auto auc = d_results.subspan(3 * n_classes, n_classes);
-    return ScaleClasses(d_results, local_area, tp, auc, n_classes);
+    return ScaleClasses(ctx, d_results, local_area, tp, auc, n_classes);
   }
 
   /**
@@ -437,7 +436,7 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
       tp[c] = 1.0f;
     }
   });
-  return ScaleClasses(d_results, local_area, tp, auc, n_classes);
+  return ScaleClasses(ctx, d_results, local_area, tp, auc, n_classes);
 }
 
 void MultiClassSortedIdx(Context const *ctx, common::Span<float const> predts,
@@ -472,8 +471,7 @@ double GPUMultiClassROCAUC(Context const *ctx, common::Span<float const> predts,
                               size_t /*class_id*/) {
     return TrapezoidArea(fp_prev, fp, tp_prev, tp);
   };
-  return GPUMultiClassAUCOVR<true>(info, ctx->Device(), dh::ToSpan(class_ptr), n_classes, cache,
-                                   fn);
+  return GPUMultiClassAUCOVR<true>(ctx, info, dh::ToSpan(class_ptr), n_classes, cache, fn);
 }
 
 namespace {
@@ -697,7 +695,7 @@ double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,
     return detail::CalcDeltaPRAUC(fp_prev, fp, tp_prev, tp,
                                   d_totals[class_id].first);
   };
-  return GPUMultiClassAUCOVR<false>(info, ctx->Device(), d_class_ptr, n_classes, cache, fn);
+  return GPUMultiClassAUCOVR<false>(ctx, info, d_class_ptr, n_classes, cache, fn);
 }
 
 template <typename Fn>
diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu
index feabedfab795..f245f3e06306 100644
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -215,7 +215,7 @@ struct EvalError {
       has_param_ = false;
     }
   }
-  const char *Name() const {
+  [[nodiscard]] const char *Name() const {
     static thread_local std::string name;
     if (has_param_) {
       std::ostringstream os;
@@ -228,7 +228,7 @@ struct EvalError {
     }
   }
 
-  XGBOOST_DEVICE bst_float EvalRow(bst_float label, bst_float pred) const {
+  [[nodiscard]] XGBOOST_DEVICE bst_float EvalRow(bst_float label, bst_float pred) const {
     // assume label is in [0,1]
     return pred > threshold_ ? 1.0f - label : label;
   }
@@ -370,7 +370,7 @@ struct EvalEWiseBase : public MetricNoCache {
     return Policy::GetFinal(dat[0], dat[1]);
   }
 
-  const char* Name() const override { return policy_.Name(); }
+  [[nodiscard]] const char* Name() const override { return policy_.Name(); }
 
  private:
   Policy policy_;
diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc
index 41495164ce7e..6762aec32d46 100644
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -162,7 +162,7 @@ struct EvalRank : public MetricNoCache, public EvalRankConfig {
     return collective::GlobalRatio(info, sum_metric, static_cast<double>(ngroups));
   }
 
-  const char* Name() const override {
+  [[nodiscard]] const char* Name() const override {
     return name.c_str();
   }
 
@@ -294,7 +294,7 @@ class EvalRankWithCache : public Metric {
 };
 
 namespace {
-double Finalize(MetaInfo const& info, double score, double sw) {
+double Finalize(Context const*, MetaInfo const& info, double score, double sw) {
   std::array<double, 2> dat{score, sw};
   collective::GlobalSum(info, &dat);
   std::tie(score, sw) = std::tuple_cat(dat);
@@ -323,7 +323,7 @@ class EvalPrecision : public EvalRankWithCache<ltr::PreCache> {
 
     if (ctx_->IsCUDA()) {
       auto pre = cuda_impl::PreScore(ctx_, info, predt, p_cache);
-      return Finalize(info, pre.Residue(), pre.Weights());
+      return Finalize(ctx_, info, pre.Residue(), pre.Weights());
     }
 
     auto gptr = p_cache->DataGroupPtr(ctx_);
@@ -352,7 +352,7 @@ class EvalPrecision : public EvalRankWithCache<ltr::PreCache> {
     }
 
     auto sum = std::accumulate(pre.cbegin(), pre.cend(), 0.0);
-    return Finalize(info, sum, sw);
+    return Finalize(ctx_, info, sum, sw);
   }
 };
 
@@ -369,7 +369,7 @@ class EvalNDCG : public EvalRankWithCache<ltr::NDCGCache> {
               std::shared_ptr<ltr::NDCGCache> p_cache) override {
     if (ctx_->IsCUDA()) {
       auto ndcg = cuda_impl::NDCGScore(ctx_, info, preds, minus_, p_cache);
-      return Finalize(info, ndcg.Residue(), ndcg.Weights());
+      return Finalize(ctx_, info, ndcg.Residue(), ndcg.Weights());
     }
 
     // group local ndcg
@@ -415,7 +415,7 @@ class EvalNDCG : public EvalRankWithCache<ltr::NDCGCache> {
       sum_w = std::accumulate(weights.weights.cbegin(), weights.weights.cend(), 0.0);
     }
     auto ndcg = std::accumulate(linalg::cbegin(ndcg_gloc), linalg::cend(ndcg_gloc), 0.0);
-    return Finalize(info, ndcg, sum_w);
+    return Finalize(ctx_, info, ndcg, sum_w);
   }
 };
 
@@ -427,7 +427,7 @@ class EvalMAPScore : public EvalRankWithCache<ltr::MAPCache> {
               std::shared_ptr<ltr::MAPCache> p_cache) override {
     if (ctx_->IsCUDA()) {
       auto map = cuda_impl::MAPScore(ctx_, info, predt, minus_, p_cache);
-      return Finalize(info, map.Residue(), map.Weights());
+      return Finalize(ctx_, info, map.Residue(), map.Weights());
     }
 
     auto gptr = p_cache->DataGroupPtr(ctx_);
@@ -469,7 +469,7 @@ class EvalMAPScore : public EvalRankWithCache<ltr::MAPCache> {
       sw += weight[i];
     }
     auto sum = std::accumulate(map_gloc.cbegin(), map_gloc.cend(), 0.0);
-    return Finalize(info, sum, sw);
+    return Finalize(ctx_, info, sum, sw);
   }
 };
 
diff --git a/src/metric/survival_metric.cu b/src/metric/survival_metric.cu
index 0625af25ad22..c13702a1917f 100644
--- a/src/metric/survival_metric.cu
+++ b/src/metric/survival_metric.cu
@@ -217,7 +217,7 @@ struct EvalEWiseSurvivalBase : public MetricNoCache {
     return Policy::GetFinal(dat[0], dat[1]);
   }
 
-  const char* Name() const override {
+  [[nodiscard]] const char* Name() const override {
     return policy_.Name();
   }
 
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 26d8f3440a8f..20305850a7f6 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -189,7 +189,7 @@ struct SparsePageView {
 
   explicit SparsePageView(SparsePage const *p) : base_rowid{p->base_rowid} { view = p->GetView(); }
   SparsePage::Inst operator[](size_t i) { return view[i]; }
-  size_t Size() const { return view.Size(); }
+  [[nodiscard]] size_t Size() const { return view.Size(); }
 };
 
 struct SingleInstanceView {
@@ -250,7 +250,7 @@ struct GHistIndexMatrixView {
     }
     return ret;
   }
-  size_t Size() const { return page_.Size(); }
+  [[nodiscard]] size_t Size() const { return page_.Size(); }
 };
 
 template <typename Adapter>
@@ -290,7 +290,7 @@ class AdapterView {
     return ret;
   }
 
-  size_t Size() const { return adapter_->NumRows(); }
+  [[nodiscard]] size_t Size() const { return adapter_->NumRows(); }
 
   bst_row_t const static base_rowid = 0;  // NOLINT
 };
@@ -408,31 +408,33 @@ class ColumnSplitHelper {
   ColumnSplitHelper(ColumnSplitHelper &&) noexcept = delete;
   ColumnSplitHelper &operator=(ColumnSplitHelper &&) noexcept = delete;
 
-  void PredictDMatrix(DMatrix *p_fmat, std::vector<bst_float> *out_preds) {
+  void PredictDMatrix(Context const *ctx, DMatrix *p_fmat, std::vector<bst_float> *out_preds) {
     CHECK(xgboost::collective::IsDistributed())
         << "column-split prediction is only supported for distributed training";
 
     for (auto const &batch : p_fmat->GetBatches<SparsePage>()) {
       CHECK_EQ(out_preds->size(),
                p_fmat->Info().num_row_ * model_.learner_model_param->num_output_group);
-      PredictBatchKernel<SparsePageView, kBlockOfRowsSize>(SparsePageView{&batch}, out_preds);
+      PredictBatchKernel<SparsePageView, kBlockOfRowsSize>(ctx, SparsePageView{&batch}, out_preds);
     }
   }
 
-  void PredictInstance(SparsePage::Inst const &inst, std::vector<bst_float> *out_preds) {
+  void PredictInstance(Context const *ctx, SparsePage::Inst const &inst,
+                       std::vector<bst_float> *out_preds) {
     CHECK(xgboost::collective::IsDistributed())
         << "column-split prediction is only supported for distributed training";
 
-    PredictBatchKernel<SingleInstanceView, 1>(SingleInstanceView{inst}, out_preds);
+    PredictBatchKernel<SingleInstanceView, 1>(ctx, SingleInstanceView{inst}, out_preds);
   }
 
-  void PredictLeaf(DMatrix *p_fmat, std::vector<bst_float> *out_preds) {
+  void PredictLeaf(Context const* ctx, DMatrix *p_fmat, std::vector<bst_float> *out_preds) {
     CHECK(xgboost::collective::IsDistributed())
         << "column-split prediction is only supported for distributed training";
 
     for (auto const &batch : p_fmat->GetBatches<SparsePage>()) {
       CHECK_EQ(out_preds->size(), p_fmat->Info().num_row_ * (tree_end_ - tree_begin_));
-      PredictBatchKernel<SparsePageView, kBlockOfRowsSize, true>(SparsePageView{&batch}, out_preds);
+      PredictBatchKernel<SparsePageView, kBlockOfRowsSize, true>(ctx, SparsePageView{&batch},
+                                                                 out_preds);
     }
   }
 
@@ -453,12 +455,13 @@ class ColumnSplitHelper {
     std::fill(missing_storage_.begin(), missing_storage_.end(), 0);
   }
 
-  std::size_t BitIndex(std::size_t tree_id, std::size_t row_id, std::size_t node_id) const {
+  [[nodiscard]] std::size_t BitIndex(std::size_t tree_id, std::size_t row_id,
+                                     std::size_t node_id) const {
     size_t tree_index = tree_id - tree_begin_;
     return tree_offsets_[tree_index] * n_rows_ + row_id * tree_sizes_[tree_index] + node_id;
   }
 
-  void AllreduceBitVectors() {
+  void AllreduceBitVectors(Context const*) {
     collective::Allreduce<collective::Operation::kBitwiseOR>(decision_storage_.data(),
                                                              decision_storage_.size());
     collective::Allreduce<collective::Operation::kBitwiseAND>(missing_storage_.data(),
@@ -547,7 +550,7 @@ class ColumnSplitHelper {
   }
 
   template <typename DataView, size_t block_of_rows_size, bool predict_leaf = false>
-  void PredictBatchKernel(DataView batch, std::vector<bst_float> *out_preds) {
+  void PredictBatchKernel(Context const* ctx, DataView batch, std::vector<bst_float> *out_preds) {
     auto const num_group = model_.learner_model_param->num_output_group;
 
     // parallel over local batch
@@ -568,7 +571,7 @@ class ColumnSplitHelper {
       FVecDrop(block_size, fvec_offset, &feat_vecs_);
     });
 
-    AllreduceBitVectors();
+    AllreduceBitVectors(ctx);
 
     // auto block_id has the same type as `n_blocks`.
     common::ParallelFor(n_blocks, n_threads_, [&](auto block_id) {
@@ -646,7 +649,7 @@ class CPUPredictor : public Predictor {
           << "Predict DMatrix with column split" << MTNotImplemented();
 
       ColumnSplitHelper helper(this->ctx_->Threads(), model, tree_begin, tree_end);
-      helper.PredictDMatrix(p_fmat, out_preds);
+      helper.PredictDMatrix(ctx_, p_fmat, out_preds);
       return;
     }
 
@@ -779,7 +782,7 @@ class CPUPredictor : public Predictor {
           << "Predict instance with column split" << MTNotImplemented();
 
       ColumnSplitHelper helper(this->ctx_->Threads(), model, 0, ntree_limit);
-      helper.PredictInstance(inst, out_preds);
+      helper.PredictInstance(ctx_, inst, out_preds);
       return;
     }
 
@@ -811,7 +814,7 @@ class CPUPredictor : public Predictor {
           << "Predict leaf with column split" << MTNotImplemented();
 
       ColumnSplitHelper helper(n_threads, model, 0, ntree_limit);
-      helper.PredictLeaf(p_fmat, &preds);
+      helper.PredictLeaf(ctx_, p_fmat, &preds);
       return;
     }
 
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index e41248e29749..7fad0739758a 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -62,9 +62,7 @@ struct TreeView {
     cats.node_ptr = tree_cat_ptrs;
   }
 
-  __device__ bool HasCategoricalSplit() const {
-    return !cats.categories.empty();
-  }
+  [[nodiscard]] __device__ bool HasCategoricalSplit() const { return !cats.categories.empty(); }
 };
 
 struct SparsePageView {
@@ -77,7 +75,7 @@ struct SparsePageView {
                                 common::Span<const bst_row_t> row_ptr,
                                 bst_feature_t num_features)
       : d_data{data}, d_row_ptr{row_ptr}, num_features(num_features) {}
-  __device__ float GetElement(size_t ridx, size_t fidx) const {
+  [[nodiscard]] __device__ float GetElement(size_t ridx, size_t fidx) const {
     // Binary search
     auto begin_ptr = d_data.begin() + d_row_ptr[ridx];
     auto end_ptr = d_data.begin() + d_row_ptr[ridx + 1];
@@ -105,8 +103,8 @@ struct SparsePageView {
     // Value is missing
     return nanf("");
   }
-  XGBOOST_DEVICE size_t NumRows() const { return d_row_ptr.size() - 1; }
-  XGBOOST_DEVICE size_t NumCols() const { return num_features; }
+  [[nodiscard]] XGBOOST_DEVICE size_t NumRows() const { return d_row_ptr.size() - 1; }
+  [[nodiscard]] XGBOOST_DEVICE size_t NumCols() const { return num_features; }
 };
 
 struct SparsePageLoader {
@@ -137,7 +135,7 @@ struct SparsePageLoader {
       __syncthreads();
     }
   }
-  __device__ float GetElement(size_t  ridx, size_t  fidx) const {
+  [[nodiscard]] __device__ float GetElement(size_t ridx, size_t fidx) const {
     if (use_shared) {
       return smem[threadIdx.x * data.num_features + fidx];
     } else {
@@ -151,7 +149,7 @@ struct EllpackLoader {
   XGBOOST_DEVICE EllpackLoader(EllpackDeviceAccessor const& m, bool, bst_feature_t, bst_row_t,
                                size_t, float)
       : matrix{m} {}
-  __device__ __forceinline__ float GetElement(size_t ridx, size_t fidx) const {
+  [[nodiscard]] __device__ __forceinline__ float GetElement(size_t ridx, size_t fidx) const {
     auto gidx = matrix.GetBinIndex(ridx, fidx);
     if (gidx == -1) {
       return nan("");
diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index 627bf4ca4fab..ceb322c28616 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -395,11 +395,11 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
   }
 }
 
-void GPUHistEvaluator::EvaluateSplits(
-    const std::vector<bst_node_t> &nidx, bst_feature_t max_active_features,
-    common::Span<const EvaluateSplitInputs> d_inputs,
-    EvaluateSplitSharedInputs shared_inputs,
-    common::Span<GPUExpandEntry> out_entries) {
+void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector<bst_node_t> &nidx,
+                                      bst_feature_t max_active_features,
+                                      common::Span<const EvaluateSplitInputs> d_inputs,
+                                      EvaluateSplitSharedInputs shared_inputs,
+                                      common::Span<GPUExpandEntry> out_entries) {
   auto evaluator = this->tree_evaluator_.template GetEvaluator<GPUTrainingParam>();
 
   dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(d_inputs.size());
@@ -417,19 +417,20 @@ void GPUHistEvaluator::EvaluateSplits(
                           out_splits.size() * sizeof(DeviceSplitCandidate));
 
     // Reduce to get the best candidate from all workers.
-    dh::LaunchN(out_splits.size(), [world_size, all_candidates, out_splits] __device__(size_t i) {
-      out_splits[i] = all_candidates[i];
-      for (auto rank = 1; rank < world_size; rank++) {
-        out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i];
-      }
-    });
+    dh::LaunchN(out_splits.size(), ctx->CUDACtx()->Stream(),
+                [world_size, all_candidates, out_splits] __device__(size_t i) {
+                  out_splits[i] = all_candidates[i];
+                  for (auto rank = 1; rank < world_size; rank++) {
+                    out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i];
+                  }
+                });
   }
 
   auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
   auto d_entries = out_entries;
   auto device_cats_accessor = this->DeviceCatStorage(nidx);
   // turn candidate into entry, along with handling sort based split.
-  dh::LaunchN(d_inputs.size(), [=] __device__(size_t i) mutable {
+  dh::LaunchN(d_inputs.size(), ctx->CUDACtx()->Stream(), [=] __device__(size_t i) mutable {
     auto const input = d_inputs[i];
     auto &split = out_splits[i];
     // Subtract parent gain here
@@ -464,12 +465,12 @@ void GPUHistEvaluator::EvaluateSplits(
   this->CopyToHost(nidx);
 }
 
-GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
-    EvaluateSplitInputs input, EvaluateSplitSharedInputs shared_inputs) {
+GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input,
+                                                     EvaluateSplitSharedInputs shared_inputs) {
   dh::device_vector<EvaluateSplitInputs> inputs = std::vector<EvaluateSplitInputs>{input};
   dh::TemporaryArray<GPUExpandEntry> out_entries(1);
-  this->EvaluateSplits({input.nidx}, input.feature_set.size(), dh::ToSpan(inputs), shared_inputs,
-                       dh::ToSpan(out_entries));
+  this->EvaluateSplits(ctx, {input.nidx}, input.feature_set.size(), dh::ToSpan(inputs),
+                       shared_inputs, dh::ToSpan(out_entries));
   GPUExpandEntry root_entry;
   dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
                                 cudaMemcpyDeviceToHost));
diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
index 7c61099a1a43..8c387f6324bb 100644
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -193,7 +193,7 @@ class GPUHistEvaluator {
   /**
    * \brief Evaluate splits for left and right nodes.
    */
-  void EvaluateSplits(const std::vector<bst_node_t> &nidx,
+  void EvaluateSplits(Context const* ctx, const std::vector<bst_node_t> &nidx,
                       bst_feature_t max_active_features,
                       common::Span<const EvaluateSplitInputs> d_inputs,
                       EvaluateSplitSharedInputs shared_inputs,
@@ -201,7 +201,7 @@ class GPUHistEvaluator {
   /**
    * \brief Evaluate splits for root node.
    */
-  GPUExpandEntry EvaluateSingleSplit(EvaluateSplitInputs input,
+  GPUExpandEntry EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input,
                                      EvaluateSplitSharedInputs shared_inputs);
 };
 }  // namespace tree
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 22eb7ab8185c..c473c9269580 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -16,8 +16,7 @@
 #include "row_partitioner.cuh"
 #include "xgboost/base.h"
 
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 namespace {
 struct Pair {
   GradientPair first;
@@ -53,7 +52,8 @@ struct Clip : public thrust::unary_function<GradientPair, Pair> {
  *
  * to avoid outliers, as the full reduction is reproducible on GPU with reduction tree.
  */
-GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair, MetaInfo const& info) {
+GradientQuantiser::GradientQuantiser(Context const*, common::Span<GradientPair const> gpair,
+                                     MetaInfo const& info) {
   using GradientSumT = GradientPairPrecise;
   using T = typename GradientSumT::ValueT;
   dh::XGBCachingDeviceAllocator<char> alloc;
@@ -99,7 +99,6 @@ GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair, Met
                                  static_cast<T>(1) / to_floating_point_.GetHess());
 }
 
-
 XGBOOST_DEV_INLINE void
 AtomicAddGpairShared(xgboost::GradientPairInt64 *dest,
                xgboost::GradientPairInt64 const &gpair) {
@@ -314,6 +313,4 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
 
   dh::safe_cuda(cudaGetLastError());
 }
-
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh
index c693e2e62e52..925c548936f4 100644
--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@@ -39,18 +39,20 @@ private:
   GradientPairPrecise to_floating_point_;
 
  public:
-  GradientQuantiser(common::Span<GradientPair const> gpair, MetaInfo const& info);
-  XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
+  GradientQuantiser(Context const* ctx, common::Span<GradientPair const> gpair, MetaInfo const& info);
+  [[nodiscard]] XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
     auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
-                               gpair.GetHess() * to_fixed_point_.GetHess());
+                                      gpair.GetHess() * to_fixed_point_.GetHess());
     return adjusted;
   }
-  XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPairPrecise const& gpair) const {
+  [[nodiscard]] XGBOOST_DEVICE GradientPairInt64
+  ToFixedPoint(GradientPairPrecise const& gpair) const {
     auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
-                               gpair.GetHess() * to_fixed_point_.GetHess());
+                                      gpair.GetHess() * to_fixed_point_.GetHess());
     return adjusted;
   }
-  XGBOOST_DEVICE GradientPairPrecise ToFloatingPoint(const GradientPairInt64&gpair) const {
+  [[nodiscard]] XGBOOST_DEVICE GradientPairPrecise
+  ToFloatingPoint(const GradientPairInt64& gpair) const {
     auto g = gpair.GetQuantisedGrad() * to_floating_point_.GetGrad();
     auto h = gpair.GetQuantisedHess() * to_floating_point_.GetHess();
     return {g,h};
diff --git a/src/tree/hist/histogram.h b/src/tree/hist/histogram.h
index f378c78089db..033d2221ec89 100644
--- a/src/tree/hist/histogram.h
+++ b/src/tree/hist/histogram.h
@@ -171,7 +171,8 @@ class HistogramBuilder {
     }
   }
 
-  void SyncHistogram(RegTree const *p_tree, std::vector<bst_node_t> const &nodes_to_build,
+  void SyncHistogram(Context const *, RegTree const *p_tree,
+                     std::vector<bst_node_t> const &nodes_to_build,
                      std::vector<bst_node_t> const &nodes_to_trick) {
     auto n_total_bins = buffer_.TotalBins();
     common::BlockedSpace2d space(
@@ -277,14 +278,14 @@ class MultiHistogramBuilder {
     }
 
     for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
-      this->target_builders_[t].SyncHistogram(p_tree, nodes, dummy_sub);
+      this->target_builders_[t].SyncHistogram(ctx_, p_tree, nodes, dummy_sub);
     }
   }
   /**
    * @brief Build histogram for left and right child of valid candidates
    */
   template <typename Partitioner, typename ExpandEntry>
-  void BuildHistLeftRight(DMatrix *p_fmat, RegTree const *p_tree,
+  void BuildHistLeftRight(Context const *ctx, DMatrix *p_fmat, RegTree const *p_tree,
                           std::vector<Partitioner> const &partitioners,
                           std::vector<ExpandEntry> const &valid_candidates,
                           linalg::MatrixView<GradientPair const> gpair, BatchParam const &param,
@@ -318,7 +319,7 @@ class MultiHistogramBuilder {
     }
 
     for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
-      this->target_builders_[t].SyncHistogram(p_tree, nodes_to_build, nodes_to_sub);
+      this->target_builders_[t].SyncHistogram(ctx, p_tree, nodes_to_build, nodes_to_sub);
     }
   }
 
diff --git a/src/tree/hist/param.cc b/src/tree/hist/param.cc
index 602566cd399e..bd8d7a85c407 100644
--- a/src/tree/hist/param.cc
+++ b/src/tree/hist/param.cc
@@ -12,7 +12,7 @@
 namespace xgboost::tree {
 DMLC_REGISTER_PARAMETER(HistMakerTrainParam);
 
-void HistMakerTrainParam::CheckTreesSynchronized(RegTree const* local_tree) const {
+void HistMakerTrainParam::CheckTreesSynchronized(Context const*, RegTree const* local_tree) const {
   if (!this->debug_synchronize) {
     return;
   }
diff --git a/src/tree/hist/param.h b/src/tree/hist/param.h
index 8757b65e6610..aa9d8cedf644 100644
--- a/src/tree/hist/param.h
+++ b/src/tree/hist/param.h
@@ -15,7 +15,7 @@ struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
   bool debug_synchronize{false};
   std::size_t max_cached_hist_node{DefaultNodes()};
 
-  void CheckTreesSynchronized(RegTree const* local_tree) const;
+  void CheckTreesSynchronized(Context const* ctx, RegTree const* local_tree) const;
 
   // declare parameters
   DMLC_DECLARE_PARAMETER(HistMakerTrainParam) {
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index 17e020ced5fc..3c37556e1091 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -140,7 +140,7 @@ class GloablApproxBuilder {
                       std::vector<GradientPair> const &gpair, common::Span<float> hess) {
     monitor_->Start(__func__);
     this->histogram_builder_.BuildHistLeftRight(
-        p_fmat, p_tree, partitioner_, valid_candidates,
+        ctx_, p_fmat, p_tree, partitioner_, valid_candidates,
         linalg::MakeTensorView(ctx_, gpair, gpair.size(), 1), BatchSpec(*param_, hess));
     monitor_->Stop(__func__);
   }
@@ -300,7 +300,7 @@ class GlobalApproxUpdater : public TreeUpdater {
     std::size_t t_idx = 0;
     for (auto p_tree : trees) {
       this->pimpl_->UpdateTree(m, s_gpair, hess, p_tree, &out_position[t_idx]);
-      hist_param_.CheckTreesSynchronized(p_tree);
+      hist_param_.CheckTreesSynchronized(ctx_, p_tree);
       ++t_idx;
     }
   }
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 6db201dd5277..3c9c61f88847 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -246,7 +246,7 @@ struct GPUHistMakerDevice {
     this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
                            dmat->Info().IsColumnSplit(), ctx_->Device());
 
-    quantiser = std::make_unique<GradientQuantiser>(this->gpair, dmat->Info());
+    quantiser = std::make_unique<GradientQuantiser>(ctx_, this->gpair, dmat->Info());
 
     row_partitioner.reset();  // Release the device memory first before reallocating
     row_partitioner = std::make_unique<RowPartitioner>(ctx_->Device(), sample.sample_rows);
@@ -276,7 +276,7 @@ struct GPUHistMakerDevice {
         matrix.min_fvalue,
         matrix.is_dense && !collective::IsDistributed()
     };
-    auto split = this->evaluator_.EvaluateSingleSplit(inputs, shared_inputs);
+    auto split = this->evaluator_.EvaluateSingleSplit(ctx_, inputs, shared_inputs);
     return split;
   }
 
@@ -329,7 +329,7 @@ struct GPUHistMakerDevice {
         d_node_inputs.data().get(), h_node_inputs.data(),
         h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault));
 
-    this->evaluator_.EvaluateSplits(nidx, max_active_features, dh::ToSpan(d_node_inputs),
+    this->evaluator_.EvaluateSplits(ctx_, nidx, max_active_features, dh::ToSpan(d_node_inputs),
                                     shared_inputs, dh::ToSpan(entries));
     dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
                                   entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
@@ -842,7 +842,7 @@ class GPUHistMaker : public TreeUpdater {
       std::size_t t_idx{0};
       for (xgboost::RegTree* tree : trees) {
         this->UpdateTree(param, gpair_hdv, dmat, tree, &out_position[t_idx]);
-        this->hist_maker_param_.CheckTreesSynchronized(tree);
+        this->hist_maker_param_.CheckTreesSynchronized(ctx_, tree);
         ++t_idx;
       }
       dh::safe_cuda(cudaGetLastError());
@@ -985,7 +985,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
     std::size_t t_idx{0};
     for (xgboost::RegTree* tree : trees) {
       this->UpdateTree(gpair->Data(), p_fmat, tree, &out_position[t_idx]);
-      this->hist_maker_param_.CheckTreesSynchronized(tree);
+      this->hist_maker_param_.CheckTreesSynchronized(ctx_, tree);
       ++t_idx;
     }
 
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 50943e1c403a..375b24cfa97a 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -228,8 +228,8 @@ class MultiTargetHistBuilder {
                       std::vector<MultiExpandEntry> const &valid_candidates,
                       linalg::MatrixView<GradientPair const> gpair) {
     monitor_->Start(__func__);
-    histogram_builder_->BuildHistLeftRight(p_fmat, p_tree, partitioner_, valid_candidates, gpair,
-                                           HistBatch(param_));
+    histogram_builder_->BuildHistLeftRight(ctx_, p_fmat, p_tree, partitioner_, valid_candidates,
+                                           gpair, HistBatch(param_));
     monitor_->Stop(__func__);
   }
 
@@ -436,8 +436,8 @@ class HistUpdater {
                       std::vector<CPUExpandEntry> const &valid_candidates,
                       linalg::MatrixView<GradientPair const> gpair) {
     monitor_->Start(__func__);
-    this->histogram_builder_->BuildHistLeftRight(p_fmat, p_tree, partitioner_, valid_candidates,
-                                                 gpair, HistBatch(param_));
+    this->histogram_builder_->BuildHistLeftRight(ctx_, p_fmat, p_tree, partitioner_,
+                                                 valid_candidates, gpair, HistBatch(param_));
     monitor_->Stop(__func__);
   }
 
@@ -537,7 +537,7 @@ class QuantileHistMaker : public TreeUpdater {
                                    h_out_position, *tree_it);
       }
 
-      hist_param_.CheckTreesSynchronized(*tree_it);
+      hist_param_.CheckTreesSynchronized(ctx_, *tree_it);
     }
   }
 
diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu
index 4782f9580567..92d8ff753603 100644
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -360,25 +360,27 @@ TEST(HistUtil, DeviceSketchExternalMemoryWithWeights) {
 }
 
 template <typename Adapter>
-auto MakeUnweightedCutsForTest(Adapter adapter, int32_t num_bins, float missing, size_t batch_size = 0) {
+auto MakeUnweightedCutsForTest(Context const* ctx, Adapter adapter, int32_t num_bins, float missing,
+                               size_t batch_size = 0) {
   common::HistogramCuts batched_cuts;
   HostDeviceVector<FeatureType> ft;
   SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(),
                                    DeviceOrd::CUDA(0));
   MetaInfo info;
   AdapterDeviceSketch(adapter.Value(), num_bins, info, missing, &sketch_container, batch_size);
-  sketch_container.MakeCuts(&batched_cuts, info.IsColumnSplit());
+  sketch_container.MakeCuts(ctx, &batched_cuts, info.IsColumnSplit());
   return batched_cuts;
 }
 
 template <typename Adapter>
-void ValidateBatchedCuts(Adapter adapter, int num_bins, DMatrix* dmat, size_t batch_size = 0) {
+void ValidateBatchedCuts(Context const* ctx, Adapter adapter, int num_bins, DMatrix* dmat, size_t batch_size = 0) {
   common::HistogramCuts batched_cuts = MakeUnweightedCutsForTest(
-      adapter, num_bins, std::numeric_limits<float>::quiet_NaN(), batch_size);
+      ctx, adapter, num_bins, std::numeric_limits<float>::quiet_NaN(), batch_size);
   ValidateCuts(batched_cuts, dmat, num_bins);
 }
 
 TEST(HistUtil, AdapterDeviceSketch) {
+  auto ctx = MakeCUDACtx(0);
   int rows = 5;
   int cols = 1;
   int num_bins = 4;
@@ -391,8 +393,8 @@ TEST(HistUtil, AdapterDeviceSketch) {
 
   data::CupyAdapter adapter(str);
 
-  auto device_cuts = MakeUnweightedCutsForTest(adapter, num_bins, missing);
-  Context ctx;
+  auto device_cuts = MakeUnweightedCutsForTest(&ctx, adapter, num_bins, missing);
+  ctx = ctx.MakeCPU();
   auto host_cuts = GetHostCuts(&ctx, &adapter, num_bins, missing);
 
   EXPECT_EQ(device_cuts.Values(), host_cuts.Values());
@@ -401,6 +403,7 @@ TEST(HistUtil, AdapterDeviceSketch) {
 }
 
 TEST(HistUtil, AdapterDeviceSketchMemory) {
+  auto ctx = MakeCUDACtx(0);
   int num_columns = 100;
   int num_rows = 1000;
   int num_bins = 256;
@@ -410,7 +413,8 @@ TEST(HistUtil, AdapterDeviceSketchMemory) {
 
   dh::GlobalMemoryLogger().Clear();
   ConsoleLogger::Configure({{"verbosity", "3"}});
-  auto cuts = MakeUnweightedCutsForTest(adapter, num_bins, std::numeric_limits<float>::quiet_NaN());
+  auto cuts =
+      MakeUnweightedCutsForTest(&ctx, adapter, num_bins, std::numeric_limits<float>::quiet_NaN());
   ConsoleLogger::Configure({{"verbosity", "0"}});
   size_t bytes_required = detail::RequiredMemory(
       num_rows, num_columns, num_rows * num_columns, num_bins, false);
@@ -419,6 +423,7 @@ TEST(HistUtil, AdapterDeviceSketchMemory) {
 }
 
 TEST(HistUtil, AdapterSketchSlidingWindowMemory) {
+  auto ctx = MakeCUDACtx(0);
   int num_columns = 100;
   int num_rows = 1000;
   int num_bins = 256;
@@ -435,7 +440,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowMemory) {
   AdapterDeviceSketch(adapter.Value(), num_bins, info, std::numeric_limits<float>::quiet_NaN(),
                       &sketch_container);
   HistogramCuts cuts;
-  sketch_container.MakeCuts(&cuts, info.IsColumnSplit());
+  sketch_container.MakeCuts(&ctx, &cuts, info.IsColumnSplit());
   size_t bytes_required = detail::RequiredMemory(
       num_rows, num_columns, num_rows * num_columns, num_bins, false);
   EXPECT_LE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required * 1.05);
@@ -444,6 +449,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowMemory) {
 }
 
 TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
+  auto ctx = MakeCUDACtx(0);
   int num_columns = 100;
   int num_rows = 1000;
   int num_bins = 256;
@@ -465,7 +471,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
                       &sketch_container);
 
   HistogramCuts cuts;
-  sketch_container.MakeCuts(&cuts, info.IsColumnSplit());
+  sketch_container.MakeCuts(&ctx, &cuts, info.IsColumnSplit());
   ConsoleLogger::Configure({{"verbosity", "0"}});
   size_t bytes_required = detail::RequiredMemory(
       num_rows, num_columns, num_rows * num_columns, num_bins, true);
@@ -475,6 +481,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
 
 void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
                                   int32_t num_bins, bool weighted) {
+  auto ctx = MakeCUDACtx(0);
   auto h_x = GenerateRandomCategoricalSingleColumn(n, num_categories);
   thrust::device_vector<float> x(h_x);
   auto adapter = AdapterFromData(x, n, 1);
@@ -498,7 +505,7 @@ void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
   AdapterDeviceSketch(adapter.Value(), num_bins, info,
                       std::numeric_limits<float>::quiet_NaN(), &container);
   HistogramCuts cuts;
-  container.MakeCuts(&cuts, info.IsColumnSplit());
+  container.MakeCuts(&ctx, &cuts, info.IsColumnSplit());
 
   thrust::sort(x.begin(), x.end());
   auto n_uniques = thrust::unique(x.begin(), x.end()) - x.begin();
@@ -522,6 +529,7 @@ void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
 TEST(HistUtil, AdapterDeviceSketchCategorical) {
   auto categorical_sizes = {2, 6, 8, 12};
   int num_bins = 256;
+  auto ctx = MakeCUDACtx(0);
   auto sizes = {25, 100, 1000};
   for (auto n : sizes) {
     for (auto num_categories : categorical_sizes) {
@@ -529,7 +537,7 @@ TEST(HistUtil, AdapterDeviceSketchCategorical) {
       auto dmat = GetDMatrixFromData(x, n, 1);
       auto x_device = thrust::device_vector<float>(x);
       auto adapter = AdapterFromData(x_device, n, 1);
-      ValidateBatchedCuts(adapter, num_bins, dmat.get());
+      ValidateBatchedCuts(&ctx, adapter, num_bins, dmat.get());
       TestCategoricalSketchAdapter(n, num_categories, num_bins, true);
       TestCategoricalSketchAdapter(n, num_categories, num_bins, false);
     }
@@ -540,13 +548,14 @@ TEST(HistUtil, AdapterDeviceSketchMultipleColumns) {
   auto bin_sizes = {2, 16, 256, 512};
   auto sizes = {100, 1000, 1500};
   int num_columns = 5;
+  auto ctx = MakeCUDACtx(0);
   for (auto num_rows : sizes) {
     auto x = GenerateRandom(num_rows, num_columns);
     auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
     auto x_device = thrust::device_vector<float>(x);
     for (auto num_bins : bin_sizes) {
       auto adapter = AdapterFromData(x_device, num_rows, num_columns);
-      ValidateBatchedCuts(adapter, num_bins, dmat.get());
+      ValidateBatchedCuts(&ctx, adapter, num_bins, dmat.get());
     }
   }
 }
@@ -556,12 +565,13 @@ TEST(HistUtil, AdapterDeviceSketchBatches) {
   int num_rows = 5000;
   auto batch_sizes = {0, 100, 1500, 6000};
   int num_columns = 5;
+  auto ctx = MakeCUDACtx(0);
   for (auto batch_size : batch_sizes) {
     auto x = GenerateRandom(num_rows, num_columns);
     auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
     auto x_device = thrust::device_vector<float>(x);
     auto adapter = AdapterFromData(x_device, num_rows, num_columns);
-    ValidateBatchedCuts(adapter, num_bins, dmat.get(), batch_size);
+    ValidateBatchedCuts(&ctx, adapter, num_bins, dmat.get(), batch_size);
   }
 }
 
@@ -647,12 +657,12 @@ TEST(HistUtil, SketchingEquivalent) {
       auto x_device = thrust::device_vector<float>(x);
       auto adapter = AdapterFromData(x_device, num_rows, num_columns);
       common::HistogramCuts adapter_cuts = MakeUnweightedCutsForTest(
-          adapter, num_bins, std::numeric_limits<float>::quiet_NaN());
+          &ctx, adapter, num_bins, std::numeric_limits<float>::quiet_NaN());
       EXPECT_EQ(dmat_cuts.Values(), adapter_cuts.Values());
       EXPECT_EQ(dmat_cuts.Ptrs(), adapter_cuts.Ptrs());
       EXPECT_EQ(dmat_cuts.MinValues(), adapter_cuts.MinValues());
 
-      ValidateBatchedCuts(adapter, num_bins, dmat.get());
+      ValidateBatchedCuts(&ctx, adapter, num_bins, dmat.get());
     }
   }
 }
@@ -702,7 +712,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
                       .Device(DeviceOrd::CUDA(0))
                       .GenerateArrayInterface(&storage);
   MetaInfo info;
-  Context ctx;
+  auto ctx = MakeCUDACtx(0);
   auto& h_weights = info.weights_.HostVector();
   if (with_group) {
     h_weights.resize(kGroups);
@@ -731,7 +741,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
                       &sketch_container);
 
   common::HistogramCuts cuts;
-  sketch_container.MakeCuts(&cuts, info.IsColumnSplit());
+  sketch_container.MakeCuts(&ctx, &cuts, info.IsColumnSplit());
 
   auto dmat = GetDMatrixFromData(storage.HostVector(), kRows, kCols);
   if (with_group) {
@@ -744,10 +754,9 @@ void TestAdapterSketchFromWeights(bool with_group) {
   ASSERT_EQ(cuts.Ptrs().size(), kCols + 1);
   ValidateCuts(cuts, dmat.get(), kBins);
 
-  auto cuda_ctx = MakeCUDACtx(0);
   if (with_group) {
     dmat->Info().weights_ = decltype(dmat->Info().weights_)();  // remove weight
-    HistogramCuts non_weighted = DeviceSketch(&cuda_ctx, dmat.get(), kBins, 0);
+    HistogramCuts non_weighted = DeviceSketch(&ctx, dmat.get(), kBins, 0);
     for (size_t i = 0; i < cuts.Values().size(); ++i) {
       ASSERT_EQ(cuts.Values()[i], non_weighted.Values()[i]);
     }
@@ -773,7 +782,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
     SketchContainer sketch_container{ft, kBins, kCols, kRows, DeviceOrd::CUDA(0)};
     AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
                         &sketch_container);
-    sketch_container.MakeCuts(&weighted, info.IsColumnSplit());
+    sketch_container.MakeCuts(&ctx, &weighted, info.IsColumnSplit());
     ValidateCuts(weighted, dmat.get(), kBins);
   }
 }
diff --git a/tests/cpp/common/test_quantile.cc b/tests/cpp/common/test_quantile.cc
index 343f59cda3ea..9fa1566ea130 100644
--- a/tests/cpp/common/test_quantile.cc
+++ b/tests/cpp/common/test_quantile.cc
@@ -86,7 +86,7 @@ void DoTestDistributedQuantile(size_t rows, size_t cols) {
   }
 
   HistogramCuts distributed_cuts;
-  sketch_distributed.MakeCuts(m->Info(), &distributed_cuts);
+  sketch_distributed.MakeCuts(&ctx, m->Info(), &distributed_cuts);
 
   // Generate cuts for single node environment
   collective::Finalize();
@@ -117,7 +117,7 @@ void DoTestDistributedQuantile(size_t rows, size_t cols) {
   }
 
   HistogramCuts single_node_cuts;
-  sketch_on_single_node.MakeCuts(m->Info(), &single_node_cuts);
+  sketch_on_single_node.MakeCuts(&ctx, m->Info(), &single_node_cuts);
 
   auto const& sptrs = single_node_cuts.Ptrs();
   auto const& dptrs = distributed_cuts.Ptrs();
@@ -220,7 +220,7 @@ void DoTestColSplitQuantile(size_t rows, size_t cols) {
       }
     }
 
-    sketch_distributed.MakeCuts(m->Info(), &distributed_cuts);
+    sketch_distributed.MakeCuts(&ctx, m->Info(), &distributed_cuts);
   }
 
   // Generate cuts for single node environment
@@ -243,7 +243,7 @@ void DoTestColSplitQuantile(size_t rows, size_t cols) {
       }
     }
 
-    sketch_on_single_node.MakeCuts(m->Info(), &single_node_cuts);
+    sketch_on_single_node.MakeCuts(&ctx, m->Info(), &single_node_cuts);
   }
 
   auto const& sptrs = single_node_cuts.Ptrs();
diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu
index 49353439f21a..26bd05524ded 100644
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -370,6 +370,7 @@ void TestAllReduceBasic() {
   constexpr size_t kRows = 1000, kCols = 100;
   RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
     auto const device = DeviceOrd::CUDA(GPUIDX);
+    auto ctx = MakeCUDACtx(device.ordinal);
 
     // Set up single node version;
     HostDeviceVector<FeatureType> ft({}, device);
@@ -413,7 +414,7 @@ void TestAllReduceBasic() {
     AdapterDeviceSketch(adapter.Value(), n_bins, info,
                         std::numeric_limits<float>::quiet_NaN(),
                         &sketch_distributed);
-    sketch_distributed.AllReduce(false);
+    sketch_distributed.AllReduce(&ctx, false);
     sketch_distributed.Unique();
 
     ASSERT_EQ(sketch_distributed.ColumnsPtr().size(),
@@ -517,6 +518,7 @@ void TestSameOnAllWorkers() {
                                  MetaInfo const &info) {
     auto const rank = collective::GetRank();
     auto const device = DeviceOrd::CUDA(GPUIDX);
+    Context ctx = MakeCUDACtx(device.ordinal);
     HostDeviceVector<FeatureType> ft({}, device);
     SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
     HostDeviceVector<float> storage({}, device);
@@ -528,7 +530,7 @@ void TestSameOnAllWorkers() {
     AdapterDeviceSketch(adapter.Value(), n_bins, info,
                         std::numeric_limits<float>::quiet_NaN(),
                         &sketch_distributed);
-    sketch_distributed.AllReduce(false);
+    sketch_distributed.AllReduce(&ctx, false);
     sketch_distributed.Unique();
     TestQuantileElemRank(device, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true);
 
diff --git a/tests/cpp/plugin/helpers.h b/tests/cpp/plugin/helpers.h
index 3dd0c3a1fe15..85f2e014bdf2 100644
--- a/tests/cpp/plugin/helpers.h
+++ b/tests/cpp/plugin/helpers.h
@@ -73,6 +73,7 @@ void RunWithFederatedCommunicator(int32_t world_size, std::string const& server_
   auto run = [&](auto rank) {
     Json config{JsonObject()};
     config["xgboost_communicator"] = String("federated");
+    config["federated_secure"] = false;
     config["federated_server_address"] = String(server_address);
     config["federated_world_size"] = world_size;
     config["federated_rank"] = rank;
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index 6922577485ba..04f1d35b4002 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -2,6 +2,7 @@
  * Copyright (c) 2017-2023, XGBoost contributors
  */
 #include <gtest/gtest.h>
+#include <gmock/gmock.h>
 #include <xgboost/learner.h>                        // for Learner
 #include <xgboost/logging.h>                        // for LogCheck_NE, CHECK_NE, LogCheck_EQ
 #include <xgboost/objective.h>                      // for ObjFunction
@@ -81,7 +82,9 @@ TEST(Learner, ParameterValidation) {
 
   // whitespace
   learner->SetParam("tree method", "exact");
-  EXPECT_THROW(learner->Configure(), dmlc::Error);
+  EXPECT_THAT([&] { learner->Configure(); },
+              ::testing::ThrowsMessage<dmlc::Error>(
+                  ::testing::HasSubstr(R"("tree method" contains whitespace)")));
 }
 
 TEST(Learner, CheckGroup) {
diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
index 7d5f15a1c47e..862bc6bfcca9 100644
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -19,14 +19,15 @@ auto ZeroParam() {
 }
 }  // anonymous namespace
 
-inline GradientQuantiser DummyRoundingFactor() {
+inline GradientQuantiser DummyRoundingFactor(Context const* ctx) {
   thrust::device_vector<GradientPair> gpair(1);
   gpair[0] = {1000.f, 1000.f};  // Tests should not exceed sum of 1000
-  return {dh::ToSpan(gpair), MetaInfo()};
+  return {ctx, dh::ToSpan(gpair), MetaInfo()};
 }
 
-thrust::device_vector<GradientPairInt64> ConvertToInteger(std::vector<GradientPairPrecise> x) {
-  auto r = DummyRoundingFactor();
+thrust::device_vector<GradientPairInt64> ConvertToInteger(Context const* ctx,
+                                                          std::vector<GradientPairPrecise> x) {
+  auto r = DummyRoundingFactor(ctx);
   std::vector<GradientPairInt64> y(x.size());
   for (std::size_t i = 0; i < x.size(); i++) {
     y[i] = r.ToFixedPoint(GradientPair(x[i]));
@@ -41,11 +42,12 @@ TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) {
   cuts_.cut_ptrs_.SetDevice(ctx.Device());
   cuts_.cut_values_.SetDevice(ctx.Device());
   cuts_.min_vals_.SetDevice(ctx.Device());
-  thrust::device_vector<GradientPairInt64> feature_histogram{ConvertToInteger(feature_histogram_)};
+  thrust::device_vector<GradientPairInt64> feature_histogram{
+      ConvertToInteger(&ctx, feature_histogram_)};
 
   dh::device_vector<FeatureType> feature_types(feature_set.size(), FeatureType::kCategorical);
   auto d_feature_types = dh::ToSpan(feature_types);
-  auto quantiser = DummyRoundingFactor();
+  auto quantiser = DummyRoundingFactor(&ctx);
   EvaluateSplitInputs input{1, 0, quantiser.ToFixedPoint(parent_sum_), dh::ToSpan(feature_set),
                             dh::ToSpan(feature_histogram)};
   EvaluateSplitSharedInputs shared_inputs{param,
@@ -60,7 +62,7 @@ TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) {
 
   evaluator.Reset(cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, false,
                   ctx.Device());
-  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
 
   ASSERT_EQ(result.thresh, 1);
   this->CheckResult(result.loss_chg, result.findex, result.fvalue, result.is_cat,
@@ -90,7 +92,7 @@ TEST(GpuHist, PartitionBasic) {
       *std::max_element(cuts.cut_values_.HostVector().begin(), cuts.cut_values_.HostVector().end());
   cuts.SetCategorical(true, max_cat);
   d_feature_types = dh::ToSpan(feature_types);
-  auto quantiser = DummyRoundingFactor();
+  auto quantiser = DummyRoundingFactor(&ctx);
   EvaluateSplitSharedInputs shared_inputs{
       param,
       quantiser,
@@ -108,10 +110,10 @@ TEST(GpuHist, PartitionBasic) {
     // -1.0s go right
     // -3.0s go left
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-5.0, 3.0});
-    auto feature_histogram = ConvertToInteger({{-1.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}});
+    auto feature_histogram = ConvertToInteger(&ctx, {{-1.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}});
     EvaluateSplitInputs input{0, 0, parent_sum, dh::ToSpan(feature_set),
                               dh::ToSpan(feature_histogram)};
-    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
     auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
     EXPECT_EQ(result.dir, kLeftDir);
     EXPECT_EQ(cats, std::bitset<32>("11000000000000000000000000000000"));
@@ -122,10 +124,10 @@ TEST(GpuHist, PartitionBasic) {
     // -1.0s go right
     // -3.0s go left
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-7.0, 3.0});
-    auto feature_histogram = ConvertToInteger({{-1.0, 1.0}, {-3.0, 1.0}, {-3.0, 1.0}});
+    auto feature_histogram = ConvertToInteger(&ctx, {{-1.0, 1.0}, {-3.0, 1.0}, {-3.0, 1.0}});
     EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
                               dh::ToSpan(feature_histogram)};
-    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
     auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
     EXPECT_EQ(result.dir, kLeftDir);
     EXPECT_EQ(cats, std::bitset<32>("10000000000000000000000000000000"));
@@ -134,10 +136,10 @@ TEST(GpuHist, PartitionBasic) {
   {
     // All -1.0, gain from splitting should be 0.0
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-3.0, 3.0});
-    auto feature_histogram = ConvertToInteger({{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}});
+    auto feature_histogram = ConvertToInteger(&ctx, {{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}});
     EvaluateSplitInputs input{2, 0, parent_sum, dh::ToSpan(feature_set),
                               dh::ToSpan(feature_histogram)};
-    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
     EXPECT_EQ(result.dir, kLeftDir);
     EXPECT_FLOAT_EQ(result.loss_chg, 0.0f);
     EXPECT_EQ(result.left_sum + result.right_sum, parent_sum);
@@ -147,10 +149,10 @@ TEST(GpuHist, PartitionBasic) {
   // value
   {
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 6.0});
-    auto feature_histogram = ConvertToInteger({{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}});
+    auto feature_histogram = ConvertToInteger(&ctx, {{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}});
     EvaluateSplitInputs input{3, 0, parent_sum, dh::ToSpan(feature_set),
                               dh::ToSpan(feature_histogram)};
-    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
     auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
     EXPECT_EQ(cats, std::bitset<32>("11000000000000000000000000000000"));
     EXPECT_EQ(result.dir, kLeftDir);
@@ -160,10 +162,10 @@ TEST(GpuHist, PartitionBasic) {
     // -1.0s go right
     // -3.0s go left
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-5.0, 3.0});
-    auto feature_histogram = ConvertToInteger({{-1.0, 1.0}, {-3.0, 1.0}, {-1.0, 1.0}});
+    auto feature_histogram = ConvertToInteger(&ctx, {{-1.0, 1.0}, {-3.0, 1.0}, {-1.0, 1.0}});
     EvaluateSplitInputs input{4, 0, parent_sum, dh::ToSpan(feature_set),
                               dh::ToSpan(feature_histogram)};
-    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
     auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
     EXPECT_EQ(result.dir, kLeftDir);
     EXPECT_EQ(cats, std::bitset<32>("10100000000000000000000000000000"));
@@ -173,10 +175,10 @@ TEST(GpuHist, PartitionBasic) {
     // -1.0s go right
     // -3.0s go left
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-5.0, 3.0});
-    auto feature_histogram = ConvertToInteger({{-3.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}});
+    auto feature_histogram = ConvertToInteger(&ctx, {{-3.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}});
     EvaluateSplitInputs input{5, 0, parent_sum, dh::ToSpan(feature_set),
                               dh::ToSpan(feature_histogram)};
-    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
     auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
     EXPECT_EQ(cats, std::bitset<32>("01000000000000000000000000000000"));
     EXPECT_EQ(result.left_sum + result.right_sum, parent_sum);
@@ -205,7 +207,7 @@ TEST(GpuHist, PartitionTwoFeatures) {
       *std::max_element(cuts.cut_values_.HostVector().begin(), cuts.cut_values_.HostVector().end());
   cuts.SetCategorical(true, max_cat);
 
-  auto quantiser = DummyRoundingFactor();
+  auto quantiser = DummyRoundingFactor(&ctx);
   EvaluateSplitSharedInputs shared_inputs{param,
                                           quantiser,
                                           d_feature_types,
@@ -220,10 +222,10 @@ TEST(GpuHist, PartitionTwoFeatures) {
   {
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
     auto feature_histogram = ConvertToInteger(
-        {{-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}});
+        &ctx, {{-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}});
     EvaluateSplitInputs input{0, 0, parent_sum, dh::ToSpan(feature_set),
                               dh::ToSpan(feature_histogram)};
-    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
     auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
     EXPECT_EQ(result.findex, 1);
     EXPECT_EQ(cats, std::bitset<32>("11000000000000000000000000000000"));
@@ -233,10 +235,10 @@ TEST(GpuHist, PartitionTwoFeatures) {
   {
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
     auto feature_histogram = ConvertToInteger(
-        {{-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0}});
+        &ctx, {{-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0}});
     EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
                               dh::ToSpan(feature_histogram)};
-    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
     auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
     EXPECT_EQ(result.findex, 1);
     EXPECT_EQ(cats, std::bitset<32>("10000000000000000000000000000000"));
@@ -266,7 +268,7 @@ TEST(GpuHist, PartitionTwoNodes) {
       *std::max_element(cuts.cut_values_.HostVector().begin(), cuts.cut_values_.HostVector().end());
   cuts.SetCategorical(true, max_cat);
 
-  auto quantiser = DummyRoundingFactor();
+  auto quantiser = DummyRoundingFactor(&ctx);
   EvaluateSplitSharedInputs shared_inputs{param,
                                           quantiser,
                                           d_feature_types,
@@ -283,15 +285,16 @@ TEST(GpuHist, PartitionTwoNodes) {
   {
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
     auto feature_histogram_a = ConvertToInteger(
-        {{-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}});
+        &ctx, {{-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}});
     thrust::device_vector<EvaluateSplitInputs> inputs(2);
     inputs[0] = EvaluateSplitInputs{0, 0, parent_sum, dh::ToSpan(feature_set),
                                     dh::ToSpan(feature_histogram_a)};
-    auto feature_histogram_b = ConvertToInteger({{-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}});
+    auto feature_histogram_b = ConvertToInteger(&ctx, {{-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}});
     inputs[1] = EvaluateSplitInputs{1, 0, parent_sum, dh::ToSpan(feature_set),
                                     dh::ToSpan(feature_histogram_b)};
     thrust::device_vector<GPUExpandEntry> results(2);
-    evaluator.EvaluateSplits({0, 1}, 1, dh::ToSpan(inputs), shared_inputs, dh::ToSpan(results));
+    evaluator.EvaluateSplits(&ctx, {0, 1}, 1, dh::ToSpan(inputs), shared_inputs,
+                             dh::ToSpan(results));
     EXPECT_EQ(std::bitset<32>(evaluator.GetHostNodeCats(0)[0]),
               std::bitset<32>("10000000000000000000000000000000"));
     EXPECT_EQ(std::bitset<32>(evaluator.GetHostNodeCats(1)[0]),
@@ -301,7 +304,7 @@ TEST(GpuHist, PartitionTwoNodes) {
 
 void TestEvaluateSingleSplit(bool is_categorical) {
   auto ctx = MakeCUDACtx(0);
-  auto quantiser = DummyRoundingFactor();
+  auto quantiser = DummyRoundingFactor(&ctx);
   auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
   TrainParam tparam = ZeroParam();
   GPUTrainingParam param{tparam};
@@ -311,7 +314,8 @@ void TestEvaluateSingleSplit(bool is_categorical) {
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
 
   // Setup gradients so that second feature gets higher gain
-  auto feature_histogram = ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}});
+  auto feature_histogram =
+      ConvertToInteger(&ctx, {{-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}});
 
   dh::device_vector<FeatureType> feature_types(feature_set.size(), FeatureType::kCategorical);
   common::Span<FeatureType> d_feature_types;
@@ -336,7 +340,7 @@ void TestEvaluateSingleSplit(bool is_categorical) {
                              ctx.Device()};
   evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false,
                   ctx.Device());
-  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 1);
   if (is_categorical) {
@@ -352,7 +356,8 @@ TEST(GpuHist, EvaluateSingleSplit) { TestEvaluateSingleSplit(false); }
 TEST(GpuHist, EvaluateSingleCategoricalSplit) { TestEvaluateSingleSplit(true); }
 
 TEST(GpuHist, EvaluateSingleSplitMissing) {
-  auto quantiser = DummyRoundingFactor();
+  auto ctx = MakeCUDACtx(0);
+  auto quantiser = DummyRoundingFactor(&ctx);
   auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{1.0, 1.5});
   TrainParam tparam = ZeroParam();
   GPUTrainingParam param{tparam};
@@ -361,7 +366,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) {
   thrust::device_vector<uint32_t> feature_segments = std::vector<bst_row_t>{0, 2};
   thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0};
   thrust::device_vector<float> feature_min_values = std::vector<float>{0.0};
-  auto feature_histogram = ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}});
+  auto feature_histogram = ConvertToInteger(&ctx, {{-0.5, 0.5}, {0.5, 0.5}});
   EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
                             dh::ToSpan(feature_histogram)};
   EvaluateSplitSharedInputs shared_inputs{param,
@@ -373,7 +378,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) {
                                           false};
 
   GPUHistEvaluator evaluator(tparam, feature_set.size(), FstCU());
-  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 0);
   EXPECT_EQ(result.fvalue, 1.0);
@@ -383,14 +388,15 @@ TEST(GpuHist, EvaluateSingleSplitMissing) {
 }
 
 TEST(GpuHist, EvaluateSingleSplitEmpty) {
+  auto ctx = MakeCUDACtx(0);
   TrainParam tparam = ZeroParam();
   GPUHistEvaluator evaluator(tparam, 1, FstCU());
   DeviceSplitCandidate result =
       evaluator
           .EvaluateSingleSplit(
-              EvaluateSplitInputs{},
+              &ctx, EvaluateSplitInputs{},
               EvaluateSplitSharedInputs{
-                  GPUTrainingParam(tparam), DummyRoundingFactor(), {}, {}, {}, {}, false})
+                  GPUTrainingParam(tparam), DummyRoundingFactor(&ctx), {}, {}, {}, {}, false})
           .split;
   EXPECT_EQ(result.findex, -1);
   EXPECT_LT(result.loss_chg, 0.0f);
@@ -398,7 +404,8 @@ TEST(GpuHist, EvaluateSingleSplitEmpty) {
 
 // Feature 0 has a better split, but the algorithm must select feature 1
 TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {
-  auto quantiser = DummyRoundingFactor();
+  auto ctx = MakeCUDACtx(0);
+  auto quantiser = DummyRoundingFactor(&ctx);
   auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
   TrainParam tparam = ZeroParam();
   tparam.UpdateAllowUnknown(Args{});
@@ -408,7 +415,8 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {
   thrust::device_vector<uint32_t> feature_segments = std::vector<bst_row_t>{0, 2, 4};
   thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0, 11.0, 12.0};
   thrust::device_vector<float> feature_min_values = std::vector<float>{0.0, 10.0};
-  auto feature_histogram = ConvertToInteger({{-10.0, 0.5}, {10.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
+  auto feature_histogram =
+      ConvertToInteger(&ctx, {{-10.0, 0.5}, {10.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
   EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
                             dh::ToSpan(feature_histogram)};
   EvaluateSplitSharedInputs shared_inputs{param,
@@ -420,7 +428,7 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {
                                           false};
 
   GPUHistEvaluator evaluator(tparam, feature_min_values.size(), FstCU());
-  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 1);
   EXPECT_EQ(result.fvalue, 11.0);
@@ -430,7 +438,8 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {
 
 // Features 0 and 1 have identical gain, the algorithm must select 0
 TEST(GpuHist, EvaluateSingleSplitBreakTies) {
-  auto quantiser = DummyRoundingFactor();
+  auto ctx = MakeCUDACtx(0);
+  auto quantiser = DummyRoundingFactor(&ctx);
   auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
   TrainParam tparam = ZeroParam();
   tparam.UpdateAllowUnknown(Args{});
@@ -440,7 +449,8 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) {
   thrust::device_vector<uint32_t> feature_segments = std::vector<bst_row_t>{0, 2, 4};
   thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0, 11.0, 12.0};
   thrust::device_vector<float> feature_min_values = std::vector<float>{0.0, 10.0};
-  auto feature_histogram = ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
+  auto feature_histogram =
+      ConvertToInteger(&ctx, {{-0.5, 0.5}, {0.5, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
   EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
                             dh::ToSpan(feature_histogram)};
   EvaluateSplitSharedInputs shared_inputs{param,
@@ -452,15 +462,16 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) {
                                           false};
 
   GPUHistEvaluator evaluator(tparam, feature_min_values.size(), FstCU());
-  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 0);
   EXPECT_EQ(result.fvalue, 1.0);
 }
 
 TEST(GpuHist, EvaluateSplits) {
+  auto ctx = MakeCUDACtx(0);
   thrust::device_vector<DeviceSplitCandidate> out_splits(2);
-  auto quantiser = DummyRoundingFactor();
+  auto quantiser = DummyRoundingFactor(&ctx);
   auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
   TrainParam tparam = ZeroParam();
   tparam.UpdateAllowUnknown(Args{});
@@ -471,9 +482,9 @@ TEST(GpuHist, EvaluateSplits) {
   thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0, 11.0, 12.0};
   thrust::device_vector<float> feature_min_values = std::vector<float>{0.0, 0.0};
   auto feature_histogram_left =
-      ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}});
+      ConvertToInteger(&ctx, {{-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}});
   auto feature_histogram_right =
-      ConvertToInteger({{-1.0, 0.5}, {1.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
+      ConvertToInteger(&ctx, {{-1.0, 0.5}, {1.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
   EvaluateSplitInputs input_left{1, 0, parent_sum, dh::ToSpan(feature_set),
                                  dh::ToSpan(feature_histogram_left)};
   EvaluateSplitInputs input_right{2, 0, parent_sum, dh::ToSpan(feature_set),
@@ -514,7 +525,7 @@ TEST_F(TestPartitionBasedSplit, GpuHist) {
   evaluator.Reset(cuts_, dh::ToSpan(ft), info_.num_col_, param_, false, ctx.Device());
 
   // Convert the sample histogram to fixed point
-  auto quantiser = DummyRoundingFactor();
+  auto quantiser = DummyRoundingFactor(&ctx);
   thrust::host_vector<GradientPairInt64> h_hist;
   for (auto e : hist_[0]) {
     h_hist.push_back(quantiser.ToFixedPoint(e));
@@ -531,7 +542,7 @@ TEST_F(TestPartitionBasedSplit, GpuHist) {
                                           cuts_.cut_values_.ConstDeviceSpan(),
                                           cuts_.min_vals_.ConstDeviceSpan(),
                                           false};
-  auto split = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+  auto split = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
   ASSERT_NEAR(split.loss_chg, best_score_, 1e-2);
 }
 
@@ -541,7 +552,7 @@ namespace {
 void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
   auto ctx = MakeCUDACtx(GPUIDX);
   auto rank = collective::GetRank();
-  auto quantiser = DummyRoundingFactor();
+  auto quantiser = DummyRoundingFactor(&ctx);
   auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
   TrainParam tparam = ZeroParam();
   GPUTrainingParam param{tparam};
@@ -552,8 +563,8 @@ void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
 
   // Setup gradients so that second feature gets higher gain
-  auto feature_histogram = rank == 0 ? ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}})
-                                     : ConvertToInteger({{-1.0, 0.5}, {1.0, 0.5}});
+  auto feature_histogram = rank == 0 ? ConvertToInteger(&ctx, {{-0.5, 0.5}, {0.5, 0.5}})
+                                     : ConvertToInteger(&ctx, {{-1.0, 0.5}, {1.0, 0.5}});
 
   dh::device_vector<FeatureType> feature_types(feature_set.size(), FeatureType::kCategorical);
   common::Span<FeatureType> d_feature_types;
@@ -576,7 +587,7 @@ void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
 
   GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
   evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true, ctx.Device());
-  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 1) << "rank: " << rank;
   if (is_categorical) {
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index 0c91cf21e7f1..f7f2e27ea1be 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -37,7 +37,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
     FeatureGroups feature_groups(page->Cuts(), page->is_dense, shm_size,
                                  sizeof(GradientPairInt64));
 
-    auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
+    auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo());
     BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(FstCU()),
                            feature_groups.DeviceAccessor(FstCU()), gpair.DeviceSpan(), ridx,
                            d_histogram, quantiser);
@@ -51,7 +51,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
       dh::device_vector<GradientPairInt64> new_histogram(num_bins);
       auto d_new_histogram = dh::ToSpan(new_histogram);
 
-      auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
+      auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo());
       BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(FstCU()),
                              feature_groups.DeviceAccessor(FstCU()), gpair.DeviceSpan(), ridx,
                              d_new_histogram, quantiser);
@@ -129,7 +129,7 @@ void TestGPUHistogramCategorical(size_t num_categories) {
   dh::device_vector<GradientPairInt64> cat_hist(num_categories);
   auto gpair = GenerateRandomGradients(kRows, 0, 2);
   gpair.SetDevice(DeviceOrd::CUDA(0));
-  auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
+  auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo());
   /**
    * Generate hist with cat data.
    */
diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc
index 8949b5f4be2a..25a800367c49 100644
--- a/tests/cpp/tree/hist/test_histogram.cc
+++ b/tests/cpp/tree/hist/test_histogram.cc
@@ -181,7 +181,7 @@ void TestSyncHist(bool is_distributed) {
 
   histogram.Buffer().Reset(1, n_nodes, space, target_hists);
   // sync hist
-  histogram.SyncHistogram(&tree, nodes_for_explicit_hist_build, nodes_for_subtraction_trick);
+  histogram.SyncHistogram(&ctx, &tree, nodes_for_explicit_hist_build, nodes_for_subtraction_trick);
 
   using GHistRowT = common::GHistRow;
   auto check_hist = [](const GHistRowT parent, const GHistRowT left, const GHistRowT right,
@@ -266,7 +266,7 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_
     histogram.BuildHist(0, space, gidx, row_set_collection, nodes_to_build,
                         linalg::MakeTensorView(&ctx, gpair, gpair.size()), force_read_by_column);
   }
-  histogram.SyncHistogram(&tree, nodes_to_build, {});
+  histogram.SyncHistogram(&ctx, &tree, nodes_to_build, {});
 
   // Check if number of histogram bins is correct
   ASSERT_EQ(histogram.Histogram()[nid].size(), gmat.cut.Ptrs().back());
@@ -366,7 +366,7 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
                        linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size()),
                        force_read_by_column);
   }
-  cat_hist.SyncHistogram(&tree, nodes_to_build, {});
+  cat_hist.SyncHistogram(&ctx, &tree, nodes_to_build, {});
 
   /**
    * Generate hist with one hot encoded data.
@@ -382,7 +382,7 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
                           linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size()),
                           force_read_by_column);
   }
-  onehot_hist.SyncHistogram(&tree, nodes_to_build, {});
+  onehot_hist.SyncHistogram(&ctx, &tree, nodes_to_build, {});
 
   auto cat = cat_hist.Histogram()[0];
   auto onehot = onehot_hist.Histogram()[0];
@@ -451,7 +451,7 @@ void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, boo
                             force_read_by_column);
       ++page_idx;
     }
-    multi_build.SyncHistogram(&tree, nodes, {});
+    multi_build.SyncHistogram(ctx, &tree, nodes, {});
 
     multi_page = multi_build.Histogram()[RegTree::kRoot];
   }
@@ -480,7 +480,7 @@ void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, boo
     single_build.BuildHist(0, space, gmat, row_set_collection, nodes,
                            linalg::MakeTensorView(ctx, h_gpair, h_gpair.size()),
                            force_read_by_column);
-    single_build.SyncHistogram(&tree, nodes, {});
+    single_build.SyncHistogram(ctx, &tree, nodes, {});
 
     single_page = single_build.Histogram()[RegTree::kRoot];
   }
@@ -570,7 +570,7 @@ class OverflowTest : public ::testing::TestWithParam<std::tuple<bool, bool>> {
     CHECK_NE(partitioners.front()[tree.RightChild(best.nid)].Size(), 0);
 
     hist_builder.BuildHistLeftRight(
-        Xy.get(), &tree, partitioners, valid_candidates,
+        &ctx, Xy.get(), &tree, partitioners, valid_candidates,
         linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size(), 1), batch);
 
     if (limit) {
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index accfbae082c7..6f937351ea23 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -111,7 +111,7 @@ void TestBuildHist(bool use_shared_memory_histograms) {
   maker.hist.AllocateHistograms({0});
 
   maker.gpair = gpair.DeviceSpan();
-  maker.quantiser = std::make_unique<GradientQuantiser>(maker.gpair, MetaInfo());
+  maker.quantiser = std::make_unique<GradientQuantiser>(&ctx, maker.gpair, MetaInfo());
   maker.page = page.get();
 
   maker.InitFeatureGroupsOnce();
@@ -162,12 +162,6 @@ HistogramCutsWrapper GetHostCutMatrix () {
   return cmat;
 }
 
-inline GradientQuantiser DummyRoundingFactor() {
-  thrust::device_vector<GradientPair> gpair(1);
-  gpair[0] = {1000.f, 1000.f};  // Tests should not exceed sum of 1000
-  return {dh::ToSpan(gpair), MetaInfo()};
-}
-
 void TestHistogramIndexImpl() {
   // Test if the compressed histogram index matches when using a sparse
   // dmatrix with and without using external memory