From aad29dfb31ada247310a6ce1c4a783deeac74052 Mon Sep 17 00:00:00 2001
From: Ziyue Xu <ziyue.xu@gmail.com>
Date: Wed, 17 Jul 2024 09:31:44 -0400
Subject: [PATCH 01/26] secure horizontal for GPU

---
 src/tree/updater_gpu_hist.cu | 45 +++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 19957857218d..3ae440359af3 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -43,6 +43,9 @@
 #include "xgboost/task.h"  // for ObjInfo
 #include "xgboost/tree_model.h"
 
+#include "../collective/communicator-inl.h"
+#include "../collective/allgather.h"         // for AllgatherV
+
 namespace xgboost::tree {
 #if !defined(GTEST_TEST)
 DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);
@@ -665,7 +668,47 @@ struct GPUHistMakerDevice {
     // Reduce all in one go
     // This gives much better latency in a distributed setting
     // when processing a large batch
-    this->AllReduceHist(hist_nidx.at(0), hist_nidx.size());
+    // If secure horizontal, perform AllReduce by calling the encryption plugin
+    if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) {
+        // Get encryption plugin
+        decltype(std::declval<collective::FederatedComm>().EncryptionPlugin()) plugin_;
+        auto const &comm = collective::GlobalCommGroup()->Ctx(ctx_, DeviceOrd::CPU());
+        auto const &fed = dynamic_cast<collective::FederatedComm const &>(comm);
+
+        plugin_ = fed.EncryptionPlugin();
+
+        // Get the histogram data
+        std::size_t n = page->Cuts().TotalBins() * 2 * hist_nidx.size();
+        auto d_node_hist = hist.GetNodeHistogram(hist_nidx.at(0)).data();
+        using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT;
+        auto hist_vec = linalg::MakeVec(reinterpret_cast<ReduceT*>(d_node_hist), n, ctx_->Device());
+
+        // copy the histogram out of GPU memory
+        common::Span<std::int8_t> erased = common::EraseType(hist_vec.Values());
+        std::vector<std::int8_t> h_data(erased.size());
+        cudaMemcpy(h_data.data(), erased.data(), erased.size(), cudaMemcpyDeviceToHost);
+
+        // call the encryption plugin
+        auto src_hist = common::Span{reinterpret_cast<double const *>(h_data.data()), n};
+        auto hist_buf = plugin_->BuildEncryptedHistHori(src_hist);
+
+        // allgather
+        HostDeviceVector<std::int8_t> hist_entries;
+        std::vector<std::int64_t> recv_segments;
+        auto rc =
+            collective::AllgatherV(ctx_, linalg::MakeVec(hist_buf), &recv_segments, &hist_entries);
+        collective::SafeColl(rc);
+
+        // call the encryption plugin to aggregate the histograms
+        auto hist_aggr =
+                plugin_->SyncEncryptedHistHori(common::RestoreType<std::uint8_t>(hist_entries.HostSpan()));
+
+        // copy the aggregated histogram back to GPU memory
+        cudaMemcpy(erased.data(), hist_aggr.data(), erased.size(), cudaMemcpyHostToDevice);
+    }
+    else {
+      this->AllReduceHist(hist_nidx.at(0), hist_nidx.size());
+    }
 
     for (size_t i = 0; i < subtraction_nidx.size(); i++) {
       auto build_hist_nidx = hist_nidx.at(i);

From b448dffb8fad97670aaf5101c9c275308cef7f1c Mon Sep 17 00:00:00 2001
From: Ziyue Xu <ziyue.xu@gmail.com>
Date: Wed, 17 Jul 2024 18:20:29 -0400
Subject: [PATCH 02/26] more structured refinement

---
 src/tree/updater_gpu_hist.cu | 107 ++++++++++++++++++++++-------------
 1 file changed, 67 insertions(+), 40 deletions(-)

diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 3ae440359af3..606b1be5960d 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -46,6 +46,7 @@
 #include "../collective/communicator-inl.h"
 #include "../collective/allgather.h"         // for AllgatherV
 
+#include <stdio.h>
 namespace xgboost::tree {
 #if !defined(GTEST_TEST)
 DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);
@@ -623,17 +624,67 @@ struct GPUHistMakerDevice {
   // num histograms is the number of contiguous histograms in memory to reduce over
   void AllReduceHist(int nidx, int num_histograms) {
     monitor.Start("AllReduce");
+    std::size_t n = page->Cuts().TotalBins() * 2 * num_histograms;
     auto d_node_hist = hist.GetNodeHistogram(nidx).data();
     using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT;
+    auto hist_vec = linalg::MakeVec(reinterpret_cast<ReduceT*>(d_node_hist), n, ctx_->Device());
+
+    // print out the first histogram with iterator
+    //std::vector<int64_t> entry(hist_vec.Values().size());
+    //dh::CopyDeviceSpanToVector(&entry, hist_vec.Values());
+    //printf("Non-enc: Rank %d Before AllReduce: %ld\n", collective::GetRank(), entry[0]);
+
     auto rc = collective::GlobalSum(
-        ctx_, info_,
-        linalg::MakeVec(reinterpret_cast<ReduceT*>(d_node_hist),
-                        page->Cuts().TotalBins() * 2 * num_histograms, ctx_->Device()));
+        ctx_, info_, hist_vec);
     SafeColl(rc);
 
+    // print out the first histogram with iterator
+    //dh::CopyDeviceSpanToVector(&entry, hist_vec.Values());
+    //printf("Non-enc: Rank %d After AllReduce: %ld\n", collective::GetRank(), entry[0]);
+
     monitor.Stop("AllReduce");
   }
 
+  void AllReduceHistEncrypted(int nidx, int num_histograms) {
+      monitor.Start("AllReduceEncrypted");
+      // Get encryption plugin
+      decltype(std::declval<collective::FederatedComm>().EncryptionPlugin()) plugin;
+      auto const &comm = collective::GlobalCommGroup()->Ctx(ctx_, DeviceOrd::CPU());
+      auto const &fed = dynamic_cast<collective::FederatedComm const &>(comm);
+      plugin = fed.EncryptionPlugin();
+
+      // Get the histogram data
+      std::size_t n = page->Cuts().TotalBins() * 2 * num_histograms;
+      auto d_node_hist = hist.GetNodeHistogram(nidx).data();
+      using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT;
+      auto hist_vec = linalg::MakeVec(reinterpret_cast<ReduceT*>(d_node_hist), n, ctx_->Device());
+
+      // copy the histogram out of GPU memory
+      common::Span<std::int8_t> erased = common::EraseType(hist_vec.Values());
+      std::vector<std::int8_t> h_data(erased.size());
+      cudaMemcpy(h_data.data(), erased.data(), erased.size(), cudaMemcpyDeviceToHost);
+
+      // call the encryption plugin
+      auto src_hist = common::Span{reinterpret_cast<double const *>(h_data.data()), n};
+      auto hist_buf = plugin->BuildEncryptedHistHori(src_hist);
+
+      // allgather
+      HostDeviceVector<std::int8_t> hist_entries;
+      std::vector<std::int64_t> recv_segments;
+      auto rc =
+              collective::AllgatherV(ctx_, linalg::MakeVec(hist_buf), &recv_segments, &hist_entries);
+      collective::SafeColl(rc);
+
+      // call the encryption plugin to aggregate the histograms
+      auto hist_aggr =
+              plugin->SyncEncryptedHistHori(common::RestoreType<std::uint8_t>(hist_entries.HostSpan()));
+
+      // copy the aggregated histogram back to GPU memory
+      cudaMemcpy(erased.data(), hist_aggr.data(), erased.size(), cudaMemcpyHostToDevice);
+
+      monitor.Stop("AllReduceEncrypted");
+  }
+
   /**
    * \brief Build GPU local histograms for the left and right child of some parent node
    */
@@ -670,41 +721,7 @@ struct GPUHistMakerDevice {
     // when processing a large batch
     // If secure horizontal, perform AllReduce by calling the encryption plugin
     if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) {
-        // Get encryption plugin
-        decltype(std::declval<collective::FederatedComm>().EncryptionPlugin()) plugin_;
-        auto const &comm = collective::GlobalCommGroup()->Ctx(ctx_, DeviceOrd::CPU());
-        auto const &fed = dynamic_cast<collective::FederatedComm const &>(comm);
-
-        plugin_ = fed.EncryptionPlugin();
-
-        // Get the histogram data
-        std::size_t n = page->Cuts().TotalBins() * 2 * hist_nidx.size();
-        auto d_node_hist = hist.GetNodeHistogram(hist_nidx.at(0)).data();
-        using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT;
-        auto hist_vec = linalg::MakeVec(reinterpret_cast<ReduceT*>(d_node_hist), n, ctx_->Device());
-
-        // copy the histogram out of GPU memory
-        common::Span<std::int8_t> erased = common::EraseType(hist_vec.Values());
-        std::vector<std::int8_t> h_data(erased.size());
-        cudaMemcpy(h_data.data(), erased.data(), erased.size(), cudaMemcpyDeviceToHost);
-
-        // call the encryption plugin
-        auto src_hist = common::Span{reinterpret_cast<double const *>(h_data.data()), n};
-        auto hist_buf = plugin_->BuildEncryptedHistHori(src_hist);
-
-        // allgather
-        HostDeviceVector<std::int8_t> hist_entries;
-        std::vector<std::int64_t> recv_segments;
-        auto rc =
-            collective::AllgatherV(ctx_, linalg::MakeVec(hist_buf), &recv_segments, &hist_entries);
-        collective::SafeColl(rc);
-
-        // call the encryption plugin to aggregate the histograms
-        auto hist_aggr =
-                plugin_->SyncEncryptedHistHori(common::RestoreType<std::uint8_t>(hist_entries.HostSpan()));
-
-        // copy the aggregated histogram back to GPU memory
-        cudaMemcpy(erased.data(), hist_aggr.data(), erased.size(), cudaMemcpyHostToDevice);
+      this->AllReduceHistEncrypted(hist_nidx.at(0), hist_nidx.size());
     }
     else {
       this->AllReduceHist(hist_nidx.at(0), hist_nidx.size());
@@ -718,7 +735,12 @@ struct GPUHistMakerDevice {
       if (!this->SubtractionTrick(parent_nidx, build_hist_nidx, subtraction_trick_nidx)) {
         // Calculate other histogram manually
         this->BuildHist(subtraction_trick_nidx);
-        this->AllReduceHist(subtraction_trick_nidx, 1);
+        if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) {
+          this->AllReduceHistEncrypted(subtraction_trick_nidx, 1);
+        }
+        else {
+          this->AllReduceHist(subtraction_trick_nidx, 1);
+        }
       }
     }
   }
@@ -792,7 +814,12 @@ struct GPUHistMakerDevice {
 
     hist.AllocateHistograms({kRootNIdx});
     this->BuildHist(kRootNIdx);
-    this->AllReduceHist(kRootNIdx, 1);
+    if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) {
+      this->AllReduceHistEncrypted(kRootNIdx, 1);
+    }
+    else {
+      this->AllReduceHist(kRootNIdx, 1);
+    }
 
     // Remember root stats
     auto root_sum = quantiser.ToFloatingPoint(root_sum_quantised);

From 7d6d5921c6ba1c29bb86b83786f7420fd7f0a6b3 Mon Sep 17 00:00:00 2001
From: Ziyue Xu <ziyue.xu@gmail.com>
Date: Thu, 18 Jul 2024 16:26:24 -0400
Subject: [PATCH 03/26] add the missing aggr piece to secure hori GPU

---
 src/tree/updater_gpu_hist.cu | 92 +++++++++++++++++-------------------
 1 file changed, 44 insertions(+), 48 deletions(-)

diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 606b1be5960d..5600da12d9f4 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -628,61 +628,57 @@ struct GPUHistMakerDevice {
     auto d_node_hist = hist.GetNodeHistogram(nidx).data();
     using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT;
     auto hist_vec = linalg::MakeVec(reinterpret_cast<ReduceT*>(d_node_hist), n, ctx_->Device());
-
-    // print out the first histogram with iterator
-    //std::vector<int64_t> entry(hist_vec.Values().size());
-    //dh::CopyDeviceSpanToVector(&entry, hist_vec.Values());
-    //printf("Non-enc: Rank %d Before AllReduce: %ld\n", collective::GetRank(), entry[0]);
-
     auto rc = collective::GlobalSum(
         ctx_, info_, hist_vec);
     SafeColl(rc);
-
-    // print out the first histogram with iterator
-    //dh::CopyDeviceSpanToVector(&entry, hist_vec.Values());
-    //printf("Non-enc: Rank %d After AllReduce: %ld\n", collective::GetRank(), entry[0]);
-
     monitor.Stop("AllReduce");
   }
 
   void AllReduceHistEncrypted(int nidx, int num_histograms) {
-      monitor.Start("AllReduceEncrypted");
-      // Get encryption plugin
-      decltype(std::declval<collective::FederatedComm>().EncryptionPlugin()) plugin;
-      auto const &comm = collective::GlobalCommGroup()->Ctx(ctx_, DeviceOrd::CPU());
-      auto const &fed = dynamic_cast<collective::FederatedComm const &>(comm);
-      plugin = fed.EncryptionPlugin();
-
-      // Get the histogram data
-      std::size_t n = page->Cuts().TotalBins() * 2 * num_histograms;
-      auto d_node_hist = hist.GetNodeHistogram(nidx).data();
-      using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT;
-      auto hist_vec = linalg::MakeVec(reinterpret_cast<ReduceT*>(d_node_hist), n, ctx_->Device());
-
-      // copy the histogram out of GPU memory
-      common::Span<std::int8_t> erased = common::EraseType(hist_vec.Values());
-      std::vector<std::int8_t> h_data(erased.size());
-      cudaMemcpy(h_data.data(), erased.data(), erased.size(), cudaMemcpyDeviceToHost);
-
-      // call the encryption plugin
-      auto src_hist = common::Span{reinterpret_cast<double const *>(h_data.data()), n};
-      auto hist_buf = plugin->BuildEncryptedHistHori(src_hist);
-
-      // allgather
-      HostDeviceVector<std::int8_t> hist_entries;
-      std::vector<std::int64_t> recv_segments;
-      auto rc =
-              collective::AllgatherV(ctx_, linalg::MakeVec(hist_buf), &recv_segments, &hist_entries);
-      collective::SafeColl(rc);
-
-      // call the encryption plugin to aggregate the histograms
-      auto hist_aggr =
-              plugin->SyncEncryptedHistHori(common::RestoreType<std::uint8_t>(hist_entries.HostSpan()));
-
-      // copy the aggregated histogram back to GPU memory
-      cudaMemcpy(erased.data(), hist_aggr.data(), erased.size(), cudaMemcpyHostToDevice);
-
-      monitor.Stop("AllReduceEncrypted");
+    monitor.Start("AllReduceEncrypted");
+    // Get encryption plugin
+    decltype(std::declval<collective::FederatedComm>().EncryptionPlugin()) plugin;
+    auto const &comm = collective::GlobalCommGroup()->Ctx(ctx_, DeviceOrd::CPU());
+    auto const &fed = dynamic_cast<collective::FederatedComm const &>(comm);
+    plugin = fed.EncryptionPlugin();
+
+    // Get the histogram data
+    std::size_t n = page->Cuts().TotalBins() * 2 * num_histograms;
+    auto d_node_hist = hist.GetNodeHistogram(nidx).data();
+    using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT;
+    auto hist_vec = linalg::MakeVec(reinterpret_cast<ReduceT*>(d_node_hist), n, ctx_->Device());
+
+    // copy the histogram out of GPU memory
+    common::Span<std::int8_t> erased = common::EraseType(hist_vec.Values());
+    std::vector<std::int8_t> h_data(erased.size());
+    cudaMemcpy(h_data.data(), erased.data(), erased.size(), cudaMemcpyDeviceToHost);
+
+    // call the encryption plugin
+    auto src_hist = common::Span{reinterpret_cast<double const *>(h_data.data()), n};
+    auto hist_buf = plugin->BuildEncryptedHistHori(src_hist);
+
+    // allgather
+    HostDeviceVector<std::int8_t> hist_entries;
+    std::vector<std::int64_t> recv_segments;
+    auto rc = collective::AllgatherV(ctx_, linalg::MakeVec(hist_buf), &recv_segments, &hist_entries);
+    collective::SafeColl(rc);
+
+    // call the encryption plugin to decode the histograms
+    auto hist_aggr = plugin->SyncEncryptedHistHori(common::RestoreType<std::uint8_t>(hist_entries.HostSpan()));
+
+    // reinterpret the aggregated histogram as a int64_t and aggregate
+    auto hist_aggr_64 = common::Span{reinterpret_cast<std::int64_t *>(hist_aggr.data()), hist_aggr.size()};
+    int num_ranks = collective::GlobalCommGroup()->World();
+    for (size_t i = 0; i < n; i++) {
+      for (int j = 1; j < num_ranks; j++) {
+        hist_aggr_64[i] = hist_aggr_64[i] + hist_aggr_64[i + j * n];
+      }
+    }
+
+    // copy the aggregated histogram back to GPU memory
+    cudaMemcpy(erased.data(), hist_aggr_64.data(), erased.size(), cudaMemcpyHostToDevice);
+
+    monitor.Stop("AllReduceEncrypted");
   }
 
   /**

From 7421cfa55ca77d857387337f223b2b1b80317ef2 Mon Sep 17 00:00:00 2001
From: Ziyue Xu <ziyue.xu@gmail.com>
Date: Fri, 19 Jul 2024 10:29:58 -0400
Subject: [PATCH 04/26] code lint corrections

---
 src/tree/updater_gpu_hist.cu | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 5600da12d9f4..f562de17f69c 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -46,7 +46,6 @@
 #include "../collective/communicator-inl.h"
 #include "../collective/allgather.h"         // for AllgatherV
 
-#include <stdio.h>
 namespace xgboost::tree {
 #if !defined(GTEST_TEST)
 DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);
@@ -660,14 +659,17 @@ struct GPUHistMakerDevice {
     // allgather
     HostDeviceVector<std::int8_t> hist_entries;
     std::vector<std::int64_t> recv_segments;
-    auto rc = collective::AllgatherV(ctx_, linalg::MakeVec(hist_buf), &recv_segments, &hist_entries);
+    auto rc = collective::AllgatherV(ctx_, linalg::MakeVec(hist_buf),
+                                     &recv_segments, &hist_entries);
     collective::SafeColl(rc);
 
     // call the encryption plugin to decode the histograms
-    auto hist_aggr = plugin->SyncEncryptedHistHori(common::RestoreType<std::uint8_t>(hist_entries.HostSpan()));
+    auto hist_aggr = plugin->SyncEncryptedHistHori(
+            common::RestoreType<std::uint8_t>(hist_entries.HostSpan()));
 
     // reinterpret the aggregated histogram as a int64_t and aggregate
-    auto hist_aggr_64 = common::Span{reinterpret_cast<std::int64_t *>(hist_aggr.data()), hist_aggr.size()};
+    auto hist_aggr_64 = common::Span{
+        reinterpret_cast<std::int64_t *>(hist_aggr.data()), hist_aggr.size()};
     int num_ranks = collective::GlobalCommGroup()->World();
     for (size_t i = 0; i < n; i++) {
       for (int j = 1; j < num_ranks; j++) {
@@ -718,8 +720,7 @@ struct GPUHistMakerDevice {
     // If secure horizontal, perform AllReduce by calling the encryption plugin
     if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) {
       this->AllReduceHistEncrypted(hist_nidx.at(0), hist_nidx.size());
-    }
-    else {
+    } else {
       this->AllReduceHist(hist_nidx.at(0), hist_nidx.size());
     }
 
@@ -733,8 +734,7 @@ struct GPUHistMakerDevice {
         this->BuildHist(subtraction_trick_nidx);
         if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) {
           this->AllReduceHistEncrypted(subtraction_trick_nidx, 1);
-        }
-        else {
+        } else {
           this->AllReduceHist(subtraction_trick_nidx, 1);
         }
       }
@@ -812,8 +812,7 @@ struct GPUHistMakerDevice {
     this->BuildHist(kRootNIdx);
     if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) {
       this->AllReduceHistEncrypted(kRootNIdx, 1);
-    }
-    else {
+    } else {
       this->AllReduceHist(kRootNIdx, 1);
     }
 

From 361e17a0f164980d7b1a050d6d063c5e9f6344be Mon Sep 17 00:00:00 2001
From: Ziyue Xu <ziyue.xu@gmail.com>
Date: Thu, 25 Jul 2024 10:06:51 -0400
Subject: [PATCH 05/26] remove redundant plugin init

---
 src/tree/updater_gpu_hist.cu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 8d1a38e79822..5f4ed304ea9c 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -530,10 +530,9 @@ struct GPUHistMakerDevice {
   void AllReduceHistEncrypted(int nidx, int num_histograms) {
     monitor.Start("AllReduceEncrypted");
     // Get encryption plugin
-    decltype(std::declval<collective::FederatedComm>().EncryptionPlugin()) plugin;
     auto const &comm = collective::GlobalCommGroup()->Ctx(ctx_, DeviceOrd::CPU());
     auto const &fed = dynamic_cast<collective::FederatedComm const &>(comm);
-    plugin = fed.EncryptionPlugin();
+    auto plugin = fed.EncryptionPlugin();
 
     // Get the histogram data
     std::size_t n = page->Cuts().TotalBins() * 2 * num_histograms;

From 306fc1a5dda46a4d18c102af8b7da390cb13c818 Mon Sep 17 00:00:00 2001
From: Ziyue Xu <ziyue.xu@gmail.com>
Date: Thu, 25 Jul 2024 16:16:09 -0400
Subject: [PATCH 06/26] update to avoid pipeline stuck for secure vertical

---
 src/collective/aggregator.h |  2 +-
 src/gbm/gbtree.cc           | 17 +++++++++++++++--
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/collective/aggregator.h b/src/collective/aggregator.h
index f3d2aa090aa9..8ebd39b900fe 100644
--- a/src/collective/aggregator.h
+++ b/src/collective/aggregator.h
@@ -199,7 +199,7 @@ void BroadcastGradient(Context const* ctx, MetaInfo const& info, GradFn&& grad_f
 #if defined(XGBOOST_USE_FEDERATED)
     // Need to encrypt the gradient before broadcasting.
     common::Span<std::uint8_t> encrypted;
-    auto const& comm = GlobalCommGroup()->Ctx(ctx, ctx->Device());
+    auto const& comm = GlobalCommGroup()->Ctx(ctx, DeviceOrd::CPU());
     auto const& fed = dynamic_cast<FederatedComm const&>(comm);
     if (GetRank() == 0) {
       // Obtain the gradient
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 26c768fafea7..fe95708c7a52 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -237,7 +237,9 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
     }
   } else if (model_.learner_model_param->OutputLength() == 1u) {
     TreesOneGroup ret;
+      //std::cout << "BoostNewTrees" << std::endl;
     BoostNewTrees(in_gpair, p_fmat, 0, &node_position, &ret);
+      //std::cout << "UpdateTreeLeaf" << std::endl;
     UpdateTreeLeaf(p_fmat, predt->predictions, obj, 0, node_position, &ret);
     const size_t num_new_trees = ret.size();
     new_trees.push_back(std::move(ret));
@@ -312,7 +314,11 @@ void GBTree::BoostNewTrees(linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat,
 
   // update the trees
   auto n_out = model_.learner_model_param->OutputLength() * p_fmat->Info().num_row_;
-  StringView msg{
+
+    //std::cout << "Num_rows: " << p_fmat->Info().num_row_ << " and n_out: " << n_out << std::endl;
+
+
+    StringView msg{
       "Mismatching size between number of rows from input data and size of gradient vector."};
   if (!model_.learner_model_param->IsVectorLeaf() && p_fmat->Info().num_row_ != 0) {
     CHECK_EQ(n_out % gpair->Size(), 0) << msg;
@@ -325,11 +331,18 @@ void GBTree::BoostNewTrees(linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat,
   // Rescale learning rate according to the size of trees
   auto lr = tree_param_.learning_rate;
   tree_param_.learning_rate /= static_cast<float>(new_trees.size());
-  for (auto& up : updaters_) {
+
+    //std::cout << "Update for Rank " << collective::GetRank() << " with gpair size " << gpair->Size() << " and device " << gpair->Device() << std::endl;
+
+
+    for (auto& up : updaters_) {
     up->Update(&tree_param_, gpair, p_fmat,
                common::Span<HostDeviceVector<bst_node_t>>{*out_position}, new_trees);
   }
   tree_param_.learning_rate = lr;
+
+    //std::cout << "Update done" << std::endl;
+
 }
 
 void GBTree::CommitModel(TreesOneIter&& new_trees) {

From 906b0fa42b9d048b3626a7a4c40a8720a715c166 Mon Sep 17 00:00:00 2001
From: Ziyue Xu <ziyue.xu@gmail.com>
Date: Thu, 25 Jul 2024 16:19:23 -0400
Subject: [PATCH 07/26] update to avoid pipeline stuck for secure vertical

---
 src/gbm/gbtree.cc | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index fe95708c7a52..26c768fafea7 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -237,9 +237,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
     }
   } else if (model_.learner_model_param->OutputLength() == 1u) {
     TreesOneGroup ret;
-      //std::cout << "BoostNewTrees" << std::endl;
     BoostNewTrees(in_gpair, p_fmat, 0, &node_position, &ret);
-      //std::cout << "UpdateTreeLeaf" << std::endl;
     UpdateTreeLeaf(p_fmat, predt->predictions, obj, 0, node_position, &ret);
     const size_t num_new_trees = ret.size();
     new_trees.push_back(std::move(ret));
@@ -314,11 +312,7 @@ void GBTree::BoostNewTrees(linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat,
 
   // update the trees
   auto n_out = model_.learner_model_param->OutputLength() * p_fmat->Info().num_row_;
-
-    //std::cout << "Num_rows: " << p_fmat->Info().num_row_ << " and n_out: " << n_out << std::endl;
-
-
-    StringView msg{
+  StringView msg{
       "Mismatching size between number of rows from input data and size of gradient vector."};
   if (!model_.learner_model_param->IsVectorLeaf() && p_fmat->Info().num_row_ != 0) {
     CHECK_EQ(n_out % gpair->Size(), 0) << msg;
@@ -331,18 +325,11 @@ void GBTree::BoostNewTrees(linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat,
   // Rescale learning rate according to the size of trees
   auto lr = tree_param_.learning_rate;
   tree_param_.learning_rate /= static_cast<float>(new_trees.size());
-
-    //std::cout << "Update for Rank " << collective::GetRank() << " with gpair size " << gpair->Size() << " and device " << gpair->Device() << std::endl;
-
-
-    for (auto& up : updaters_) {
+  for (auto& up : updaters_) {
     up->Update(&tree_param_, gpair, p_fmat,
                common::Span<HostDeviceVector<bst_node_t>>{*out_position}, new_trees);
   }
   tree_param_.learning_rate = lr;
-
-    //std::cout << "Update done" << std::endl;
-
 }
 
 void GBTree::CommitModel(TreesOneIter&& new_trees) {

From 35d23f80b768e857bf672acc7c88a601fb6000a1 Mon Sep 17 00:00:00 2001
From: Ziyue Xu <ziyuex@nvidia.com>
Date: Fri, 26 Jul 2024 10:00:52 -0400
Subject: [PATCH 08/26] Update src/tree/updater_gpu_hist.cu

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 src/tree/updater_gpu_hist.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 5f4ed304ea9c..bdcc90a8bda7 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -528,7 +528,7 @@ struct GPUHistMakerDevice {
   }
 
   void AllReduceHistEncrypted(int nidx, int num_histograms) {
-    monitor.Start("AllReduceEncrypted");
+    monitor.Start(__func__);
     // Get encryption plugin
     auto const &comm = collective::GlobalCommGroup()->Ctx(ctx_, DeviceOrd::CPU());
     auto const &fed = dynamic_cast<collective::FederatedComm const &>(comm);

From 9ab2d8dcd569bef85c1b62a46093e4e1b6b8be11 Mon Sep 17 00:00:00 2001
From: Ziyue Xu <ziyuex@nvidia.com>
Date: Fri, 26 Jul 2024 10:00:57 -0400
Subject: [PATCH 09/26] Update src/tree/updater_gpu_hist.cu

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 src/tree/updater_gpu_hist.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index bdcc90a8bda7..86ffeecf3cf6 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -573,7 +573,7 @@ struct GPUHistMakerDevice {
     // copy the aggregated histogram back to GPU memory
     cudaMemcpy(erased.data(), hist_aggr_64.data(), erased.size(), cudaMemcpyHostToDevice);
 
-    monitor.Stop("AllReduceEncrypted");
+    monitor.Stop(__func__);
   }
 
   /**

From 96ab11311fcfd318fc08d62dee257ebdd3bc625d Mon Sep 17 00:00:00 2001
From: Ziyue Xu <ziyue.xu@gmail.com>
Date: Fri, 26 Jul 2024 10:17:19 -0400
Subject: [PATCH 10/26] wrap cuda API calls in safe_cuda

---
 src/tree/updater_gpu_hist.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 86ffeecf3cf6..58439132ee6f 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -543,7 +543,7 @@ struct GPUHistMakerDevice {
     // copy the histogram out of GPU memory
     common::Span<std::int8_t> erased = common::EraseType(hist_vec.Values());
     std::vector<std::int8_t> h_data(erased.size());
-    cudaMemcpy(h_data.data(), erased.data(), erased.size(), cudaMemcpyDeviceToHost);
+    dh::safe_cuda(cudaMemcpy(h_data.data(), erased.data(), erased.size(), cudaMemcpyDeviceToHost));
 
     // call the encryption plugin
     auto src_hist = common::Span{reinterpret_cast<double const *>(h_data.data()), n};

From 778d6be01ee6ad7a4b1a3351d59fa4eb74b8d672 Mon Sep 17 00:00:00 2001
From: Ziyue Xu <ziyue.xu@gmail.com>
Date: Fri, 26 Jul 2024 13:21:14 -0400
Subject: [PATCH 11/26] add include for federatedcomm

---
 src/tree/updater_gpu_hist.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 58439132ee6f..a19e2c6f45ea 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -46,6 +46,7 @@
 
 #include "../collective/communicator-inl.h"
 #include "../collective/allgather.h"         // for AllgatherV
+#include "../../plugin/federated/federated_comm.h"  // for FederatedComm
 
 namespace xgboost::tree {
 #if !defined(GTEST_TEST)

From 32c301438187b6cf4238c03228b73c364550bcd5 Mon Sep 17 00:00:00 2001
From: Ziyue Xu <ziyue.xu@gmail.com>
Date: Fri, 26 Jul 2024 14:53:55 -0400
Subject: [PATCH 12/26] add include conditions

---
 src/tree/updater_gpu_hist.cu | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index a19e2c6f45ea..78e22ea9c7e1 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -46,7 +46,12 @@
 
 #include "../collective/communicator-inl.h"
 #include "../collective/allgather.h"         // for AllgatherV
+
+#if defined(XGBOOST_USE_FEDERATED)
 #include "../../plugin/federated/federated_comm.h"  // for FederatedComm
+#else
+#include "../../common/error_msg.h"  // for NoFederated
+#endif
 
 namespace xgboost::tree {
 #if !defined(GTEST_TEST)
@@ -528,6 +533,7 @@ struct GPUHistMakerDevice {
     monitor.Stop("AllReduce");
   }
 
+#if defined(XGBOOST_USE_FEDERATED)
   void AllReduceHistEncrypted(int nidx, int num_histograms) {
     monitor.Start(__func__);
     // Get encryption plugin
@@ -576,6 +582,7 @@ struct GPUHistMakerDevice {
 
     monitor.Stop(__func__);
   }
+#endif
 
   /**
    * \brief Build GPU local histograms for the left and right child of some parent node
@@ -613,7 +620,9 @@ struct GPUHistMakerDevice {
     // when processing a large batch
     // If secure horizontal, perform AllReduce by calling the encryption plugin
     if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) {
-      this->AllReduceHistEncrypted(hist_nidx.at(0), hist_nidx.size());
+      #if defined(XGBOOST_USE_FEDERATED)
+        this->AllReduceHistEncrypted(hist_nidx.at(0), hist_nidx.size());
+      #endif
     } else {
       this->AllReduceHist(hist_nidx.at(0), hist_nidx.size());
     }
@@ -627,7 +636,9 @@ struct GPUHistMakerDevice {
         // Calculate other histogram manually
         this->BuildHist(subtraction_trick_nidx);
         if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) {
-          this->AllReduceHistEncrypted(subtraction_trick_nidx, 1);
+          #if defined(XGBOOST_USE_FEDERATED)
+            this->AllReduceHistEncrypted(subtraction_trick_nidx, 1);
+          #endif
         } else {
           this->AllReduceHist(subtraction_trick_nidx, 1);
         }
@@ -705,7 +716,9 @@ struct GPUHistMakerDevice {
     hist.AllocateHistograms(ctx_, {kRootNIdx});
     this->BuildHist(kRootNIdx);
     if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) {
-      this->AllReduceHistEncrypted(kRootNIdx, 1);
+      #if defined(XGBOOST_USE_FEDERATED)
+        this->AllReduceHistEncrypted(kRootNIdx, 1);
+      #endif
     } else {
       this->AllReduceHist(kRootNIdx, 1);
     }

From 7df59551e88d145d04961958ccc0aa1b56028141 Mon Sep 17 00:00:00 2001
From: Ziyue Xu <ziyue.xu@gmail.com>
Date: Fri, 26 Jul 2024 15:12:50 -0400
Subject: [PATCH 13/26] add include conditions

---
 src/tree/updater_gpu_hist.cu | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 78e22ea9c7e1..7d3069e35f8d 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -582,6 +582,8 @@ struct GPUHistMakerDevice {
 
     monitor.Stop(__func__);
   }
+#else
+  LOG(FATAL) << error::NoFederated();
 #endif
 
   /**
@@ -622,6 +624,8 @@ struct GPUHistMakerDevice {
     if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) {
       #if defined(XGBOOST_USE_FEDERATED)
         this->AllReduceHistEncrypted(hist_nidx.at(0), hist_nidx.size());
+      #else
+        LOG(FATAL) << error::NoFederated();
       #endif
     } else {
       this->AllReduceHist(hist_nidx.at(0), hist_nidx.size());
@@ -638,6 +642,8 @@ struct GPUHistMakerDevice {
         if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) {
           #if defined(XGBOOST_USE_FEDERATED)
             this->AllReduceHistEncrypted(subtraction_trick_nidx, 1);
+          #else
+            LOG(FATAL) << error::NoFederated();
           #endif
         } else {
           this->AllReduceHist(subtraction_trick_nidx, 1);
@@ -718,6 +724,8 @@ struct GPUHistMakerDevice {
     if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) {
       #if defined(XGBOOST_USE_FEDERATED)
         this->AllReduceHistEncrypted(kRootNIdx, 1);
+      #else
+        LOG(FATAL) << error::NoFederated();
       #endif
     } else {
       this->AllReduceHist(kRootNIdx, 1);

From 3cc863a266c9b4d6b6a16b2158dd9b820eb733b4 Mon Sep 17 00:00:00 2001
From: Ziyue Xu <ziyue.xu@gmail.com>
Date: Mon, 29 Jul 2024 10:19:40 -0400
Subject: [PATCH 14/26] correct import error

---
 src/tree/updater_gpu_hist.cu | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 7d3069e35f8d..088fc199786b 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -50,7 +50,7 @@
 #if defined(XGBOOST_USE_FEDERATED)
 #include "../../plugin/federated/federated_comm.h"  // for FederatedComm
 #else
-#include "../../common/error_msg.h"  // for NoFederated
+#include "../common/error_msg.h"  // for NoFederated
 #endif
 
 namespace xgboost::tree {
@@ -582,8 +582,6 @@ struct GPUHistMakerDevice {
 
     monitor.Stop(__func__);
   }
-#else
-  LOG(FATAL) << error::NoFederated();
 #endif
 
   /**

From f2b876d182f5412f199b9f89bf1da43585612823 Mon Sep 17 00:00:00 2001
From: Ziyue Xu <ziyue.xu@gmail.com>
Date: Tue, 30 Jul 2024 18:21:47 -0400
Subject: [PATCH 15/26] implement alternative vertical pipeline in GPU

---
 src/collective/aggregator.h          |  4 ++
 src/common/quantile.cu               | 23 ++++++++
 src/tree/gpu_hist/evaluate_splits.cu | 65 ++++++++++++++---------
 src/tree/gpu_hist/histogram.cu       | 79 ++++++++++++++++++++++++++--
 src/tree/gpu_hist/histogram.cuh      |  5 +-
 src/tree/updater_gpu_hist.cu         |  4 +-
 6 files changed, 148 insertions(+), 32 deletions(-)

diff --git a/src/collective/aggregator.h b/src/collective/aggregator.h
index 8ebd39b900fe..ed548a0f173b 100644
--- a/src/collective/aggregator.h
+++ b/src/collective/aggregator.h
@@ -230,6 +230,10 @@ void BroadcastGradient(Context const* ctx, MetaInfo const& info, GradFn&& grad_f
 #else
     LOG(FATAL) << error::NoFederated();
 #endif
+
+    // !!!Temporarily turn on regular gradient broadcasting for testing
+    // encrypted vertical
+    ApplyWithLabels(ctx, info, out_gpair->Data(), [&] { grad_fn(out_gpair); });
   } else {
     ApplyWithLabels(ctx, info, out_gpair->Data(), [&] { grad_fn(out_gpair); });
   }
diff --git a/src/common/quantile.cu b/src/common/quantile.cu
index d0356ae421c7..d5e927b30414 100644
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -673,6 +673,7 @@ void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool i
     }
   }
 
+  auto secure_vertical = is_column_split && collective::IsEncrypted();
   // Set up output cuts
   for (bst_feature_t i = 0; i < num_columns_; ++i) {
     size_t column_size = std::max(static_cast<size_t>(1ul), this->Column(i).size());
@@ -681,6 +682,11 @@ void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool i
       CheckMaxCat(max_values[i].value, column_size);
       h_out_columns_ptr.push_back(max_values[i].value + 1);  // includes both max_cat and 0.
     } else {
+      // If vertical and secure mode, we need to sync the max_num_bins aross workers
+      // to create the same global number of cut point bins for easier future processing
+      if (secure_vertical) {
+        collective::SafeColl(collective::Allreduce(ctx, &column_size, collective::Op::kMax));
+      }
       h_out_columns_ptr.push_back(
           std::min(static_cast<size_t>(column_size), static_cast<size_t>(num_bins_)));
     }
@@ -711,6 +717,10 @@ void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool i
         out_column[0] = kRtEps;
         assert(out_column.size() == 1);
       }
+      // For secure vertical split, fill all cut values with dummy value
+      if (secure_vertical) {
+        out_column[idx] = kRtEps;
+      }
       return;
     }
 
@@ -736,6 +746,19 @@ void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool i
     out_column[idx] = in_column[idx+1].value;
   });
 
+  if (secure_vertical) {
+    // cut values need to be synced across all workers via Allreduce
+    // To do: apply same inference indexing as CPU, skip for now
+    auto cut_values_device = p_cuts->cut_values_.DeviceSpan();
+    std::vector<float> cut_values_host(cut_values_device.size());
+    dh::CopyDeviceSpanToVector(&cut_values_host, cut_values_device);
+    auto rc = collective::Allreduce(ctx, &cut_values_host, collective::Op::kSum);
+    SafeColl(rc);
+    dh::safe_cuda(cudaMemcpyAsync(cut_values_device.data(), cut_values_host.data(),
+                                  cut_values_device.size() * sizeof(float),
+                                    cudaMemcpyHostToDevice));
+  }
+
   p_cuts->SetCategorical(this->has_categorical_, max_cat);
   timer_.Stop(__func__);
 }
diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index 5e225a13f142..3b5c1b76fa1c 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -6,6 +6,7 @@
 #include <limits>
 
 #include "../../collective/allgather.h"
+#include "../../collective/broadcast.h"
 #include "../../common/categorical.h"
 #include "../../data/ellpack_page.cuh"
 #include "evaluate_splits.cuh"
@@ -404,34 +405,48 @@ void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector<bst_
 
   dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(d_inputs.size());
   auto out_splits = dh::ToSpan(splits_out_storage);
-  this->LaunchEvaluateSplits(max_active_features, d_inputs, shared_inputs,
-                             evaluator, out_splits);
 
-  if (is_column_split_) {
-    // With column-wise data split, we gather the split candidates from all the workers and find the
-    // global best candidates.
-    auto const world_size = collective::GetWorldSize();
-    dh::TemporaryArray<DeviceSplitCandidate> all_candidate_storage(out_splits.size() * world_size);
-    auto all_candidates = dh::ToSpan(all_candidate_storage);
-    auto current_rank =
-        all_candidates.subspan(collective::GetRank() * out_splits.size(), out_splits.size());
-    dh::safe_cuda(cudaMemcpyAsync(current_rank.data(), out_splits.data(),
-                                  out_splits.size() * sizeof(DeviceSplitCandidate),
-                                  cudaMemcpyDeviceToDevice));
-    auto rc = collective::Allgather(
-        ctx, linalg::MakeVec(all_candidates.data(), all_candidates.size(), ctx->Device()));
-    collective::SafeColl(rc);
-
-    // Reduce to get the best candidate from all workers.
-    dh::LaunchN(out_splits.size(), ctx->CUDACtx()->Stream(),
-                [world_size, all_candidates, out_splits] __device__(size_t i) {
-                  out_splits[i] = all_candidates[i];
-                  for (auto rank = 1; rank < world_size; rank++) {
-                    out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i];
-                  }
-                });
+  bool is_passive_party = is_column_split_ && collective::IsEncrypted() && collective::GetRank() != 0;
+  bool is_active_party = !is_passive_party;
+  // Under secure vertical setting, only the active party is able to evaluate the split
+  // based on global histogram. Other parties will receive the final best split information
+  // Hence the below computation is not performed by the passive parties
+  if (is_active_party) {
+    this->LaunchEvaluateSplits(max_active_features, d_inputs, shared_inputs,
+                               evaluator, out_splits);
   }
 
+  if (is_column_split_) {
+    if (!collective::IsEncrypted()) {
+      // With regular column-wise data split, we gather the split candidates from
+      // all the workers and find the global best candidates.
+      auto const world_size = collective::GetWorldSize();
+      dh::TemporaryArray<DeviceSplitCandidate> all_candidate_storage(out_splits.size() * world_size);
+      auto all_candidates = dh::ToSpan(all_candidate_storage);
+      auto current_rank =
+              all_candidates.subspan(collective::GetRank() * out_splits.size(), out_splits.size());
+      dh::safe_cuda(cudaMemcpyAsync(current_rank.data(), out_splits.data(),
+                                    out_splits.size() * sizeof(DeviceSplitCandidate),
+                                    cudaMemcpyDeviceToDevice));
+      auto rc = collective::Allgather(
+              ctx, linalg::MakeVec(all_candidates.data(), all_candidates.size(), ctx->Device()));
+      collective::SafeColl(rc);
+      // Reduce to get the best candidate from all workers.
+      dh::LaunchN(out_splits.size(), ctx->CUDACtx()->Stream(),
+                  [world_size, all_candidates, out_splits] __device__(size_t i) {
+        out_splits[i] = all_candidates[i];
+        for (auto rank = 1; rank < world_size; rank++) {
+          out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i];
+        }
+      });
+    } else {
+      // With encrypted column-wise data split, we distribute the best split candidates
+      // from Rank 0 to all other workers
+      auto rc = collective::Broadcast(
+              ctx, linalg::MakeVec(out_splits.data(), out_splits.size(), ctx->Device()), 0);
+      collective::SafeColl(rc);
+    }
+  }
   auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
   auto d_entries = out_entries;
   auto device_cats_accessor = this->DeviceCatStorage(nidx);
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 372a5c09ba0c..d0d2634a1363 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -14,6 +14,13 @@
 #include "row_partitioner.cuh"
 #include "xgboost/base.h"
 
+#include "../../common/device_helpers.cuh"
+#if defined(XGBOOST_USE_FEDERATED)
+#include "../../../plugin/federated/federated_hist.h"  // for FederataedHistPolicy
+#else
+#include "../../common/error_msg.h"  // for NoFederated
+#endif
+
 namespace xgboost::tree {
 namespace {
 struct Pair {
@@ -309,6 +316,21 @@ class DeviceHistogramBuilderImpl {
              bool force_global_memory) {
     this->kernel_ = std::make_unique<HistogramKernel<>>(ctx, feature_groups, force_global_memory);
     this->force_global_memory_ = force_global_memory;
+
+
+    std::cout << "Reset DeviceHistogramBuilderImpl" << std::endl;
+
+    // Reset federated plugin
+    // start of every round, transmit the matrix to plugin
+    #if defined(XGBOOST_USE_FEDERATED)
+      // Get encryption plugin
+      auto const &comm = collective::GlobalCommGroup()->Ctx(ctx, DeviceOrd::CPU());
+      auto const &fed = dynamic_cast<collective::FederatedComm const &>(comm);
+      auto plugin = fed.EncryptionPlugin();
+      // Reset plugin
+      //plugin->Reset();
+    #endif
+
   }
 
   void BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
@@ -354,13 +376,64 @@ void DeviceHistogramBuilder::Reset(Context const* ctx, FeatureGroupsAccessor con
   this->p_impl_->Reset(ctx, feature_groups, force_global_memory);
 }
 
-void DeviceHistogramBuilder::BuildHistogram(CUDAContext const* ctx,
+void DeviceHistogramBuilder::BuildHistogram(Context const* ctx,
                                             EllpackDeviceAccessor const& matrix,
                                             FeatureGroupsAccessor const& feature_groups,
                                             common::Span<GradientPair const> gpair,
                                             common::Span<const std::uint32_t> ridx,
                                             common::Span<GradientPairInt64> histogram,
-                                            GradientQuantiser rounding) {
-  this->p_impl_->BuildHistogram(ctx, matrix, feature_groups, gpair, ridx, histogram, rounding);
+                                            GradientQuantiser rounding, MetaInfo const& info) {
+
+  auto IsSecureVertical = !info.IsRowSplit() && collective::IsDistributed() && collective::IsEncrypted();
+  if (!IsSecureVertical) {
+    // Regular training, build histogram locally
+    this->p_impl_->BuildHistogram(ctx->CUDACtx(), matrix, feature_groups, gpair, ridx, histogram, rounding);
+  } else {
+    // Encrypted vertical, build histogram using federated plugin
+    auto const &comm = collective::GlobalCommGroup()->Ctx(ctx, DeviceOrd::CPU());
+    auto const &fed = dynamic_cast<collective::FederatedComm const &>(comm);
+    auto plugin = fed.EncryptionPlugin();
+    // Transmit matrix to plugin
+    //plugin->TransmitMatrix(matrix);
+    // Transmit row indices to plugin
+    //plugin->TransmitRowIndices(ridx);
+
+    // !!!Temporarily turn on regular histogram building for testing
+    // encrypted vertical
+    this->p_impl_->BuildHistogram(ctx->CUDACtx(), matrix, feature_groups, gpair, ridx, histogram, rounding);
+
+    // Further histogram sync process - simulated
+    // only the last stage is needed under plugin system
+
+    // copy histogram data to host
+    std::vector<GradientPairInt64> host_histogram(histogram.size());
+    dh::CopyDeviceSpanToVector(&host_histogram, histogram);
+    // convert to regular vector
+    std::vector<std::int64_t> host_histogram_64(histogram.size() * 2);
+    for (auto i = 0; i < host_histogram.size(); i++) {
+        host_histogram_64[i * 2] = host_histogram[i].GetQuantisedGrad();
+        host_histogram_64[i * 2 + 1] = host_histogram[i].GetQuantisedHess();
+    }
+    // aggregate histograms in float
+    auto rc = collective::Allreduce(ctx, &host_histogram_64, collective::Op::kSum);
+    SafeColl(rc);
+    // convert back to GradientPairInt64
+    // only copy to Rank 0, clear other ranks to simulate the plugin scenario
+    for (auto i = 0; i < host_histogram.size(); i++) {
+      GradientPairInt64 hist_item(host_histogram_64[i * 2], host_histogram_64[i * 2 + 1]);
+      GradientPairInt64 hist_item_empty(0, 0);
+      if (collective::GetRank() != 0) {
+        hist_item = hist_item_empty;
+      } else {
+        host_histogram[i] = hist_item;
+      }
+    }
+    // copy the aggregated histogram back to GPU memory
+    // at this point, the histogram contains full information from all parties
+    dh::safe_cuda(cudaMemcpyAsync(histogram.data(), host_histogram.data(),
+                                  histogram.size() * sizeof(GradientPairPrecise),
+                                  cudaMemcpyHostToDevice));
+
+  }
 }
 }  // namespace xgboost::tree
diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh
index 87c60a8bfdbc..5c0d58f2df46 100644
--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@@ -178,11 +178,12 @@ class DeviceHistogramBuilder {
 
   void Reset(Context const* ctx, FeatureGroupsAccessor const& feature_groups,
              bool force_global_memory);
-  void BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
+  void BuildHistogram(Context const* ctx, EllpackDeviceAccessor const& matrix,
                       FeatureGroupsAccessor const& feature_groups,
                       common::Span<GradientPair const> gpair,
                       common::Span<const std::uint32_t> ridx,
-                      common::Span<GradientPairInt64> histogram, GradientQuantiser rounding);
+                      common::Span<GradientPairInt64> histogram, GradientQuantiser rounding,
+                      MetaInfo const& info);
 };
 }  // namespace xgboost::tree
 #endif  // HISTOGRAM_CUH_
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 088fc199786b..d588e4db9d2b 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -249,9 +249,9 @@ struct GPUHistMakerDevice {
   void BuildHist(int nidx) {
     auto d_node_hist = hist.GetNodeHistogram(nidx);
     auto d_ridx = row_partitioner->GetRows(nidx);
-    this->histogram_.BuildHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->Device()),
+    this->histogram_.BuildHistogram(ctx_, page->GetDeviceAccessor(ctx_->Device()),
                                     feature_groups->DeviceAccessor(ctx_->Device()), gpair, d_ridx,
-                                    d_node_hist, *quantiser);
+                                    d_node_hist, *quantiser, info_);
   }
 
   // Attempt to do subtraction trick

From 257251947b9e12ace737ea6693ccffe2e7a089f0 Mon Sep 17 00:00:00 2001
From: Ziyue Xu <ziyue.xu@gmail.com>
Date: Tue, 30 Jul 2024 22:22:01 -0400
Subject: [PATCH 16/26] transmit necessary info to plugin - align GPU with CPU

---
 plugin/federated/federated_plugin.cc | 21 ++++++++
 src/tree/gpu_hist/histogram.cu       | 77 ++++++++++++++++++++--------
 src/tree/gpu_hist/histogram.cuh      |  3 +-
 3 files changed, 78 insertions(+), 23 deletions(-)

diff --git a/plugin/federated/federated_plugin.cc b/plugin/federated/federated_plugin.cc
index 8488c59b26db..2e36da05145a 100644
--- a/plugin/federated/federated_plugin.cc
+++ b/plugin/federated/federated_plugin.cc
@@ -21,6 +21,20 @@
 namespace xgboost::collective {
 void FederatedPluginMock::Reset(common::Span<std::uint32_t const> cutptrs,
                                 common::Span<std::int32_t const> bin_idx) {
+
+    //print some contents of cutptrs and bin_idx
+    std::cout << "cutptrs.size() = " << cutptrs.size() << std::endl;
+    for (int i = 0; i < cutptrs.size(); i++) {
+        std::cout << cutptrs[i] << " ";
+    }
+    std::cout << std::endl;
+
+    std::cout << "bin_idx.size() = " << bin_idx.size() << std::endl;
+    for (int i = 0; i < 3; i++) {
+        std::cout << bin_idx[i] << " ";
+    }
+    std::cout << std::endl;
+
   this->cuts_.resize(cutptrs.size());
   std::copy_n(cutptrs.data(), cutptrs.size(), this->cuts_.data());
 
@@ -70,6 +84,13 @@ void FederatedPluginMock::Reset(common::Span<std::uint32_t const> cutptrs,
   auto hist_buffer = common::Span<double>{hist_plain_};
   std::fill_n(hist_buffer.data(), hist_buffer.size(), 0.0);
 
+  // print some contents of rowptrs
+    std::cout << "rowptrs.size() = " << rowptrs.size() << std::endl;
+    for (int i = 0; i < rowptrs.size(); i++) {
+        std::cout << rowptrs[i] << " ";
+    }
+    std::cout << std::endl;
+
   CHECK_EQ(rowptrs.size(), sizes.size());
   CHECK_EQ(nids.size(), sizes.size());
   auto gpair = common::RestoreType<GradientPair const>(common::Span<std::uint8_t>{grad_});
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index d0d2634a1363..7479e7b36adc 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -316,21 +316,6 @@ class DeviceHistogramBuilderImpl {
              bool force_global_memory) {
     this->kernel_ = std::make_unique<HistogramKernel<>>(ctx, feature_groups, force_global_memory);
     this->force_global_memory_ = force_global_memory;
-
-
-    std::cout << "Reset DeviceHistogramBuilderImpl" << std::endl;
-
-    // Reset federated plugin
-    // start of every round, transmit the matrix to plugin
-    #if defined(XGBOOST_USE_FEDERATED)
-      // Get encryption plugin
-      auto const &comm = collective::GlobalCommGroup()->Ctx(ctx, DeviceOrd::CPU());
-      auto const &fed = dynamic_cast<collective::FederatedComm const &>(comm);
-      auto plugin = fed.EncryptionPlugin();
-      // Reset plugin
-      //plugin->Reset();
-    #endif
-
   }
 
   void BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
@@ -394,17 +379,65 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx,
     auto const &fed = dynamic_cast<collective::FederatedComm const &>(comm);
     auto plugin = fed.EncryptionPlugin();
     // Transmit matrix to plugin
-    //plugin->TransmitMatrix(matrix);
-    // Transmit row indices to plugin
-    //plugin->TransmitRowIndices(ridx);
+    if(!is_aggr_context_initialized_){
+      std::cout << "Initialized Plugin Context" << std::endl;
+      // Get cutptrs
+      std::vector<uint32_t> h_cuts_ptr(matrix.feature_segments.size());
+      dh::CopyDeviceSpanToVector(&h_cuts_ptr, matrix.feature_segments);
+      common::Span<std::uint32_t const> cutptrs = common::Span<std::uint32_t const>(h_cuts_ptr.data(), h_cuts_ptr.size());
+      std::cout << "cutptrs.size() = " << h_cuts_ptr.size() << std::endl;
+      for (int i = 0; i < h_cuts_ptr.size(); i++) {
+        std::cout << h_cuts_ptr[i] << " ";
+      }
+      std::cout << std::endl;
+
+      // Get bin_idx matrix
+
+
+
+      //common::Span<std::int32_t const> bin_idx
+      //plugin->Reset(h_cuts_ptr, bin_idx);
+      is_aggr_context_initialized_ = true;
+    }
+
+    std::cout << "Transmitting row indices to plugin" << std::endl;
+    // print a few samples of ridx
+    std::vector<uint32_t> h_ridx(ridx.size());
+    dh::CopyDeviceSpanToVector(&h_ridx, ridx);
+    std::cout << "ridx.size() = " << h_ridx.size() << std::endl;
+    for (int i = 0; i < 5; i++) {
+      std::cout << h_ridx[i] << " ";
+    }
+    std::cout << std::endl;
+
+    // Transmit row indices to plugin and get encrypted histogram
+    //hist_data_ = this->plugin_->BuildEncryptedHistVert(ptrs, sizes, nodes);
+
+    // Perform AllGather
+    std::cout << "Allgather histograms" << std::endl;
+    /*
+    HostDeviceVector<std::int8_t> hist_entries;
+    std::vector<std::int64_t> recv_segments;
+    collective::SafeColl(
+              collective::AllgatherV(ctx_, linalg::MakeVec(hist_data_), &recv_segments, &hist_entries));
+
+    // Call the plugin here to get the resulting histogram. Histogram from all workers are
+    // gathered to the label owner.
+    common::Span<double> hist_aggr =
+            plugin_->SyncEncryptedHistVert(common::RestoreType<std::uint8_t>(hist_entries.HostSpan()));
+*/
+
+
+
+
+
+
 
     // !!!Temporarily turn on regular histogram building for testing
-    // encrypted vertical
+    // compute local histograms
     this->p_impl_->BuildHistogram(ctx->CUDACtx(), matrix, feature_groups, gpair, ridx, histogram, rounding);
 
-    // Further histogram sync process - simulated
-    // only the last stage is needed under plugin system
-
+    // Further histogram sync process - simulated with allreduce
     // copy histogram data to host
     std::vector<GradientPairInt64> host_histogram(histogram.size());
     dh::CopyDeviceSpanToVector(&host_histogram, histogram);
diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh
index 5c0d58f2df46..08ba9be856fc 100644
--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@@ -175,7 +175,8 @@ class DeviceHistogramBuilder {
  public:
   DeviceHistogramBuilder();
   ~DeviceHistogramBuilder();
-
+  // Whether to secure aggregation context has been initialized
+  bool is_aggr_context_initialized_{false};
   void Reset(Context const* ctx, FeatureGroupsAccessor const& feature_groups,
              bool force_global_memory);
   void BuildHistogram(Context const* ctx, EllpackDeviceAccessor const& matrix,

From e42faaa394a04c8bfe216d9a97867ea8af5eefd7 Mon Sep 17 00:00:00 2001
From: Ziyue Xu <ziyue.xu@gmail.com>
Date: Wed, 31 Jul 2024 10:58:14 -0400
Subject: [PATCH 17/26] marked calls to plugin - align GPU with CPU

---
 plugin/federated/federated_plugin.cc |  8 +++++++-
 src/tree/gpu_hist/histogram.cu       | 28 +++++++++++++++++++---------
 2 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/plugin/federated/federated_plugin.cc b/plugin/federated/federated_plugin.cc
index 2e36da05145a..6ff5aee4602a 100644
--- a/plugin/federated/federated_plugin.cc
+++ b/plugin/federated/federated_plugin.cc
@@ -85,9 +85,15 @@ void FederatedPluginMock::Reset(common::Span<std::uint32_t const> cutptrs,
   std::fill_n(hist_buffer.data(), hist_buffer.size(), 0.0);
 
   // print some contents of rowptrs
+
     std::cout << "rowptrs.size() = " << rowptrs.size() << std::endl;
     for (int i = 0; i < rowptrs.size(); i++) {
-        std::cout << rowptrs[i] << " ";
+        std::cout << sizes[i] << std::endl;
+        common::Span row_indices{rowptrs[i], rowptrs[i] + sizes[i]};
+        for (int j = 0; j < 5; j++) {
+            std::cout << row_indices[j] << " ";
+        }
+        std::cout << std::endl;
     }
     std::cout << std::endl;
 
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 7479e7b36adc..ae1043666624 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -396,25 +396,35 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx,
 
 
       //common::Span<std::int32_t const> bin_idx
+      //*********************************************
       //plugin->Reset(h_cuts_ptr, bin_idx);
+      //*********************************************
       is_aggr_context_initialized_ = true;
     }
 
-    std::cout << "Transmitting row indices to plugin" << std::endl;
-    // print a few samples of ridx
+
+    // get row indices from device
     std::vector<uint32_t> h_ridx(ridx.size());
     dh::CopyDeviceSpanToVector(&h_ridx, ridx);
-    std::cout << "ridx.size() = " << h_ridx.size() << std::endl;
-    for (int i = 0; i < 5; i++) {
-      std::cout << h_ridx[i] << " ";
-    }
-    std::cout << std::endl;
+
+    // wrap info following plugin expectations
+    std::vector<std::uint64_t const *> ptrs(1);
+    std::vector<std::size_t> sizes(1);
+    std::vector<bst_node_t> nodes(1);
+    ptrs[0] = reinterpret_cast<std::uint64_t const *>(h_ridx.data());
+    sizes[0] = h_ridx.size();
+    nodes[0] = 0;
 
     // Transmit row indices to plugin and get encrypted histogram
-    //hist_data_ = this->plugin_->BuildEncryptedHistVert(ptrs, sizes, nodes);
+    std::cout << "Building encrypted histograms with row indices " << std::endl;
+      //*********************************************
+    //auto hist_data = plugin->BuildEncryptedHistVert(ptrs, sizes, nodes);
+//*********************************************
 
     // Perform AllGather
     std::cout << "Allgather histograms" << std::endl;
+
+    //*********************************************
     /*
     HostDeviceVector<std::int8_t> hist_entries;
     std::vector<std::int64_t> recv_segments;
@@ -426,7 +436,7 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx,
     common::Span<double> hist_aggr =
             plugin_->SyncEncryptedHistVert(common::RestoreType<std::uint8_t>(hist_entries.HostSpan()));
 */
-
+    //*********************************************
 
 
 

From 26aaded1b5880622454dba63de803fb9f61897d4 Mon Sep 17 00:00:00 2001
From: Ziyue Xu <ziyue.xu@gmail.com>
Date: Wed, 31 Jul 2024 19:02:59 -0400
Subject: [PATCH 18/26] secure vertical GPU fully functional

---
 plugin/federated/federated_plugin.cc |  27 ------
 src/collective/aggregator.h          |  11 ++-
 src/tree/gpu_hist/histogram.cu       | 118 +++++++++++++--------------
 3 files changed, 65 insertions(+), 91 deletions(-)

diff --git a/plugin/federated/federated_plugin.cc b/plugin/federated/federated_plugin.cc
index 6ff5aee4602a..8488c59b26db 100644
--- a/plugin/federated/federated_plugin.cc
+++ b/plugin/federated/federated_plugin.cc
@@ -21,20 +21,6 @@
 namespace xgboost::collective {
 void FederatedPluginMock::Reset(common::Span<std::uint32_t const> cutptrs,
                                 common::Span<std::int32_t const> bin_idx) {
-
-    //print some contents of cutptrs and bin_idx
-    std::cout << "cutptrs.size() = " << cutptrs.size() << std::endl;
-    for (int i = 0; i < cutptrs.size(); i++) {
-        std::cout << cutptrs[i] << " ";
-    }
-    std::cout << std::endl;
-
-    std::cout << "bin_idx.size() = " << bin_idx.size() << std::endl;
-    for (int i = 0; i < 3; i++) {
-        std::cout << bin_idx[i] << " ";
-    }
-    std::cout << std::endl;
-
   this->cuts_.resize(cutptrs.size());
   std::copy_n(cutptrs.data(), cutptrs.size(), this->cuts_.data());
 
@@ -84,19 +70,6 @@ void FederatedPluginMock::Reset(common::Span<std::uint32_t const> cutptrs,
   auto hist_buffer = common::Span<double>{hist_plain_};
   std::fill_n(hist_buffer.data(), hist_buffer.size(), 0.0);
 
-  // print some contents of rowptrs
-
-    std::cout << "rowptrs.size() = " << rowptrs.size() << std::endl;
-    for (int i = 0; i < rowptrs.size(); i++) {
-        std::cout << sizes[i] << std::endl;
-        common::Span row_indices{rowptrs[i], rowptrs[i] + sizes[i]};
-        for (int j = 0; j < 5; j++) {
-            std::cout << row_indices[j] << " ";
-        }
-        std::cout << std::endl;
-    }
-    std::cout << std::endl;
-
   CHECK_EQ(rowptrs.size(), sizes.size());
   CHECK_EQ(nids.size(), sizes.size());
   auto gpair = common::RestoreType<GradientPair const>(common::Span<std::uint8_t>{grad_});
diff --git a/src/collective/aggregator.h b/src/collective/aggregator.h
index ed548a0f173b..7f3110601ff7 100644
--- a/src/collective/aggregator.h
+++ b/src/collective/aggregator.h
@@ -227,13 +227,16 @@ void BroadcastGradient(Context const* ctx, MetaInfo const& info, GradFn&& grad_f
     SafeColl(rc);
     // Pass the gradient to the plugin
     fed.EncryptionPlugin()->SyncEncryptedGradient(encrypted);
+
+    // !!!Temporarily solution
+    // This step is needed for memory allocation in the case of vertical secure GPU
+    // make out_gpair data value to all zero to avoid information leak
+    auto gpair_data = out_gpair->Data();
+    gpair_data->Fill(GradientPair{0.0f, 0.0f});
+    ApplyWithLabels(ctx, info, gpair_data, [&] { grad_fn(out_gpair); });
 #else
     LOG(FATAL) << error::NoFederated();
 #endif
-
-    // !!!Temporarily turn on regular gradient broadcasting for testing
-    // encrypted vertical
-    ApplyWithLabels(ctx, info, out_gpair->Data(), [&] { grad_fn(out_gpair); });
   } else {
     ApplyWithLabels(ctx, info, out_gpair->Data(), [&] { grad_fn(out_gpair); });
   }
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index ae1043666624..d87f5cc077ac 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -14,6 +14,8 @@
 #include "row_partitioner.cuh"
 #include "xgboost/base.h"
 
+#include "../../collective/allgather.h"         // for AllgatherV
+
 #include "../../common/device_helpers.cuh"
 #if defined(XGBOOST_USE_FEDERATED)
 #include "../../../plugin/federated/federated_hist.h"  // for FederataedHistPolicy
@@ -361,6 +363,24 @@ void DeviceHistogramBuilder::Reset(Context const* ctx, FeatureGroupsAccessor con
   this->p_impl_->Reset(ctx, feature_groups, force_global_memory);
 }
 
+struct ReadMatrixFunction {
+  EllpackDeviceAccessor matrix;
+  int kCols;
+  bst_float* matrix_data_d;
+  ReadMatrixFunction(EllpackDeviceAccessor matrix, int kCols, bst_float* matrix_data_d)
+    : matrix(std::move(matrix)), kCols(kCols), matrix_data_d(matrix_data_d) {}
+
+  __device__ void operator()(size_t global_idx) {
+    auto row = global_idx / kCols;
+    auto col = global_idx % kCols;
+    auto value = matrix.GetBinIndex(row, col);
+    if (isnan(value)) {
+      value = -1;
+    }
+    matrix_data_d[global_idx] = value;
+  }
+};
+
 void DeviceHistogramBuilder::BuildHistogram(Context const* ctx,
                                             EllpackDeviceAccessor const& matrix,
                                             FeatureGroupsAccessor const& feature_groups,
@@ -378,105 +398,83 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx,
     auto const &comm = collective::GlobalCommGroup()->Ctx(ctx, DeviceOrd::CPU());
     auto const &fed = dynamic_cast<collective::FederatedComm const &>(comm);
     auto plugin = fed.EncryptionPlugin();
+
     // Transmit matrix to plugin
     if(!is_aggr_context_initialized_){
-      std::cout << "Initialized Plugin Context" << std::endl;
       // Get cutptrs
       std::vector<uint32_t> h_cuts_ptr(matrix.feature_segments.size());
       dh::CopyDeviceSpanToVector(&h_cuts_ptr, matrix.feature_segments);
       common::Span<std::uint32_t const> cutptrs = common::Span<std::uint32_t const>(h_cuts_ptr.data(), h_cuts_ptr.size());
-      std::cout << "cutptrs.size() = " << h_cuts_ptr.size() << std::endl;
-      for (int i = 0; i < h_cuts_ptr.size(); i++) {
-        std::cout << h_cuts_ptr[i] << " ";
-      }
-      std::cout << std::endl;
 
       // Get bin_idx matrix
-
-
-
-      //common::Span<std::int32_t const> bin_idx
-      //*********************************************
-      //plugin->Reset(h_cuts_ptr, bin_idx);
-      //*********************************************
+      auto kRows = matrix.n_rows;
+      auto kCols = matrix.NumFeatures();
+      std::vector<int32_t> h_bin_idx(kRows * kCols);
+      // Access GPU matrix data
+      thrust::device_vector<bst_float> matrix_d(kRows * kCols);
+      dh::LaunchN(kRows * kCols, ReadMatrixFunction(matrix, kCols, matrix_d.data().get()));
+      thrust::copy(matrix_d.begin(), matrix_d.end(), h_bin_idx.begin());
+      common::Span<std::int32_t const> bin_idx = common::Span<std::int32_t const>(h_bin_idx.data(), h_bin_idx.size());
+
+      // Initialize plugin context
+      plugin->Reset(h_cuts_ptr, h_bin_idx);
       is_aggr_context_initialized_ = true;
     }
 
-
     // get row indices from device
     std::vector<uint32_t> h_ridx(ridx.size());
     dh::CopyDeviceSpanToVector(&h_ridx, ridx);
-
-    // wrap info following plugin expectations
+    // necessary conversions to fit plugin expectations
+    std::vector<uint64_t> h_ridx_64(ridx.size());
+    for (int i = 0; i < ridx.size(); i++) {
+      h_ridx_64[i] = h_ridx[i];
+    }
     std::vector<std::uint64_t const *> ptrs(1);
     std::vector<std::size_t> sizes(1);
     std::vector<bst_node_t> nodes(1);
-    ptrs[0] = reinterpret_cast<std::uint64_t const *>(h_ridx.data());
-    sizes[0] = h_ridx.size();
+    ptrs[0] = reinterpret_cast<std::uint64_t const *>(h_ridx_64.data());
+    sizes[0] = h_ridx_64.size();
     nodes[0] = 0;
 
     // Transmit row indices to plugin and get encrypted histogram
-    std::cout << "Building encrypted histograms with row indices " << std::endl;
-      //*********************************************
-    //auto hist_data = plugin->BuildEncryptedHistVert(ptrs, sizes, nodes);
-//*********************************************
+    auto hist_data = plugin->BuildEncryptedHistVert(ptrs, sizes, nodes);
 
     // Perform AllGather
-    std::cout << "Allgather histograms" << std::endl;
-
-    //*********************************************
-    /*
     HostDeviceVector<std::int8_t> hist_entries;
     std::vector<std::int64_t> recv_segments;
     collective::SafeColl(
-              collective::AllgatherV(ctx_, linalg::MakeVec(hist_data_), &recv_segments, &hist_entries));
+              collective::AllgatherV(ctx, linalg::MakeVec(hist_data), &recv_segments, &hist_entries));
 
     // Call the plugin here to get the resulting histogram. Histogram from all workers are
     // gathered to the label owner.
     common::Span<double> hist_aggr =
-            plugin_->SyncEncryptedHistVert(common::RestoreType<std::uint8_t>(hist_entries.HostSpan()));
-*/
-    //*********************************************
-
-
-
-
-
+            plugin->SyncEncryptedHistVert(common::RestoreType<std::uint8_t>(hist_entries.HostSpan()));
 
-    // !!!Temporarily turn on regular histogram building for testing
-    // compute local histograms
-    this->p_impl_->BuildHistogram(ctx->CUDACtx(), matrix, feature_groups, gpair, ridx, histogram, rounding);
-
-    // Further histogram sync process - simulated with allreduce
-    // copy histogram data to host
+    // Post process the AllGathered data
+    auto world_size = collective::GetWorldSize();
     std::vector<GradientPairInt64> host_histogram(histogram.size());
-    dh::CopyDeviceSpanToVector(&host_histogram, histogram);
-    // convert to regular vector
-    std::vector<std::int64_t> host_histogram_64(histogram.size() * 2);
-    for (auto i = 0; i < host_histogram.size(); i++) {
-        host_histogram_64[i * 2] = host_histogram[i].GetQuantisedGrad();
-        host_histogram_64[i * 2 + 1] = host_histogram[i].GetQuantisedHess();
-    }
-    // aggregate histograms in float
-    auto rc = collective::Allreduce(ctx, &host_histogram_64, collective::Op::kSum);
-    SafeColl(rc);
-    // convert back to GradientPairInt64
-    // only copy to Rank 0, clear other ranks to simulate the plugin scenario
-    for (auto i = 0; i < host_histogram.size(); i++) {
-      GradientPairInt64 hist_item(host_histogram_64[i * 2], host_histogram_64[i * 2 + 1]);
-      GradientPairInt64 hist_item_empty(0, 0);
+    for (auto i = 0; i < histogram.size(); i++) {
+      double grad = 0.0;
+      double hess = 0.0;
+      for (auto rank = 0; rank < world_size; ++rank) {
+        auto idx = rank * histogram.size() + i;
+        grad += hist_aggr[idx * 2];
+        hess += hist_aggr[idx * 2 + 1];
+      }
+      GradientPairPrecise hist_item(grad, hess);
+      GradientPairPrecise hist_item_empty(0.0, 0.0);
       if (collective::GetRank() != 0) {
-        hist_item = hist_item_empty;
+        host_histogram[i] = rounding.ToFixedPoint(hist_item_empty);
       } else {
-        host_histogram[i] = hist_item;
+        host_histogram[i] = rounding.ToFixedPoint(hist_item);
       }
     }
+
     // copy the aggregated histogram back to GPU memory
     // at this point, the histogram contains full information from all parties
     dh::safe_cuda(cudaMemcpyAsync(histogram.data(), host_histogram.data(),
-                                  histogram.size() * sizeof(GradientPairPrecise),
+                                  histogram.size() * sizeof(GradientPairInt64),
                                   cudaMemcpyHostToDevice));
-
   }
 }
 }  // namespace xgboost::tree

From aa5b51b43308e953d7519c7173e3f23a1b99a8ec Mon Sep 17 00:00:00 2001
From: Ziyue Xu <ziyue.xu@gmail.com>
Date: Thu, 1 Aug 2024 12:09:21 -0400
Subject: [PATCH 19/26] fix code linting and test scripts

---
 src/tree/gpu_hist/evaluate_splits.cu      |  6 +++--
 src/tree/gpu_hist/histogram.cu            | 21 ++++++++++-------
 tests/cpp/tree/gpu_hist/test_histogram.cu | 28 +++++++++++------------
 tests/cpp/tree/test_gpu_hist.cu           |  4 ++--
 4 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index 3b5c1b76fa1c..e536fc0ac90b 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -406,7 +406,8 @@ void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector<bst_
   dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(d_inputs.size());
   auto out_splits = dh::ToSpan(splits_out_storage);
 
-  bool is_passive_party = is_column_split_ && collective::IsEncrypted() && collective::GetRank() != 0;
+  bool is_passive_party = is_column_split_ && collective::IsEncrypted()
+          && collective::GetRank() != 0;
   bool is_active_party = !is_passive_party;
   // Under secure vertical setting, only the active party is able to evaluate the split
   // based on global histogram. Other parties will receive the final best split information
@@ -421,7 +422,8 @@ void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector<bst_
       // With regular column-wise data split, we gather the split candidates from
       // all the workers and find the global best candidates.
       auto const world_size = collective::GetWorldSize();
-      dh::TemporaryArray<DeviceSplitCandidate> all_candidate_storage(out_splits.size() * world_size);
+      dh::TemporaryArray<DeviceSplitCandidate> all_candidate_storage(
+              out_splits.size() * world_size);
       auto all_candidates = dh::ToSpan(all_candidate_storage);
       auto current_rank =
               all_candidates.subspan(collective::GetRank() * out_splits.size(), out_splits.size());
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index d87f5cc077ac..def1b3016af3 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -388,11 +388,12 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx,
                                             common::Span<const std::uint32_t> ridx,
                                             common::Span<GradientPairInt64> histogram,
                                             GradientQuantiser rounding, MetaInfo const& info) {
-
-  auto IsSecureVertical = !info.IsRowSplit() && collective::IsDistributed() && collective::IsEncrypted();
+  auto IsSecureVertical = !info.IsRowSplit() && collective::IsDistributed()
+          && collective::IsEncrypted();
   if (!IsSecureVertical) {
     // Regular training, build histogram locally
-    this->p_impl_->BuildHistogram(ctx->CUDACtx(), matrix, feature_groups, gpair, ridx, histogram, rounding);
+    this->p_impl_->BuildHistogram(ctx->CUDACtx(), matrix, feature_groups,
+                                  gpair, ridx, histogram, rounding);
   } else {
     // Encrypted vertical, build histogram using federated plugin
     auto const &comm = collective::GlobalCommGroup()->Ctx(ctx, DeviceOrd::CPU());
@@ -400,11 +401,12 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx,
     auto plugin = fed.EncryptionPlugin();
 
     // Transmit matrix to plugin
-    if(!is_aggr_context_initialized_){
+    if (!is_aggr_context_initialized_) {
       // Get cutptrs
       std::vector<uint32_t> h_cuts_ptr(matrix.feature_segments.size());
       dh::CopyDeviceSpanToVector(&h_cuts_ptr, matrix.feature_segments);
-      common::Span<std::uint32_t const> cutptrs = common::Span<std::uint32_t const>(h_cuts_ptr.data(), h_cuts_ptr.size());
+      common::Span<std::uint32_t const> cutptrs =
+              common::Span<std::uint32_t const>(h_cuts_ptr.data(), h_cuts_ptr.size());
 
       // Get bin_idx matrix
       auto kRows = matrix.n_rows;
@@ -414,7 +416,8 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx,
       thrust::device_vector<bst_float> matrix_d(kRows * kCols);
       dh::LaunchN(kRows * kCols, ReadMatrixFunction(matrix, kCols, matrix_d.data().get()));
       thrust::copy(matrix_d.begin(), matrix_d.end(), h_bin_idx.begin());
-      common::Span<std::int32_t const> bin_idx = common::Span<std::int32_t const>(h_bin_idx.data(), h_bin_idx.size());
+      common::Span<std::int32_t const> bin_idx =
+              common::Span<std::int32_t const>(h_bin_idx.data(), h_bin_idx.size());
 
       // Initialize plugin context
       plugin->Reset(h_cuts_ptr, h_bin_idx);
@@ -443,12 +446,14 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx,
     HostDeviceVector<std::int8_t> hist_entries;
     std::vector<std::int64_t> recv_segments;
     collective::SafeColl(
-              collective::AllgatherV(ctx, linalg::MakeVec(hist_data), &recv_segments, &hist_entries));
+              collective::AllgatherV(ctx, linalg::MakeVec(hist_data),
+                                     &recv_segments, &hist_entries));
 
     // Call the plugin here to get the resulting histogram. Histogram from all workers are
     // gathered to the label owner.
     common::Span<double> hist_aggr =
-            plugin->SyncEncryptedHistVert(common::RestoreType<std::uint8_t>(hist_entries.HostSpan()));
+            plugin->SyncEncryptedHistVert(
+                    common::RestoreType<std::uint8_t>(hist_entries.HostSpan()));
 
     // Post process the AllGathered data
     auto world_size = collective::GetWorldSize();
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index c9320f616983..f73fff58f05d 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -80,9 +80,9 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global)
     auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo());
     DeviceHistogramBuilder builder;
     builder.Reset(&ctx, feature_groups.DeviceAccessor(ctx.Device()), force_global);
-    builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+    builder.BuildHistogram(&ctx, page->GetDeviceAccessor(ctx.Device()),
                            feature_groups.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
-                           d_histogram, quantiser);
+                           d_histogram, quantiser, MetaInfo());
 
     std::vector<GradientPairInt64> histogram_h(num_bins);
     dh::safe_cuda(cudaMemcpy(histogram_h.data(), d_histogram.data(),
@@ -95,9 +95,9 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global)
       auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo());
       DeviceHistogramBuilder builder;
       builder.Reset(&ctx, feature_groups.DeviceAccessor(ctx.Device()), force_global);
-      builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+      builder.BuildHistogram(&ctx, page->GetDeviceAccessor(ctx.Device()),
                              feature_groups.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
-                             d_new_histogram, quantiser);
+                             d_new_histogram, quantiser, MetaInfo());
 
       std::vector<GradientPairInt64> new_histogram_h(num_bins);
       dh::safe_cuda(cudaMemcpy(new_histogram_h.data(), d_new_histogram.data(),
@@ -119,9 +119,9 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global)
       dh::device_vector<GradientPairInt64> baseline(num_bins);
       DeviceHistogramBuilder builder;
       builder.Reset(&ctx, single_group.DeviceAccessor(ctx.Device()), force_global);
-      builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+      builder.BuildHistogram(&ctx, page->GetDeviceAccessor(ctx.Device()),
                              single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
-                             dh::ToSpan(baseline), quantiser);
+                             dh::ToSpan(baseline), quantiser, MetaInfo());
 
       std::vector<GradientPairInt64> baseline_h(num_bins);
       dh::safe_cuda(cudaMemcpy(baseline_h.data(), baseline.data().get(),
@@ -185,9 +185,9 @@ void TestGPUHistogramCategorical(size_t num_categories) {
     FeatureGroups single_group(page->Cuts());
     DeviceHistogramBuilder builder;
     builder.Reset(&ctx, single_group.DeviceAccessor(ctx.Device()), false);
-    builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+    builder.BuildHistogram(&ctx, page->GetDeviceAccessor(ctx.Device()),
                            single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
-                           dh::ToSpan(cat_hist), quantiser);
+                           dh::ToSpan(cat_hist), quantiser, MetaInfo());
   }
 
   /**
@@ -201,9 +201,9 @@ void TestGPUHistogramCategorical(size_t num_categories) {
     FeatureGroups single_group(page->Cuts());
     DeviceHistogramBuilder builder;
     builder.Reset(&ctx, single_group.DeviceAccessor(ctx.Device()), false);
-    builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+    builder.BuildHistogram(&ctx, page->GetDeviceAccessor(ctx.Device()),
                            single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
-                           dh::ToSpan(encode_hist), quantiser);
+                           dh::ToSpan(encode_hist), quantiser, MetaInfo());
   }
 
   std::vector<GradientPairInt64> h_cat_hist(cat_hist.size());
@@ -350,9 +350,9 @@ class HistogramExternalMemoryTest : public ::testing::TestWithParam<std::tuple<f
         auto d_histogram = dh::ToSpan(multi_hist);
         DeviceHistogramBuilder builder;
         builder.Reset(&ctx, fg->DeviceAccessor(ctx.Device()), force_global);
-        builder.BuildHistogram(ctx.CUDACtx(), impl->GetDeviceAccessor(ctx.Device()),
+        builder.BuildHistogram(&ctx, impl->GetDeviceAccessor(ctx.Device()),
                                fg->DeviceAccessor(ctx.Device()), gpair.ConstDeviceSpan(), ridx,
-                               d_histogram, quantiser);
+                               d_histogram, quantiser, MetaInfo());
         ++k;
       }
       ASSERT_EQ(k, n_batches);
@@ -373,9 +373,9 @@ class HistogramExternalMemoryTest : public ::testing::TestWithParam<std::tuple<f
       auto d_histogram = dh::ToSpan(single_hist);
       DeviceHistogramBuilder builder;
       builder.Reset(&ctx, fg->DeviceAccessor(ctx.Device()), force_global);
-      builder.BuildHistogram(ctx.CUDACtx(), page.GetDeviceAccessor(ctx.Device()),
+      builder.BuildHistogram(&ctx, page.GetDeviceAccessor(ctx.Device()),
                              fg->DeviceAccessor(ctx.Device()), gpair.ConstDeviceSpan(), ridx,
-                             d_histogram, quantiser);
+                             d_histogram, quantiser, MetaInfo());
     }
 
     std::vector<GradientPairInt64> h_single(single_hist.size());
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index edd129353bdf..454d05d14df1 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -79,10 +79,10 @@ void TestBuildHist(bool use_shared_memory_histograms) {
   DeviceHistogramBuilder builder;
   builder.Reset(&ctx, maker.feature_groups->DeviceAccessor(ctx.Device()),
                 !use_shared_memory_histograms);
-  builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+  builder.BuildHistogram(&ctx, page->GetDeviceAccessor(ctx.Device()),
                          maker.feature_groups->DeviceAccessor(ctx.Device()), gpair.DeviceSpan(),
                          maker.row_partitioner->GetRows(0), maker.hist.GetNodeHistogram(0),
-                         *maker.quantiser);
+                         *maker.quantiser, MetaInfo());
 
   DeviceHistogramStorage<>& d_hist = maker.hist;
 

From 7480ed339752eedc85d2d15b57086e39ea9b61df Mon Sep 17 00:00:00 2001
From: Ziyue Xu <ziyue.xu@gmail.com>
Date: Thu, 1 Aug 2024 15:01:11 -0400
Subject: [PATCH 20/26] wrap plugin calls into federated

---
 src/tree/gpu_hist/histogram.cu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index def1b3016af3..4aa69949f989 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -395,6 +395,7 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx,
     this->p_impl_->BuildHistogram(ctx->CUDACtx(), matrix, feature_groups,
                                   gpair, ridx, histogram, rounding);
   } else {
+  #if defined(XGBOOST_USE_FEDERATED)
     // Encrypted vertical, build histogram using federated plugin
     auto const &comm = collective::GlobalCommGroup()->Ctx(ctx, DeviceOrd::CPU());
     auto const &fed = dynamic_cast<collective::FederatedComm const &>(comm);
@@ -480,6 +481,9 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx,
     dh::safe_cuda(cudaMemcpyAsync(histogram.data(), host_histogram.data(),
                                   histogram.size() * sizeof(GradientPairInt64),
                                   cudaMemcpyHostToDevice));
+  #else
+    LOG(FATAL) << error::NoFederated();
+  #endif
   }
 }
 }  // namespace xgboost::tree

From 4587b2ef6dc8ff7bb8d9dd5d59d372a4e2d9a23e Mon Sep 17 00:00:00 2001
From: Ziyue Xu <ziyuex@nvidia.com>
Date: Thu, 1 Aug 2024 18:55:56 -0400
Subject: [PATCH 21/26] only rank 0 need histogram sync result

---
 src/tree/gpu_hist/histogram.cu | 49 ++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 23 deletions(-)

diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 4aa69949f989..e33b30c8bfac 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -451,36 +451,39 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx,
                                      &recv_segments, &hist_entries));
 
     // Call the plugin here to get the resulting histogram. Histogram from all workers are
-    // gathered to the label owner.
+    // gathered to the label owner. 
     common::Span<double> hist_aggr =
             plugin->SyncEncryptedHistVert(
                     common::RestoreType<std::uint8_t>(hist_entries.HostSpan()));
 
     // Post process the AllGathered data
-    auto world_size = collective::GetWorldSize();
-    std::vector<GradientPairInt64> host_histogram(histogram.size());
-    for (auto i = 0; i < histogram.size(); i++) {
-      double grad = 0.0;
-      double hess = 0.0;
-      for (auto rank = 0; rank < world_size; ++rank) {
-        auto idx = rank * histogram.size() + i;
-        grad += hist_aggr[idx * 2];
-        hess += hist_aggr[idx * 2 + 1];
+    // This is only needed by Rank 0
+    if (collective::GetRank() == 0) {
+      auto world_size = collective::GetWorldSize();
+      std::vector<GradientPairInt64> host_histogram(histogram.size());
+      for (auto i = 0; i < histogram.size(); i++) {
+        double grad = 0.0;
+        double hess = 0.0;
+        for (auto rank = 0; rank < world_size; ++rank) {
+          auto idx = rank * histogram.size() + i;
+          grad += hist_aggr[idx * 2];
+          hess += hist_aggr[idx * 2 + 1];
+        }
+        GradientPairPrecise hist_item(grad, hess);
+        GradientPairPrecise hist_item_empty(0.0, 0.0);
+        if (collective::GetRank() != 0) {
+          host_histogram[i] = rounding.ToFixedPoint(hist_item_empty);
+        } else {
+          host_histogram[i] = rounding.ToFixedPoint(hist_item);
+        }
       }
-      GradientPairPrecise hist_item(grad, hess);
-      GradientPairPrecise hist_item_empty(0.0, 0.0);
-      if (collective::GetRank() != 0) {
-        host_histogram[i] = rounding.ToFixedPoint(hist_item_empty);
-      } else {
-        host_histogram[i] = rounding.ToFixedPoint(hist_item);
-      }
-    }
 
-    // copy the aggregated histogram back to GPU memory
-    // at this point, the histogram contains full information from all parties
-    dh::safe_cuda(cudaMemcpyAsync(histogram.data(), host_histogram.data(),
-                                  histogram.size() * sizeof(GradientPairInt64),
-                                  cudaMemcpyHostToDevice));
+      // copy the aggregated histogram back to GPU memory
+      // at this point, the histogram contains full information from all parties
+      dh::safe_cuda(cudaMemcpyAsync(histogram.data(), host_histogram.data(),
+                                    histogram.size() * sizeof(GradientPairInt64),
+                                    cudaMemcpyHostToDevice));
+    }
   #else
     LOG(FATAL) << error::NoFederated();
   #endif

From ad213148ef944db5b74d3ed2a03d11e854ee9a78 Mon Sep 17 00:00:00 2001
From: Ziyue Xu <ziyuex@nvidia.com>
Date: Thu, 1 Aug 2024 19:01:55 -0400
Subject: [PATCH 22/26] Update histogram.cu

---
 src/tree/gpu_hist/histogram.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index e33b30c8bfac..68427d21322c 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -451,7 +451,7 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx,
                                      &recv_segments, &hist_entries));
 
     // Call the plugin here to get the resulting histogram. Histogram from all workers are
-    // gathered to the label owner. 
+    // gathered to the label owner
     common::Span<double> hist_aggr =
             plugin->SyncEncryptedHistVert(
                     common::RestoreType<std::uint8_t>(hist_entries.HostSpan()));

From 397ade38d2906e4c39a88ce8927d6fbaed801d74 Mon Sep 17 00:00:00 2001
From: Zhihong Zhang <zhihongz@nvidia.com>
Date: Thu, 1 Aug 2024 19:06:36 -0400
Subject: [PATCH 23/26] Added check for passive when sync histo for vertical
 and removed some nested blocks

---
 src/tree/gpu_hist/histogram.cu | 181 +++++++++++++++++----------------
 1 file changed, 92 insertions(+), 89 deletions(-)

diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 68427d21322c..93678496cda1 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -394,99 +394,102 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx,
     // Regular training, build histogram locally
     this->p_impl_->BuildHistogram(ctx->CUDACtx(), matrix, feature_groups,
                                   gpair, ridx, histogram, rounding);
-  } else {
+    return;
+  }
   #if defined(XGBOOST_USE_FEDERATED)
-    // Encrypted vertical, build histogram using federated plugin
-    auto const &comm = collective::GlobalCommGroup()->Ctx(ctx, DeviceOrd::CPU());
-    auto const &fed = dynamic_cast<collective::FederatedComm const &>(comm);
-    auto plugin = fed.EncryptionPlugin();
-
-    // Transmit matrix to plugin
-    if (!is_aggr_context_initialized_) {
-      // Get cutptrs
-      std::vector<uint32_t> h_cuts_ptr(matrix.feature_segments.size());
-      dh::CopyDeviceSpanToVector(&h_cuts_ptr, matrix.feature_segments);
-      common::Span<std::uint32_t const> cutptrs =
-              common::Span<std::uint32_t const>(h_cuts_ptr.data(), h_cuts_ptr.size());
-
-      // Get bin_idx matrix
-      auto kRows = matrix.n_rows;
-      auto kCols = matrix.NumFeatures();
-      std::vector<int32_t> h_bin_idx(kRows * kCols);
-      // Access GPU matrix data
-      thrust::device_vector<bst_float> matrix_d(kRows * kCols);
-      dh::LaunchN(kRows * kCols, ReadMatrixFunction(matrix, kCols, matrix_d.data().get()));
-      thrust::copy(matrix_d.begin(), matrix_d.end(), h_bin_idx.begin());
-      common::Span<std::int32_t const> bin_idx =
-              common::Span<std::int32_t const>(h_bin_idx.data(), h_bin_idx.size());
-
-      // Initialize plugin context
-      plugin->Reset(h_cuts_ptr, h_bin_idx);
-      is_aggr_context_initialized_ = true;
-    }
+  // Encrypted vertical, build histogram using federated plugin
+  auto const &comm = collective::GlobalCommGroup()->Ctx(ctx, DeviceOrd::CPU());
+  auto const &fed = dynamic_cast<collective::FederatedComm const &>(comm);
+  auto plugin = fed.EncryptionPlugin();
+
+  // Transmit matrix to plugin
+  if (!is_aggr_context_initialized_) {
+    // Get cutptrs
+    std::vector<uint32_t> h_cuts_ptr(matrix.feature_segments.size());
+    dh::CopyDeviceSpanToVector(&h_cuts_ptr, matrix.feature_segments);
+    common::Span<std::uint32_t const> cutptrs =
+            common::Span<std::uint32_t const>(h_cuts_ptr.data(), h_cuts_ptr.size());
+
+    // Get bin_idx matrix
+    auto kRows = matrix.n_rows;
+    auto kCols = matrix.NumFeatures();
+    std::vector<int32_t> h_bin_idx(kRows * kCols);
+    // Access GPU matrix data
+    thrust::device_vector<bst_float> matrix_d(kRows * kCols);
+    dh::LaunchN(kRows * kCols, ReadMatrixFunction(matrix, kCols, matrix_d.data().get()));
+    thrust::copy(matrix_d.begin(), matrix_d.end(), h_bin_idx.begin());
+    common::Span<std::int32_t const> bin_idx =
+            common::Span<std::int32_t const>(h_bin_idx.data(), h_bin_idx.size());
+
+    // Initialize plugin context
+    plugin->Reset(h_cuts_ptr, h_bin_idx);
+    is_aggr_context_initialized_ = true;
+  }
 
-    // get row indices from device
-    std::vector<uint32_t> h_ridx(ridx.size());
-    dh::CopyDeviceSpanToVector(&h_ridx, ridx);
-    // necessary conversions to fit plugin expectations
-    std::vector<uint64_t> h_ridx_64(ridx.size());
-    for (int i = 0; i < ridx.size(); i++) {
-      h_ridx_64[i] = h_ridx[i];
-    }
-    std::vector<std::uint64_t const *> ptrs(1);
-    std::vector<std::size_t> sizes(1);
-    std::vector<bst_node_t> nodes(1);
-    ptrs[0] = reinterpret_cast<std::uint64_t const *>(h_ridx_64.data());
-    sizes[0] = h_ridx_64.size();
-    nodes[0] = 0;
-
-    // Transmit row indices to plugin and get encrypted histogram
-    auto hist_data = plugin->BuildEncryptedHistVert(ptrs, sizes, nodes);
-
-    // Perform AllGather
-    HostDeviceVector<std::int8_t> hist_entries;
-    std::vector<std::int64_t> recv_segments;
-    collective::SafeColl(
-              collective::AllgatherV(ctx, linalg::MakeVec(hist_data),
-                                     &recv_segments, &hist_entries));
-
-    // Call the plugin here to get the resulting histogram. Histogram from all workers are
-    // gathered to the label owner
-    common::Span<double> hist_aggr =
-            plugin->SyncEncryptedHistVert(
-                    common::RestoreType<std::uint8_t>(hist_entries.HostSpan()));
-
-    // Post process the AllGathered data
-    // This is only needed by Rank 0
-    if (collective::GetRank() == 0) {
-      auto world_size = collective::GetWorldSize();
-      std::vector<GradientPairInt64> host_histogram(histogram.size());
-      for (auto i = 0; i < histogram.size(); i++) {
-        double grad = 0.0;
-        double hess = 0.0;
-        for (auto rank = 0; rank < world_size; ++rank) {
-          auto idx = rank * histogram.size() + i;
-          grad += hist_aggr[idx * 2];
-          hess += hist_aggr[idx * 2 + 1];
-        }
-        GradientPairPrecise hist_item(grad, hess);
-        GradientPairPrecise hist_item_empty(0.0, 0.0);
-        if (collective::GetRank() != 0) {
-          host_histogram[i] = rounding.ToFixedPoint(hist_item_empty);
-        } else {
-          host_histogram[i] = rounding.ToFixedPoint(hist_item);
-        }
-      }
+  // get row indices from device
+  std::vector<uint32_t> h_ridx(ridx.size());
+  dh::CopyDeviceSpanToVector(&h_ridx, ridx);
+  // necessary conversions to fit plugin expectations
+  std::vector<uint64_t> h_ridx_64(ridx.size());
+  for (int i = 0; i < ridx.size(); i++) {
+    h_ridx_64[i] = h_ridx[i];
+  }
+  std::vector<std::uint64_t const *> ptrs(1);
+  std::vector<std::size_t> sizes(1);
+  std::vector<bst_node_t> nodes(1);
+  ptrs[0] = reinterpret_cast<std::uint64_t const *>(h_ridx_64.data());
+  sizes[0] = h_ridx_64.size();
+  nodes[0] = 0;
+
+  // Transmit row indices to plugin and get encrypted histogram
+  auto hist_data = plugin->BuildEncryptedHistVert(ptrs, sizes, nodes);
+
+  // Perform AllGather
+  HostDeviceVector<std::int8_t> hist_entries;
+  std::vector<std::int64_t> recv_segments;
+  collective::SafeColl(
+            collective::AllgatherV(ctx, linalg::MakeVec(hist_data),
+                                   &recv_segments, &hist_entries));
+
+  if (collective::GetRank() != 0) {
+    // This is only needed for lable owner
+    return;
+  }
 
-      // copy the aggregated histogram back to GPU memory
-      // at this point, the histogram contains full information from all parties
-      dh::safe_cuda(cudaMemcpyAsync(histogram.data(), host_histogram.data(),
-                                    histogram.size() * sizeof(GradientPairInt64),
-                                    cudaMemcpyHostToDevice));
+  // Call the plugin here to get the resulting histogram. Histogram from all workers are
+  // gathered to the label owner.
+  common::Span<double> hist_aggr =
+          plugin->SyncEncryptedHistVert(
+                  common::RestoreType<std::uint8_t>(hist_entries.HostSpan()));
+
+  // Post process the AllGathered data
+  auto world_size = collective::GetWorldSize();
+  std::vector<GradientPairInt64> host_histogram(histogram.size());
+  for (auto i = 0; i < histogram.size(); i++) {
+    double grad = 0.0;
+    double hess = 0.0;
+    for (auto rank = 0; rank < world_size; ++rank) {
+      auto idx = rank * histogram.size() + i;
+      grad += hist_aggr[idx * 2];
+      hess += hist_aggr[idx * 2 + 1];
+    }
+    GradientPairPrecise hist_item(grad, hess);
+    GradientPairPrecise hist_item_empty(0.0, 0.0);
+    if (collective::GetRank() != 0) {
+      host_histogram[i] = rounding.ToFixedPoint(hist_item_empty);
+    } else {
+      host_histogram[i] = rounding.ToFixedPoint(hist_item);
     }
-  #else
-    LOG(FATAL) << error::NoFederated();
-  #endif
   }
+
+  // copy the aggregated histogram back to GPU memory
+  // at this point, the histogram contains full information from all parties
+  dh::safe_cuda(cudaMemcpyAsync(histogram.data(), host_histogram.data(),
+                                histogram.size() * sizeof(GradientPairInt64),
+                                cudaMemcpyHostToDevice));
+#else
+  LOG(FATAL) << error::NoFederated();
+#endif
+
 }
 }  // namespace xgboost::tree

From 61d08217ac70c8802ecebe2d010b42d0306cce29 Mon Sep 17 00:00:00 2001
From: Ziyue Xu <ziyuex@nvidia.com>
Date: Thu, 1 Aug 2024 19:43:42 -0400
Subject: [PATCH 24/26] Code clean

---
 src/tree/gpu_hist/histogram.cu | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 93678496cda1..3f608f1c4f64 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -452,11 +452,11 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx,
                                    &recv_segments, &hist_entries));
 
   if (collective::GetRank() != 0) {
-    // This is only needed for lable owner
+    // Below is only needed for lable owner
     return;
   }
 
-  // Call the plugin here to get the resulting histogram. Histogram from all workers are
+  // Call the plugin to get the resulting histogram. Histogram from all workers are
   // gathered to the label owner.
   common::Span<double> hist_aggr =
           plugin->SyncEncryptedHistVert(
@@ -474,12 +474,7 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx,
       hess += hist_aggr[idx * 2 + 1];
     }
     GradientPairPrecise hist_item(grad, hess);
-    GradientPairPrecise hist_item_empty(0.0, 0.0);
-    if (collective::GetRank() != 0) {
-      host_histogram[i] = rounding.ToFixedPoint(hist_item_empty);
-    } else {
-      host_histogram[i] = rounding.ToFixedPoint(hist_item);
-    }
+    host_histogram[i] = rounding.ToFixedPoint(hist_item);
   }
 
   // copy the aggregated histogram back to GPU memory
@@ -490,6 +485,5 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx,
 #else
   LOG(FATAL) << error::NoFederated();
 #endif
-
 }
 }  // namespace xgboost::tree

From ba825213f56f78226c47003e25741b30f0b84db2 Mon Sep 17 00:00:00 2001
From: Ziyue Xu <ziyue.xu@gmail.com>
Date: Fri, 9 Aug 2024 10:53:25 -0400
Subject: [PATCH 25/26] updates for PR checks

---
 src/tree/gpu_hist/histogram.cu  | 16 ++++++++--------
 src/tree/gpu_hist/histogram.cuh |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 3f608f1c4f64..2b52f03a1dbf 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -365,16 +365,16 @@ void DeviceHistogramBuilder::Reset(Context const* ctx, FeatureGroupsAccessor con
 
 struct ReadMatrixFunction {
   EllpackDeviceAccessor matrix;
-  int kCols;
+  int k_cols;
   bst_float* matrix_data_d;
-  ReadMatrixFunction(EllpackDeviceAccessor matrix, int kCols, bst_float* matrix_data_d)
-    : matrix(std::move(matrix)), kCols(kCols), matrix_data_d(matrix_data_d) {}
+  ReadMatrixFunction(EllpackDeviceAccessor matrix, int k_cols, bst_float* matrix_data_d)
+    : matrix(std::move(matrix)), k_cols(k_cols), matrix_data_d(matrix_data_d) {}
 
   __device__ void operator()(size_t global_idx) {
-    auto row = global_idx / kCols;
-    auto col = global_idx % kCols;
+    auto row = global_idx / k_cols;
+    auto col = global_idx % k_cols;
     auto value = matrix.GetBinIndex(row, col);
-    if (isnan(value)) {
+    if (isnan(float(value))) {
       value = -1;
     }
     matrix_data_d[global_idx] = value;
@@ -403,7 +403,7 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx,
   auto plugin = fed.EncryptionPlugin();
 
   // Transmit matrix to plugin
-  if (!is_aggr_context_initialized_) {
+  if (!is_aggr_context_initialized) {
     // Get cutptrs
     std::vector<uint32_t> h_cuts_ptr(matrix.feature_segments.size());
     dh::CopyDeviceSpanToVector(&h_cuts_ptr, matrix.feature_segments);
@@ -423,7 +423,7 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx,
 
     // Initialize plugin context
     plugin->Reset(h_cuts_ptr, h_bin_idx);
-    is_aggr_context_initialized_ = true;
+    is_aggr_context_initialized = true;
   }
 
   // get row indices from device
diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh
index 08ba9be856fc..aedcacbb92d8 100644
--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@@ -176,7 +176,7 @@ class DeviceHistogramBuilder {
   DeviceHistogramBuilder();
   ~DeviceHistogramBuilder();
   // Whether to secure aggregation context has been initialized
-  bool is_aggr_context_initialized_{false};
+  bool is_aggr_context_initialized{false};
   void Reset(Context const* ctx, FeatureGroupsAccessor const& feature_groups,
              bool force_global_memory);
   void BuildHistogram(Context const* ctx, EllpackDeviceAccessor const& matrix,

From 45b4e2ebc49342891fe3db02d3f2ae53a08929ed Mon Sep 17 00:00:00 2001
From: Ziyue Xu <ziyue.xu@gmail.com>
Date: Fri, 9 Aug 2024 11:03:22 -0400
Subject: [PATCH 26/26] updates for PR checks

---
 src/tree/gpu_hist/histogram.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 2b52f03a1dbf..073faa6f5203 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -374,7 +374,7 @@ struct ReadMatrixFunction {
     auto row = global_idx / k_cols;
     auto col = global_idx % k_cols;
     auto value = matrix.GetBinIndex(row, col);
-    if (isnan(float(value))) {
+    if (isnan(static_cast<float>(value))) {
       value = -1;
     }
     matrix_data_d[global_idx] = value;