From aad29dfb31ada247310a6ce1c4a783deeac74052 Mon Sep 17 00:00:00 2001 From: Ziyue Xu Date: Wed, 17 Jul 2024 09:31:44 -0400 Subject: [PATCH 01/26] secure horizontal for GPU --- src/tree/updater_gpu_hist.cu | 45 +++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 19957857218d..3ae440359af3 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -43,6 +43,9 @@ #include "xgboost/task.h" // for ObjInfo #include "xgboost/tree_model.h" +#include "../collective/communicator-inl.h" +#include "../collective/allgather.h" // for AllgatherV + namespace xgboost::tree { #if !defined(GTEST_TEST) DMLC_REGISTRY_FILE_TAG(updater_gpu_hist); @@ -665,7 +668,47 @@ struct GPUHistMakerDevice { // Reduce all in one go // This gives much better latency in a distributed setting // when processing a large batch - this->AllReduceHist(hist_nidx.at(0), hist_nidx.size()); + // If secure horizontal, perform AllReduce by calling the encryption plugin + if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) { + // Get encryption plugin + decltype(std::declval().EncryptionPlugin()) plugin_; + auto const &comm = collective::GlobalCommGroup()->Ctx(ctx_, DeviceOrd::CPU()); + auto const &fed = dynamic_cast(comm); + + plugin_ = fed.EncryptionPlugin(); + + // Get the histogram data + std::size_t n = page->Cuts().TotalBins() * 2 * hist_nidx.size(); + auto d_node_hist = hist.GetNodeHistogram(hist_nidx.at(0)).data(); + using ReduceT = typename std::remove_pointer::type::ValueT; + auto hist_vec = linalg::MakeVec(reinterpret_cast(d_node_hist), n, ctx_->Device()); + + // copy the histogram out of GPU memory + common::Span erased = common::EraseType(hist_vec.Values()); + std::vector h_data(erased.size()); + cudaMemcpy(h_data.data(), erased.data(), erased.size(), cudaMemcpyDeviceToHost); + + // call the encryption plugin + auto src_hist = common::Span{reinterpret_cast(h_data.data()), n}; + auto hist_buf = plugin_->BuildEncryptedHistHori(src_hist); + + // allgather + HostDeviceVector hist_entries; + std::vector recv_segments; + auto rc = + collective::AllgatherV(ctx_, linalg::MakeVec(hist_buf), &recv_segments, &hist_entries); + collective::SafeColl(rc); + + // call the encryption plugin to aggregate the histograms + auto hist_aggr = + plugin_->SyncEncryptedHistHori(common::RestoreType(hist_entries.HostSpan())); + + // copy the aggregated histogram back to GPU memory + cudaMemcpy(erased.data(), hist_aggr.data(), erased.size(), cudaMemcpyHostToDevice); + } + else { + this->AllReduceHist(hist_nidx.at(0), hist_nidx.size()); + } for (size_t i = 0; i < subtraction_nidx.size(); i++) { auto build_hist_nidx = hist_nidx.at(i); From b448dffb8fad97670aaf5101c9c275308cef7f1c Mon Sep 17 00:00:00 2001 From: Ziyue Xu Date: Wed, 17 Jul 2024 18:20:29 -0400 Subject: [PATCH 02/26] more structured refinement --- src/tree/updater_gpu_hist.cu | 107 ++++++++++++++++++++++------------- 1 file changed, 67 insertions(+), 40 deletions(-) diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 3ae440359af3..606b1be5960d 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -46,6 +46,7 @@ #include "../collective/communicator-inl.h" #include "../collective/allgather.h" // for AllgatherV +#include namespace xgboost::tree { #if !defined(GTEST_TEST) DMLC_REGISTRY_FILE_TAG(updater_gpu_hist); @@ -623,17 +624,67 @@ struct GPUHistMakerDevice { // num histograms is the number of contiguous histograms in memory to reduce over void AllReduceHist(int nidx, int num_histograms) { monitor.Start("AllReduce"); + std::size_t n = page->Cuts().TotalBins() * 2 * num_histograms; auto d_node_hist = hist.GetNodeHistogram(nidx).data(); using ReduceT = typename std::remove_pointer::type::ValueT; + auto hist_vec = linalg::MakeVec(reinterpret_cast(d_node_hist), n, ctx_->Device()); + + // print out the first histogram with iterator + //std::vector entry(hist_vec.Values().size()); + //dh::CopyDeviceSpanToVector(&entry, hist_vec.Values()); + //printf("Non-enc: Rank %d Before AllReduce: %ld\n", collective::GetRank(), entry[0]); + auto rc = collective::GlobalSum( - ctx_, info_, - linalg::MakeVec(reinterpret_cast(d_node_hist), - page->Cuts().TotalBins() * 2 * num_histograms, ctx_->Device())); + ctx_, info_, hist_vec); SafeColl(rc); + // print out the first histogram with iterator + //dh::CopyDeviceSpanToVector(&entry, hist_vec.Values()); + //printf("Non-enc: Rank %d After AllReduce: %ld\n", collective::GetRank(), entry[0]); + monitor.Stop("AllReduce"); } + void AllReduceHistEncrypted(int nidx, int num_histograms) { + monitor.Start("AllReduceEncrypted"); + // Get encryption plugin + decltype(std::declval().EncryptionPlugin()) plugin; + auto const &comm = collective::GlobalCommGroup()->Ctx(ctx_, DeviceOrd::CPU()); + auto const &fed = dynamic_cast(comm); + plugin = fed.EncryptionPlugin(); + + // Get the histogram data + std::size_t n = page->Cuts().TotalBins() * 2 * num_histograms; + auto d_node_hist = hist.GetNodeHistogram(nidx).data(); + using ReduceT = typename std::remove_pointer::type::ValueT; + auto hist_vec = linalg::MakeVec(reinterpret_cast(d_node_hist), n, ctx_->Device()); + + // copy the histogram out of GPU memory + common::Span erased = common::EraseType(hist_vec.Values()); + std::vector h_data(erased.size()); + cudaMemcpy(h_data.data(), erased.data(), erased.size(), cudaMemcpyDeviceToHost); + + // call the encryption plugin + auto src_hist = common::Span{reinterpret_cast(h_data.data()), n}; + auto hist_buf = plugin->BuildEncryptedHistHori(src_hist); + + // allgather + HostDeviceVector hist_entries; + std::vector recv_segments; + auto rc = + collective::AllgatherV(ctx_, linalg::MakeVec(hist_buf), &recv_segments, &hist_entries); + collective::SafeColl(rc); + + // call the encryption plugin to aggregate the histograms + auto hist_aggr = + plugin->SyncEncryptedHistHori(common::RestoreType(hist_entries.HostSpan())); + + // copy the aggregated histogram back to GPU memory + cudaMemcpy(erased.data(), hist_aggr.data(), erased.size(), cudaMemcpyHostToDevice); + + monitor.Stop("AllReduceEncrypted"); + } + /** * \brief Build GPU local histograms for the left and right child of some parent node */ @@ -670,41 +721,7 @@ struct GPUHistMakerDevice { // when processing a large batch // If secure horizontal, perform AllReduce by calling the encryption plugin if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) { - // Get encryption plugin - decltype(std::declval().EncryptionPlugin()) plugin_; - auto const &comm = collective::GlobalCommGroup()->Ctx(ctx_, DeviceOrd::CPU()); - auto const &fed = dynamic_cast(comm); - - plugin_ = fed.EncryptionPlugin(); - - // Get the histogram data - std::size_t n = page->Cuts().TotalBins() * 2 * hist_nidx.size(); - auto d_node_hist = hist.GetNodeHistogram(hist_nidx.at(0)).data(); - using ReduceT = typename std::remove_pointer::type::ValueT; - auto hist_vec = linalg::MakeVec(reinterpret_cast(d_node_hist), n, ctx_->Device()); - - // copy the histogram out of GPU memory - common::Span erased = common::EraseType(hist_vec.Values()); - std::vector h_data(erased.size()); - cudaMemcpy(h_data.data(), erased.data(), erased.size(), cudaMemcpyDeviceToHost); - - // call the encryption plugin - auto src_hist = common::Span{reinterpret_cast(h_data.data()), n}; - auto hist_buf = plugin_->BuildEncryptedHistHori(src_hist); - - // allgather - HostDeviceVector hist_entries; - std::vector recv_segments; - auto rc = - collective::AllgatherV(ctx_, linalg::MakeVec(hist_buf), &recv_segments, &hist_entries); - collective::SafeColl(rc); - - // call the encryption plugin to aggregate the histograms - auto hist_aggr = - plugin_->SyncEncryptedHistHori(common::RestoreType(hist_entries.HostSpan())); - - // copy the aggregated histogram back to GPU memory - cudaMemcpy(erased.data(), hist_aggr.data(), erased.size(), cudaMemcpyHostToDevice); + this->AllReduceHistEncrypted(hist_nidx.at(0), hist_nidx.size()); } else { this->AllReduceHist(hist_nidx.at(0), hist_nidx.size()); @@ -718,7 +735,12 @@ struct GPUHistMakerDevice { if (!this->SubtractionTrick(parent_nidx, build_hist_nidx, subtraction_trick_nidx)) { // Calculate other histogram manually this->BuildHist(subtraction_trick_nidx); - this->AllReduceHist(subtraction_trick_nidx, 1); + if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) { + this->AllReduceHistEncrypted(subtraction_trick_nidx, 1); + } + else { + this->AllReduceHist(subtraction_trick_nidx, 1); + } } } } @@ -792,7 +814,12 @@ struct GPUHistMakerDevice { hist.AllocateHistograms({kRootNIdx}); this->BuildHist(kRootNIdx); - this->AllReduceHist(kRootNIdx, 1); + if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) { + this->AllReduceHistEncrypted(kRootNIdx, 1); + } + else { + this->AllReduceHist(kRootNIdx, 1); + } // Remember root stats auto root_sum = quantiser.ToFloatingPoint(root_sum_quantised); From 7d6d5921c6ba1c29bb86b83786f7420fd7f0a6b3 Mon Sep 17 00:00:00 2001 From: Ziyue Xu Date: Thu, 18 Jul 2024 16:26:24 -0400 Subject: [PATCH 03/26] add the missing aggr piece to secure hori GPU --- src/tree/updater_gpu_hist.cu | 92 +++++++++++++++++------------------- 1 file changed, 44 insertions(+), 48 deletions(-) diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 606b1be5960d..5600da12d9f4 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -628,61 +628,57 @@ struct GPUHistMakerDevice { auto d_node_hist = hist.GetNodeHistogram(nidx).data(); using ReduceT = typename std::remove_pointer::type::ValueT; auto hist_vec = linalg::MakeVec(reinterpret_cast(d_node_hist), n, ctx_->Device()); - - // print out the first histogram with iterator - //std::vector entry(hist_vec.Values().size()); - //dh::CopyDeviceSpanToVector(&entry, hist_vec.Values()); - //printf("Non-enc: Rank %d Before AllReduce: %ld\n", collective::GetRank(), entry[0]); - auto rc = collective::GlobalSum( ctx_, info_, hist_vec); SafeColl(rc); - - // print out the first histogram with iterator - //dh::CopyDeviceSpanToVector(&entry, hist_vec.Values()); - //printf("Non-enc: Rank %d After AllReduce: %ld\n", collective::GetRank(), entry[0]); - monitor.Stop("AllReduce"); } void AllReduceHistEncrypted(int nidx, int num_histograms) { - monitor.Start("AllReduceEncrypted"); - // Get encryption plugin - decltype(std::declval().EncryptionPlugin()) plugin; - auto const &comm = collective::GlobalCommGroup()->Ctx(ctx_, DeviceOrd::CPU()); - auto const &fed = dynamic_cast(comm); - plugin = fed.EncryptionPlugin(); - - // Get the histogram data - std::size_t n = page->Cuts().TotalBins() * 2 * num_histograms; - auto d_node_hist = hist.GetNodeHistogram(nidx).data(); - using ReduceT = typename std::remove_pointer::type::ValueT; - auto hist_vec = linalg::MakeVec(reinterpret_cast(d_node_hist), n, ctx_->Device()); - - // copy the histogram out of GPU memory - common::Span erased = common::EraseType(hist_vec.Values()); - std::vector h_data(erased.size()); - cudaMemcpy(h_data.data(), erased.data(), erased.size(), cudaMemcpyDeviceToHost); - - // call the encryption plugin - auto src_hist = common::Span{reinterpret_cast(h_data.data()), n}; - auto hist_buf = plugin->BuildEncryptedHistHori(src_hist); - - // allgather - HostDeviceVector hist_entries; - std::vector recv_segments; - auto rc = - collective::AllgatherV(ctx_, linalg::MakeVec(hist_buf), &recv_segments, &hist_entries); - collective::SafeColl(rc); - - // call the encryption plugin to aggregate the histograms - auto hist_aggr = - plugin->SyncEncryptedHistHori(common::RestoreType(hist_entries.HostSpan())); - - // copy the aggregated histogram back to GPU memory - cudaMemcpy(erased.data(), hist_aggr.data(), erased.size(), cudaMemcpyHostToDevice); - - monitor.Stop("AllReduceEncrypted"); + monitor.Start("AllReduceEncrypted"); + // Get encryption plugin + decltype(std::declval().EncryptionPlugin()) plugin; + auto const &comm = collective::GlobalCommGroup()->Ctx(ctx_, DeviceOrd::CPU()); + auto const &fed = dynamic_cast(comm); + plugin = fed.EncryptionPlugin(); + + // Get the histogram data + std::size_t n = page->Cuts().TotalBins() * 2 * num_histograms; + auto d_node_hist = hist.GetNodeHistogram(nidx).data(); + using ReduceT = typename std::remove_pointer::type::ValueT; + auto hist_vec = linalg::MakeVec(reinterpret_cast(d_node_hist), n, ctx_->Device()); + + // copy the histogram out of GPU memory + common::Span erased = common::EraseType(hist_vec.Values()); + std::vector h_data(erased.size()); + cudaMemcpy(h_data.data(), erased.data(), erased.size(), cudaMemcpyDeviceToHost); + + // call the encryption plugin + auto src_hist = common::Span{reinterpret_cast(h_data.data()), n}; + auto hist_buf = plugin->BuildEncryptedHistHori(src_hist); + + // allgather + HostDeviceVector hist_entries; + std::vector recv_segments; + auto rc = collective::AllgatherV(ctx_, linalg::MakeVec(hist_buf), &recv_segments, &hist_entries); + collective::SafeColl(rc); + + // call the encryption plugin to decode the histograms + auto hist_aggr = plugin->SyncEncryptedHistHori(common::RestoreType(hist_entries.HostSpan())); + + // reinterpret the aggregated histogram as a int64_t and aggregate + auto hist_aggr_64 = common::Span{reinterpret_cast(hist_aggr.data()), hist_aggr.size()}; + int num_ranks = collective::GlobalCommGroup()->World(); + for (size_t i = 0; i < n; i++) { + for (int j = 1; j < num_ranks; j++) { + hist_aggr_64[i] = hist_aggr_64[i] + hist_aggr_64[i + j * n]; + } + } + + // copy the aggregated histogram back to GPU memory + cudaMemcpy(erased.data(), hist_aggr_64.data(), erased.size(), cudaMemcpyHostToDevice); + + monitor.Stop("AllReduceEncrypted"); } /** From 7421cfa55ca77d857387337f223b2b1b80317ef2 Mon Sep 17 00:00:00 2001 From: Ziyue Xu Date: Fri, 19 Jul 2024 10:29:58 -0400 Subject: [PATCH 04/26] code lint corrections --- src/tree/updater_gpu_hist.cu | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 5600da12d9f4..f562de17f69c 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -46,7 +46,6 @@ #include "../collective/communicator-inl.h" #include "../collective/allgather.h" // for AllgatherV -#include namespace xgboost::tree { #if !defined(GTEST_TEST) DMLC_REGISTRY_FILE_TAG(updater_gpu_hist); @@ -660,14 +659,17 @@ struct GPUHistMakerDevice { // allgather HostDeviceVector hist_entries; std::vector recv_segments; - auto rc = collective::AllgatherV(ctx_, linalg::MakeVec(hist_buf), &recv_segments, &hist_entries); + auto rc = collective::AllgatherV(ctx_, linalg::MakeVec(hist_buf), + &recv_segments, &hist_entries); collective::SafeColl(rc); // call the encryption plugin to decode the histograms - auto hist_aggr = plugin->SyncEncryptedHistHori(common::RestoreType(hist_entries.HostSpan())); + auto hist_aggr = plugin->SyncEncryptedHistHori( + common::RestoreType(hist_entries.HostSpan())); // reinterpret the aggregated histogram as a int64_t and aggregate - auto hist_aggr_64 = common::Span{reinterpret_cast(hist_aggr.data()), hist_aggr.size()}; + auto hist_aggr_64 = common::Span{ + reinterpret_cast(hist_aggr.data()), hist_aggr.size()}; int num_ranks = collective::GlobalCommGroup()->World(); for (size_t i = 0; i < n; i++) { for (int j = 1; j < num_ranks; j++) { @@ -718,8 +720,7 @@ struct GPUHistMakerDevice { // If secure horizontal, perform AllReduce by calling the encryption plugin if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) { this->AllReduceHistEncrypted(hist_nidx.at(0), hist_nidx.size()); - } - else { + } else { this->AllReduceHist(hist_nidx.at(0), hist_nidx.size()); } @@ -733,8 +734,7 @@ struct GPUHistMakerDevice { this->BuildHist(subtraction_trick_nidx); if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) { this->AllReduceHistEncrypted(subtraction_trick_nidx, 1); - } - else { + } else { this->AllReduceHist(subtraction_trick_nidx, 1); } } @@ -812,8 +812,7 @@ struct GPUHistMakerDevice { this->BuildHist(kRootNIdx); if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) { this->AllReduceHistEncrypted(kRootNIdx, 1); - } - else { + } else { this->AllReduceHist(kRootNIdx, 1); } From 361e17a0f164980d7b1a050d6d063c5e9f6344be Mon Sep 17 00:00:00 2001 From: Ziyue Xu Date: Thu, 25 Jul 2024 10:06:51 -0400 Subject: [PATCH 05/26] remove redundant plugin init --- src/tree/updater_gpu_hist.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 8d1a38e79822..5f4ed304ea9c 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -530,10 +530,9 @@ struct GPUHistMakerDevice { void AllReduceHistEncrypted(int nidx, int num_histograms) { monitor.Start("AllReduceEncrypted"); // Get encryption plugin - decltype(std::declval().EncryptionPlugin()) plugin; auto const &comm = collective::GlobalCommGroup()->Ctx(ctx_, DeviceOrd::CPU()); auto const &fed = dynamic_cast(comm); - plugin = fed.EncryptionPlugin(); + auto plugin = fed.EncryptionPlugin(); // Get the histogram data std::size_t n = page->Cuts().TotalBins() * 2 * num_histograms; From 306fc1a5dda46a4d18c102af8b7da390cb13c818 Mon Sep 17 00:00:00 2001 From: Ziyue Xu Date: Thu, 25 Jul 2024 16:16:09 -0400 Subject: [PATCH 06/26] update to avoid pipeline stuck for secure vertical --- src/collective/aggregator.h | 2 +- src/gbm/gbtree.cc | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/collective/aggregator.h b/src/collective/aggregator.h index f3d2aa090aa9..8ebd39b900fe 100644 --- a/src/collective/aggregator.h +++ b/src/collective/aggregator.h @@ -199,7 +199,7 @@ void BroadcastGradient(Context const* ctx, MetaInfo const& info, GradFn&& grad_f #if defined(XGBOOST_USE_FEDERATED) // Need to encrypt the gradient before broadcasting. common::Span encrypted; - auto const& comm = GlobalCommGroup()->Ctx(ctx, ctx->Device()); + auto const& comm = GlobalCommGroup()->Ctx(ctx, DeviceOrd::CPU()); auto const& fed = dynamic_cast(comm); if (GetRank() == 0) { // Obtain the gradient diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index 26c768fafea7..fe95708c7a52 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -237,7 +237,9 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix* in_gpair, } } else if (model_.learner_model_param->OutputLength() == 1u) { TreesOneGroup ret; + //std::cout << "BoostNewTrees" << std::endl; BoostNewTrees(in_gpair, p_fmat, 0, &node_position, &ret); + //std::cout << "UpdateTreeLeaf" << std::endl; UpdateTreeLeaf(p_fmat, predt->predictions, obj, 0, node_position, &ret); const size_t num_new_trees = ret.size(); new_trees.push_back(std::move(ret)); @@ -312,7 +314,11 @@ void GBTree::BoostNewTrees(linalg::Matrix* gpair, DMatrix* p_fmat, // update the trees auto n_out = model_.learner_model_param->OutputLength() * p_fmat->Info().num_row_; - StringView msg{ + + //std::cout << "Num_rows: " << p_fmat->Info().num_row_ << " and n_out: " << n_out << std::endl; + + + StringView msg{ "Mismatching size between number of rows from input data and size of gradient vector."}; if (!model_.learner_model_param->IsVectorLeaf() && p_fmat->Info().num_row_ != 0) { CHECK_EQ(n_out % gpair->Size(), 0) << msg; @@ -325,11 +331,18 @@ void GBTree::BoostNewTrees(linalg::Matrix* gpair, DMatrix* p_fmat, // Rescale learning rate according to the size of trees auto lr = tree_param_.learning_rate; tree_param_.learning_rate /= static_cast(new_trees.size()); - for (auto& up : updaters_) { + + //std::cout << "Update for Rank " << collective::GetRank() << " with gpair size " << gpair->Size() << " and device " << gpair->Device() << std::endl; + + + for (auto& up : updaters_) { up->Update(&tree_param_, gpair, p_fmat, common::Span>{*out_position}, new_trees); } tree_param_.learning_rate = lr; + + //std::cout << "Update done" << std::endl; + } void GBTree::CommitModel(TreesOneIter&& new_trees) { From 906b0fa42b9d048b3626a7a4c40a8720a715c166 Mon Sep 17 00:00:00 2001 From: Ziyue Xu Date: Thu, 25 Jul 2024 16:19:23 -0400 Subject: [PATCH 07/26] update to avoid pipeline stuck for secure vertical --- src/gbm/gbtree.cc | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index fe95708c7a52..26c768fafea7 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -237,9 +237,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix* in_gpair, } } else if (model_.learner_model_param->OutputLength() == 1u) { TreesOneGroup ret; - //std::cout << "BoostNewTrees" << std::endl; BoostNewTrees(in_gpair, p_fmat, 0, &node_position, &ret); - //std::cout << "UpdateTreeLeaf" << std::endl; UpdateTreeLeaf(p_fmat, predt->predictions, obj, 0, node_position, &ret); const size_t num_new_trees = ret.size(); new_trees.push_back(std::move(ret)); @@ -314,11 +312,7 @@ void GBTree::BoostNewTrees(linalg::Matrix* gpair, DMatrix* p_fmat, // update the trees auto n_out = model_.learner_model_param->OutputLength() * p_fmat->Info().num_row_; - - //std::cout << "Num_rows: " << p_fmat->Info().num_row_ << " and n_out: " << n_out << std::endl; - - - StringView msg{ + StringView msg{ "Mismatching size between number of rows from input data and size of gradient vector."}; if (!model_.learner_model_param->IsVectorLeaf() && p_fmat->Info().num_row_ != 0) { CHECK_EQ(n_out % gpair->Size(), 0) << msg; @@ -331,18 +325,11 @@ void GBTree::BoostNewTrees(linalg::Matrix* gpair, DMatrix* p_fmat, // Rescale learning rate according to the size of trees auto lr = tree_param_.learning_rate; tree_param_.learning_rate /= static_cast(new_trees.size()); - - //std::cout << "Update for Rank " << collective::GetRank() << " with gpair size " << gpair->Size() << " and device " << gpair->Device() << std::endl; - - - for (auto& up : updaters_) { + for (auto& up : updaters_) { up->Update(&tree_param_, gpair, p_fmat, common::Span>{*out_position}, new_trees); } tree_param_.learning_rate = lr; - - //std::cout << "Update done" << std::endl; - } void GBTree::CommitModel(TreesOneIter&& new_trees) { From 35d23f80b768e857bf672acc7c88a601fb6000a1 Mon Sep 17 00:00:00 2001 From: Ziyue Xu Date: Fri, 26 Jul 2024 10:00:52 -0400 Subject: [PATCH 08/26] Update src/tree/updater_gpu_hist.cu Co-authored-by: Jiaming Yuan --- src/tree/updater_gpu_hist.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 5f4ed304ea9c..bdcc90a8bda7 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -528,7 +528,7 @@ struct GPUHistMakerDevice { } void AllReduceHistEncrypted(int nidx, int num_histograms) { - monitor.Start("AllReduceEncrypted"); + monitor.Start(__func__); // Get encryption plugin auto const &comm = collective::GlobalCommGroup()->Ctx(ctx_, DeviceOrd::CPU()); auto const &fed = dynamic_cast(comm); From 9ab2d8dcd569bef85c1b62a46093e4e1b6b8be11 Mon Sep 17 00:00:00 2001 From: Ziyue Xu Date: Fri, 26 Jul 2024 10:00:57 -0400 Subject: [PATCH 09/26] Update src/tree/updater_gpu_hist.cu Co-authored-by: Jiaming Yuan --- src/tree/updater_gpu_hist.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index bdcc90a8bda7..86ffeecf3cf6 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -573,7 +573,7 @@ struct GPUHistMakerDevice { // copy the aggregated histogram back to GPU memory cudaMemcpy(erased.data(), hist_aggr_64.data(), erased.size(), cudaMemcpyHostToDevice); - monitor.Stop("AllReduceEncrypted"); + monitor.Stop(__func__); } /** From 96ab11311fcfd318fc08d62dee257ebdd3bc625d Mon Sep 17 00:00:00 2001 From: Ziyue Xu Date: Fri, 26 Jul 2024 10:17:19 -0400 Subject: [PATCH 10/26] wrap cuda API calls in safe_cuda --- src/tree/updater_gpu_hist.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 86ffeecf3cf6..58439132ee6f 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -543,7 +543,7 @@ struct GPUHistMakerDevice { // copy the histogram out of GPU memory common::Span erased = common::EraseType(hist_vec.Values()); std::vector h_data(erased.size()); - cudaMemcpy(h_data.data(), erased.data(), erased.size(), cudaMemcpyDeviceToHost); + dh::safe_cuda(cudaMemcpy(h_data.data(), erased.data(), erased.size(), cudaMemcpyDeviceToHost)); // call the encryption plugin auto src_hist = common::Span{reinterpret_cast(h_data.data()), n}; From 778d6be01ee6ad7a4b1a3351d59fa4eb74b8d672 Mon Sep 17 00:00:00 2001 From: Ziyue Xu Date: Fri, 26 Jul 2024 13:21:14 -0400 Subject: [PATCH 11/26] add include for federatedcomm --- src/tree/updater_gpu_hist.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 58439132ee6f..a19e2c6f45ea 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -46,6 +46,7 @@ #include "../collective/communicator-inl.h" #include "../collective/allgather.h" // for AllgatherV +#include "../../plugin/federated/federated_comm.h" // for FederatedComm namespace xgboost::tree { #if !defined(GTEST_TEST) From 32c301438187b6cf4238c03228b73c364550bcd5 Mon Sep 17 00:00:00 2001 From: Ziyue Xu Date: Fri, 26 Jul 2024 14:53:55 -0400 Subject: [PATCH 12/26] add include conditions --- src/tree/updater_gpu_hist.cu | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index a19e2c6f45ea..78e22ea9c7e1 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -46,7 +46,12 @@ #include "../collective/communicator-inl.h" #include "../collective/allgather.h" // for AllgatherV + +#if defined(XGBOOST_USE_FEDERATED) #include "../../plugin/federated/federated_comm.h" // for FederatedComm +#else +#include "../../common/error_msg.h" // for NoFederated +#endif namespace xgboost::tree { #if !defined(GTEST_TEST) @@ -528,6 +533,7 @@ struct GPUHistMakerDevice { monitor.Stop("AllReduce"); } +#if defined(XGBOOST_USE_FEDERATED) void AllReduceHistEncrypted(int nidx, int num_histograms) { monitor.Start(__func__); // Get encryption plugin @@ -576,6 +582,7 @@ struct GPUHistMakerDevice { monitor.Stop(__func__); } +#endif /** * \brief Build GPU local histograms for the left and right child of some parent node @@ -613,7 +620,9 @@ struct GPUHistMakerDevice { // when processing a large batch // If secure horizontal, perform AllReduce by calling the encryption plugin if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) { - this->AllReduceHistEncrypted(hist_nidx.at(0), hist_nidx.size()); + #if defined(XGBOOST_USE_FEDERATED) + this->AllReduceHistEncrypted(hist_nidx.at(0), hist_nidx.size()); + #endif } else { this->AllReduceHist(hist_nidx.at(0), hist_nidx.size()); } @@ -627,7 +636,9 @@ struct GPUHistMakerDevice { // Calculate other histogram manually this->BuildHist(subtraction_trick_nidx); if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) { - this->AllReduceHistEncrypted(subtraction_trick_nidx, 1); + #if defined(XGBOOST_USE_FEDERATED) + this->AllReduceHistEncrypted(subtraction_trick_nidx, 1); + #endif } else { this->AllReduceHist(subtraction_trick_nidx, 1); } @@ -705,7 +716,9 @@ struct GPUHistMakerDevice { hist.AllocateHistograms(ctx_, {kRootNIdx}); this->BuildHist(kRootNIdx); if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) { - this->AllReduceHistEncrypted(kRootNIdx, 1); + #if defined(XGBOOST_USE_FEDERATED) + this->AllReduceHistEncrypted(kRootNIdx, 1); + #endif } else { this->AllReduceHist(kRootNIdx, 1); } From 7df59551e88d145d04961958ccc0aa1b56028141 Mon Sep 17 00:00:00 2001 From: Ziyue Xu Date: Fri, 26 Jul 2024 15:12:50 -0400 Subject: [PATCH 13/26] add include conditions --- src/tree/updater_gpu_hist.cu | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 78e22ea9c7e1..7d3069e35f8d 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -582,6 +582,8 @@ struct GPUHistMakerDevice { monitor.Stop(__func__); } +#else + LOG(FATAL) << error::NoFederated(); #endif /** @@ -622,6 +624,8 @@ struct GPUHistMakerDevice { if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) { #if defined(XGBOOST_USE_FEDERATED) this->AllReduceHistEncrypted(hist_nidx.at(0), hist_nidx.size()); + #else + LOG(FATAL) << error::NoFederated(); #endif } else { this->AllReduceHist(hist_nidx.at(0), hist_nidx.size()); @@ -638,6 +642,8 @@ struct GPUHistMakerDevice { if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) { #if defined(XGBOOST_USE_FEDERATED) this->AllReduceHistEncrypted(subtraction_trick_nidx, 1); + #else + LOG(FATAL) << error::NoFederated(); #endif } else { this->AllReduceHist(subtraction_trick_nidx, 1); @@ -718,6 +724,8 @@ struct GPUHistMakerDevice { if (collective::IsDistributed() && info_.IsRowSplit() && collective::IsEncrypted()) { #if defined(XGBOOST_USE_FEDERATED) this->AllReduceHistEncrypted(kRootNIdx, 1); + #else + LOG(FATAL) << error::NoFederated(); #endif } else { this->AllReduceHist(kRootNIdx, 1); From 3cc863a266c9b4d6b6a16b2158dd9b820eb733b4 Mon Sep 17 00:00:00 2001 From: Ziyue Xu Date: Mon, 29 Jul 2024 10:19:40 -0400 Subject: [PATCH 14/26] correct import error --- src/tree/updater_gpu_hist.cu | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 7d3069e35f8d..088fc199786b 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -50,7 +50,7 @@ #if defined(XGBOOST_USE_FEDERATED) #include "../../plugin/federated/federated_comm.h" // for FederatedComm #else -#include "../../common/error_msg.h" // for NoFederated +#include "../common/error_msg.h" // for NoFederated #endif namespace xgboost::tree { @@ -582,8 +582,6 @@ struct GPUHistMakerDevice { monitor.Stop(__func__); } -#else - LOG(FATAL) << error::NoFederated(); #endif /** From f2b876d182f5412f199b9f89bf1da43585612823 Mon Sep 17 00:00:00 2001 From: Ziyue Xu Date: Tue, 30 Jul 2024 18:21:47 -0400 Subject: [PATCH 15/26] implement alternative vertical pipeline in GPU --- src/collective/aggregator.h | 4 ++ src/common/quantile.cu | 23 ++++++++ src/tree/gpu_hist/evaluate_splits.cu | 65 ++++++++++++++--------- src/tree/gpu_hist/histogram.cu | 79 ++++++++++++++++++++++++++-- src/tree/gpu_hist/histogram.cuh | 5 +- src/tree/updater_gpu_hist.cu | 4 +- 6 files changed, 148 insertions(+), 32 deletions(-) diff --git a/src/collective/aggregator.h b/src/collective/aggregator.h index 8ebd39b900fe..ed548a0f173b 100644 --- a/src/collective/aggregator.h +++ b/src/collective/aggregator.h @@ -230,6 +230,10 @@ void BroadcastGradient(Context const* ctx, MetaInfo const& info, GradFn&& grad_f #else LOG(FATAL) << error::NoFederated(); #endif + + // !!!Temporarily turn on regular gradient broadcasting for testing + // encrypted vertical + ApplyWithLabels(ctx, info, out_gpair->Data(), [&] { grad_fn(out_gpair); }); } else { ApplyWithLabels(ctx, info, out_gpair->Data(), [&] { grad_fn(out_gpair); }); } diff --git a/src/common/quantile.cu b/src/common/quantile.cu index d0356ae421c7..d5e927b30414 100644 --- a/src/common/quantile.cu +++ b/src/common/quantile.cu @@ -673,6 +673,7 @@ void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool i } } + auto secure_vertical = is_column_split && collective::IsEncrypted(); // Set up output cuts for (bst_feature_t i = 0; i < num_columns_; ++i) { size_t column_size = std::max(static_cast(1ul), this->Column(i).size()); @@ -681,6 +682,11 @@ void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool i CheckMaxCat(max_values[i].value, column_size); h_out_columns_ptr.push_back(max_values[i].value + 1); // includes both max_cat and 0. } else { + // If vertical and secure mode, we need to sync the max_num_bins aross workers + // to create the same global number of cut point bins for easier future processing + if (secure_vertical) { + collective::SafeColl(collective::Allreduce(ctx, &column_size, collective::Op::kMax)); + } h_out_columns_ptr.push_back( std::min(static_cast(column_size), static_cast(num_bins_))); } @@ -711,6 +717,10 @@ void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool i out_column[0] = kRtEps; assert(out_column.size() == 1); } + // For secure vertical split, fill all cut values with dummy value + if (secure_vertical) { + out_column[idx] = kRtEps; + } return; } @@ -736,6 +746,19 @@ void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool i out_column[idx] = in_column[idx+1].value; }); + if (secure_vertical) { + // cut values need to be synced across all workers via Allreduce + // To do: apply same inference indexing as CPU, skip for now + auto cut_values_device = p_cuts->cut_values_.DeviceSpan(); + std::vector cut_values_host(cut_values_device.size()); + dh::CopyDeviceSpanToVector(&cut_values_host, cut_values_device); + auto rc = collective::Allreduce(ctx, &cut_values_host, collective::Op::kSum); + SafeColl(rc); + dh::safe_cuda(cudaMemcpyAsync(cut_values_device.data(), cut_values_host.data(), + cut_values_device.size() * sizeof(float), + cudaMemcpyHostToDevice)); + } + p_cuts->SetCategorical(this->has_categorical_, max_cat); timer_.Stop(__func__); } diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index 5e225a13f142..3b5c1b76fa1c 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -6,6 +6,7 @@ #include #include "../../collective/allgather.h" +#include "../../collective/broadcast.h" #include "../../common/categorical.h" #include "../../data/ellpack_page.cuh" #include "evaluate_splits.cuh" @@ -404,34 +405,48 @@ void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector splits_out_storage(d_inputs.size()); auto out_splits = dh::ToSpan(splits_out_storage); - this->LaunchEvaluateSplits(max_active_features, d_inputs, shared_inputs, - evaluator, out_splits); - if (is_column_split_) { - // With column-wise data split, we gather the split candidates from all the workers and find the - // global best candidates. - auto const world_size = collective::GetWorldSize(); - dh::TemporaryArray all_candidate_storage(out_splits.size() * world_size); - auto all_candidates = dh::ToSpan(all_candidate_storage); - auto current_rank = - all_candidates.subspan(collective::GetRank() * out_splits.size(), out_splits.size()); - dh::safe_cuda(cudaMemcpyAsync(current_rank.data(), out_splits.data(), - out_splits.size() * sizeof(DeviceSplitCandidate), - cudaMemcpyDeviceToDevice)); - auto rc = collective::Allgather( - ctx, linalg::MakeVec(all_candidates.data(), all_candidates.size(), ctx->Device())); - collective::SafeColl(rc); - - // Reduce to get the best candidate from all workers. - dh::LaunchN(out_splits.size(), ctx->CUDACtx()->Stream(), - [world_size, all_candidates, out_splits] __device__(size_t i) { - out_splits[i] = all_candidates[i]; - for (auto rank = 1; rank < world_size; rank++) { - out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i]; - } - }); + bool is_passive_party = is_column_split_ && collective::IsEncrypted() && collective::GetRank() != 0; + bool is_active_party = !is_passive_party; + // Under secure vertical setting, only the active party is able to evaluate the split + // based on global histogram. Other parties will receive the final best split information + // Hence the below computation is not performed by the passive parties + if (is_active_party) { + this->LaunchEvaluateSplits(max_active_features, d_inputs, shared_inputs, + evaluator, out_splits); } + if (is_column_split_) { + if (!collective::IsEncrypted()) { + // With regular column-wise data split, we gather the split candidates from + // all the workers and find the global best candidates. + auto const world_size = collective::GetWorldSize(); + dh::TemporaryArray all_candidate_storage(out_splits.size() * world_size); + auto all_candidates = dh::ToSpan(all_candidate_storage); + auto current_rank = + all_candidates.subspan(collective::GetRank() * out_splits.size(), out_splits.size()); + dh::safe_cuda(cudaMemcpyAsync(current_rank.data(), out_splits.data(), + out_splits.size() * sizeof(DeviceSplitCandidate), + cudaMemcpyDeviceToDevice)); + auto rc = collective::Allgather( + ctx, linalg::MakeVec(all_candidates.data(), all_candidates.size(), ctx->Device())); + collective::SafeColl(rc); + // Reduce to get the best candidate from all workers. + dh::LaunchN(out_splits.size(), ctx->CUDACtx()->Stream(), + [world_size, all_candidates, out_splits] __device__(size_t i) { + out_splits[i] = all_candidates[i]; + for (auto rank = 1; rank < world_size; rank++) { + out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i]; + } + }); + } else { + // With encrypted column-wise data split, we distribute the best split candidates + // from Rank 0 to all other workers + auto rc = collective::Broadcast( + ctx, linalg::MakeVec(out_splits.data(), out_splits.size(), ctx->Device()), 0); + collective::SafeColl(rc); + } + } auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()); auto d_entries = out_entries; auto device_cats_accessor = this->DeviceCatStorage(nidx); diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index 372a5c09ba0c..d0d2634a1363 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -14,6 +14,13 @@ #include "row_partitioner.cuh" #include "xgboost/base.h" +#include "../../common/device_helpers.cuh" +#if defined(XGBOOST_USE_FEDERATED) +#include "../../../plugin/federated/federated_hist.h" // for FederataedHistPolicy +#else +#include "../../common/error_msg.h" // for NoFederated +#endif + namespace xgboost::tree { namespace { struct Pair { @@ -309,6 +316,21 @@ class DeviceHistogramBuilderImpl { bool force_global_memory) { this->kernel_ = std::make_unique>(ctx, feature_groups, force_global_memory); this->force_global_memory_ = force_global_memory; + + + std::cout << "Reset DeviceHistogramBuilderImpl" << std::endl; + + // Reset federated plugin + // start of every round, transmit the matrix to plugin + #if defined(XGBOOST_USE_FEDERATED) + // Get encryption plugin + auto const &comm = collective::GlobalCommGroup()->Ctx(ctx, DeviceOrd::CPU()); + auto const &fed = dynamic_cast(comm); + auto plugin = fed.EncryptionPlugin(); + // Reset plugin + //plugin->Reset(); + #endif + } void BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix, @@ -354,13 +376,64 @@ void DeviceHistogramBuilder::Reset(Context const* ctx, FeatureGroupsAccessor con this->p_impl_->Reset(ctx, feature_groups, force_global_memory); } -void DeviceHistogramBuilder::BuildHistogram(CUDAContext const* ctx, +void DeviceHistogramBuilder::BuildHistogram(Context const* ctx, EllpackDeviceAccessor const& matrix, FeatureGroupsAccessor const& feature_groups, common::Span gpair, common::Span ridx, common::Span histogram, - GradientQuantiser rounding) { - this->p_impl_->BuildHistogram(ctx, matrix, feature_groups, gpair, ridx, histogram, rounding); + GradientQuantiser rounding, MetaInfo const& info) { + + auto IsSecureVertical = !info.IsRowSplit() && collective::IsDistributed() && collective::IsEncrypted(); + if (!IsSecureVertical) { + // Regular training, build histogram locally + this->p_impl_->BuildHistogram(ctx->CUDACtx(), matrix, feature_groups, gpair, ridx, histogram, rounding); + } else { + // Encrypted vertical, build histogram using federated plugin + auto const &comm = collective::GlobalCommGroup()->Ctx(ctx, DeviceOrd::CPU()); + auto const &fed = dynamic_cast(comm); + auto plugin = fed.EncryptionPlugin(); + // Transmit matrix to plugin + //plugin->TransmitMatrix(matrix); + // Transmit row indices to plugin + //plugin->TransmitRowIndices(ridx); + + // !!!Temporarily turn on regular histogram building for testing + // encrypted vertical + this->p_impl_->BuildHistogram(ctx->CUDACtx(), matrix, feature_groups, gpair, ridx, histogram, rounding); + + // Further histogram sync process - simulated + // only the last stage is needed under plugin system + + // copy histogram data to host + std::vector host_histogram(histogram.size()); + dh::CopyDeviceSpanToVector(&host_histogram, histogram); + // convert to regular vector + std::vector host_histogram_64(histogram.size() * 2); + for (auto i = 0; i < host_histogram.size(); i++) { + host_histogram_64[i * 2] = host_histogram[i].GetQuantisedGrad(); + host_histogram_64[i * 2 + 1] = host_histogram[i].GetQuantisedHess(); + } + // aggregate histograms in float + auto rc = collective::Allreduce(ctx, &host_histogram_64, collective::Op::kSum); + SafeColl(rc); + // convert back to GradientPairInt64 + // only copy to Rank 0, clear other ranks to simulate the plugin scenario + for (auto i = 0; i < host_histogram.size(); i++) { + GradientPairInt64 hist_item(host_histogram_64[i * 2], host_histogram_64[i * 2 + 1]); + GradientPairInt64 hist_item_empty(0, 0); + if (collective::GetRank() != 0) { + hist_item = hist_item_empty; + } else { + host_histogram[i] = hist_item; + } + } + // copy the aggregated histogram back to GPU memory + // at this point, the histogram contains full information from all parties + dh::safe_cuda(cudaMemcpyAsync(histogram.data(), host_histogram.data(), + histogram.size() * sizeof(GradientPairPrecise), + cudaMemcpyHostToDevice)); + + } } } // namespace xgboost::tree diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh index 87c60a8bfdbc..5c0d58f2df46 100644 --- a/src/tree/gpu_hist/histogram.cuh +++ b/src/tree/gpu_hist/histogram.cuh @@ -178,11 +178,12 @@ class DeviceHistogramBuilder { void Reset(Context const* ctx, FeatureGroupsAccessor const& feature_groups, bool force_global_memory); - void BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix, + void BuildHistogram(Context const* ctx, EllpackDeviceAccessor const& matrix, FeatureGroupsAccessor const& feature_groups, common::Span gpair, common::Span ridx, - common::Span histogram, GradientQuantiser rounding); + common::Span histogram, GradientQuantiser rounding, + MetaInfo const& info); }; } // namespace xgboost::tree #endif // HISTOGRAM_CUH_ diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 088fc199786b..d588e4db9d2b 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -249,9 +249,9 @@ struct GPUHistMakerDevice { void BuildHist(int nidx) { auto d_node_hist = hist.GetNodeHistogram(nidx); auto d_ridx = row_partitioner->GetRows(nidx); - this->histogram_.BuildHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->Device()), + this->histogram_.BuildHistogram(ctx_, page->GetDeviceAccessor(ctx_->Device()), feature_groups->DeviceAccessor(ctx_->Device()), gpair, d_ridx, - d_node_hist, *quantiser); + d_node_hist, *quantiser, info_); } // Attempt to do subtraction trick From 257251947b9e12ace737ea6693ccffe2e7a089f0 Mon Sep 17 00:00:00 2001 From: Ziyue Xu Date: Tue, 30 Jul 2024 22:22:01 -0400 Subject: [PATCH 16/26] transmit necessary info to plugin - align GPU with CPU --- plugin/federated/federated_plugin.cc | 21 ++++++++ src/tree/gpu_hist/histogram.cu | 77 ++++++++++++++++++++-------- src/tree/gpu_hist/histogram.cuh | 3 +- 3 files changed, 78 insertions(+), 23 deletions(-) diff --git a/plugin/federated/federated_plugin.cc b/plugin/federated/federated_plugin.cc index 8488c59b26db..2e36da05145a 100644 --- a/plugin/federated/federated_plugin.cc +++ b/plugin/federated/federated_plugin.cc @@ -21,6 +21,20 @@ namespace xgboost::collective { void FederatedPluginMock::Reset(common::Span cutptrs, common::Span bin_idx) { + + //print some contents of cutptrs and bin_idx + std::cout << "cutptrs.size() = " << cutptrs.size() << std::endl; + for (int i = 0; i < cutptrs.size(); i++) { + std::cout << cutptrs[i] << " "; + } + std::cout << std::endl; + + std::cout << "bin_idx.size() = " << bin_idx.size() << std::endl; + for (int i = 0; i < 3; i++) { + std::cout << bin_idx[i] << " "; + } + std::cout << std::endl; + this->cuts_.resize(cutptrs.size()); std::copy_n(cutptrs.data(), cutptrs.size(), this->cuts_.data()); @@ -70,6 +84,13 @@ void FederatedPluginMock::Reset(common::Span cutptrs, auto hist_buffer = common::Span{hist_plain_}; std::fill_n(hist_buffer.data(), hist_buffer.size(), 0.0); + // print some contents of rowptrs + std::cout << "rowptrs.size() = " << rowptrs.size() << std::endl; + for (int i = 0; i < rowptrs.size(); i++) { + std::cout << rowptrs[i] << " "; + } + std::cout << std::endl; + CHECK_EQ(rowptrs.size(), sizes.size()); CHECK_EQ(nids.size(), sizes.size()); auto gpair = common::RestoreType(common::Span{grad_}); diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index d0d2634a1363..7479e7b36adc 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -316,21 +316,6 @@ class DeviceHistogramBuilderImpl { bool force_global_memory) { this->kernel_ = std::make_unique>(ctx, feature_groups, force_global_memory); this->force_global_memory_ = force_global_memory; - - - std::cout << "Reset DeviceHistogramBuilderImpl" << std::endl; - - // Reset federated plugin - // start of every round, transmit the matrix to plugin - #if defined(XGBOOST_USE_FEDERATED) - // Get encryption plugin - auto const &comm = collective::GlobalCommGroup()->Ctx(ctx, DeviceOrd::CPU()); - auto const &fed = dynamic_cast(comm); - auto plugin = fed.EncryptionPlugin(); - // Reset plugin - //plugin->Reset(); - #endif - } void BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix, @@ -394,17 +379,65 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx, auto const &fed = dynamic_cast(comm); auto plugin = fed.EncryptionPlugin(); // Transmit matrix to plugin - //plugin->TransmitMatrix(matrix); - // Transmit row indices to plugin - //plugin->TransmitRowIndices(ridx); + if(!is_aggr_context_initialized_){ + std::cout << "Initialized Plugin Context" << std::endl; + // Get cutptrs + std::vector h_cuts_ptr(matrix.feature_segments.size()); + dh::CopyDeviceSpanToVector(&h_cuts_ptr, matrix.feature_segments); + common::Span cutptrs = common::Span(h_cuts_ptr.data(), h_cuts_ptr.size()); + std::cout << "cutptrs.size() = " << h_cuts_ptr.size() << std::endl; + for (int i = 0; i < h_cuts_ptr.size(); i++) { + std::cout << h_cuts_ptr[i] << " "; + } + std::cout << std::endl; + + // Get bin_idx matrix + + + + //common::Span bin_idx + //plugin->Reset(h_cuts_ptr, bin_idx); + is_aggr_context_initialized_ = true; + } + + std::cout << "Transmitting row indices to plugin" << std::endl; + // print a few samples of ridx + std::vector h_ridx(ridx.size()); + dh::CopyDeviceSpanToVector(&h_ridx, ridx); + std::cout << "ridx.size() = " << h_ridx.size() << std::endl; + for (int i = 0; i < 5; i++) { + std::cout << h_ridx[i] << " "; + } + std::cout << std::endl; + + // Transmit row indices to plugin and get encrypted histogram + //hist_data_ = this->plugin_->BuildEncryptedHistVert(ptrs, sizes, nodes); + + // Perform AllGather + std::cout << "Allgather histograms" << std::endl; + /* + HostDeviceVector hist_entries; + std::vector recv_segments; + collective::SafeColl( + collective::AllgatherV(ctx_, linalg::MakeVec(hist_data_), &recv_segments, &hist_entries)); + + // Call the plugin here to get the resulting histogram. Histogram from all workers are + // gathered to the label owner. + common::Span hist_aggr = + plugin_->SyncEncryptedHistVert(common::RestoreType(hist_entries.HostSpan())); +*/ + + + + + + // !!!Temporarily turn on regular histogram building for testing - // encrypted vertical + // compute local histograms this->p_impl_->BuildHistogram(ctx->CUDACtx(), matrix, feature_groups, gpair, ridx, histogram, rounding); - // Further histogram sync process - simulated - // only the last stage is needed under plugin system - + // Further histogram sync process - simulated with allreduce // copy histogram data to host std::vector host_histogram(histogram.size()); dh::CopyDeviceSpanToVector(&host_histogram, histogram); diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh index 5c0d58f2df46..08ba9be856fc 100644 --- a/src/tree/gpu_hist/histogram.cuh +++ b/src/tree/gpu_hist/histogram.cuh @@ -175,7 +175,8 @@ class DeviceHistogramBuilder { public: DeviceHistogramBuilder(); ~DeviceHistogramBuilder(); - + // Whether to secure aggregation context has been initialized + bool is_aggr_context_initialized_{false}; void Reset(Context const* ctx, FeatureGroupsAccessor const& feature_groups, bool force_global_memory); void BuildHistogram(Context const* ctx, EllpackDeviceAccessor const& matrix, From e42faaa394a04c8bfe216d9a97867ea8af5eefd7 Mon Sep 17 00:00:00 2001 From: Ziyue Xu Date: Wed, 31 Jul 2024 10:58:14 -0400 Subject: [PATCH 17/26] marked calls to plugin - align GPU with CPU --- plugin/federated/federated_plugin.cc | 8 +++++++- src/tree/gpu_hist/histogram.cu | 28 +++++++++++++++++++--------- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/plugin/federated/federated_plugin.cc b/plugin/federated/federated_plugin.cc index 2e36da05145a..6ff5aee4602a 100644 --- a/plugin/federated/federated_plugin.cc +++ b/plugin/federated/federated_plugin.cc @@ -85,9 +85,15 @@ void FederatedPluginMock::Reset(common::Span cutptrs, std::fill_n(hist_buffer.data(), hist_buffer.size(), 0.0); // print some contents of rowptrs + std::cout << "rowptrs.size() = " << rowptrs.size() << std::endl; for (int i = 0; i < rowptrs.size(); i++) { - std::cout << rowptrs[i] << " "; + std::cout << sizes[i] << std::endl; + common::Span row_indices{rowptrs[i], rowptrs[i] + sizes[i]}; + for (int j = 0; j < 5; j++) { + std::cout << row_indices[j] << " "; + } + std::cout << std::endl; } std::cout << std::endl; diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index 7479e7b36adc..ae1043666624 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -396,25 +396,35 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx, //common::Span bin_idx + //********************************************* //plugin->Reset(h_cuts_ptr, bin_idx); + //********************************************* is_aggr_context_initialized_ = true; } - std::cout << "Transmitting row indices to plugin" << std::endl; - // print a few samples of ridx + + // get row indices from device std::vector h_ridx(ridx.size()); dh::CopyDeviceSpanToVector(&h_ridx, ridx); - std::cout << "ridx.size() = " << h_ridx.size() << std::endl; - for (int i = 0; i < 5; i++) { - std::cout << h_ridx[i] << " "; - } - std::cout << std::endl; + + // wrap info following plugin expectations + std::vector ptrs(1); + std::vector sizes(1); + std::vector nodes(1); + ptrs[0] = reinterpret_cast(h_ridx.data()); + sizes[0] = h_ridx.size(); + nodes[0] = 0; // Transmit row indices to plugin and get encrypted histogram - //hist_data_ = this->plugin_->BuildEncryptedHistVert(ptrs, sizes, nodes); + std::cout << "Building encrypted histograms with row indices " << std::endl; + //********************************************* + //auto hist_data = plugin->BuildEncryptedHistVert(ptrs, sizes, nodes); +//********************************************* // Perform AllGather std::cout << "Allgather histograms" << std::endl; + + //********************************************* /* HostDeviceVector hist_entries; std::vector recv_segments; @@ -426,7 +436,7 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx, common::Span hist_aggr = plugin_->SyncEncryptedHistVert(common::RestoreType(hist_entries.HostSpan())); */ - + //********************************************* From 26aaded1b5880622454dba63de803fb9f61897d4 Mon Sep 17 00:00:00 2001 From: Ziyue Xu Date: Wed, 31 Jul 2024 19:02:59 -0400 Subject: [PATCH 18/26] secure vertical GPU fully functional --- plugin/federated/federated_plugin.cc | 27 ------ src/collective/aggregator.h | 11 ++- src/tree/gpu_hist/histogram.cu | 118 +++++++++++++-------------- 3 files changed, 65 insertions(+), 91 deletions(-) diff --git a/plugin/federated/federated_plugin.cc b/plugin/federated/federated_plugin.cc index 6ff5aee4602a..8488c59b26db 100644 --- a/plugin/federated/federated_plugin.cc +++ b/plugin/federated/federated_plugin.cc @@ -21,20 +21,6 @@ namespace xgboost::collective { void FederatedPluginMock::Reset(common::Span cutptrs, common::Span bin_idx) { - - //print some contents of cutptrs and bin_idx - std::cout << "cutptrs.size() = " << cutptrs.size() << std::endl; - for (int i = 0; i < cutptrs.size(); i++) { - std::cout << cutptrs[i] << " "; - } - std::cout << std::endl; - - std::cout << "bin_idx.size() = " << bin_idx.size() << std::endl; - for (int i = 0; i < 3; i++) { - std::cout << bin_idx[i] << " "; - } - std::cout << std::endl; - this->cuts_.resize(cutptrs.size()); std::copy_n(cutptrs.data(), cutptrs.size(), this->cuts_.data()); @@ -84,19 +70,6 @@ void FederatedPluginMock::Reset(common::Span cutptrs, auto hist_buffer = common::Span{hist_plain_}; std::fill_n(hist_buffer.data(), hist_buffer.size(), 0.0); - // print some contents of rowptrs - - std::cout << "rowptrs.size() = " << rowptrs.size() << std::endl; - for (int i = 0; i < rowptrs.size(); i++) { - std::cout << sizes[i] << std::endl; - common::Span row_indices{rowptrs[i], rowptrs[i] + sizes[i]}; - for (int j = 0; j < 5; j++) { - std::cout << row_indices[j] << " "; - } - std::cout << std::endl; - } - std::cout << std::endl; - CHECK_EQ(rowptrs.size(), sizes.size()); CHECK_EQ(nids.size(), sizes.size()); auto gpair = common::RestoreType(common::Span{grad_}); diff --git a/src/collective/aggregator.h b/src/collective/aggregator.h index ed548a0f173b..7f3110601ff7 100644 --- a/src/collective/aggregator.h +++ b/src/collective/aggregator.h @@ -227,13 +227,16 @@ void BroadcastGradient(Context const* ctx, MetaInfo const& info, GradFn&& grad_f SafeColl(rc); // Pass the gradient to the plugin fed.EncryptionPlugin()->SyncEncryptedGradient(encrypted); + + // !!!Temporarily solution + // This step is needed for memory allocation in the case of vertical secure GPU + // make out_gpair data value to all zero to avoid information leak + auto gpair_data = out_gpair->Data(); + gpair_data->Fill(GradientPair{0.0f, 0.0f}); + ApplyWithLabels(ctx, info, gpair_data, [&] { grad_fn(out_gpair); }); #else LOG(FATAL) << error::NoFederated(); #endif - - // !!!Temporarily turn on regular gradient broadcasting for testing - // encrypted vertical - ApplyWithLabels(ctx, info, out_gpair->Data(), [&] { grad_fn(out_gpair); }); } else { ApplyWithLabels(ctx, info, out_gpair->Data(), [&] { grad_fn(out_gpair); }); } diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index ae1043666624..d87f5cc077ac 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -14,6 +14,8 @@ #include "row_partitioner.cuh" #include "xgboost/base.h" +#include "../../collective/allgather.h" // for AllgatherV + #include "../../common/device_helpers.cuh" #if defined(XGBOOST_USE_FEDERATED) #include "../../../plugin/federated/federated_hist.h" // for FederataedHistPolicy @@ -361,6 +363,24 @@ void DeviceHistogramBuilder::Reset(Context const* ctx, FeatureGroupsAccessor con this->p_impl_->Reset(ctx, feature_groups, force_global_memory); } +struct ReadMatrixFunction { + EllpackDeviceAccessor matrix; + int kCols; + bst_float* matrix_data_d; + ReadMatrixFunction(EllpackDeviceAccessor matrix, int kCols, bst_float* matrix_data_d) + : matrix(std::move(matrix)), kCols(kCols), matrix_data_d(matrix_data_d) {} + + __device__ void operator()(size_t global_idx) { + auto row = global_idx / kCols; + auto col = global_idx % kCols; + auto value = matrix.GetBinIndex(row, col); + if (isnan(value)) { + value = -1; + } + matrix_data_d[global_idx] = value; + } +}; + void DeviceHistogramBuilder::BuildHistogram(Context const* ctx, EllpackDeviceAccessor const& matrix, FeatureGroupsAccessor const& feature_groups, @@ -378,105 +398,83 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx, auto const &comm = collective::GlobalCommGroup()->Ctx(ctx, DeviceOrd::CPU()); auto const &fed = dynamic_cast(comm); auto plugin = fed.EncryptionPlugin(); + // Transmit matrix to plugin if(!is_aggr_context_initialized_){ - std::cout << "Initialized Plugin Context" << std::endl; // Get cutptrs std::vector h_cuts_ptr(matrix.feature_segments.size()); dh::CopyDeviceSpanToVector(&h_cuts_ptr, matrix.feature_segments); common::Span cutptrs = common::Span(h_cuts_ptr.data(), h_cuts_ptr.size()); - std::cout << "cutptrs.size() = " << h_cuts_ptr.size() << std::endl; - for (int i = 0; i < h_cuts_ptr.size(); i++) { - std::cout << h_cuts_ptr[i] << " "; - } - std::cout << std::endl; // Get bin_idx matrix - - - - //common::Span bin_idx - //********************************************* - //plugin->Reset(h_cuts_ptr, bin_idx); - //********************************************* + auto kRows = matrix.n_rows; + auto kCols = matrix.NumFeatures(); + std::vector h_bin_idx(kRows * kCols); + // Access GPU matrix data + thrust::device_vector matrix_d(kRows * kCols); + dh::LaunchN(kRows * kCols, ReadMatrixFunction(matrix, kCols, matrix_d.data().get())); + thrust::copy(matrix_d.begin(), matrix_d.end(), h_bin_idx.begin()); + common::Span bin_idx = common::Span(h_bin_idx.data(), h_bin_idx.size()); + + // Initialize plugin context + plugin->Reset(h_cuts_ptr, h_bin_idx); is_aggr_context_initialized_ = true; } - // get row indices from device std::vector h_ridx(ridx.size()); dh::CopyDeviceSpanToVector(&h_ridx, ridx); - - // wrap info following plugin expectations + // necessary conversions to fit plugin expectations + std::vector h_ridx_64(ridx.size()); + for (int i = 0; i < ridx.size(); i++) { + h_ridx_64[i] = h_ridx[i]; + } std::vector ptrs(1); std::vector sizes(1); std::vector nodes(1); - ptrs[0] = reinterpret_cast(h_ridx.data()); - sizes[0] = h_ridx.size(); + ptrs[0] = reinterpret_cast(h_ridx_64.data()); + sizes[0] = h_ridx_64.size(); nodes[0] = 0; // Transmit row indices to plugin and get encrypted histogram - std::cout << "Building encrypted histograms with row indices " << std::endl; - //********************************************* - //auto hist_data = plugin->BuildEncryptedHistVert(ptrs, sizes, nodes); -//********************************************* + auto hist_data = plugin->BuildEncryptedHistVert(ptrs, sizes, nodes); // Perform AllGather - std::cout << "Allgather histograms" << std::endl; - - //********************************************* - /* HostDeviceVector hist_entries; std::vector recv_segments; collective::SafeColl( - collective::AllgatherV(ctx_, linalg::MakeVec(hist_data_), &recv_segments, &hist_entries)); + collective::AllgatherV(ctx, linalg::MakeVec(hist_data), &recv_segments, &hist_entries)); // Call the plugin here to get the resulting histogram. Histogram from all workers are // gathered to the label owner. common::Span hist_aggr = - plugin_->SyncEncryptedHistVert(common::RestoreType(hist_entries.HostSpan())); -*/ - //********************************************* - - - - - + plugin->SyncEncryptedHistVert(common::RestoreType(hist_entries.HostSpan())); - // !!!Temporarily turn on regular histogram building for testing - // compute local histograms - this->p_impl_->BuildHistogram(ctx->CUDACtx(), matrix, feature_groups, gpair, ridx, histogram, rounding); - - // Further histogram sync process - simulated with allreduce - // copy histogram data to host + // Post process the AllGathered data + auto world_size = collective::GetWorldSize(); std::vector host_histogram(histogram.size()); - dh::CopyDeviceSpanToVector(&host_histogram, histogram); - // convert to regular vector - std::vector host_histogram_64(histogram.size() * 2); - for (auto i = 0; i < host_histogram.size(); i++) { - host_histogram_64[i * 2] = host_histogram[i].GetQuantisedGrad(); - host_histogram_64[i * 2 + 1] = host_histogram[i].GetQuantisedHess(); - } - // aggregate histograms in float - auto rc = collective::Allreduce(ctx, &host_histogram_64, collective::Op::kSum); - SafeColl(rc); - // convert back to GradientPairInt64 - // only copy to Rank 0, clear other ranks to simulate the plugin scenario - for (auto i = 0; i < host_histogram.size(); i++) { - GradientPairInt64 hist_item(host_histogram_64[i * 2], host_histogram_64[i * 2 + 1]); - GradientPairInt64 hist_item_empty(0, 0); + for (auto i = 0; i < histogram.size(); i++) { + double grad = 0.0; + double hess = 0.0; + for (auto rank = 0; rank < world_size; ++rank) { + auto idx = rank * histogram.size() + i; + grad += hist_aggr[idx * 2]; + hess += hist_aggr[idx * 2 + 1]; + } + GradientPairPrecise hist_item(grad, hess); + GradientPairPrecise hist_item_empty(0.0, 0.0); if (collective::GetRank() != 0) { - hist_item = hist_item_empty; + host_histogram[i] = rounding.ToFixedPoint(hist_item_empty); } else { - host_histogram[i] = hist_item; + host_histogram[i] = rounding.ToFixedPoint(hist_item); } } + // copy the aggregated histogram back to GPU memory // at this point, the histogram contains full information from all parties dh::safe_cuda(cudaMemcpyAsync(histogram.data(), host_histogram.data(), - histogram.size() * sizeof(GradientPairPrecise), + histogram.size() * sizeof(GradientPairInt64), cudaMemcpyHostToDevice)); - } } } // namespace xgboost::tree From aa5b51b43308e953d7519c7173e3f23a1b99a8ec Mon Sep 17 00:00:00 2001 From: Ziyue Xu Date: Thu, 1 Aug 2024 12:09:21 -0400 Subject: [PATCH 19/26] fix code linting and test scripts --- src/tree/gpu_hist/evaluate_splits.cu | 6 +++-- src/tree/gpu_hist/histogram.cu | 21 ++++++++++------- tests/cpp/tree/gpu_hist/test_histogram.cu | 28 +++++++++++------------ tests/cpp/tree/test_gpu_hist.cu | 4 ++-- 4 files changed, 33 insertions(+), 26 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index 3b5c1b76fa1c..e536fc0ac90b 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -406,7 +406,8 @@ void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector splits_out_storage(d_inputs.size()); auto out_splits = dh::ToSpan(splits_out_storage); - bool is_passive_party = is_column_split_ && collective::IsEncrypted() && collective::GetRank() != 0; + bool is_passive_party = is_column_split_ && collective::IsEncrypted() + && collective::GetRank() != 0; bool is_active_party = !is_passive_party; // Under secure vertical setting, only the active party is able to evaluate the split // based on global histogram. Other parties will receive the final best split information @@ -421,7 +422,8 @@ void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector all_candidate_storage(out_splits.size() * world_size); + dh::TemporaryArray all_candidate_storage( + out_splits.size() * world_size); auto all_candidates = dh::ToSpan(all_candidate_storage); auto current_rank = all_candidates.subspan(collective::GetRank() * out_splits.size(), out_splits.size()); diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index d87f5cc077ac..def1b3016af3 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -388,11 +388,12 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx, common::Span ridx, common::Span histogram, GradientQuantiser rounding, MetaInfo const& info) { - - auto IsSecureVertical = !info.IsRowSplit() && collective::IsDistributed() && collective::IsEncrypted(); + auto IsSecureVertical = !info.IsRowSplit() && collective::IsDistributed() + && collective::IsEncrypted(); if (!IsSecureVertical) { // Regular training, build histogram locally - this->p_impl_->BuildHistogram(ctx->CUDACtx(), matrix, feature_groups, gpair, ridx, histogram, rounding); + this->p_impl_->BuildHistogram(ctx->CUDACtx(), matrix, feature_groups, + gpair, ridx, histogram, rounding); } else { // Encrypted vertical, build histogram using federated plugin auto const &comm = collective::GlobalCommGroup()->Ctx(ctx, DeviceOrd::CPU()); @@ -400,11 +401,12 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx, auto plugin = fed.EncryptionPlugin(); // Transmit matrix to plugin - if(!is_aggr_context_initialized_){ + if (!is_aggr_context_initialized_) { // Get cutptrs std::vector h_cuts_ptr(matrix.feature_segments.size()); dh::CopyDeviceSpanToVector(&h_cuts_ptr, matrix.feature_segments); - common::Span cutptrs = common::Span(h_cuts_ptr.data(), h_cuts_ptr.size()); + common::Span cutptrs = + common::Span(h_cuts_ptr.data(), h_cuts_ptr.size()); // Get bin_idx matrix auto kRows = matrix.n_rows; @@ -414,7 +416,8 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx, thrust::device_vector matrix_d(kRows * kCols); dh::LaunchN(kRows * kCols, ReadMatrixFunction(matrix, kCols, matrix_d.data().get())); thrust::copy(matrix_d.begin(), matrix_d.end(), h_bin_idx.begin()); - common::Span bin_idx = common::Span(h_bin_idx.data(), h_bin_idx.size()); + common::Span bin_idx = + common::Span(h_bin_idx.data(), h_bin_idx.size()); // Initialize plugin context plugin->Reset(h_cuts_ptr, h_bin_idx); @@ -443,12 +446,14 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx, HostDeviceVector hist_entries; std::vector recv_segments; collective::SafeColl( - collective::AllgatherV(ctx, linalg::MakeVec(hist_data), &recv_segments, &hist_entries)); + collective::AllgatherV(ctx, linalg::MakeVec(hist_data), + &recv_segments, &hist_entries)); // Call the plugin here to get the resulting histogram. Histogram from all workers are // gathered to the label owner. common::Span hist_aggr = - plugin->SyncEncryptedHistVert(common::RestoreType(hist_entries.HostSpan())); + plugin->SyncEncryptedHistVert( + common::RestoreType(hist_entries.HostSpan())); // Post process the AllGathered data auto world_size = collective::GetWorldSize(); diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu index c9320f616983..f73fff58f05d 100644 --- a/tests/cpp/tree/gpu_hist/test_histogram.cu +++ b/tests/cpp/tree/gpu_hist/test_histogram.cu @@ -80,9 +80,9 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global) auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo()); DeviceHistogramBuilder builder; builder.Reset(&ctx, feature_groups.DeviceAccessor(ctx.Device()), force_global); - builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()), + builder.BuildHistogram(&ctx, page->GetDeviceAccessor(ctx.Device()), feature_groups.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx, - d_histogram, quantiser); + d_histogram, quantiser, MetaInfo()); std::vector histogram_h(num_bins); dh::safe_cuda(cudaMemcpy(histogram_h.data(), d_histogram.data(), @@ -95,9 +95,9 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global) auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo()); DeviceHistogramBuilder builder; builder.Reset(&ctx, feature_groups.DeviceAccessor(ctx.Device()), force_global); - builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()), + builder.BuildHistogram(&ctx, page->GetDeviceAccessor(ctx.Device()), feature_groups.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx, - d_new_histogram, quantiser); + d_new_histogram, quantiser, MetaInfo()); std::vector new_histogram_h(num_bins); dh::safe_cuda(cudaMemcpy(new_histogram_h.data(), d_new_histogram.data(), @@ -119,9 +119,9 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global) dh::device_vector baseline(num_bins); DeviceHistogramBuilder builder; builder.Reset(&ctx, single_group.DeviceAccessor(ctx.Device()), force_global); - builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()), + builder.BuildHistogram(&ctx, page->GetDeviceAccessor(ctx.Device()), single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx, - dh::ToSpan(baseline), quantiser); + dh::ToSpan(baseline), quantiser, MetaInfo()); std::vector baseline_h(num_bins); dh::safe_cuda(cudaMemcpy(baseline_h.data(), baseline.data().get(), @@ -185,9 +185,9 @@ void TestGPUHistogramCategorical(size_t num_categories) { FeatureGroups single_group(page->Cuts()); DeviceHistogramBuilder builder; builder.Reset(&ctx, single_group.DeviceAccessor(ctx.Device()), false); - builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()), + builder.BuildHistogram(&ctx, page->GetDeviceAccessor(ctx.Device()), single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx, - dh::ToSpan(cat_hist), quantiser); + dh::ToSpan(cat_hist), quantiser, MetaInfo()); } /** @@ -201,9 +201,9 @@ void TestGPUHistogramCategorical(size_t num_categories) { FeatureGroups single_group(page->Cuts()); DeviceHistogramBuilder builder; builder.Reset(&ctx, single_group.DeviceAccessor(ctx.Device()), false); - builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()), + builder.BuildHistogram(&ctx, page->GetDeviceAccessor(ctx.Device()), single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx, - dh::ToSpan(encode_hist), quantiser); + dh::ToSpan(encode_hist), quantiser, MetaInfo()); } std::vector h_cat_hist(cat_hist.size()); @@ -350,9 +350,9 @@ class HistogramExternalMemoryTest : public ::testing::TestWithParamDeviceAccessor(ctx.Device()), force_global); - builder.BuildHistogram(ctx.CUDACtx(), impl->GetDeviceAccessor(ctx.Device()), + builder.BuildHistogram(&ctx, impl->GetDeviceAccessor(ctx.Device()), fg->DeviceAccessor(ctx.Device()), gpair.ConstDeviceSpan(), ridx, - d_histogram, quantiser); + d_histogram, quantiser, MetaInfo()); ++k; } ASSERT_EQ(k, n_batches); @@ -373,9 +373,9 @@ class HistogramExternalMemoryTest : public ::testing::TestWithParamDeviceAccessor(ctx.Device()), force_global); - builder.BuildHistogram(ctx.CUDACtx(), page.GetDeviceAccessor(ctx.Device()), + builder.BuildHistogram(&ctx, page.GetDeviceAccessor(ctx.Device()), fg->DeviceAccessor(ctx.Device()), gpair.ConstDeviceSpan(), ridx, - d_histogram, quantiser); + d_histogram, quantiser, MetaInfo()); } std::vector h_single(single_hist.size()); diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index edd129353bdf..454d05d14df1 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -79,10 +79,10 @@ void TestBuildHist(bool use_shared_memory_histograms) { DeviceHistogramBuilder builder; builder.Reset(&ctx, maker.feature_groups->DeviceAccessor(ctx.Device()), !use_shared_memory_histograms); - builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()), + builder.BuildHistogram(&ctx, page->GetDeviceAccessor(ctx.Device()), maker.feature_groups->DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), maker.row_partitioner->GetRows(0), maker.hist.GetNodeHistogram(0), - *maker.quantiser); + *maker.quantiser, MetaInfo()); DeviceHistogramStorage<>& d_hist = maker.hist; From 7480ed339752eedc85d2d15b57086e39ea9b61df Mon Sep 17 00:00:00 2001 From: Ziyue Xu Date: Thu, 1 Aug 2024 15:01:11 -0400 Subject: [PATCH 20/26] wrap plugin calls into federated --- src/tree/gpu_hist/histogram.cu | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index def1b3016af3..4aa69949f989 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -395,6 +395,7 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx, this->p_impl_->BuildHistogram(ctx->CUDACtx(), matrix, feature_groups, gpair, ridx, histogram, rounding); } else { + #if defined(XGBOOST_USE_FEDERATED) // Encrypted vertical, build histogram using federated plugin auto const &comm = collective::GlobalCommGroup()->Ctx(ctx, DeviceOrd::CPU()); auto const &fed = dynamic_cast(comm); @@ -480,6 +481,9 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx, dh::safe_cuda(cudaMemcpyAsync(histogram.data(), host_histogram.data(), histogram.size() * sizeof(GradientPairInt64), cudaMemcpyHostToDevice)); + #else + LOG(FATAL) << error::NoFederated(); + #endif } } } // namespace xgboost::tree From 4587b2ef6dc8ff7bb8d9dd5d59d372a4e2d9a23e Mon Sep 17 00:00:00 2001 From: Ziyue Xu Date: Thu, 1 Aug 2024 18:55:56 -0400 Subject: [PATCH 21/26] only rank 0 need histogram sync result --- src/tree/gpu_hist/histogram.cu | 49 ++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index 4aa69949f989..e33b30c8bfac 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -451,36 +451,39 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx, &recv_segments, &hist_entries)); // Call the plugin here to get the resulting histogram. Histogram from all workers are - // gathered to the label owner. + // gathered to the label owner. common::Span hist_aggr = plugin->SyncEncryptedHistVert( common::RestoreType(hist_entries.HostSpan())); // Post process the AllGathered data - auto world_size = collective::GetWorldSize(); - std::vector host_histogram(histogram.size()); - for (auto i = 0; i < histogram.size(); i++) { - double grad = 0.0; - double hess = 0.0; - for (auto rank = 0; rank < world_size; ++rank) { - auto idx = rank * histogram.size() + i; - grad += hist_aggr[idx * 2]; - hess += hist_aggr[idx * 2 + 1]; + // This is only needed by Rank 0 + if (collective::GetRank() == 0) { + auto world_size = collective::GetWorldSize(); + std::vector host_histogram(histogram.size()); + for (auto i = 0; i < histogram.size(); i++) { + double grad = 0.0; + double hess = 0.0; + for (auto rank = 0; rank < world_size; ++rank) { + auto idx = rank * histogram.size() + i; + grad += hist_aggr[idx * 2]; + hess += hist_aggr[idx * 2 + 1]; + } + GradientPairPrecise hist_item(grad, hess); + GradientPairPrecise hist_item_empty(0.0, 0.0); + if (collective::GetRank() != 0) { + host_histogram[i] = rounding.ToFixedPoint(hist_item_empty); + } else { + host_histogram[i] = rounding.ToFixedPoint(hist_item); + } } - GradientPairPrecise hist_item(grad, hess); - GradientPairPrecise hist_item_empty(0.0, 0.0); - if (collective::GetRank() != 0) { - host_histogram[i] = rounding.ToFixedPoint(hist_item_empty); - } else { - host_histogram[i] = rounding.ToFixedPoint(hist_item); - } - } - // copy the aggregated histogram back to GPU memory - // at this point, the histogram contains full information from all parties - dh::safe_cuda(cudaMemcpyAsync(histogram.data(), host_histogram.data(), - histogram.size() * sizeof(GradientPairInt64), - cudaMemcpyHostToDevice)); + // copy the aggregated histogram back to GPU memory + // at this point, the histogram contains full information from all parties + dh::safe_cuda(cudaMemcpyAsync(histogram.data(), host_histogram.data(), + histogram.size() * sizeof(GradientPairInt64), + cudaMemcpyHostToDevice)); + } #else LOG(FATAL) << error::NoFederated(); #endif From ad213148ef944db5b74d3ed2a03d11e854ee9a78 Mon Sep 17 00:00:00 2001 From: Ziyue Xu Date: Thu, 1 Aug 2024 19:01:55 -0400 Subject: [PATCH 22/26] Update histogram.cu --- src/tree/gpu_hist/histogram.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index e33b30c8bfac..68427d21322c 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -451,7 +451,7 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx, &recv_segments, &hist_entries)); // Call the plugin here to get the resulting histogram. Histogram from all workers are - // gathered to the label owner. + // gathered to the label owner common::Span hist_aggr = plugin->SyncEncryptedHistVert( common::RestoreType(hist_entries.HostSpan())); From 397ade38d2906e4c39a88ce8927d6fbaed801d74 Mon Sep 17 00:00:00 2001 From: Zhihong Zhang Date: Thu, 1 Aug 2024 19:06:36 -0400 Subject: [PATCH 23/26] Added check for passive when sync histo for vertical and removed some nested blocks --- src/tree/gpu_hist/histogram.cu | 181 +++++++++++++++++---------------- 1 file changed, 92 insertions(+), 89 deletions(-) diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index 68427d21322c..93678496cda1 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -394,99 +394,102 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx, // Regular training, build histogram locally this->p_impl_->BuildHistogram(ctx->CUDACtx(), matrix, feature_groups, gpair, ridx, histogram, rounding); - } else { + return; + } #if defined(XGBOOST_USE_FEDERATED) - // Encrypted vertical, build histogram using federated plugin - auto const &comm = collective::GlobalCommGroup()->Ctx(ctx, DeviceOrd::CPU()); - auto const &fed = dynamic_cast(comm); - auto plugin = fed.EncryptionPlugin(); - - // Transmit matrix to plugin - if (!is_aggr_context_initialized_) { - // Get cutptrs - std::vector h_cuts_ptr(matrix.feature_segments.size()); - dh::CopyDeviceSpanToVector(&h_cuts_ptr, matrix.feature_segments); - common::Span cutptrs = - common::Span(h_cuts_ptr.data(), h_cuts_ptr.size()); - - // Get bin_idx matrix - auto kRows = matrix.n_rows; - auto kCols = matrix.NumFeatures(); - std::vector h_bin_idx(kRows * kCols); - // Access GPU matrix data - thrust::device_vector matrix_d(kRows * kCols); - dh::LaunchN(kRows * kCols, ReadMatrixFunction(matrix, kCols, matrix_d.data().get())); - thrust::copy(matrix_d.begin(), matrix_d.end(), h_bin_idx.begin()); - common::Span bin_idx = - common::Span(h_bin_idx.data(), h_bin_idx.size()); - - // Initialize plugin context - plugin->Reset(h_cuts_ptr, h_bin_idx); - is_aggr_context_initialized_ = true; - } + // Encrypted vertical, build histogram using federated plugin + auto const &comm = collective::GlobalCommGroup()->Ctx(ctx, DeviceOrd::CPU()); + auto const &fed = dynamic_cast(comm); + auto plugin = fed.EncryptionPlugin(); + + // Transmit matrix to plugin + if (!is_aggr_context_initialized_) { + // Get cutptrs + std::vector h_cuts_ptr(matrix.feature_segments.size()); + dh::CopyDeviceSpanToVector(&h_cuts_ptr, matrix.feature_segments); + common::Span cutptrs = + common::Span(h_cuts_ptr.data(), h_cuts_ptr.size()); + + // Get bin_idx matrix + auto kRows = matrix.n_rows; + auto kCols = matrix.NumFeatures(); + std::vector h_bin_idx(kRows * kCols); + // Access GPU matrix data + thrust::device_vector matrix_d(kRows * kCols); + dh::LaunchN(kRows * kCols, ReadMatrixFunction(matrix, kCols, matrix_d.data().get())); + thrust::copy(matrix_d.begin(), matrix_d.end(), h_bin_idx.begin()); + common::Span bin_idx = + common::Span(h_bin_idx.data(), h_bin_idx.size()); + + // Initialize plugin context + plugin->Reset(h_cuts_ptr, h_bin_idx); + is_aggr_context_initialized_ = true; + } - // get row indices from device - std::vector h_ridx(ridx.size()); - dh::CopyDeviceSpanToVector(&h_ridx, ridx); - // necessary conversions to fit plugin expectations - std::vector h_ridx_64(ridx.size()); - for (int i = 0; i < ridx.size(); i++) { - h_ridx_64[i] = h_ridx[i]; - } - std::vector ptrs(1); - std::vector sizes(1); - std::vector nodes(1); - ptrs[0] = reinterpret_cast(h_ridx_64.data()); - sizes[0] = h_ridx_64.size(); - nodes[0] = 0; - - // Transmit row indices to plugin and get encrypted histogram - auto hist_data = plugin->BuildEncryptedHistVert(ptrs, sizes, nodes); - - // Perform AllGather - HostDeviceVector hist_entries; - std::vector recv_segments; - collective::SafeColl( - collective::AllgatherV(ctx, linalg::MakeVec(hist_data), - &recv_segments, &hist_entries)); - - // Call the plugin here to get the resulting histogram. Histogram from all workers are - // gathered to the label owner - common::Span hist_aggr = - plugin->SyncEncryptedHistVert( - common::RestoreType(hist_entries.HostSpan())); - - // Post process the AllGathered data - // This is only needed by Rank 0 - if (collective::GetRank() == 0) { - auto world_size = collective::GetWorldSize(); - std::vector host_histogram(histogram.size()); - for (auto i = 0; i < histogram.size(); i++) { - double grad = 0.0; - double hess = 0.0; - for (auto rank = 0; rank < world_size; ++rank) { - auto idx = rank * histogram.size() + i; - grad += hist_aggr[idx * 2]; - hess += hist_aggr[idx * 2 + 1]; - } - GradientPairPrecise hist_item(grad, hess); - GradientPairPrecise hist_item_empty(0.0, 0.0); - if (collective::GetRank() != 0) { - host_histogram[i] = rounding.ToFixedPoint(hist_item_empty); - } else { - host_histogram[i] = rounding.ToFixedPoint(hist_item); - } - } + // get row indices from device + std::vector h_ridx(ridx.size()); + dh::CopyDeviceSpanToVector(&h_ridx, ridx); + // necessary conversions to fit plugin expectations + std::vector h_ridx_64(ridx.size()); + for (int i = 0; i < ridx.size(); i++) { + h_ridx_64[i] = h_ridx[i]; + } + std::vector ptrs(1); + std::vector sizes(1); + std::vector nodes(1); + ptrs[0] = reinterpret_cast(h_ridx_64.data()); + sizes[0] = h_ridx_64.size(); + nodes[0] = 0; + + // Transmit row indices to plugin and get encrypted histogram + auto hist_data = plugin->BuildEncryptedHistVert(ptrs, sizes, nodes); + + // Perform AllGather + HostDeviceVector hist_entries; + std::vector recv_segments; + collective::SafeColl( + collective::AllgatherV(ctx, linalg::MakeVec(hist_data), + &recv_segments, &hist_entries)); + + if (collective::GetRank() != 0) { + // This is only needed for lable owner + return; + } - // copy the aggregated histogram back to GPU memory - // at this point, the histogram contains full information from all parties - dh::safe_cuda(cudaMemcpyAsync(histogram.data(), host_histogram.data(), - histogram.size() * sizeof(GradientPairInt64), - cudaMemcpyHostToDevice)); + // Call the plugin here to get the resulting histogram. Histogram from all workers are + // gathered to the label owner. + common::Span hist_aggr = + plugin->SyncEncryptedHistVert( + common::RestoreType(hist_entries.HostSpan())); + + // Post process the AllGathered data + auto world_size = collective::GetWorldSize(); + std::vector host_histogram(histogram.size()); + for (auto i = 0; i < histogram.size(); i++) { + double grad = 0.0; + double hess = 0.0; + for (auto rank = 0; rank < world_size; ++rank) { + auto idx = rank * histogram.size() + i; + grad += hist_aggr[idx * 2]; + hess += hist_aggr[idx * 2 + 1]; + } + GradientPairPrecise hist_item(grad, hess); + GradientPairPrecise hist_item_empty(0.0, 0.0); + if (collective::GetRank() != 0) { + host_histogram[i] = rounding.ToFixedPoint(hist_item_empty); + } else { + host_histogram[i] = rounding.ToFixedPoint(hist_item); } - #else - LOG(FATAL) << error::NoFederated(); - #endif } + + // copy the aggregated histogram back to GPU memory + // at this point, the histogram contains full information from all parties + dh::safe_cuda(cudaMemcpyAsync(histogram.data(), host_histogram.data(), + histogram.size() * sizeof(GradientPairInt64), + cudaMemcpyHostToDevice)); +#else + LOG(FATAL) << error::NoFederated(); +#endif + } } // namespace xgboost::tree From 61d08217ac70c8802ecebe2d010b42d0306cce29 Mon Sep 17 00:00:00 2001 From: Ziyue Xu Date: Thu, 1 Aug 2024 19:43:42 -0400 Subject: [PATCH 24/26] Code clean --- src/tree/gpu_hist/histogram.cu | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index 93678496cda1..3f608f1c4f64 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -452,11 +452,11 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx, &recv_segments, &hist_entries)); if (collective::GetRank() != 0) { - // This is only needed for lable owner + // Below is only needed for lable owner return; } - // Call the plugin here to get the resulting histogram. Histogram from all workers are + // Call the plugin to get the resulting histogram. Histogram from all workers are // gathered to the label owner. common::Span hist_aggr = plugin->SyncEncryptedHistVert( @@ -474,12 +474,7 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx, hess += hist_aggr[idx * 2 + 1]; } GradientPairPrecise hist_item(grad, hess); - GradientPairPrecise hist_item_empty(0.0, 0.0); - if (collective::GetRank() != 0) { - host_histogram[i] = rounding.ToFixedPoint(hist_item_empty); - } else { - host_histogram[i] = rounding.ToFixedPoint(hist_item); - } + host_histogram[i] = rounding.ToFixedPoint(hist_item); } // copy the aggregated histogram back to GPU memory @@ -490,6 +485,5 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx, #else LOG(FATAL) << error::NoFederated(); #endif - } } // namespace xgboost::tree From ba825213f56f78226c47003e25741b30f0b84db2 Mon Sep 17 00:00:00 2001 From: Ziyue Xu Date: Fri, 9 Aug 2024 10:53:25 -0400 Subject: [PATCH 25/26] updates for PR checks --- src/tree/gpu_hist/histogram.cu | 16 ++++++++-------- src/tree/gpu_hist/histogram.cuh | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index 3f608f1c4f64..2b52f03a1dbf 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -365,16 +365,16 @@ void DeviceHistogramBuilder::Reset(Context const* ctx, FeatureGroupsAccessor con struct ReadMatrixFunction { EllpackDeviceAccessor matrix; - int kCols; + int k_cols; bst_float* matrix_data_d; - ReadMatrixFunction(EllpackDeviceAccessor matrix, int kCols, bst_float* matrix_data_d) - : matrix(std::move(matrix)), kCols(kCols), matrix_data_d(matrix_data_d) {} + ReadMatrixFunction(EllpackDeviceAccessor matrix, int k_cols, bst_float* matrix_data_d) + : matrix(std::move(matrix)), k_cols(k_cols), matrix_data_d(matrix_data_d) {} __device__ void operator()(size_t global_idx) { - auto row = global_idx / kCols; - auto col = global_idx % kCols; + auto row = global_idx / k_cols; + auto col = global_idx % k_cols; auto value = matrix.GetBinIndex(row, col); - if (isnan(value)) { + if (isnan(float(value))) { value = -1; } matrix_data_d[global_idx] = value; @@ -403,7 +403,7 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx, auto plugin = fed.EncryptionPlugin(); // Transmit matrix to plugin - if (!is_aggr_context_initialized_) { + if (!is_aggr_context_initialized) { // Get cutptrs std::vector h_cuts_ptr(matrix.feature_segments.size()); dh::CopyDeviceSpanToVector(&h_cuts_ptr, matrix.feature_segments); @@ -423,7 +423,7 @@ void DeviceHistogramBuilder::BuildHistogram(Context const* ctx, // Initialize plugin context plugin->Reset(h_cuts_ptr, h_bin_idx); - is_aggr_context_initialized_ = true; + is_aggr_context_initialized = true; } // get row indices from device diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh index 08ba9be856fc..aedcacbb92d8 100644 --- a/src/tree/gpu_hist/histogram.cuh +++ b/src/tree/gpu_hist/histogram.cuh @@ -176,7 +176,7 @@ class DeviceHistogramBuilder { DeviceHistogramBuilder(); ~DeviceHistogramBuilder(); // Whether to secure aggregation context has been initialized - bool is_aggr_context_initialized_{false}; + bool is_aggr_context_initialized{false}; void Reset(Context const* ctx, FeatureGroupsAccessor const& feature_groups, bool force_global_memory); void BuildHistogram(Context const* ctx, EllpackDeviceAccessor const& matrix, From 45b4e2ebc49342891fe3db02d3f2ae53a08929ed Mon Sep 17 00:00:00 2001 From: Ziyue Xu Date: Fri, 9 Aug 2024 11:03:22 -0400 Subject: [PATCH 26/26] updates for PR checks --- src/tree/gpu_hist/histogram.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index 2b52f03a1dbf..073faa6f5203 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -374,7 +374,7 @@ struct ReadMatrixFunction { auto row = global_idx / k_cols; auto col = global_idx % k_cols; auto value = matrix.GetBinIndex(row, col); - if (isnan(float(value))) { + if (isnan(static_cast(value))) { value = -1; } matrix_data_d[global_idx] = value;