From c9fc9e345fd7a4728a40c4c6f01429d9e566e99c Mon Sep 17 00:00:00 2001 From: Rong Ou Date: Wed, 3 May 2023 15:47:45 -0700 Subject: [PATCH 1/2] Support column split in partition builder --- src/common/partition_builder.h | 49 +++++++++--- src/tree/common_row_partitioner.h | 15 ++-- tests/cpp/tree/test_quantile_hist.cc | 111 +++++++++++++++++++++++++++ 3 files changed, 160 insertions(+), 15 deletions(-) diff --git a/src/common/partition_builder.h b/src/common/partition_builder.h index e5e6971e5c33..b5c169641ad9 100644 --- a/src/common/partition_builder.h +++ b/src/common/partition_builder.h @@ -183,19 +183,34 @@ class PartitionBuilder { SetNRightElems(node_in_set, range.begin(), n_right); } + template + void MaskKernel(ColumnType* p_column, common::Span row_indices, size_t base_rowid, + BitVector* decision_bits, BitVector* missing_bits, Predicate&& pred) { + auto& column = *p_column; + for (auto row_id : row_indices) { + const int32_t bin_id = column[row_id - base_rowid]; + if (any_missing && bin_id == ColumnType::kMissingId) { + missing_bits->Set(row_id - base_rowid); + } else if (pred(row_id, bin_id)) { + decision_bits->Set(row_id - base_rowid); + } + } + } + /** * @brief When data is split by column, we don't have all the features locally on the current * worker, so we go through all the rows and mark the bit vectors on whether the decision is made * to go right, or if the feature value used for the split is missing. */ - template + template void MaskRows(const size_t node_in_set, std::vector const& nodes, - const common::Range1d range, GHistIndexMatrix const& gmat, + const common::Range1d range, bst_bin_t split_cond, GHistIndexMatrix const& gmat, const common::ColumnMatrix& column_matrix, const RegTree& tree, const size_t* rid, BitVector* decision_bits, BitVector* missing_bits) { common::Span rid_span(rid + range.begin(), rid + range.end()); std::size_t nid = nodes[node_in_set].nid; bst_feature_t fid = tree[nid].SplitIndex(); + bool default_left = tree.DefaultLeft(nid); bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical; auto node_cats = tree.NodeCats(nid); auto const& cut_values = gmat.cut.Values(); @@ -218,7 +233,27 @@ class PartitionBuilder { } } } else { - LOG(FATAL) << "Column data split is only supported for the `approx` tree method"; + auto pred_hist = [&](auto ridx, auto bin_id) { + if (any_cat && is_cat) { + auto gidx = gmat.GetGindex(ridx, fid); + CHECK_GT(gidx, -1); + return Decision(node_cats, cut_values[gidx]); + } else { + return bin_id <= split_cond; + } + }; + + if (column_matrix.GetColumnType(fid) == xgboost::common::kDenseColumn) { + auto column = column_matrix.DenseColumn(fid); + MaskKernel(&column, rid_span, gmat.base_rowid, decision_bits, missing_bits, + pred_hist); + } else { + CHECK_EQ(any_missing, true); + auto column = + column_matrix.SparseColumn(fid, rid_span.front() - gmat.base_rowid); + MaskKernel(&column, rid_span, gmat.base_rowid, decision_bits, missing_bits, + pred_hist); + } } } @@ -238,7 +273,7 @@ class PartitionBuilder { std::size_t nid = nodes[node_in_set].nid; bool default_left = tree[nid].DefaultLeft(); - auto pred_approx = [&](auto ridx) { + auto pred = [&](auto ridx) { bool go_left = default_left; bool is_missing = missing_bits.Check(ridx - gmat.base_rowid); if (!is_missing) { @@ -248,11 +283,7 @@ class PartitionBuilder { }; std::pair child_nodes_sizes; - if (!column_matrix.IsInitialized()) { - child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx); - } else { - LOG(FATAL) << "Column data split is only supported for the `approx` tree method"; - } + child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred); const size_t n_left = child_nodes_sizes.first; const size_t n_right = child_nodes_sizes.second; diff --git a/src/tree/common_row_partitioner.h b/src/tree/common_row_partitioner.h index ba69d8921fe7..ef12d0ccc05a 100644 --- a/src/tree/common_row_partitioner.h +++ b/src/tree/common_row_partitioner.h @@ -38,19 +38,21 @@ class ColumnSplitHelper { missing_bits_ = BitVector(common::Span(missing_storage_)); } - template + template void Partition(common::BlockedSpace2d const& space, std::int32_t n_threads, GHistIndexMatrix const& gmat, common::ColumnMatrix const& column_matrix, - std::vector const& nodes, RegTree const* p_tree) { + std::vector const& nodes, + std::vector const& split_conditions, RegTree const* p_tree) { // When data is split by column, we don't have all the feature values in the local worker, so // we first collect all the decisions and whether the feature is missing into bit vectors. std::fill(decision_storage_.begin(), decision_storage_.end(), 0); std::fill(missing_storage_.begin(), missing_storage_.end(), 0); common::ParallelFor2d(space, n_threads, [&](size_t node_in_set, common::Range1d r) { const int32_t nid = nodes[node_in_set].nid; - partition_builder_->MaskRows(node_in_set, nodes, r, gmat, column_matrix, *p_tree, - (*row_set_collection_)[nid].begin, &decision_bits_, - &missing_bits_); + bst_bin_t split_cond = column_matrix.IsInitialized() ? split_conditions[node_in_set] : 0; + partition_builder_->MaskRows( + node_in_set, nodes, r, split_cond, gmat, column_matrix, *p_tree, + (*row_set_collection_)[nid].begin, &decision_bits_, &missing_bits_); }); // Then aggregate the bit vectors across all the workers. @@ -217,7 +219,8 @@ class CommonRowPartitioner { // 2.3 Split elements of row_set_collection_ to left and right child-nodes for each node // Store results in intermediate buffers from partition_builder_ if (is_col_split_) { - column_split_helper_.Partition(space, ctx->Threads(), gmat, column_matrix, nodes, p_tree); + column_split_helper_.Partition( + space, ctx->Threads(), gmat, column_matrix, nodes, split_conditions, p_tree); } else { common::ParallelFor2d(space, ctx->Threads(), [&](size_t node_in_set, common::Range1d r) { size_t begin = r.begin(); diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc index e5ce75585c30..9b9278021efa 100644 --- a/tests/cpp/tree/test_quantile_hist.cc +++ b/tests/cpp/tree/test_quantile_hist.cc @@ -19,6 +19,8 @@ #include "xgboost/data.h" namespace xgboost::tree { + +namespace { template void TestPartitioner(bst_target_t n_targets) { std::size_t n_samples = 1024, base_rowid = 0; @@ -86,8 +88,117 @@ void TestPartitioner(bst_target_t n_targets) { } } } +} // anonymous namespace TEST(QuantileHist, Partitioner) { TestPartitioner(1); } TEST(QuantileHist, MultiPartitioner) { TestPartitioner(3); } + +namespace { + +template +void VerifyColumnSplitPartitioner(bst_target_t n_targets, size_t n_samples, + bst_feature_t n_features, size_t base_rowid, + std::shared_ptr Xy, float min_value, float mid_value, + CommonRowPartitioner const& expected_mid_partitioner) { + auto dmat = + std::unique_ptr{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())}; + + Context ctx; + ctx.InitAllowUnknown(Args{}); + + std::vector candidates{{0, 0}}; + candidates.front().split.loss_chg = 0.4; + auto cuts = common::SketchOnDMatrix(&ctx, dmat.get(), 64); + + for (auto const& page : Xy->GetBatches()) { + GHistIndexMatrix gmat(page, {}, cuts, 64, true, 0.5, ctx.Threads()); + bst_feature_t const split_ind = 0; + common::ColumnMatrix column_indices; + column_indices.InitFromSparse(page, gmat, 0.5, ctx.Threads()); + { + RegTree tree{n_targets, n_features}; + CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, true}; + if constexpr (std::is_same::value) { + GetSplit(&tree, min_value, &candidates); + } else { + GetMultiSplitForTest(&tree, min_value, &candidates); + } + partitioner.UpdatePosition(&ctx, gmat, column_indices, candidates, &tree); + ASSERT_EQ(partitioner.Size(), 3); + ASSERT_EQ(partitioner[1].Size(), 0); + ASSERT_EQ(partitioner[2].Size(), n_samples); + } + { + RegTree tree{n_targets, n_features}; + CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, true}; + if constexpr (std::is_same::value) { + GetSplit(&tree, mid_value, &candidates); + } else { + GetMultiSplitForTest(&tree, mid_value, &candidates); + } + auto left_nidx = tree.LeftChild(RegTree::kRoot); + partitioner.UpdatePosition(&ctx, gmat, column_indices, candidates, &tree); + + auto elem = partitioner[left_nidx]; + ASSERT_LT(elem.Size(), n_samples); + ASSERT_GT(elem.Size(), 1); + auto expected_elem = expected_mid_partitioner[left_nidx]; + ASSERT_EQ(elem.Size(), expected_elem.Size()); + for (auto it = elem.begin, eit = expected_elem.begin; it != elem.end; ++it, ++eit) { + ASSERT_EQ(*it, *eit); + } + + auto right_nidx = tree.RightChild(RegTree::kRoot); + elem = partitioner[right_nidx]; + expected_elem = expected_mid_partitioner[right_nidx]; + ASSERT_EQ(elem.Size(), expected_elem.Size()); + for (auto it = elem.begin, eit = expected_elem.begin; it != elem.end; ++it, ++eit) { + ASSERT_EQ(*it, *eit); + } + } + } +} + +template +void TestColumnSplitPartitioner(bst_target_t n_targets) { + std::size_t n_samples = 1024, base_rowid = 0; + bst_feature_t n_features = 16; + auto Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true); + std::vector candidates{{0, 0}}; + candidates.front().split.loss_chg = 0.4; + + Context ctx; + ctx.InitAllowUnknown(Args{}); + auto cuts = common::SketchOnDMatrix(&ctx, Xy.get(), 64); + + float min_value, mid_value; + CommonRowPartitioner mid_partitioner{&ctx, n_samples, base_rowid, false}; + for (auto const& page : Xy->GetBatches()) { + GHistIndexMatrix gmat(page, {}, cuts, 64, true, 0.5, ctx.Threads()); + bst_feature_t const split_ind = 0; + common::ColumnMatrix column_indices; + column_indices.InitFromSparse(page, gmat, 0.5, ctx.Threads()); + min_value = gmat.cut.MinValues()[split_ind]; + + auto ptr = gmat.cut.Ptrs()[split_ind + 1]; + mid_value = gmat.cut.Values().at(ptr / 2); + RegTree tree{n_targets, n_features}; + if constexpr (std::is_same::value) { + GetSplit(&tree, mid_value, &candidates); + } else { + GetMultiSplitForTest(&tree, mid_value, &candidates); + } + mid_partitioner.UpdatePosition(&ctx, gmat, column_indices, candidates, &tree); + } + + auto constexpr kWorkers = 4; + RunWithInMemoryCommunicator(kWorkers, VerifyColumnSplitPartitioner, n_targets, + n_samples, n_features, base_rowid, Xy, min_value, mid_value, mid_partitioner); +} +} // anonymous namespace + +TEST(QuantileHist, PartitionerColSplit) { TestColumnSplitPartitioner(1); } + +TEST(QuantileHist, MultiPartitionerColSplit) { TestColumnSplitPartitioner(3); } } // namespace xgboost::tree From aa2f37a15629b669e8c28177e94ce3b63880b027 Mon Sep 17 00:00:00 2001 From: Rong Ou Date: Wed, 3 May 2023 15:58:30 -0700 Subject: [PATCH 2/2] minor cleanup --- src/common/partition_builder.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/common/partition_builder.h b/src/common/partition_builder.h index b5c169641ad9..cdc8aa193124 100644 --- a/src/common/partition_builder.h +++ b/src/common/partition_builder.h @@ -187,8 +187,8 @@ class PartitionBuilder { void MaskKernel(ColumnType* p_column, common::Span row_indices, size_t base_rowid, BitVector* decision_bits, BitVector* missing_bits, Predicate&& pred) { auto& column = *p_column; - for (auto row_id : row_indices) { - const int32_t bin_id = column[row_id - base_rowid]; + for (auto const row_id : row_indices) { + auto const bin_id = column[row_id - base_rowid]; if (any_missing && bin_id == ColumnType::kMissingId) { missing_bits->Set(row_id - base_rowid); } else if (pred(row_id, bin_id)) { @@ -210,7 +210,6 @@ class PartitionBuilder { common::Span rid_span(rid + range.begin(), rid + range.end()); std::size_t nid = nodes[node_in_set].nid; bst_feature_t fid = tree[nid].SplitIndex(); - bool default_left = tree.DefaultLeft(nid); bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical; auto node_cats = tree.NodeCats(nid); auto const& cut_values = gmat.cut.Values(); @@ -219,7 +218,7 @@ class PartitionBuilder { for (auto row_id : rid_span) { auto gidx = gmat.GetGindex(row_id, fid); if (gidx > -1) { - bool go_left = false; + bool go_left; if (is_cat) { go_left = Decision(node_cats, cut_values[gidx]); } else {