Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support hist in the partition builder under column split #9120

Merged
merged 4 commits into from
May 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 40 additions & 10 deletions src/common/partition_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,14 +183,28 @@ class PartitionBuilder {
SetNRightElems(node_in_set, range.begin(), n_right);
}

template <bool any_missing, typename ColumnType, typename Predicate>
void MaskKernel(ColumnType* p_column, common::Span<const size_t> row_indices, size_t base_rowid,
BitVector* decision_bits, BitVector* missing_bits, Predicate&& pred) {
auto& column = *p_column;
for (auto const row_id : row_indices) {
auto const bin_id = column[row_id - base_rowid];
if (any_missing && bin_id == ColumnType::kMissingId) {
missing_bits->Set(row_id - base_rowid);
} else if (pred(row_id, bin_id)) {
decision_bits->Set(row_id - base_rowid);
}
}
}

/**
* @brief When data is split by column, we don't have all the features locally on the current
* worker, so we go through all the rows and mark the bit vectors on whether the decision is made
* to go right, or if the feature value used for the split is missing.
*/
template <typename ExpandEntry>
template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
void MaskRows(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
const common::Range1d range, GHistIndexMatrix const& gmat,
const common::Range1d range, bst_bin_t split_cond, GHistIndexMatrix const& gmat,
const common::ColumnMatrix& column_matrix, const RegTree& tree, const size_t* rid,
BitVector* decision_bits, BitVector* missing_bits) {
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
Expand All @@ -204,7 +218,7 @@ class PartitionBuilder {
for (auto row_id : rid_span) {
auto gidx = gmat.GetGindex(row_id, fid);
if (gidx > -1) {
bool go_left = false;
bool go_left;
if (is_cat) {
go_left = Decision(node_cats, cut_values[gidx]);
} else {
Expand All @@ -218,7 +232,27 @@ class PartitionBuilder {
}
}
} else {
LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
auto pred_hist = [&](auto ridx, auto bin_id) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Logically, is there any difference from

auto pred_hist = [&](auto ridx, auto bin_id) {
?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes they are the same, but since it's a closure, can't be easily reused. Maybe we should extract into a standalone function?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The partitioner is becoming less and less manageable now. I don't have any good suggestions on how to refactor it at the moment and can approve the PR as-is. But would be great if there's a way to simplify/modularize it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be helpful to separate out the column-split logic into its own class?

Copy link
Member

@trivialfis trivialfis May 4, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe next time. Currently, we have total 8 combinations for tree_method, categorical, column-split, along with these, we also need to take multi-output into account. After these combinations, we have type dispatching for histogram bins and dense/sparse/dense-with-missing. So ... how many cases there are?

I'm trying to think of a way to make the code easier to understand and write a test suite specifically for partition builder.

if (any_cat && is_cat) {
auto gidx = gmat.GetGindex(ridx, fid);
CHECK_GT(gidx, -1);
return Decision(node_cats, cut_values[gidx]);
} else {
return bin_id <= split_cond;
}
};

if (column_matrix.GetColumnType(fid) == xgboost::common::kDenseColumn) {
auto column = column_matrix.DenseColumn<BinIdxType, any_missing>(fid);
MaskKernel<any_missing>(&column, rid_span, gmat.base_rowid, decision_bits, missing_bits,
pred_hist);
} else {
CHECK_EQ(any_missing, true);
auto column =
column_matrix.SparseColumn<BinIdxType>(fid, rid_span.front() - gmat.base_rowid);
MaskKernel<any_missing>(&column, rid_span, gmat.base_rowid, decision_bits, missing_bits,
pred_hist);
}
}
}

Expand All @@ -238,7 +272,7 @@ class PartitionBuilder {
std::size_t nid = nodes[node_in_set].nid;
bool default_left = tree[nid].DefaultLeft();

auto pred_approx = [&](auto ridx) {
auto pred = [&](auto ridx) {
bool go_left = default_left;
bool is_missing = missing_bits.Check(ridx - gmat.base_rowid);
if (!is_missing) {
Expand All @@ -248,11 +282,7 @@ class PartitionBuilder {
};

std::pair<size_t, size_t> child_nodes_sizes;
if (!column_matrix.IsInitialized()) {
child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
} else {
LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
}
child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred);

const size_t n_left = child_nodes_sizes.first;
const size_t n_right = child_nodes_sizes.second;
Expand Down
15 changes: 9 additions & 6 deletions src/tree/common_row_partitioner.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,19 +38,21 @@ class ColumnSplitHelper {
missing_bits_ = BitVector(common::Span<BitVector::value_type>(missing_storage_));
}

template <typename ExpandEntry>
template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
void Partition(common::BlockedSpace2d const& space, std::int32_t n_threads,
GHistIndexMatrix const& gmat, common::ColumnMatrix const& column_matrix,
std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
std::vector<ExpandEntry> const& nodes,
std::vector<int32_t> const& split_conditions, RegTree const* p_tree) {
// When data is split by column, we don't have all the feature values in the local worker, so
// we first collect all the decisions and whether the feature is missing into bit vectors.
std::fill(decision_storage_.begin(), decision_storage_.end(), 0);
std::fill(missing_storage_.begin(), missing_storage_.end(), 0);
common::ParallelFor2d(space, n_threads, [&](size_t node_in_set, common::Range1d r) {
const int32_t nid = nodes[node_in_set].nid;
partition_builder_->MaskRows(node_in_set, nodes, r, gmat, column_matrix, *p_tree,
(*row_set_collection_)[nid].begin, &decision_bits_,
&missing_bits_);
bst_bin_t split_cond = column_matrix.IsInitialized() ? split_conditions[node_in_set] : 0;
partition_builder_->MaskRows<BinIdxType, any_missing, any_cat>(
node_in_set, nodes, r, split_cond, gmat, column_matrix, *p_tree,
(*row_set_collection_)[nid].begin, &decision_bits_, &missing_bits_);
});

// Then aggregate the bit vectors across all the workers.
Expand Down Expand Up @@ -217,7 +219,8 @@ class CommonRowPartitioner {
// 2.3 Split elements of row_set_collection_ to left and right child-nodes for each node
// Store results in intermediate buffers from partition_builder_
if (is_col_split_) {
column_split_helper_.Partition(space, ctx->Threads(), gmat, column_matrix, nodes, p_tree);
column_split_helper_.Partition<BinIdxType, any_missing, any_cat>(
space, ctx->Threads(), gmat, column_matrix, nodes, split_conditions, p_tree);
} else {
common::ParallelFor2d(space, ctx->Threads(), [&](size_t node_in_set, common::Range1d r) {
size_t begin = r.begin();
Expand Down
111 changes: 111 additions & 0 deletions tests/cpp/tree/test_quantile_hist.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#include "xgboost/data.h"

namespace xgboost::tree {

namespace {
template <typename ExpandEntry>
void TestPartitioner(bst_target_t n_targets) {
std::size_t n_samples = 1024, base_rowid = 0;
Expand Down Expand Up @@ -86,8 +88,117 @@ void TestPartitioner(bst_target_t n_targets) {
}
}
}
} // anonymous namespace

TEST(QuantileHist, Partitioner) { TestPartitioner<CPUExpandEntry>(1); }

TEST(QuantileHist, MultiPartitioner) { TestPartitioner<MultiExpandEntry>(3); }

namespace {

template <typename ExpandEntry>
void VerifyColumnSplitPartitioner(bst_target_t n_targets, size_t n_samples,
bst_feature_t n_features, size_t base_rowid,
std::shared_ptr<DMatrix> Xy, float min_value, float mid_value,
CommonRowPartitioner const& expected_mid_partitioner) {
auto dmat =
std::unique_ptr<DMatrix>{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())};

Context ctx;
ctx.InitAllowUnknown(Args{});

std::vector<ExpandEntry> candidates{{0, 0}};
candidates.front().split.loss_chg = 0.4;
auto cuts = common::SketchOnDMatrix(&ctx, dmat.get(), 64);

for (auto const& page : Xy->GetBatches<SparsePage>()) {
GHistIndexMatrix gmat(page, {}, cuts, 64, true, 0.5, ctx.Threads());
bst_feature_t const split_ind = 0;
common::ColumnMatrix column_indices;
column_indices.InitFromSparse(page, gmat, 0.5, ctx.Threads());
{
RegTree tree{n_targets, n_features};
CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, true};
if constexpr (std::is_same<ExpandEntry, CPUExpandEntry>::value) {
GetSplit(&tree, min_value, &candidates);
} else {
GetMultiSplitForTest(&tree, min_value, &candidates);
}
partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates, &tree);
ASSERT_EQ(partitioner.Size(), 3);
ASSERT_EQ(partitioner[1].Size(), 0);
ASSERT_EQ(partitioner[2].Size(), n_samples);
}
{
RegTree tree{n_targets, n_features};
CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, true};
if constexpr (std::is_same<ExpandEntry, CPUExpandEntry>::value) {
GetSplit(&tree, mid_value, &candidates);
} else {
GetMultiSplitForTest(&tree, mid_value, &candidates);
}
auto left_nidx = tree.LeftChild(RegTree::kRoot);
partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates, &tree);

auto elem = partitioner[left_nidx];
ASSERT_LT(elem.Size(), n_samples);
ASSERT_GT(elem.Size(), 1);
auto expected_elem = expected_mid_partitioner[left_nidx];
ASSERT_EQ(elem.Size(), expected_elem.Size());
for (auto it = elem.begin, eit = expected_elem.begin; it != elem.end; ++it, ++eit) {
ASSERT_EQ(*it, *eit);
}

auto right_nidx = tree.RightChild(RegTree::kRoot);
elem = partitioner[right_nidx];
expected_elem = expected_mid_partitioner[right_nidx];
ASSERT_EQ(elem.Size(), expected_elem.Size());
for (auto it = elem.begin, eit = expected_elem.begin; it != elem.end; ++it, ++eit) {
ASSERT_EQ(*it, *eit);
}
}
}
}

template <typename ExpandEntry>
void TestColumnSplitPartitioner(bst_target_t n_targets) {
std::size_t n_samples = 1024, base_rowid = 0;
bst_feature_t n_features = 16;
auto Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
std::vector<ExpandEntry> candidates{{0, 0}};
candidates.front().split.loss_chg = 0.4;

Context ctx;
ctx.InitAllowUnknown(Args{});
auto cuts = common::SketchOnDMatrix(&ctx, Xy.get(), 64);

float min_value, mid_value;
CommonRowPartitioner mid_partitioner{&ctx, n_samples, base_rowid, false};
for (auto const& page : Xy->GetBatches<SparsePage>()) {
GHistIndexMatrix gmat(page, {}, cuts, 64, true, 0.5, ctx.Threads());
bst_feature_t const split_ind = 0;
common::ColumnMatrix column_indices;
column_indices.InitFromSparse(page, gmat, 0.5, ctx.Threads());
min_value = gmat.cut.MinValues()[split_ind];

auto ptr = gmat.cut.Ptrs()[split_ind + 1];
mid_value = gmat.cut.Values().at(ptr / 2);
RegTree tree{n_targets, n_features};
if constexpr (std::is_same<ExpandEntry, CPUExpandEntry>::value) {
GetSplit(&tree, mid_value, &candidates);
} else {
GetMultiSplitForTest(&tree, mid_value, &candidates);
}
mid_partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates, &tree);
}

auto constexpr kWorkers = 4;
RunWithInMemoryCommunicator(kWorkers, VerifyColumnSplitPartitioner<ExpandEntry>, n_targets,
n_samples, n_features, base_rowid, Xy, min_value, mid_value, mid_partitioner);
}
} // anonymous namespace

TEST(QuantileHist, PartitionerColSplit) { TestColumnSplitPartitioner<CPUExpandEntry>(1); }

TEST(QuantileHist, MultiPartitionerColSplit) { TestColumnSplitPartitioner<MultiExpandEntry>(3); }
} // namespace xgboost::tree