From 5c506acb39bcb75cf7098085fc3d9d643abd7f8d Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 29 Feb 2024 15:57:13 +0800 Subject: [PATCH 1/2] Disable column sample by node for the exact tree method. The exact tree method grow by layers of nodes. --- doc/parameter.rst | 2 +- src/tree/updater_colmaker.cc | 18 +++++++++--------- tests/python/test_updaters.py | 18 ++++++++++++++++-- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/doc/parameter.rst b/doc/parameter.rst index 7898bb363549..e5cb13abfe7a 100644 --- a/doc/parameter.rst +++ b/doc/parameter.rst @@ -118,7 +118,7 @@ Parameters for Tree Booster - All ``colsample_by*`` parameters have a range of (0, 1], the default value of 1, and specify the fraction of columns to be subsampled. - ``colsample_bytree`` is the subsample ratio of columns when constructing each tree. Subsampling occurs once for every tree constructed. - ``colsample_bylevel`` is the subsample ratio of columns for each level. Subsampling occurs once for every new depth level reached in a tree. Columns are subsampled from the set of columns chosen for the current tree. - - ``colsample_bynode`` is the subsample ratio of columns for each node (split). Subsampling occurs once every time a new split is evaluated. Columns are subsampled from the set of columns chosen for the current level. + - ``colsample_bynode`` is the subsample ratio of columns for each node (split). Subsampling occurs once every time a new split is evaluated. Columns are subsampled from the set of columns chosen for the current level. This is not supported by the exact tree method. - ``colsample_by*`` parameters work cumulatively. For instance, the combination ``{'colsample_bytree':0.5, 'colsample_bylevel':0.5, 'colsample_bynode':0.5}`` with 64 features will leave 8 features to choose from at diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc index ef166fae5132..45018da17adc 100644 --- a/src/tree/updater_colmaker.cc +++ b/src/tree/updater_colmaker.cc @@ -106,6 +106,9 @@ class ColMaker: public TreeUpdater { if (dmat->Info().HasCategorical()) { LOG(FATAL) << error::NoCategorical("Updater `grow_colmaker` or `exact` tree method"); } + if (param->colsample_bynode - 1.0 != 0.0) { + LOG(FATAL) << "column sample by node is not yet supported by the exact tree method"; + } this->LazyGetColumnDensity(dmat); // rescale learning rate according to size of trees interaction_constraints_.Configure(*param, dmat->Info().num_row_); @@ -440,9 +443,8 @@ class ColMaker: public TreeUpdater { } // update the solution candidate - virtual void UpdateSolution(const SortedCSCPage &batch, - const std::vector &feat_set, - const std::vector &gpair, DMatrix *) { + void UpdateSolution(SortedCSCPage const &batch, const std::vector &feat_set, + const std::vector &gpair) { // start enumeration const auto num_features = feat_set.size(); CHECK(this->ctx_); @@ -466,17 +468,15 @@ class ColMaker: public TreeUpdater { } }); } + // find splits at current level, do split per level - inline void FindSplit(int depth, - const std::vector &qexpand, - const std::vector &gpair, - DMatrix *p_fmat, - RegTree *p_tree) { + void FindSplit(bst_node_t depth, const std::vector &qexpand, + std::vector const &gpair, DMatrix *p_fmat, RegTree *p_tree) { auto evaluator = tree_evaluator_.GetEvaluator(); auto feat_set = column_sampler_->GetFeatureSet(depth); for (const auto &batch : p_fmat->GetBatches(ctx_)) { - this->UpdateSolution(batch, feat_set->HostVector(), gpair, p_fmat); + this->UpdateSolution(batch, feat_set->HostVector(), gpair); } // after this each thread's stemp will get the best candidates, aggregate results this->SyncBestSolution(qexpand); diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py index e7641348d98e..8ec1fdd9d395 100644 --- a/tests/python/test_updaters.py +++ b/tests/python/test_updaters.py @@ -35,10 +35,24 @@ class TestTreeMethod: def test_exact(self, param, num_rounds, dataset): if dataset.name.endswith("-l1"): return - param['tree_method'] = 'exact' + param["tree_method"] = "exact" param = dataset.set_params(param) result = train_result(param, dataset.get_dmat(), num_rounds) - assert tm.non_increasing(result['train'][dataset.metric]) + assert tm.non_increasing(result["train"][dataset.metric]) + + def test_exact_sample_by_node_error(self) -> None: + X, y, w = tm.make_regression(128, 12, False) + with pytest.raises(ValueError, match="column sample by node"): + xgb.train( + {"tree_method": "exact", "colsample_bynode": 0.999}, + xgb.DMatrix(X, y, weight=w), + ) + + xgb.train( + {"tree_method": "exact", "colsample_bynode": 1.0}, + xgb.DMatrix(X, y, weight=w), + num_boost_round=2, + ) @given( exact_parameter_strategy, From 77ac76fef73aed4f6f7e349c8c8d77dea771bae4 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 29 Feb 2024 16:21:18 +0800 Subject: [PATCH 2/2] disable feature weight test. --- R-package/tests/testthat/test_feature_weights.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/tests/testthat/test_feature_weights.R b/R-package/tests/testthat/test_feature_weights.R index 4ed78c9b6cfe..54fec67cfcf5 100644 --- a/R-package/tests/testthat/test_feature_weights.R +++ b/R-package/tests/testthat/test_feature_weights.R @@ -25,7 +25,7 @@ test_that("training with feature weights works", { expect_lt(importance[1, Frequency], importance[9, Frequency]) } - for (tm in c("hist", "approx", "exact")) { + for (tm in c("hist", "approx")) { test(tm) } })