From 76102284d1997ef8b876cb40ae7c3e44b6ffbb66 Mon Sep 17 00:00:00 2001 From: remcob-gr <39093316+remcob-gr@users.noreply.github.com> Date: Thu, 4 Apr 2019 03:35:11 +0100 Subject: [PATCH] Add Cost Effective Gradient Boosting (#2014) * Add configuration parameters for CEGB. * Add skeleton CEGB tree learner Like the original CEGB version, this inherits from SerialTreeLearner. Currently, it changes nothing from the original. * Track features used in CEGB tree learner. * Pull CEGB tradeoff and coupled feature penalty from config. * Implement finding best splits for CEGB This is heavily based on the serial version, but just adds using the coupled penalties. * Set proper defaults for cegb parameters. * Ensure sanity checks don't switch off CEGB. * Implement per-data-point feature penalties in CEGB. * Implement split penalty and remove unused parameters. * Merge changes from CEGB tree learner into serial tree learner * Represent features_used_in_data by a bitset, to reduce the memory overhead of CEGB, and add sanity checks for the lengths of the penalty vectors. * Fix bug where CEGB would incorrectly penalise a previously used feature The tree learner did not update the gains of previously computed leaf splits when splitting a leaf elsewhere in the tree. This caused it to prefer new features due to incorrectly penalising splitting on previously used features. * Document CEGB parameters and add them to the appropriate section. * Remove leftover reference to cegb tree learner. * Remove outdated diff. * Fix warnings * Fix minor issues identified by @StrikerRUS. * Add docs section on CEGB, including citation. * Fix link. * Fix CI failure. * Add some unit tests * Fix pylint issues. * Fix remaining pylint issue --- docs/Advanced-Topics.rst | 13 +++++ docs/Parameters.rst | 20 +++++++ include/LightGBM/config.h | 20 +++++++ include/LightGBM/utils/common.h | 16 ++++++ src/io/config_auto.cpp | 22 ++++++++ src/treelearner/serial_tree_learner.cpp | 70 +++++++++++++++++++++++++ src/treelearner/serial_tree_learner.h | 8 +++ tests/python_package_test/test_basic.py | 65 +++++++++++++++++++++++ 8 files changed, 234 insertions(+) diff --git a/docs/Advanced-Topics.rst b/docs/Advanced-Topics.rst index 33fbcecc34a0..85d0e3432120 100644 --- a/docs/Advanced-Topics.rst +++ b/docs/Advanced-Topics.rst @@ -41,6 +41,19 @@ LambdaRank - Use ``max_position`` to set the NDCG optimization position. +Cost Efficient Gradient Boosting +-------------------------------- + +`Cost Efficient Gradient Boosting `_ (CEGB) makes it possible to penalise boosting based on the cost of obtaining feature values. +CEGB penalises learning in the following ways: + +- Each time a tree is split, a penalty of ``cegb_penalty_split`` is applied. +- When a feature is used for the first time, ``cegb_penalty_feature_coupled`` is applied. This penalty can be different for each feature and should be specified as one ``double`` per feature. +- When a feature is used for the first time for a data row, ``cegb_penalty_feature_lazy`` is applied. Like ``cegb_penalty_feature_coupled``, this penalty is specified as one ``double`` per feature. + +Each of the penalties above is scaled by ``cegb_tradeoff``. +Using this parameter, it is possible to change the overall strength of the CEGB penalties by changing only one parameter. + Parameters Tuning ----------------- diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 0f5cd3e804f8..d6dad2fca209 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -374,6 +374,26 @@ Learning Control Parameters - used only in ``refit`` task in CLI version or as argument in ``refit`` function in language-specific package +- ``cegb_tradeoff`` :raw-html:`🔗︎`, default = ``1.0``, type = double, constraints: ``cegb_tradeoff >= 0.0`` + + - cost-effective gradient boosting multiplier for all penalties + +- ``cegb_penalty_split`` :raw-html:`🔗︎`, default = ``0.0``, type = double, constraints: ``cegb_penalty_split >= 0.0`` + + - cost-effective gradient-boosting penalty for splitting a node + +- ``cegb_penalty_feature_lazy`` :raw-html:`🔗︎`, default = ``0,0,...,0``, type = multi-double + + - cost-effective gradient boosting penalty for using a feature + + - applied per data point + +- ``cegb_penalty_feature_coupled`` :raw-html:`🔗︎`, default = ``0,0,...,0``, type = multi-double + + - cost-effective gradient boosting penalty for using a feature + + - applied once per forest + IO Parameters ------------- diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 4b6d76a850e9..d080113fdd50 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -377,6 +377,26 @@ struct Config { // desc = used only in ``refit`` task in CLI version or as argument in ``refit`` function in language-specific package double refit_decay_rate = 0.9; + // check = >=0.0 + // desc = cost-effective gradient boosting multiplier for all penalties + double cegb_tradeoff = 1.0; + + // check = >=0.0 + // desc = cost-effective gradient-boosting penalty for splitting a node + double cegb_penalty_split = 0.0; + + // type = multi-double + // default = 0,0,...,0 + // desc = cost-effective gradient boosting penalty for using a feature + // desc = applied per data point + std::vector cegb_penalty_feature_lazy; + + // type = multi-double + // default = 0,0,...,0 + // desc = cost-effective gradient boosting penalty for using a feature + // desc = applied once per forest + std::vector cegb_penalty_feature_coupled; + #pragma endregion #pragma region IO Parameters diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h index 41e38f045da7..356348094b81 100644 --- a/include/LightGBM/utils/common.h +++ b/include/LightGBM/utils/common.h @@ -808,6 +808,22 @@ inline static void ObtainMinMaxSum(const T1 *w, int nw, T1 *mi, T1 *ma, T2 *su) } } +inline static std::vector EmptyBitset(int n){ + int size = n / 32; + if(n % 32 != 0) size++; + return std::vector(size); +} + +template +inline static void InsertBitset(std::vector& vec, const T val){ + int i1 = val / 32; + int i2 = val % 32; + if (static_cast(vec.size()) < i1 + 1) { + vec.resize(i1 + 1, 0); + } + vec[i1] |= (1 << i2); +} + template inline static std::vector ConstructBitset(const T* vals, int n) { std::vector ret; diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 5992ecead555..b75b3f5ad737 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -197,6 +197,10 @@ std::unordered_set Config::parameter_set({ "feature_contri", "forcedsplits_filename", "refit_decay_rate", + "cegb_tradeoff", + "cegb_penalty_split", + "cegb_penalty_feature_lazy", + "cegb_penalty_feature_coupled", "verbosity", "max_bin", "min_data_in_bin", @@ -369,6 +373,20 @@ void Config::GetMembersFromString(const std::unordered_map=0.0); CHECK(refit_decay_rate <=1.0); + GetDouble(params, "cegb_tradeoff", &cegb_tradeoff); + CHECK(cegb_tradeoff >=0.0); + + GetDouble(params, "cegb_penalty_split", &cegb_penalty_split); + CHECK(cegb_penalty_split >=0.0); + + if (GetString(params, "cegb_penalty_feature_lazy", &tmp_str)) { + cegb_penalty_feature_lazy = Common::StringToArray(tmp_str, ','); + } + + if (GetString(params, "cegb_penalty_feature_coupled", &tmp_str)) { + cegb_penalty_feature_coupled = Common::StringToArray(tmp_str, ','); + } + GetInt(params, "verbosity", &verbosity); GetInt(params, "max_bin", &max_bin); @@ -554,6 +572,10 @@ std::string Config::SaveMembersToString() const { str_buf << "[feature_contri: " << Common::Join(feature_contri, ",") << "]\n"; str_buf << "[forcedsplits_filename: " << forcedsplits_filename << "]\n"; str_buf << "[refit_decay_rate: " << refit_decay_rate << "]\n"; + str_buf << "[cegb_tradeoff: " << cegb_tradeoff << "]\n"; + str_buf << "[cegb_penalty_split: " << cegb_penalty_split << "]\n"; + str_buf << "[cegb_penalty_feature_lazy: " << Common::Join(cegb_penalty_feature_lazy, ",") << "]\n"; + str_buf << "[cegb_penalty_feature_coupled: " << Common::Join(cegb_penalty_feature_coupled, ",") << "]\n"; str_buf << "[verbosity: " << verbosity << "]\n"; str_buf << "[max_bin: " << max_bin << "]\n"; str_buf << "[min_data_in_bin: " << min_data_in_bin << "]\n"; diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 6f0955e94d2d..65c63ab28a73 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -64,6 +65,7 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian histogram_pool_.DynamicChangeSize(train_data_, config_, max_cache_size, config_->num_leaves); // push split information for all leaves best_split_per_leaf_.resize(config_->num_leaves); + splits_per_leaf_.resize(config_->num_leaves*train_data_->num_features()); // get ordered bin train_data_->CreateOrderedBins(&ordered_bins_); @@ -98,6 +100,16 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian } } Log::Info("Number of data: %d, number of used features: %d", num_data_, num_features_); + feature_used.clear(); + feature_used.resize(train_data->num_features()); + + if(!config_->cegb_penalty_feature_coupled.empty()){ + CHECK(config_->cegb_penalty_feature_coupled.size() == static_cast(train_data_->num_total_features())); + } + if(!config_->cegb_penalty_feature_lazy.empty()){ + CHECK(config_->cegb_penalty_feature_lazy.size() == static_cast(train_data_->num_total_features())); + feature_used_in_data = Common::EmptyBitset(train_data->num_features() * num_data_); + } } void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) { @@ -469,6 +481,28 @@ void SerialTreeLearner::ConstructHistograms(const std::vector& is_featur #endif } +double SerialTreeLearner::CalculateOndemandCosts(int feature_index, int leaf_index) { + if (config_->cegb_penalty_feature_lazy.empty()) + return 0.0f; + + double penalty = config_->cegb_penalty_feature_lazy[feature_index]; + + const int inner_fidx = train_data_->InnerFeatureIndex(feature_index); + + double total = 0.0f; + data_size_t cnt_leaf_data = 0; + auto tmp_idx = data_partition_->GetIndexOnLeaf(leaf_index, &cnt_leaf_data); + + for (data_size_t i_input = 0; i_input < cnt_leaf_data; ++i_input) { + int real_idx = tmp_idx[i_input]; + if (Common::FindInBitset(feature_used_in_data.data(), train_data_->num_data()*train_data_->num_features(), train_data_->num_data() * inner_fidx + real_idx)) + continue; + total += penalty; + } + + return total; +} + void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector& is_feature_used, bool use_subtract) { #ifdef TIMETAG auto start_time = std::chrono::steady_clock::now(); @@ -496,6 +530,14 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector& smaller_leaf_splits_->max_constraint(), &smaller_split); smaller_split.feature = real_fidx; + smaller_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_split * smaller_leaf_splits_->num_data_in_leaf(); + if(!config_->cegb_penalty_feature_coupled.empty() && !feature_used[feature_index]){ + smaller_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_feature_coupled[real_fidx]; + } + if(!config_->cegb_penalty_feature_lazy.empty()){ + smaller_split.gain -= config_->cegb_tradeoff * CalculateOndemandCosts(real_fidx, smaller_leaf_splits_->LeafIndex()); + } + splits_per_leaf_[smaller_leaf_splits_->LeafIndex()*train_data_->num_features() + feature_index] = smaller_split; if (smaller_split > smaller_best[tid]) { smaller_best[tid] = smaller_split; } @@ -519,6 +561,14 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector& larger_leaf_splits_->max_constraint(), &larger_split); larger_split.feature = real_fidx; + larger_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_split * larger_leaf_splits_->num_data_in_leaf(); + if(!config_->cegb_penalty_feature_coupled.empty() && !feature_used[feature_index]){ + larger_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_feature_coupled[real_fidx]; + } + if(!config_->cegb_penalty_feature_lazy.empty()){ + larger_split.gain -= config_->cegb_tradeoff*CalculateOndemandCosts(real_fidx, larger_leaf_splits_->LeafIndex()); + } + splits_per_leaf_[larger_leaf_splits_->LeafIndex()*train_data_->num_features() + feature_index] = larger_split; if (larger_split > larger_best[tid]) { larger_best[tid] = larger_split; } @@ -703,6 +753,26 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, Json& forced_split_json, int* void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf) { const SplitInfo& best_split_info = best_split_per_leaf_[best_leaf]; const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature); + if(!config_->cegb_penalty_feature_coupled.empty() && !feature_used[inner_feature_index]){ + feature_used[inner_feature_index] = true; + for(int i = 0; i < tree->num_leaves(); ++i){ + if(i == best_leaf) continue; + auto split = &splits_per_leaf_[i*train_data_->num_features() + inner_feature_index]; + split->gain += config_->cegb_tradeoff*config_->cegb_penalty_feature_coupled[best_split_info.feature]; + if(*split > best_split_per_leaf_[i]) + best_split_per_leaf_[i] = *split; + } + } + + if(!config_->cegb_penalty_feature_lazy.empty()){ + data_size_t cnt_leaf_data = 0; + auto tmp_idx = data_partition_->GetIndexOnLeaf(best_leaf, &cnt_leaf_data); + for (data_size_t i_input = 0; i_input < cnt_leaf_data; ++i_input) { + int real_idx = tmp_idx[i_input]; + Common::InsertBitset(feature_used_in_data, train_data_->num_data() * inner_feature_index + real_idx); + } + } + // left = parent *left_leaf = best_leaf; bool is_numerical_split = train_data_->FeatureBinMapper(inner_feature_index)->bin_type() == BinType::NumericalBin; diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 2d1c7a728988..90facbb06a36 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -112,6 +112,9 @@ class SerialTreeLearner: public TreeLearner { * \return The number of data in the leaf_idx leaf */ inline virtual data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const; + + double CalculateOndemandCosts(int feature_index, int leaf_index); + /*! \brief number of data */ data_size_t num_data_; /*! \brief number of features */ @@ -137,6 +140,8 @@ class SerialTreeLearner: public TreeLearner { /*! \brief store best split points for all leaves */ std::vector best_split_per_leaf_; + /*! \brief store best split per feature for all leaves */ + std::vector splits_per_leaf_; /*! \brief stores best thresholds for all feature for smaller leaf */ std::unique_ptr smaller_leaf_splits_; @@ -169,6 +174,9 @@ class SerialTreeLearner: public TreeLearner { int num_threads_; std::vector ordered_bin_indices_; bool is_constant_hessian_; + + std::vector feature_used; + std::vector feature_used_in_data; }; inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leaf_idx) const { diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index c86a64b41d22..6981385c0dc5 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -216,3 +216,68 @@ def test_add_features_monotone_types(self): self.assertIsNone(actual) else: np.testing.assert_array_equal(actual, expected) + + def test_cegb_affects_behavior(self): + X = np.random.random((1000, 5)) + X[:, [1, 3]] = 0 + y = np.random.random(1000) + names = ['col_%d' % i for i in range(5)] + ds = lgb.Dataset(X, feature_name=names).construct() + ds.set_label(y) + base = lgb.Booster(train_set=ds) + for k in range(10): + base.update() + with tempfile.NamedTemporaryFile() as f: + basename = f.name + base.save_model(basename) + with open(basename, 'rt') as f: + basetxt = f.read() + # Set extremely harsh penalties, so CEGB will block most splits. + cases = [{'cegb_penalty_feature_coupled': [50, 100, 10, 25, 30]}, + {'cegb_penalty_feature_lazy': [1, 2, 3, 4, 5]}, + {'cegb_penalty_split': 1}] + for case in cases: + booster = lgb.Booster(train_set=ds, params=case) + for k in range(10): + booster.update() + with tempfile.NamedTemporaryFile() as f: + casename = f.name + booster.save_model(casename) + with open(casename, 'rt') as f: + casetxt = f.read() + self.assertNotEqual(basetxt, casetxt) + + def test_cegb_scaling_equalities(self): + X = np.random.random((1000, 5)) + X[:, [1, 3]] = 0 + y = np.random.random(1000) + names = ['col_%d' % i for i in range(5)] + ds = lgb.Dataset(X, feature_name=names).construct() + ds.set_label(y) + # Compare pairs of penalties, to ensure scaling works as intended + pairs = [({'cegb_penalty_feature_coupled': [1, 2, 1, 2, 1]}, + {'cegb_penalty_feature_coupled': [0.5, 1, 0.5, 1, 0.5], 'cegb_tradeoff': 2}), + ({'cegb_penalty_feature_lazy': [0.01, 0.02, 0.03, 0.04, 0.05]}, + {'cegb_penalty_feature_lazy': [0.005, 0.01, 0.015, 0.02, 0.025], 'cegb_tradeoff': 2}), + ({'cegb_penalty_split': 1}, + {'cegb_penalty_split': 2, 'cegb_tradeoff': 0.5})] + for (p1, p2) in pairs: + booster1 = lgb.Booster(train_set=ds, params=p1) + booster2 = lgb.Booster(train_set=ds, params=p2) + for k in range(10): + booster1.update() + booster2.update() + with tempfile.NamedTemporaryFile() as f: + p1name = f.name + # Reset booster1's parameters to p2, so the parameter section of the file matches. + booster1.reset_parameter(p2) + booster1.save_model(p1name) + with open(p1name, 'rt') as f: + p1txt = f.read() + with tempfile.NamedTemporaryFile() as f: + p2name = f.name + booster2.save_model(p2name) + self.maxDiff = None + with open(p2name, 'rt') as f: + p2txt = f.read() + self.assertEqual(p1txt, p2txt)