From 76102284d1997ef8b876cb40ae7c3e44b6ffbb66 Mon Sep 17 00:00:00 2001
From: remcob-gr <39093316+remcob-gr@users.noreply.github.com>
Date: Thu, 4 Apr 2019 03:35:11 +0100
Subject: [PATCH] Add Cost Effective Gradient Boosting (#2014)

* Add configuration parameters for CEGB.

* Add skeleton CEGB tree learner

Like the original CEGB version, this inherits from SerialTreeLearner.
Currently, it changes nothing from the original.

* Track features used in CEGB tree learner.

* Pull CEGB tradeoff and coupled feature penalty from config.

* Implement finding best splits for CEGB

This is heavily based on the serial version, but just adds using the coupled penalties.

* Set proper defaults for cegb parameters.

* Ensure sanity checks don't switch off CEGB.

* Implement per-data-point feature penalties in CEGB.

* Implement split penalty and remove unused parameters.

* Merge changes from CEGB tree learner into serial tree learner

* Represent features_used_in_data by a bitset, to reduce the memory overhead of CEGB, and add sanity checks for the lengths of the penalty vectors.

* Fix bug where CEGB would incorrectly penalise a previously used feature

The tree learner did not update the gains of previously computed leaf splits when splitting a leaf elsewhere in the tree.
This caused it to prefer new features due to incorrectly penalising splitting on previously used features.

* Document CEGB parameters and add them to the appropriate section.

* Remove leftover reference to cegb tree learner.

* Remove outdated diff.

* Fix warnings

* Fix minor issues identified by @StrikerRUS.

* Add docs section on CEGB, including citation.

* Fix link.

* Fix CI failure.

* Add some unit tests

* Fix pylint issues.

* Fix remaining pylint issue
---
 docs/Advanced-Topics.rst                | 13 +++++
 docs/Parameters.rst                     | 20 +++++++
 include/LightGBM/config.h               | 20 +++++++
 include/LightGBM/utils/common.h         | 16 ++++++
 src/io/config_auto.cpp                  | 22 ++++++++
 src/treelearner/serial_tree_learner.cpp | 70 +++++++++++++++++++++++++
 src/treelearner/serial_tree_learner.h   |  8 +++
 tests/python_package_test/test_basic.py | 65 +++++++++++++++++++++++
 8 files changed, 234 insertions(+)
diff --git a/docs/Advanced-Topics.rst b/docs/Advanced-Topics.rst
index 33fbcecc34a0..85d0e3432120 100644
--- a/docs/Advanced-Topics.rst
+++ b/docs/Advanced-Topics.rst
@@ -41,6 +41,19 @@ LambdaRank
 
 -  Use ``max_position`` to set the NDCG optimization position.
 
+Cost Efficient Gradient Boosting
+--------------------------------
+
+`Cost Efficient Gradient Boosting <https://papers.nips.cc/paper/6753-cost-efficient-gradient-boosting.pdf>`_ (CEGB)  makes it possible to penalise boosting based on the cost of obtaining feature values.
+CEGB penalises learning in the following ways:
+
+- Each time a tree is split, a penalty of ``cegb_penalty_split`` is applied.
+- When a feature is used for the first time, ``cegb_penalty_feature_coupled`` is applied. This penalty can be different for each feature and should be specified as one ``double`` per feature.
+- When a feature is used for the first time for a data row, ``cegb_penalty_feature_lazy`` is applied. Like ``cegb_penalty_feature_coupled``, this penalty is specified as one ``double`` per feature.
+
+Each of the penalties above is scaled by ``cegb_tradeoff``.
+Using this parameter, it is possible to change the overall strength of the CEGB penalties by changing only one parameter.
+
 Parameters Tuning
 -----------------
 
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 0f5cd3e804f8..d6dad2fca209 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -374,6 +374,26 @@ Learning Control Parameters
 
    -  used only in ``refit`` task in CLI version or as argument in ``refit`` function in language-specific package
 
+-  ``cegb_tradeoff`` :raw-html:`<a id="cegb_tradeoff" title="Permalink to this parameter" href="#cegb_tradeoff">&#x1F517;&#xFE0E;</a>`, default = ``1.0``, type = double, constraints: ``cegb_tradeoff >= 0.0``
+
+   -  cost-effective gradient boosting multiplier for all penalties
+
+-  ``cegb_penalty_split`` :raw-html:`<a id="cegb_penalty_split" title="Permalink to this parameter" href="#cegb_penalty_split">&#x1F517;&#xFE0E;</a>`, default = ``0.0``, type = double, constraints: ``cegb_penalty_split >= 0.0``
+
+   -  cost-effective gradient-boosting penalty for splitting a node
+
+-  ``cegb_penalty_feature_lazy`` :raw-html:`<a id="cegb_penalty_feature_lazy" title="Permalink to this parameter" href="#cegb_penalty_feature_lazy">&#x1F517;&#xFE0E;</a>`, default = ``0,0,...,0``, type = multi-double
+
+   -  cost-effective gradient boosting penalty for using a feature
+
+   -  applied per data point
+
+-  ``cegb_penalty_feature_coupled`` :raw-html:`<a id="cegb_penalty_feature_coupled" title="Permalink to this parameter" href="#cegb_penalty_feature_coupled">&#x1F517;&#xFE0E;</a>`, default = ``0,0,...,0``, type = multi-double
+
+   -  cost-effective gradient boosting penalty for using a feature
+
+   -  applied once per forest
+
 IO Parameters
 -------------
 
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 4b6d76a850e9..d080113fdd50 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -377,6 +377,26 @@ struct Config {
   // desc = used only in ``refit`` task in CLI version or as argument in ``refit`` function in language-specific package
   double refit_decay_rate = 0.9;
 
+  // check = >=0.0
+  // desc = cost-effective gradient boosting multiplier for all penalties
+  double cegb_tradeoff = 1.0;
+
+  // check = >=0.0
+  // desc = cost-effective gradient-boosting penalty for splitting a node
+  double cegb_penalty_split = 0.0;
+
+  // type = multi-double
+  // default = 0,0,...,0
+  // desc = cost-effective gradient boosting penalty for using a feature
+  // desc = applied per data point
+  std::vector<double> cegb_penalty_feature_lazy;
+
+  // type = multi-double
+  // default = 0,0,...,0
+  // desc = cost-effective gradient boosting penalty for using a feature
+  // desc = applied once per forest
+  std::vector<double> cegb_penalty_feature_coupled;  
+
   #pragma endregion
 
   #pragma region IO Parameters
diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h
index 41e38f045da7..356348094b81 100644
--- a/include/LightGBM/utils/common.h
+++ b/include/LightGBM/utils/common.h
@@ -808,6 +808,22 @@ inline static void ObtainMinMaxSum(const T1 *w, int nw, T1 *mi, T1 *ma, T2 *su)
   }
 }
 
+inline static std::vector<uint32_t> EmptyBitset(int n){
+  int size = n / 32;
+  if(n % 32 != 0) size++;
+  return std::vector<uint32_t>(size);
+}
+
+template<typename T>
+inline static void InsertBitset(std::vector<uint32_t>& vec, const T val){
+    int i1 = val / 32;
+    int i2 = val % 32;
+    if (static_cast<int>(vec.size()) < i1 + 1) {
+      vec.resize(i1 + 1, 0);
+    }
+    vec[i1] |= (1 << i2);  
+}
+
 template<typename T>
 inline static std::vector<uint32_t> ConstructBitset(const T* vals, int n) {
   std::vector<uint32_t> ret;
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index 5992ecead555..b75b3f5ad737 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -197,6 +197,10 @@ std::unordered_set<std::string> Config::parameter_set({
   "feature_contri",
   "forcedsplits_filename",
   "refit_decay_rate",
+  "cegb_tradeoff",
+  "cegb_penalty_split",
+  "cegb_penalty_feature_lazy",
+  "cegb_penalty_feature_coupled",
   "verbosity",
   "max_bin",
   "min_data_in_bin",
@@ -369,6 +373,20 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
   CHECK(refit_decay_rate >=0.0);
   CHECK(refit_decay_rate <=1.0);
 
+  GetDouble(params, "cegb_tradeoff", &cegb_tradeoff);
+  CHECK(cegb_tradeoff >=0.0);
+
+  GetDouble(params, "cegb_penalty_split", &cegb_penalty_split);
+  CHECK(cegb_penalty_split >=0.0);
+
+  if (GetString(params, "cegb_penalty_feature_lazy", &tmp_str)) {
+    cegb_penalty_feature_lazy = Common::StringToArray<double>(tmp_str, ',');
+  }
+
+  if (GetString(params, "cegb_penalty_feature_coupled", &tmp_str)) {
+    cegb_penalty_feature_coupled = Common::StringToArray<double>(tmp_str, ',');
+  }
+
   GetInt(params, "verbosity", &verbosity);
 
   GetInt(params, "max_bin", &max_bin);
@@ -554,6 +572,10 @@ std::string Config::SaveMembersToString() const {
   str_buf << "[feature_contri: " << Common::Join(feature_contri, ",") << "]\n";
   str_buf << "[forcedsplits_filename: " << forcedsplits_filename << "]\n";
   str_buf << "[refit_decay_rate: " << refit_decay_rate << "]\n";
+  str_buf << "[cegb_tradeoff: " << cegb_tradeoff << "]\n";
+  str_buf << "[cegb_penalty_split: " << cegb_penalty_split << "]\n";
+  str_buf << "[cegb_penalty_feature_lazy: " << Common::Join(cegb_penalty_feature_lazy, ",") << "]\n";
+  str_buf << "[cegb_penalty_feature_coupled: " << Common::Join(cegb_penalty_feature_coupled, ",") << "]\n";
   str_buf << "[verbosity: " << verbosity << "]\n";
   str_buf << "[max_bin: " << max_bin << "]\n";
   str_buf << "[min_data_in_bin: " << min_data_in_bin << "]\n";
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index 6f0955e94d2d..65c63ab28a73 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -4,6 +4,7 @@
 #include <LightGBM/objective_function.h>
 
 #include <LightGBM/utils/array_args.h>
+#include <LightGBM/utils/common.h>
 
 #include <algorithm>
 #include <vector>
@@ -64,6 +65,7 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
   histogram_pool_.DynamicChangeSize(train_data_, config_, max_cache_size, config_->num_leaves);
   // push split information for all leaves
   best_split_per_leaf_.resize(config_->num_leaves);
+  splits_per_leaf_.resize(config_->num_leaves*train_data_->num_features());
 
   // get ordered bin
   train_data_->CreateOrderedBins(&ordered_bins_);
@@ -98,6 +100,16 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
     }
   }
   Log::Info("Number of data: %d, number of used features: %d", num_data_, num_features_);
+  feature_used.clear();
+  feature_used.resize(train_data->num_features());
+
+  if(!config_->cegb_penalty_feature_coupled.empty()){
+    CHECK(config_->cegb_penalty_feature_coupled.size() == static_cast<size_t>(train_data_->num_total_features()));
+  }
+  if(!config_->cegb_penalty_feature_lazy.empty()){
+    CHECK(config_->cegb_penalty_feature_lazy.size() == static_cast<size_t>(train_data_->num_total_features()));
+    feature_used_in_data = Common::EmptyBitset(train_data->num_features() * num_data_);
+  }
 }
 
 void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) {
@@ -469,6 +481,28 @@ void SerialTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_featur
   #endif
 }
 
+double SerialTreeLearner::CalculateOndemandCosts(int feature_index, int leaf_index) {
+  if (config_->cegb_penalty_feature_lazy.empty())
+    return 0.0f;
+
+  double penalty = config_->cegb_penalty_feature_lazy[feature_index];
+
+  const int inner_fidx = train_data_->InnerFeatureIndex(feature_index);
+
+  double total = 0.0f;
+  data_size_t cnt_leaf_data = 0;
+  auto tmp_idx = data_partition_->GetIndexOnLeaf(leaf_index, &cnt_leaf_data);
+
+  for (data_size_t i_input = 0; i_input < cnt_leaf_data; ++i_input) {
+    int real_idx = tmp_idx[i_input];
+    if (Common::FindInBitset(feature_used_in_data.data(), train_data_->num_data()*train_data_->num_features(), train_data_->num_data() * inner_fidx + real_idx))
+      continue;
+    total += penalty;
+  }
+
+  return total;
+}
+
 void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
   #ifdef TIMETAG
   auto start_time = std::chrono::steady_clock::now();
@@ -496,6 +530,14 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
       smaller_leaf_splits_->max_constraint(),
       &smaller_split);
     smaller_split.feature = real_fidx;
+    smaller_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_split * smaller_leaf_splits_->num_data_in_leaf();
+    if(!config_->cegb_penalty_feature_coupled.empty() && !feature_used[feature_index]){
+      smaller_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_feature_coupled[real_fidx];
+    }
+    if(!config_->cegb_penalty_feature_lazy.empty()){
+      smaller_split.gain -= config_->cegb_tradeoff * CalculateOndemandCosts(real_fidx, smaller_leaf_splits_->LeafIndex());
+    }
+    splits_per_leaf_[smaller_leaf_splits_->LeafIndex()*train_data_->num_features() + feature_index] = smaller_split;
     if (smaller_split > smaller_best[tid]) {
       smaller_best[tid] = smaller_split;
     }
@@ -519,6 +561,14 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
       larger_leaf_splits_->max_constraint(),
       &larger_split);
     larger_split.feature = real_fidx;
+    larger_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_split * larger_leaf_splits_->num_data_in_leaf();
+    if(!config_->cegb_penalty_feature_coupled.empty() && !feature_used[feature_index]){
+      larger_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_feature_coupled[real_fidx];
+    }
+    if(!config_->cegb_penalty_feature_lazy.empty()){
+      larger_split.gain -= config_->cegb_tradeoff*CalculateOndemandCosts(real_fidx, larger_leaf_splits_->LeafIndex());
+    }
+    splits_per_leaf_[larger_leaf_splits_->LeafIndex()*train_data_->num_features() + feature_index] = larger_split;
     if (larger_split > larger_best[tid]) {
       larger_best[tid] = larger_split;
     }
@@ -703,6 +753,26 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, Json& forced_split_json, int*
 void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf) {
   const SplitInfo& best_split_info = best_split_per_leaf_[best_leaf];
   const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature);
+  if(!config_->cegb_penalty_feature_coupled.empty() && !feature_used[inner_feature_index]){
+    feature_used[inner_feature_index] = true;
+    for(int i = 0; i < tree->num_leaves(); ++i){
+      if(i == best_leaf) continue;
+      auto split = &splits_per_leaf_[i*train_data_->num_features() + inner_feature_index];
+      split->gain += config_->cegb_tradeoff*config_->cegb_penalty_feature_coupled[best_split_info.feature];
+      if(*split > best_split_per_leaf_[i])
+	best_split_per_leaf_[i] = *split;
+    }
+  }
+
+  if(!config_->cegb_penalty_feature_lazy.empty()){
+    data_size_t cnt_leaf_data = 0;
+    auto tmp_idx = data_partition_->GetIndexOnLeaf(best_leaf, &cnt_leaf_data);
+    for (data_size_t i_input = 0; i_input < cnt_leaf_data; ++i_input) {
+      int real_idx = tmp_idx[i_input];
+      Common::InsertBitset(feature_used_in_data, train_data_->num_data() * inner_feature_index + real_idx);
+    }
+  }
+
   // left = parent
   *left_leaf = best_leaf;
   bool is_numerical_split = train_data_->FeatureBinMapper(inner_feature_index)->bin_type() == BinType::NumericalBin;
diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h
index 2d1c7a728988..90facbb06a36 100644
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -112,6 +112,9 @@ class SerialTreeLearner: public TreeLearner {
   * \return The number of data in the leaf_idx leaf
   */
   inline virtual data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const;
+
+  double CalculateOndemandCosts(int feature_index, int leaf_index);
+
   /*! \brief number of data */
   data_size_t num_data_;
   /*! \brief number of features */
@@ -137,6 +140,8 @@ class SerialTreeLearner: public TreeLearner {
 
   /*! \brief store best split points for all leaves */
   std::vector<SplitInfo> best_split_per_leaf_;
+  /*! \brief store best split per feature for all leaves */
+  std::vector<SplitInfo> splits_per_leaf_;
 
   /*! \brief stores best thresholds for all feature for smaller leaf */
   std::unique_ptr<LeafSplits> smaller_leaf_splits_;
@@ -169,6 +174,9 @@ class SerialTreeLearner: public TreeLearner {
   int num_threads_;
   std::vector<int> ordered_bin_indices_;
   bool is_constant_hessian_;
+
+  std::vector<bool> feature_used;
+  std::vector<uint32_t> feature_used_in_data;
 };
 
 inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leaf_idx) const {
diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py
index c86a64b41d22..6981385c0dc5 100644
--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -216,3 +216,68 @@ def test_add_features_monotone_types(self):
                 self.assertIsNone(actual)
             else:
                 np.testing.assert_array_equal(actual, expected)
+
+    def test_cegb_affects_behavior(self):
+        X = np.random.random((1000, 5))
+        X[:, [1, 3]] = 0
+        y = np.random.random(1000)
+        names = ['col_%d' % i for i in range(5)]
+        ds = lgb.Dataset(X, feature_name=names).construct()
+        ds.set_label(y)
+        base = lgb.Booster(train_set=ds)
+        for k in range(10):
+            base.update()
+        with tempfile.NamedTemporaryFile() as f:
+            basename = f.name
+        base.save_model(basename)
+        with open(basename, 'rt') as f:
+            basetxt = f.read()
+        # Set extremely harsh penalties, so CEGB will block most splits.
+        cases = [{'cegb_penalty_feature_coupled': [50, 100, 10, 25, 30]},
+                 {'cegb_penalty_feature_lazy': [1, 2, 3, 4, 5]},
+                 {'cegb_penalty_split': 1}]
+        for case in cases:
+            booster = lgb.Booster(train_set=ds, params=case)
+            for k in range(10):
+                booster.update()
+            with tempfile.NamedTemporaryFile() as f:
+                casename = f.name
+            booster.save_model(casename)
+            with open(casename, 'rt') as f:
+                casetxt = f.read()
+            self.assertNotEqual(basetxt, casetxt)
+
+    def test_cegb_scaling_equalities(self):
+        X = np.random.random((1000, 5))
+        X[:, [1, 3]] = 0
+        y = np.random.random(1000)
+        names = ['col_%d' % i for i in range(5)]
+        ds = lgb.Dataset(X, feature_name=names).construct()
+        ds.set_label(y)
+        # Compare pairs of penalties, to ensure scaling works as intended
+        pairs = [({'cegb_penalty_feature_coupled': [1, 2, 1, 2, 1]},
+                  {'cegb_penalty_feature_coupled': [0.5, 1, 0.5, 1, 0.5], 'cegb_tradeoff': 2}),
+                 ({'cegb_penalty_feature_lazy': [0.01, 0.02, 0.03, 0.04, 0.05]},
+                  {'cegb_penalty_feature_lazy': [0.005, 0.01, 0.015, 0.02, 0.025], 'cegb_tradeoff': 2}),
+                 ({'cegb_penalty_split': 1},
+                  {'cegb_penalty_split': 2, 'cegb_tradeoff': 0.5})]
+        for (p1, p2) in pairs:
+            booster1 = lgb.Booster(train_set=ds, params=p1)
+            booster2 = lgb.Booster(train_set=ds, params=p2)
+            for k in range(10):
+                booster1.update()
+                booster2.update()
+            with tempfile.NamedTemporaryFile() as f:
+                p1name = f.name
+            # Reset booster1's parameters to p2, so the parameter section of the file matches.
+            booster1.reset_parameter(p2)
+            booster1.save_model(p1name)
+            with open(p1name, 'rt') as f:
+                p1txt = f.read()
+            with tempfile.NamedTemporaryFile() as f:
+                p2name = f.name
+            booster2.save_model(p2name)
+            self.maxDiff = None
+            with open(p2name, 'rt') as f:
+                p2txt = f.read()
+            self.assertEqual(p1txt, p2txt)