From 54bb9c38828b7d4b01a82ef2e86bcea4d97cc4bc Mon Sep 17 00:00:00 2001
From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com>
Date: Tue, 11 Aug 2020 15:10:36 -0400
Subject: [PATCH] [ML] add LGBMClassifier transform support

---
 eland/ml/imported_ml_model.py                 |   9 +-
 eland/ml/transformers/__init__.py             |  10 +-
 eland/ml/transformers/lightgbm.py             | 102 ++++++++++++++----
 .../tests/ml/test_imported_ml_model_pytest.py |  86 ++++++++++++---
 4 files changed, 169 insertions(+), 38 deletions(-)

diff --git a/eland/ml/imported_ml_model.py b/eland/ml/imported_ml_model.py
index 68e6e5ee..deed9621 100644
--- a/eland/ml/imported_ml_model.py
+++ b/eland/ml/imported_ml_model.py
@@ -39,7 +39,7 @@
     except ImportError:
         pass
     try:
-        from lightgbm import LGBMRegressor  # type: ignore # noqa: f401
+        from lightgbm import LGBMRegressor, LGBMClassifier  # type: ignore # noqa: f401
     except ImportError:
         pass
 
@@ -72,6 +72,12 @@ class ImportedMLModel(MLModel):
                 - "fair"
                 - "quantile"
                 - "mape"
+        - lightgbm.LGBMClassifier
+            - Categorical fields are expected to already be processed
+            - Only the following objectives are supported
+                - "binary"
+                - "multiclass"
+                - "multiclassova"
         - xgboost.XGBClassifier
             - only the following objectives are supported:
                 - "binary:logistic"
@@ -144,6 +150,7 @@ def __init__(
             "XGBClassifier",
             "XGBRegressor",
             "LGBMRegressor",
+            "LGBMClassifier",
         ],
         feature_names: List[str],
         classification_labels: Optional[List[str]] = None,
diff --git a/eland/ml/transformers/__init__.py b/eland/ml/transformers/__init__.py
index 3fe0c0cf..8beebba9 100644
--- a/eland/ml/transformers/__init__.py
+++ b/eland/ml/transformers/__init__.py
@@ -86,12 +86,20 @@ def get_model_transformer(model: Any, **kwargs: Any) -> ModelTransformer:
 try:
     from .lightgbm import (
         LGBMRegressor,
+        LGBMClassifier,
         LGBMForestTransformer,
         LGBMRegressorTransformer,
+        LGBMClassifierTransformer,
         _MODEL_TRANSFORMERS as _LIGHTGBM_MODEL_TRANSFORMERS,
     )
 
-    __all__ += ["LGBMRegressor", "LGBMForestTransformer", "LGBMRegressorTransformer"]
+    __all__ += [
+        "LGBMRegressor",
+        "LGBMClassifier",
+        "LGBMForestTransformer",
+        "LGBMRegressorTransformer",
+        "LGBMClassifierTransformer",
+    ]
     _MODEL_TRANSFORMERS.update(_LIGHTGBM_MODEL_TRANSFORMERS)
 except ImportError:
     pass
diff --git a/eland/ml/transformers/lightgbm.py b/eland/ml/transformers/lightgbm.py
index 509c7e68..b3cead22 100644
--- a/eland/ml/transformers/lightgbm.py
+++ b/eland/ml/transformers/lightgbm.py
@@ -23,7 +23,7 @@
 
 import_optional_dependency("lightgbm", on_version="warn")
 
-from lightgbm import Booster, LGBMRegressor  # type: ignore
+from lightgbm import Booster, LGBMRegressor, LGBMClassifier  # type: ignore
 
 
 def transform_decider(decider: str) -> str:
@@ -69,10 +69,34 @@ def __init__(
         super().__init__(
             model, feature_names, classification_labels, classification_weights
         )
-        self._node_decision_type = "lte"
         self._objective = model.params["objective"]
 
-    def build_tree(self, tree_json_obj: Dict[str, Any]) -> Tree:
+    def make_inner_node(
+        self,
+        tree_id: int,
+        node_id: int,
+        tree_node_json_obj: Dict[str, Any],
+        left_child: int,
+        right_child: int,
+    ) -> TreeNode:
+        return TreeNode(
+            node_idx=node_id,
+            default_left=tree_node_json_obj["default_left"],
+            split_feature=int(tree_node_json_obj["split_feature"]),
+            threshold=float(tree_node_json_obj["threshold"]),
+            decision_type=transform_decider(tree_node_json_obj["decision_type"]),
+            left_child=left_child,
+            right_child=right_child,
+        )
+
+    def make_leaf_node(
+        self, tree_id: int, node_id: int, tree_node_json_obj: Dict[str, Any]
+    ) -> TreeNode:
+        return TreeNode(
+            node_idx=node_id, leaf_value=[float(tree_node_json_obj["leaf_value"])],
+        )
+
+    def build_tree(self, tree_id: int, tree_json_obj: Dict[str, Any]) -> Tree:
         tree_nodes = list()
         next_id = Counter()
 
@@ -80,25 +104,14 @@ def add_tree_node(tree_node_json_obj: Dict[str, Any], counter: Counter) -> int:
             curr_id = counter.value()
             if "leaf_value" in tree_node_json_obj:
                 tree_nodes.append(
-                    TreeNode(
-                        node_idx=curr_id,
-                        leaf_value=[float(tree_node_json_obj["leaf_value"])],
-                    )
+                    self.make_leaf_node(tree_id, curr_id, tree_node_json_obj)
                 )
                 return curr_id
             left_id = add_tree_node(tree_node_json_obj["left_child"], counter.inc())
             right_id = add_tree_node(tree_node_json_obj["right_child"], counter.inc())
             tree_nodes.append(
-                TreeNode(
-                    node_idx=curr_id,
-                    default_left=tree_node_json_obj["default_left"],
-                    split_feature=tree_node_json_obj["split_feature"],
-                    threshold=float(tree_node_json_obj["threshold"]),
-                    decision_type=transform_decider(
-                        tree_node_json_obj["decision_type"]
-                    ),
-                    left_child=left_id,
-                    right_child=right_id,
+                self.make_inner_node(
+                    tree_id, curr_id, tree_node_json_obj, left_id, right_id
                 )
             )
             return curr_id
@@ -120,7 +133,7 @@ def build_forest(self) -> List[Tree]:
         """
         self.check_model_booster()
         json_dump = self._model.dump_model()
-        return [self.build_tree(t) for t in json_dump["tree_info"]]
+        return [self.build_tree(i, t) for i, t in enumerate(json_dump["tree_info"])]
 
     def build_aggregator_output(self) -> Dict[str, Any]:
         raise NotImplementedError("build_aggregator_output must be implemented")
@@ -190,6 +203,57 @@ def model_type(self) -> str:
         return MLModel.TYPE_REGRESSION
 
 
+class LGBMClassifierTransformer(LGBMForestTransformer):
+    def __init__(
+        self,
+        model: LGBMClassifier,
+        feature_names: List[str],
+        classification_labels: List[str],
+        classification_weights: List[float],
+    ):
+        super().__init__(
+            model.booster_, feature_names, classification_labels, classification_weights
+        )
+        self.n_estimators = int(model.n_estimators)
+        self.n_classes = int(model.n_classes_)
+        if not classification_labels:
+            self._classification_labels = [str(x) for x in model.classes_]
+
+    def make_leaf_node(
+        self, tree_id: int, node_id: int, tree_node_json_obj: Dict[str, Any]
+    ) -> TreeNode:
+        if self._objective == "binary":
+            return super().make_leaf_node(tree_id, node_id, tree_node_json_obj)
+        leaf_val = [0.0] * self.n_classes
+        leaf_val[tree_id % self.n_classes] = float(tree_node_json_obj["leaf_value"])
+        return TreeNode(node_idx=node_id, leaf_value=leaf_val)
+
+    def check_model_booster(self) -> None:
+        if self._model.params["boosting_type"] not in {"gbdt", "rf", "dart", "goss"}:
+            raise ValueError(
+                f"boosting type must exist and be of type 'gbdt', 'rf', 'dart', or 'goss'"
+                f", was {self._model.params['boosting_type']!r}"
+            )
+
+    def determine_target_type(self) -> str:
+        return "classification"
+
+    def build_aggregator_output(self) -> Dict[str, Any]:
+        return {"logistic_regression": {}}
+
+    @property
+    def model_type(self) -> str:
+        return MLModel.TYPE_CLASSIFICATION
+
+    def is_objective_supported(self) -> bool:
+        return self._objective in {
+            "binary",
+            "multiclass",
+            "multiclassova",
+        }
+
+
 _MODEL_TRANSFORMERS: Dict[type, Type[ModelTransformer]] = {
-    LGBMRegressor: LGBMRegressorTransformer
+    LGBMRegressor: LGBMRegressorTransformer,
+    LGBMClassifier: LGBMClassifierTransformer,
 }
diff --git a/eland/tests/ml/test_imported_ml_model_pytest.py b/eland/tests/ml/test_imported_ml_model_pytest.py
index 43e6bc60..01018d25 100644
--- a/eland/tests/ml/test_imported_ml_model_pytest.py
+++ b/eland/tests/ml/test_imported_ml_model_pytest.py
@@ -39,7 +39,7 @@
     HAS_XGBOOST = False
 
 try:
-    from lightgbm import LGBMRegressor
+    from lightgbm import LGBMRegressor, LGBMClassifier
 
     HAS_LIGHTGBM = True
 except ImportError:
@@ -62,6 +62,10 @@
 )
 
 
+def random_rows(data, size):
+    return data[np.random.randint(data.shape[0], size=size), :].tolist()
+
+
 def check_prediction_equality(es_model, py_model, test_data):
     # Get some test results
     test_results = py_model.predict(np.asarray(test_data))
@@ -140,8 +144,9 @@ def test_decision_tree_classifier(self, compress_model_definition):
         )
 
         # Get some test results
-        test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
-        check_prediction_equality(es_model, classifier, test_data)
+        check_prediction_equality(
+            es_model, classifier, random_rows(training_data[0], 20)
+        )
 
         # Clean up
         es_model.delete_model()
@@ -167,8 +172,9 @@ def test_decision_tree_regressor(self, compress_model_definition):
             es_compress_model_definition=compress_model_definition,
         )
         # Get some test results
-        test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
-        check_prediction_equality(es_model, regressor, test_data)
+        check_prediction_equality(
+            es_model, regressor, random_rows(training_data[0], 20)
+        )
 
         # Clean up
         es_model.delete_model()
@@ -194,8 +200,9 @@ def test_random_forest_classifier(self, compress_model_definition):
             es_compress_model_definition=compress_model_definition,
         )
         # Get some test results
-        test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
-        check_prediction_equality(es_model, classifier, test_data)
+        check_prediction_equality(
+            es_model, classifier, random_rows(training_data[0], 20)
+        )
 
         # Clean up
         es_model.delete_model()
@@ -221,8 +228,9 @@ def test_random_forest_regressor(self, compress_model_definition):
             es_compress_model_definition=compress_model_definition,
         )
         # Get some test results
-        test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
-        check_prediction_equality(es_model, regressor, test_data)
+        check_prediction_equality(
+            es_model, regressor, random_rows(training_data[0], 20)
+        )
 
         # Clean up
         es_model.delete_model()
@@ -257,8 +265,9 @@ def test_xgb_classifier(self, compress_model_definition, multi_class):
             es_compress_model_definition=compress_model_definition,
         )
         # Get some test results
-        test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
-        check_prediction_equality(es_model, classifier, test_data)
+        check_prediction_equality(
+            es_model, classifier, random_rows(training_data[0], 20)
+        )
 
         # Clean up
         es_model.delete_model()
@@ -290,8 +299,9 @@ def test_xgb_classifier_objectives_and_booster(self, objective, booster):
             ES_TEST_CLIENT, model_id, classifier, feature_names, overwrite=True
         )
         # Get some test results
-        test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
-        check_prediction_equality(es_model, classifier, test_data)
+        check_prediction_equality(
+            es_model, classifier, random_rows(training_data[0], 20)
+        )
 
         # Clean up
         es_model.delete_model()
@@ -326,8 +336,9 @@ def test_xgb_regressor(self, compress_model_definition, objective, booster):
             es_compress_model_definition=compress_model_definition,
         )
         # Get some test results
-        test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
-        check_prediction_equality(es_model, regressor, test_data)
+        check_prediction_equality(
+            es_model, regressor, random_rows(training_data[0], 20)
+        )
 
         # Clean up
         es_model.delete_model()
@@ -393,8 +404,49 @@ def test_lgbm_regressor(self, compress_model_definition, objective, booster):
             es_compress_model_definition=compress_model_definition,
         )
         # Get some test results
-        test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
-        check_prediction_equality(es_model, regressor, test_data)
+        check_prediction_equality(
+            es_model, regressor, random_rows(training_data[0], 20)
+        )
+
+        # Clean up
+        es_model.delete_model()
+
+    @requires_lightgbm
+    @pytest.mark.parametrize("compress_model_definition", [True, False])
+    @pytest.mark.parametrize("objective", ["binary", "multiclass", "multiclassova"])
+    @pytest.mark.parametrize("booster", ["gbdt", "dart", "goss"])
+    def test_lgbm_classifier_objectives_and_booster(
+        self, compress_model_definition, objective, booster
+    ):
+        # test both multiple and binary classification
+        if objective.startswith("multi"):
+            training_data = datasets.make_classification(
+                n_features=5, n_classes=3, n_informative=3
+            )
+            classifier = LGBMClassifier(boosting_type=booster, objective=objective)
+        else:
+            training_data = datasets.make_classification(n_features=5)
+            classifier = LGBMClassifier(boosting_type=booster, objective=objective)
+
+        # Train model
+        classifier.fit(training_data[0], training_data[1])
+
+        # Serialise the models to Elasticsearch
+        feature_names = ["Column_0", "Column_1", "Column_2", "Column_3", "Column_4"]
+        model_id = "test_lgbm_classifier"
+
+        es_model = ImportedMLModel(
+            ES_TEST_CLIENT,
+            model_id,
+            classifier,
+            feature_names,
+            overwrite=True,
+            es_compress_model_definition=compress_model_definition,
+        )
+
+        check_prediction_equality(
+            es_model, classifier, random_rows(training_data[0], 20)
+        )
 
         # Clean up
         es_model.delete_model()