From 54bb9c38828b7d4b01a82ef2e86bcea4d97cc4bc Mon Sep 17 00:00:00 2001 From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com> Date: Tue, 11 Aug 2020 15:10:36 -0400 Subject: [PATCH] [ML] add LGBMClassifier transform support --- eland/ml/imported_ml_model.py | 9 +- eland/ml/transformers/__init__.py | 10 +- eland/ml/transformers/lightgbm.py | 102 ++++++++++++++---- .../tests/ml/test_imported_ml_model_pytest.py | 86 ++++++++++++--- 4 files changed, 169 insertions(+), 38 deletions(-) diff --git a/eland/ml/imported_ml_model.py b/eland/ml/imported_ml_model.py index 68e6e5ee..deed9621 100644 --- a/eland/ml/imported_ml_model.py +++ b/eland/ml/imported_ml_model.py @@ -39,7 +39,7 @@ except ImportError: pass try: - from lightgbm import LGBMRegressor # type: ignore # noqa: f401 + from lightgbm import LGBMRegressor, LGBMClassifier # type: ignore # noqa: f401 except ImportError: pass @@ -72,6 +72,12 @@ class ImportedMLModel(MLModel): - "fair" - "quantile" - "mape" + - lightgbm.LGBMClassifier + - Categorical fields are expected to already be processed + - Only the following objectives are supported + - "binary" + - "multiclass" + - "multiclassova" - xgboost.XGBClassifier - only the following objectives are supported: - "binary:logistic" @@ -144,6 +150,7 @@ def __init__( "XGBClassifier", "XGBRegressor", "LGBMRegressor", + "LGBMClassifier", ], feature_names: List[str], classification_labels: Optional[List[str]] = None, diff --git a/eland/ml/transformers/__init__.py b/eland/ml/transformers/__init__.py index 3fe0c0cf..8beebba9 100644 --- a/eland/ml/transformers/__init__.py +++ b/eland/ml/transformers/__init__.py @@ -86,12 +86,20 @@ def get_model_transformer(model: Any, **kwargs: Any) -> ModelTransformer: try: from .lightgbm import ( LGBMRegressor, + LGBMClassifier, LGBMForestTransformer, LGBMRegressorTransformer, + LGBMClassifierTransformer, _MODEL_TRANSFORMERS as _LIGHTGBM_MODEL_TRANSFORMERS, ) - __all__ += ["LGBMRegressor", "LGBMForestTransformer", "LGBMRegressorTransformer"] + __all__ += [ + "LGBMRegressor", + "LGBMClassifier", + "LGBMForestTransformer", + "LGBMRegressorTransformer", + "LGBMClassifierTransformer", + ] _MODEL_TRANSFORMERS.update(_LIGHTGBM_MODEL_TRANSFORMERS) except ImportError: pass diff --git a/eland/ml/transformers/lightgbm.py b/eland/ml/transformers/lightgbm.py index 509c7e68..b3cead22 100644 --- a/eland/ml/transformers/lightgbm.py +++ b/eland/ml/transformers/lightgbm.py @@ -23,7 +23,7 @@ import_optional_dependency("lightgbm", on_version="warn") -from lightgbm import Booster, LGBMRegressor # type: ignore +from lightgbm import Booster, LGBMRegressor, LGBMClassifier # type: ignore def transform_decider(decider: str) -> str: @@ -69,10 +69,34 @@ def __init__( super().__init__( model, feature_names, classification_labels, classification_weights ) - self._node_decision_type = "lte" self._objective = model.params["objective"] - def build_tree(self, tree_json_obj: Dict[str, Any]) -> Tree: + def make_inner_node( + self, + tree_id: int, + node_id: int, + tree_node_json_obj: Dict[str, Any], + left_child: int, + right_child: int, + ) -> TreeNode: + return TreeNode( + node_idx=node_id, + default_left=tree_node_json_obj["default_left"], + split_feature=int(tree_node_json_obj["split_feature"]), + threshold=float(tree_node_json_obj["threshold"]), + decision_type=transform_decider(tree_node_json_obj["decision_type"]), + left_child=left_child, + right_child=right_child, + ) + + def make_leaf_node( + self, tree_id: int, node_id: int, tree_node_json_obj: Dict[str, Any] + ) -> TreeNode: + return TreeNode( + node_idx=node_id, leaf_value=[float(tree_node_json_obj["leaf_value"])], + ) + + def build_tree(self, tree_id: int, tree_json_obj: Dict[str, Any]) -> Tree: tree_nodes = list() next_id = Counter() @@ -80,25 +104,14 @@ def add_tree_node(tree_node_json_obj: Dict[str, Any], counter: Counter) -> int: curr_id = counter.value() if "leaf_value" in tree_node_json_obj: tree_nodes.append( - TreeNode( - node_idx=curr_id, - leaf_value=[float(tree_node_json_obj["leaf_value"])], - ) + self.make_leaf_node(tree_id, curr_id, tree_node_json_obj) ) return curr_id left_id = add_tree_node(tree_node_json_obj["left_child"], counter.inc()) right_id = add_tree_node(tree_node_json_obj["right_child"], counter.inc()) tree_nodes.append( - TreeNode( - node_idx=curr_id, - default_left=tree_node_json_obj["default_left"], - split_feature=tree_node_json_obj["split_feature"], - threshold=float(tree_node_json_obj["threshold"]), - decision_type=transform_decider( - tree_node_json_obj["decision_type"] - ), - left_child=left_id, - right_child=right_id, + self.make_inner_node( + tree_id, curr_id, tree_node_json_obj, left_id, right_id ) ) return curr_id @@ -120,7 +133,7 @@ def build_forest(self) -> List[Tree]: """ self.check_model_booster() json_dump = self._model.dump_model() - return [self.build_tree(t) for t in json_dump["tree_info"]] + return [self.build_tree(i, t) for i, t in enumerate(json_dump["tree_info"])] def build_aggregator_output(self) -> Dict[str, Any]: raise NotImplementedError("build_aggregator_output must be implemented") @@ -190,6 +203,57 @@ def model_type(self) -> str: return MLModel.TYPE_REGRESSION +class LGBMClassifierTransformer(LGBMForestTransformer): + def __init__( + self, + model: LGBMClassifier, + feature_names: List[str], + classification_labels: List[str], + classification_weights: List[float], + ): + super().__init__( + model.booster_, feature_names, classification_labels, classification_weights + ) + self.n_estimators = int(model.n_estimators) + self.n_classes = int(model.n_classes_) + if not classification_labels: + self._classification_labels = [str(x) for x in model.classes_] + + def make_leaf_node( + self, tree_id: int, node_id: int, tree_node_json_obj: Dict[str, Any] + ) -> TreeNode: + if self._objective == "binary": + return super().make_leaf_node(tree_id, node_id, tree_node_json_obj) + leaf_val = [0.0] * self.n_classes + leaf_val[tree_id % self.n_classes] = float(tree_node_json_obj["leaf_value"]) + return TreeNode(node_idx=node_id, leaf_value=leaf_val) + + def check_model_booster(self) -> None: + if self._model.params["boosting_type"] not in {"gbdt", "rf", "dart", "goss"}: + raise ValueError( + f"boosting type must exist and be of type 'gbdt', 'rf', 'dart', or 'goss'" + f", was {self._model.params['boosting_type']!r}" + ) + + def determine_target_type(self) -> str: + return "classification" + + def build_aggregator_output(self) -> Dict[str, Any]: + return {"logistic_regression": {}} + + @property + def model_type(self) -> str: + return MLModel.TYPE_CLASSIFICATION + + def is_objective_supported(self) -> bool: + return self._objective in { + "binary", + "multiclass", + "multiclassova", + } + + _MODEL_TRANSFORMERS: Dict[type, Type[ModelTransformer]] = { - LGBMRegressor: LGBMRegressorTransformer + LGBMRegressor: LGBMRegressorTransformer, + LGBMClassifier: LGBMClassifierTransformer, } diff --git a/eland/tests/ml/test_imported_ml_model_pytest.py b/eland/tests/ml/test_imported_ml_model_pytest.py index 43e6bc60..01018d25 100644 --- a/eland/tests/ml/test_imported_ml_model_pytest.py +++ b/eland/tests/ml/test_imported_ml_model_pytest.py @@ -39,7 +39,7 @@ HAS_XGBOOST = False try: - from lightgbm import LGBMRegressor + from lightgbm import LGBMRegressor, LGBMClassifier HAS_LIGHTGBM = True except ImportError: @@ -62,6 +62,10 @@ ) +def random_rows(data, size): + return data[np.random.randint(data.shape[0], size=size), :].tolist() + + def check_prediction_equality(es_model, py_model, test_data): # Get some test results test_results = py_model.predict(np.asarray(test_data)) @@ -140,8 +144,9 @@ def test_decision_tree_classifier(self, compress_model_definition): ) # Get some test results - test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]] - check_prediction_equality(es_model, classifier, test_data) + check_prediction_equality( + es_model, classifier, random_rows(training_data[0], 20) + ) # Clean up es_model.delete_model() @@ -167,8 +172,9 @@ def test_decision_tree_regressor(self, compress_model_definition): es_compress_model_definition=compress_model_definition, ) # Get some test results - test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]] - check_prediction_equality(es_model, regressor, test_data) + check_prediction_equality( + es_model, regressor, random_rows(training_data[0], 20) + ) # Clean up es_model.delete_model() @@ -194,8 +200,9 @@ def test_random_forest_classifier(self, compress_model_definition): es_compress_model_definition=compress_model_definition, ) # Get some test results - test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]] - check_prediction_equality(es_model, classifier, test_data) + check_prediction_equality( + es_model, classifier, random_rows(training_data[0], 20) + ) # Clean up es_model.delete_model() @@ -221,8 +228,9 @@ def test_random_forest_regressor(self, compress_model_definition): es_compress_model_definition=compress_model_definition, ) # Get some test results - test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]] - check_prediction_equality(es_model, regressor, test_data) + check_prediction_equality( + es_model, regressor, random_rows(training_data[0], 20) + ) # Clean up es_model.delete_model() @@ -257,8 +265,9 @@ def test_xgb_classifier(self, compress_model_definition, multi_class): es_compress_model_definition=compress_model_definition, ) # Get some test results - test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]] - check_prediction_equality(es_model, classifier, test_data) + check_prediction_equality( + es_model, classifier, random_rows(training_data[0], 20) + ) # Clean up es_model.delete_model() @@ -290,8 +299,9 @@ def test_xgb_classifier_objectives_and_booster(self, objective, booster): ES_TEST_CLIENT, model_id, classifier, feature_names, overwrite=True ) # Get some test results - test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]] - check_prediction_equality(es_model, classifier, test_data) + check_prediction_equality( + es_model, classifier, random_rows(training_data[0], 20) + ) # Clean up es_model.delete_model() @@ -326,8 +336,9 @@ def test_xgb_regressor(self, compress_model_definition, objective, booster): es_compress_model_definition=compress_model_definition, ) # Get some test results - test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]] - check_prediction_equality(es_model, regressor, test_data) + check_prediction_equality( + es_model, regressor, random_rows(training_data[0], 20) + ) # Clean up es_model.delete_model() @@ -393,8 +404,49 @@ def test_lgbm_regressor(self, compress_model_definition, objective, booster): es_compress_model_definition=compress_model_definition, ) # Get some test results - test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]] - check_prediction_equality(es_model, regressor, test_data) + check_prediction_equality( + es_model, regressor, random_rows(training_data[0], 20) + ) + + # Clean up + es_model.delete_model() + + @requires_lightgbm + @pytest.mark.parametrize("compress_model_definition", [True, False]) + @pytest.mark.parametrize("objective", ["binary", "multiclass", "multiclassova"]) + @pytest.mark.parametrize("booster", ["gbdt", "dart", "goss"]) + def test_lgbm_classifier_objectives_and_booster( + self, compress_model_definition, objective, booster + ): + # test both multiple and binary classification + if objective.startswith("multi"): + training_data = datasets.make_classification( + n_features=5, n_classes=3, n_informative=3 + ) + classifier = LGBMClassifier(boosting_type=booster, objective=objective) + else: + training_data = datasets.make_classification(n_features=5) + classifier = LGBMClassifier(boosting_type=booster, objective=objective) + + # Train model + classifier.fit(training_data[0], training_data[1]) + + # Serialise the models to Elasticsearch + feature_names = ["Column_0", "Column_1", "Column_2", "Column_3", "Column_4"] + model_id = "test_lgbm_classifier" + + es_model = ImportedMLModel( + ES_TEST_CLIENT, + model_id, + classifier, + feature_names, + overwrite=True, + es_compress_model_definition=compress_model_definition, + ) + + check_prediction_equality( + es_model, classifier, random_rows(training_data[0], 20) + ) # Clean up es_model.delete_model()