Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] add LGBMClassifier transform support #252

Merged
merged 1 commit into from
Aug 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion eland/ml/imported_ml_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
except ImportError:
pass
try:
from lightgbm import LGBMRegressor # type: ignore # noqa: f401
from lightgbm import LGBMRegressor, LGBMClassifier # type: ignore # noqa: f401
except ImportError:
pass

Expand Down Expand Up @@ -72,6 +72,12 @@ class ImportedMLModel(MLModel):
- "fair"
- "quantile"
- "mape"
- lightgbm.LGBMClassifier
- Categorical fields are expected to already be processed
- Only the following objectives are supported
- "binary"
- "multiclass"
- "multiclassova"
- xgboost.XGBClassifier
- only the following objectives are supported:
- "binary:logistic"
Expand Down Expand Up @@ -144,6 +150,7 @@ def __init__(
"XGBClassifier",
"XGBRegressor",
"LGBMRegressor",
"LGBMClassifier",
],
feature_names: List[str],
classification_labels: Optional[List[str]] = None,
Expand Down
10 changes: 9 additions & 1 deletion eland/ml/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,20 @@ def get_model_transformer(model: Any, **kwargs: Any) -> ModelTransformer:
try:
from .lightgbm import (
LGBMRegressor,
LGBMClassifier,
LGBMForestTransformer,
LGBMRegressorTransformer,
LGBMClassifierTransformer,
_MODEL_TRANSFORMERS as _LIGHTGBM_MODEL_TRANSFORMERS,
)

__all__ += ["LGBMRegressor", "LGBMForestTransformer", "LGBMRegressorTransformer"]
__all__ += [
"LGBMRegressor",
"LGBMClassifier",
"LGBMForestTransformer",
"LGBMRegressorTransformer",
"LGBMClassifierTransformer",
]
_MODEL_TRANSFORMERS.update(_LIGHTGBM_MODEL_TRANSFORMERS)
except ImportError:
pass
102 changes: 83 additions & 19 deletions eland/ml/transformers/lightgbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

import_optional_dependency("lightgbm", on_version="warn")

from lightgbm import Booster, LGBMRegressor # type: ignore
from lightgbm import Booster, LGBMRegressor, LGBMClassifier # type: ignore


def transform_decider(decider: str) -> str:
Expand Down Expand Up @@ -69,36 +69,49 @@ def __init__(
super().__init__(
model, feature_names, classification_labels, classification_weights
)
self._node_decision_type = "lte"
self._objective = model.params["objective"]

def build_tree(self, tree_json_obj: Dict[str, Any]) -> Tree:
def make_inner_node(
self,
tree_id: int,
node_id: int,
tree_node_json_obj: Dict[str, Any],
left_child: int,
right_child: int,
) -> TreeNode:
return TreeNode(
node_idx=node_id,
default_left=tree_node_json_obj["default_left"],
split_feature=int(tree_node_json_obj["split_feature"]),
threshold=float(tree_node_json_obj["threshold"]),
decision_type=transform_decider(tree_node_json_obj["decision_type"]),
left_child=left_child,
right_child=right_child,
)

def make_leaf_node(
self, tree_id: int, node_id: int, tree_node_json_obj: Dict[str, Any]
) -> TreeNode:
return TreeNode(
node_idx=node_id, leaf_value=[float(tree_node_json_obj["leaf_value"])],
)

def build_tree(self, tree_id: int, tree_json_obj: Dict[str, Any]) -> Tree:
tree_nodes = list()
next_id = Counter()

def add_tree_node(tree_node_json_obj: Dict[str, Any], counter: Counter) -> int:
curr_id = counter.value()
if "leaf_value" in tree_node_json_obj:
tree_nodes.append(
TreeNode(
node_idx=curr_id,
leaf_value=[float(tree_node_json_obj["leaf_value"])],
)
self.make_leaf_node(tree_id, curr_id, tree_node_json_obj)
)
return curr_id
left_id = add_tree_node(tree_node_json_obj["left_child"], counter.inc())
right_id = add_tree_node(tree_node_json_obj["right_child"], counter.inc())
tree_nodes.append(
TreeNode(
node_idx=curr_id,
default_left=tree_node_json_obj["default_left"],
split_feature=tree_node_json_obj["split_feature"],
threshold=float(tree_node_json_obj["threshold"]),
decision_type=transform_decider(
tree_node_json_obj["decision_type"]
),
left_child=left_id,
right_child=right_id,
self.make_inner_node(
tree_id, curr_id, tree_node_json_obj, left_id, right_id
)
)
return curr_id
Expand All @@ -120,7 +133,7 @@ def build_forest(self) -> List[Tree]:
"""
self.check_model_booster()
json_dump = self._model.dump_model()
return [self.build_tree(t) for t in json_dump["tree_info"]]
return [self.build_tree(i, t) for i, t in enumerate(json_dump["tree_info"])]

def build_aggregator_output(self) -> Dict[str, Any]:
raise NotImplementedError("build_aggregator_output must be implemented")
Expand Down Expand Up @@ -190,6 +203,57 @@ def model_type(self) -> str:
return MLModel.TYPE_REGRESSION


class LGBMClassifierTransformer(LGBMForestTransformer):
def __init__(
self,
model: LGBMClassifier,
feature_names: List[str],
classification_labels: List[str],
classification_weights: List[float],
):
super().__init__(
model.booster_, feature_names, classification_labels, classification_weights
)
self.n_estimators = int(model.n_estimators)
self.n_classes = int(model.n_classes_)
if not classification_labels:
self._classification_labels = [str(x) for x in model.classes_]

def make_leaf_node(
self, tree_id: int, node_id: int, tree_node_json_obj: Dict[str, Any]
) -> TreeNode:
if self._objective == "binary":
return super().make_leaf_node(tree_id, node_id, tree_node_json_obj)
leaf_val = [0.0] * self.n_classes
leaf_val[tree_id % self.n_classes] = float(tree_node_json_obj["leaf_value"])
return TreeNode(node_idx=node_id, leaf_value=leaf_val)

def check_model_booster(self) -> None:
if self._model.params["boosting_type"] not in {"gbdt", "rf", "dart", "goss"}:
raise ValueError(
f"boosting type must exist and be of type 'gbdt', 'rf', 'dart', or 'goss'"
f", was {self._model.params['boosting_type']!r}"
)

def determine_target_type(self) -> str:
return "classification"

def build_aggregator_output(self) -> Dict[str, Any]:
return {"logistic_regression": {}}

@property
def model_type(self) -> str:
return MLModel.TYPE_CLASSIFICATION

def is_objective_supported(self) -> bool:
return self._objective in {
"binary",
"multiclass",
"multiclassova",
}


_MODEL_TRANSFORMERS: Dict[type, Type[ModelTransformer]] = {
LGBMRegressor: LGBMRegressorTransformer
LGBMRegressor: LGBMRegressorTransformer,
LGBMClassifier: LGBMClassifierTransformer,
}
86 changes: 69 additions & 17 deletions eland/tests/ml/test_imported_ml_model_pytest.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
HAS_XGBOOST = False

try:
from lightgbm import LGBMRegressor
from lightgbm import LGBMRegressor, LGBMClassifier

HAS_LIGHTGBM = True
except ImportError:
Expand All @@ -62,6 +62,10 @@
)


def random_rows(data, size):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this provides MUCH better testing.

The training data is random, and selecting a random subset to infer against provides much better coverage than a static two rows.

return data[np.random.randint(data.shape[0], size=size), :].tolist()


def check_prediction_equality(es_model, py_model, test_data):
# Get some test results
test_results = py_model.predict(np.asarray(test_data))
Expand Down Expand Up @@ -140,8 +144,9 @@ def test_decision_tree_classifier(self, compress_model_definition):
)

# Get some test results
test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
check_prediction_equality(es_model, classifier, test_data)
check_prediction_equality(
es_model, classifier, random_rows(training_data[0], 20)
)

# Clean up
es_model.delete_model()
Expand All @@ -167,8 +172,9 @@ def test_decision_tree_regressor(self, compress_model_definition):
es_compress_model_definition=compress_model_definition,
)
# Get some test results
test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
check_prediction_equality(es_model, regressor, test_data)
check_prediction_equality(
es_model, regressor, random_rows(training_data[0], 20)
)

# Clean up
es_model.delete_model()
Expand All @@ -194,8 +200,9 @@ def test_random_forest_classifier(self, compress_model_definition):
es_compress_model_definition=compress_model_definition,
)
# Get some test results
test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
check_prediction_equality(es_model, classifier, test_data)
check_prediction_equality(
es_model, classifier, random_rows(training_data[0], 20)
)

# Clean up
es_model.delete_model()
Expand All @@ -221,8 +228,9 @@ def test_random_forest_regressor(self, compress_model_definition):
es_compress_model_definition=compress_model_definition,
)
# Get some test results
test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
check_prediction_equality(es_model, regressor, test_data)
check_prediction_equality(
es_model, regressor, random_rows(training_data[0], 20)
)

# Clean up
es_model.delete_model()
Expand Down Expand Up @@ -257,8 +265,9 @@ def test_xgb_classifier(self, compress_model_definition, multi_class):
es_compress_model_definition=compress_model_definition,
)
# Get some test results
test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
check_prediction_equality(es_model, classifier, test_data)
check_prediction_equality(
es_model, classifier, random_rows(training_data[0], 20)
)

# Clean up
es_model.delete_model()
Expand Down Expand Up @@ -290,8 +299,9 @@ def test_xgb_classifier_objectives_and_booster(self, objective, booster):
ES_TEST_CLIENT, model_id, classifier, feature_names, overwrite=True
)
# Get some test results
test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
check_prediction_equality(es_model, classifier, test_data)
check_prediction_equality(
es_model, classifier, random_rows(training_data[0], 20)
)

# Clean up
es_model.delete_model()
Expand Down Expand Up @@ -326,8 +336,9 @@ def test_xgb_regressor(self, compress_model_definition, objective, booster):
es_compress_model_definition=compress_model_definition,
)
# Get some test results
test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
check_prediction_equality(es_model, regressor, test_data)
check_prediction_equality(
es_model, regressor, random_rows(training_data[0], 20)
)

# Clean up
es_model.delete_model()
Expand Down Expand Up @@ -393,8 +404,49 @@ def test_lgbm_regressor(self, compress_model_definition, objective, booster):
es_compress_model_definition=compress_model_definition,
)
# Get some test results
test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
check_prediction_equality(es_model, regressor, test_data)
check_prediction_equality(
es_model, regressor, random_rows(training_data[0], 20)
)

# Clean up
es_model.delete_model()

@requires_lightgbm
@pytest.mark.parametrize("compress_model_definition", [True, False])
@pytest.mark.parametrize("objective", ["binary", "multiclass", "multiclassova"])
@pytest.mark.parametrize("booster", ["gbdt", "dart", "goss"])
def test_lgbm_classifier_objectives_and_booster(
self, compress_model_definition, objective, booster
):
# test both multiple and binary classification
if objective.startswith("multi"):
training_data = datasets.make_classification(
n_features=5, n_classes=3, n_informative=3
)
classifier = LGBMClassifier(boosting_type=booster, objective=objective)
else:
training_data = datasets.make_classification(n_features=5)
classifier = LGBMClassifier(boosting_type=booster, objective=objective)

# Train model
classifier.fit(training_data[0], training_data[1])

# Serialise the models to Elasticsearch
feature_names = ["Column_0", "Column_1", "Column_2", "Column_3", "Column_4"]
model_id = "test_lgbm_classifier"

es_model = ImportedMLModel(
ES_TEST_CLIENT,
model_id,
classifier,
feature_names,
overwrite=True,
es_compress_model_definition=compress_model_definition,
)

check_prediction_equality(
es_model, classifier, random_rows(training_data[0], 20)
)

# Clean up
es_model.delete_model()