Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for xgboost v1 #200

Merged
merged 1 commit into from
Apr 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .ci/test-matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
ELASTICSEARCH_VERSION:
- 8.0.0-SNAPSHOT
- 7.x-SNAPSHOT
- 7.7-SNAPSHOT
- 7.6-SNAPSHOT

TEST_SUITE:
Expand Down
9 changes: 1 addition & 8 deletions docs/requirements-docs.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,2 @@
elasticsearch==7.7.0a2
pandas>=1
matplotlib
pytest>=5.2.1
-r ../requirements-dev.txt
git+https://github.com/pandas-dev/pydata-sphinx-theme.git@master
numpydoc>=0.9.0
nbsphinx
scikit-learn
xgboost==0.90
60 changes: 30 additions & 30 deletions eland/ml/_model_serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,41 +6,41 @@
import gzip
import json
from abc import ABC
from typing import List
from typing import List, Dict, Any, Optional


def add_if_exists(d: dict, k: str, v) -> dict:
def add_if_exists(d: Dict[str, Any], k: str, v: Any) -> None:
if v is not None:
d[k] = v
return d


class ModelSerializer(ABC):
def __init__(
self,
feature_names: List[str],
target_type: str = None,
classification_labels: List[str] = None,
target_type: Optional[str] = None,
classification_labels: Optional[List[str]] = None,
):
self._target_type = target_type
self._feature_names = feature_names
self._classification_labels = classification_labels

def to_dict(self):
d = dict()
def to_dict(self) -> Dict[str, Any]:
d: Dict[str, Any] = {}
add_if_exists(d, "target_type", self._target_type)
add_if_exists(d, "feature_names", self._feature_names)
add_if_exists(d, "classification_labels", self._classification_labels)
return d

@property
def feature_names(self):
def feature_names(self) -> List[str]:
return self._feature_names

def serialize_model(self) -> Dict[str, Any]:
return {"trained_model": self.to_dict()}

def serialize_and_compress_model(self) -> str:
json_string = json.dumps(
{"trained_model": self.to_dict()}, separators=(",", ":")
)
json_string = json.dumps(self.serialize_model(), separators=(",", ":"))
return base64.b64encode(gzip.compress(json_string.encode("utf-8"))).decode(
"ascii"
)
Expand All @@ -50,13 +50,13 @@ class TreeNode:
def __init__(
self,
node_idx: int,
default_left: bool = None,
decision_type: str = None,
left_child: int = None,
right_child: int = None,
split_feature: int = None,
threshold: float = None,
leaf_value: float = None,
default_left: Optional[bool] = None,
decision_type: Optional[str] = None,
left_child: Optional[int] = None,
right_child: Optional[int] = None,
split_feature: Optional[int] = None,
threshold: Optional[float] = None,
leaf_value: Optional[float] = None,
):
self._node_idx = node_idx
self._decision_type = decision_type
Expand All @@ -67,8 +67,8 @@ def __init__(
self._leaf_value = leaf_value
self._default_left = default_left

def to_dict(self):
d = dict()
def to_dict(self) -> Dict[str, Any]:
d: Dict[str, Any] = {}
add_if_exists(d, "node_index", self._node_idx)
add_if_exists(d, "decision_type", self._decision_type)
if self._leaf_value is None:
Expand All @@ -85,9 +85,9 @@ class Tree(ModelSerializer):
def __init__(
self,
feature_names: List[str],
target_type: str = None,
tree_structure: List[TreeNode] = [],
classification_labels: List[str] = None,
target_type: Optional[str] = None,
tree_structure: Optional[List[TreeNode]] = None,
classification_labels: Optional[List[str]] = None,
):
super().__init__(
feature_names=feature_names,
Expand All @@ -96,9 +96,9 @@ def __init__(
)
if target_type == "regression" and classification_labels:
raise ValueError("regression does not support classification_labels")
self._tree_structure = tree_structure
self._tree_structure = tree_structure or []

def to_dict(self):
def to_dict(self) -> Dict[str, Any]:
d = super().to_dict()
add_if_exists(d, "tree_structure", [t.to_dict() for t in self._tree_structure])
return {"tree": d}
Expand All @@ -109,10 +109,10 @@ def __init__(
self,
feature_names: List[str],
trained_models: List[ModelSerializer],
output_aggregator: dict,
target_type: str = None,
classification_labels: List[str] = None,
classification_weights: List[float] = None,
output_aggregator: Dict[str, Any],
target_type: Optional[str] = None,
classification_labels: Optional[List[str]] = None,
classification_weights: Optional[List[float]] = None,
):
super().__init__(
feature_names=feature_names,
Expand All @@ -123,7 +123,7 @@ def __init__(
self._classification_weights = classification_weights
self._output_aggregator = output_aggregator

def to_dict(self):
def to_dict(self) -> Dict[str, Any]:
d = super().to_dict()
trained_models = None
if self._trained_models:
Expand Down
54 changes: 32 additions & 22 deletions eland/ml/_model_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
# See the LICENSE file in the project root for more information

from typing import List, Union
from typing import List, Union, Optional

import numpy as np

Expand All @@ -23,8 +23,8 @@ def __init__(
self,
model,
feature_names: List[str],
classification_labels: List[str] = None,
classification_weights: List[float] = None,
classification_labels: Optional[List[str]] = None,
classification_weights: Optional[List[float]] = None,
):
self._feature_names = feature_names
self._model = model
Expand Down Expand Up @@ -56,8 +56,8 @@ def __init__(
self,
model,
feature_names: List[str],
classification_labels: List[str] = None,
classification_weights: List[float] = None,
classification_labels: Optional[List[str]] = None,
classification_weights: Optional[List[float]] = None,
):
"""
Base class for SKLearn transformations
Expand Down Expand Up @@ -120,7 +120,7 @@ def __init__(
self,
model: Union[DecisionTreeRegressor, DecisionTreeClassifier],
feature_names: List[str],
classification_labels: List[str] = None,
classification_labels: Optional[List[str]] = None,
):
"""
Transforms a Decision Tree model (Regressor|Classifier) into a ES Supported Tree format
Expand Down Expand Up @@ -148,7 +148,7 @@ def transform(self) -> Tree:
check_is_fitted(self._model, ["classes_"])
if tree_classes is None:
tree_classes = [str(c) for c in self._model.classes_]
nodes = list()
nodes = []
tree_state = self._model.tree_.__getstate__()
for i in range(len(tree_state["nodes"])):
nodes.append(
Expand All @@ -169,8 +169,8 @@ def __init__(
self,
model: Union[RandomForestClassifier, RandomForestRegressor],
feature_names: List[str],
classification_labels: List[str] = None,
classification_weights: List[float] = None,
classification_labels: Optional[List[str]] = None,
classification_weights: Optional[List[float]] = None,
):
super().__init__(
model, feature_names, classification_labels, classification_weights
Expand Down Expand Up @@ -235,7 +235,7 @@ def __init__(
self,
model: RandomForestClassifier,
feature_names: List[str],
classification_labels: List[str] = None,
classification_labels: Optional[List[str]] = None,
):
super().__init__(model, feature_names, classification_labels)

Expand All @@ -259,8 +259,8 @@ def __init__(
feature_names: List[str],
base_score: float = 0.5,
objective: str = "reg:squarederror",
classification_labels: List[str] = None,
classification_weights: List[float] = None,
classification_labels: Optional[List[str]] = None,
classification_weights: Optional[List[float]] = None,
):
super().__init__(
model, feature_names, classification_labels, classification_weights
Expand Down Expand Up @@ -330,25 +330,24 @@ def build_forest(self) -> List[Tree]:

:return: A list of Tree objects
"""
if self._model.booster not in {"dart", "gbtree"}:
raise ValueError("booster must exist and be of type dart or gbtree")
self.check_model_booster()

tree_table = self._model.trees_to_dataframe()
transformed_trees = list()
transformed_trees = []
curr_tree = None
tree_nodes = list()
tree_nodes = []
for _, row in tree_table.iterrows():
if row["Tree"] != curr_tree:
if len(tree_nodes) > 0:
transformed_trees.append(self.build_tree(tree_nodes))
curr_tree = row["Tree"]
tree_nodes = list()
tree_nodes = []
tree_nodes.append(self.build_tree_node(row, curr_tree))
# add last tree
if len(tree_nodes) > 0:
transformed_trees.append(self.build_tree(tree_nodes))
# We add this stump as XGBoost adds the base_score to the regression outputs
if self._objective.startswith("reg"):
if self._objective.partition(":")[0] == "reg":
transformed_trees.append(self.build_base_score_stump())
return transformed_trees

Expand All @@ -361,9 +360,16 @@ def determine_target_type(self) -> str:
def is_objective_supported(self) -> bool:
return False

def check_model_booster(self):
# xgboost v1 made booster default to 'None' meaning 'gbtree'
if self._model.booster not in {"dart", "gbtree", None}:
raise ValueError(
f"booster must exist and be of type 'dart' or "
f"'gbtree', was {self._model.booster!r}"
)

def transform(self) -> Ensemble:
if self._model.booster not in {"dart", "gbtree"}:
raise ValueError("booster must exist and be of type dart or gbtree")
self.check_model_booster()

if not self.is_objective_supported():
raise ValueError(f"Unsupported objective '{self._objective}'")
Expand All @@ -381,8 +387,12 @@ def transform(self) -> Ensemble:

class XGBoostRegressorTransformer(XGBoostForestTransformer):
def __init__(self, model: XGBRegressor, feature_names: List[str]):
# XGBRegressor.base_score defaults to 0.5.
base_score = model.base_score
if base_score is None:
base_score = 0.5
super().__init__(
model.get_booster(), feature_names, model.base_score, model.objective
model.get_booster(), feature_names, base_score, model.objective
)

def determine_target_type(self) -> str:
Expand All @@ -405,7 +415,7 @@ def __init__(
self,
model: XGBClassifier,
feature_names: List[str],
classification_labels: List[str] = None,
classification_labels: Optional[List[str]] = None,
):
super().__init__(
model.get_booster(),
Expand Down
Loading