Skip to content

Commit

Permalink
Add support for xgboost v1
Browse files Browse the repository at this point in the history
  • Loading branch information
sethmlarson authored Apr 29, 2020
1 parent df2a21f commit 3d81def
Show file tree
Hide file tree
Showing 10 changed files with 133 additions and 86 deletions.
1 change: 1 addition & 0 deletions .ci/test-matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
ELASTICSEARCH_VERSION:
- 8.0.0-SNAPSHOT
- 7.x-SNAPSHOT
- 7.7-SNAPSHOT
- 7.6-SNAPSHOT

TEST_SUITE:
Expand Down
9 changes: 1 addition & 8 deletions docs/requirements-docs.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,2 @@
elasticsearch==7.7.0a2
pandas>=1
matplotlib
pytest>=5.2.1
-r ../requirements-dev.txt
git+https://github.com/pandas-dev/pydata-sphinx-theme.git@master
numpydoc>=0.9.0
nbsphinx
scikit-learn
xgboost==0.90
60 changes: 30 additions & 30 deletions eland/ml/_model_serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,41 +6,41 @@
import gzip
import json
from abc import ABC
from typing import List
from typing import List, Dict, Any, Optional


def add_if_exists(d: dict, k: str, v) -> dict:
def add_if_exists(d: Dict[str, Any], k: str, v: Any) -> None:
if v is not None:
d[k] = v
return d


class ModelSerializer(ABC):
def __init__(
self,
feature_names: List[str],
target_type: str = None,
classification_labels: List[str] = None,
target_type: Optional[str] = None,
classification_labels: Optional[List[str]] = None,
):
self._target_type = target_type
self._feature_names = feature_names
self._classification_labels = classification_labels

def to_dict(self):
d = dict()
def to_dict(self) -> Dict[str, Any]:
d: Dict[str, Any] = {}
add_if_exists(d, "target_type", self._target_type)
add_if_exists(d, "feature_names", self._feature_names)
add_if_exists(d, "classification_labels", self._classification_labels)
return d

@property
def feature_names(self):
def feature_names(self) -> List[str]:
return self._feature_names

def serialize_model(self) -> Dict[str, Any]:
return {"trained_model": self.to_dict()}

def serialize_and_compress_model(self) -> str:
json_string = json.dumps(
{"trained_model": self.to_dict()}, separators=(",", ":")
)
json_string = json.dumps(self.serialize_model(), separators=(",", ":"))
return base64.b64encode(gzip.compress(json_string.encode("utf-8"))).decode(
"ascii"
)
Expand All @@ -50,13 +50,13 @@ class TreeNode:
def __init__(
self,
node_idx: int,
default_left: bool = None,
decision_type: str = None,
left_child: int = None,
right_child: int = None,
split_feature: int = None,
threshold: float = None,
leaf_value: float = None,
default_left: Optional[bool] = None,
decision_type: Optional[str] = None,
left_child: Optional[int] = None,
right_child: Optional[int] = None,
split_feature: Optional[int] = None,
threshold: Optional[float] = None,
leaf_value: Optional[float] = None,
):
self._node_idx = node_idx
self._decision_type = decision_type
Expand All @@ -67,8 +67,8 @@ def __init__(
self._leaf_value = leaf_value
self._default_left = default_left

def to_dict(self):
d = dict()
def to_dict(self) -> Dict[str, Any]:
d: Dict[str, Any] = {}
add_if_exists(d, "node_index", self._node_idx)
add_if_exists(d, "decision_type", self._decision_type)
if self._leaf_value is None:
Expand All @@ -85,9 +85,9 @@ class Tree(ModelSerializer):
def __init__(
self,
feature_names: List[str],
target_type: str = None,
tree_structure: List[TreeNode] = [],
classification_labels: List[str] = None,
target_type: Optional[str] = None,
tree_structure: Optional[List[TreeNode]] = None,
classification_labels: Optional[List[str]] = None,
):
super().__init__(
feature_names=feature_names,
Expand All @@ -96,9 +96,9 @@ def __init__(
)
if target_type == "regression" and classification_labels:
raise ValueError("regression does not support classification_labels")
self._tree_structure = tree_structure
self._tree_structure = tree_structure or []

def to_dict(self):
def to_dict(self) -> Dict[str, Any]:
d = super().to_dict()
add_if_exists(d, "tree_structure", [t.to_dict() for t in self._tree_structure])
return {"tree": d}
Expand All @@ -109,10 +109,10 @@ def __init__(
self,
feature_names: List[str],
trained_models: List[ModelSerializer],
output_aggregator: dict,
target_type: str = None,
classification_labels: List[str] = None,
classification_weights: List[float] = None,
output_aggregator: Dict[str, Any],
target_type: Optional[str] = None,
classification_labels: Optional[List[str]] = None,
classification_weights: Optional[List[float]] = None,
):
super().__init__(
feature_names=feature_names,
Expand All @@ -123,7 +123,7 @@ def __init__(
self._classification_weights = classification_weights
self._output_aggregator = output_aggregator

def to_dict(self):
def to_dict(self) -> Dict[str, Any]:
d = super().to_dict()
trained_models = None
if self._trained_models:
Expand Down
54 changes: 32 additions & 22 deletions eland/ml/_model_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
# See the LICENSE file in the project root for more information

from typing import List, Union
from typing import List, Union, Optional

import numpy as np

Expand All @@ -23,8 +23,8 @@ def __init__(
self,
model,
feature_names: List[str],
classification_labels: List[str] = None,
classification_weights: List[float] = None,
classification_labels: Optional[List[str]] = None,
classification_weights: Optional[List[float]] = None,
):
self._feature_names = feature_names
self._model = model
Expand Down Expand Up @@ -56,8 +56,8 @@ def __init__(
self,
model,
feature_names: List[str],
classification_labels: List[str] = None,
classification_weights: List[float] = None,
classification_labels: Optional[List[str]] = None,
classification_weights: Optional[List[float]] = None,
):
"""
Base class for SKLearn transformations
Expand Down Expand Up @@ -120,7 +120,7 @@ def __init__(
self,
model: Union[DecisionTreeRegressor, DecisionTreeClassifier],
feature_names: List[str],
classification_labels: List[str] = None,
classification_labels: Optional[List[str]] = None,
):
"""
Transforms a Decision Tree model (Regressor|Classifier) into a ES Supported Tree format
Expand Down Expand Up @@ -148,7 +148,7 @@ def transform(self) -> Tree:
check_is_fitted(self._model, ["classes_"])
if tree_classes is None:
tree_classes = [str(c) for c in self._model.classes_]
nodes = list()
nodes = []
tree_state = self._model.tree_.__getstate__()
for i in range(len(tree_state["nodes"])):
nodes.append(
Expand All @@ -169,8 +169,8 @@ def __init__(
self,
model: Union[RandomForestClassifier, RandomForestRegressor],
feature_names: List[str],
classification_labels: List[str] = None,
classification_weights: List[float] = None,
classification_labels: Optional[List[str]] = None,
classification_weights: Optional[List[float]] = None,
):
super().__init__(
model, feature_names, classification_labels, classification_weights
Expand Down Expand Up @@ -235,7 +235,7 @@ def __init__(
self,
model: RandomForestClassifier,
feature_names: List[str],
classification_labels: List[str] = None,
classification_labels: Optional[List[str]] = None,
):
super().__init__(model, feature_names, classification_labels)

Expand All @@ -259,8 +259,8 @@ def __init__(
feature_names: List[str],
base_score: float = 0.5,
objective: str = "reg:squarederror",
classification_labels: List[str] = None,
classification_weights: List[float] = None,
classification_labels: Optional[List[str]] = None,
classification_weights: Optional[List[float]] = None,
):
super().__init__(
model, feature_names, classification_labels, classification_weights
Expand Down Expand Up @@ -330,25 +330,24 @@ def build_forest(self) -> List[Tree]:
:return: A list of Tree objects
"""
if self._model.booster not in {"dart", "gbtree"}:
raise ValueError("booster must exist and be of type dart or gbtree")
self.check_model_booster()

tree_table = self._model.trees_to_dataframe()
transformed_trees = list()
transformed_trees = []
curr_tree = None
tree_nodes = list()
tree_nodes = []
for _, row in tree_table.iterrows():
if row["Tree"] != curr_tree:
if len(tree_nodes) > 0:
transformed_trees.append(self.build_tree(tree_nodes))
curr_tree = row["Tree"]
tree_nodes = list()
tree_nodes = []
tree_nodes.append(self.build_tree_node(row, curr_tree))
# add last tree
if len(tree_nodes) > 0:
transformed_trees.append(self.build_tree(tree_nodes))
# We add this stump as XGBoost adds the base_score to the regression outputs
if self._objective.startswith("reg"):
if self._objective.partition(":")[0] == "reg":
transformed_trees.append(self.build_base_score_stump())
return transformed_trees

Expand All @@ -361,9 +360,16 @@ def determine_target_type(self) -> str:
def is_objective_supported(self) -> bool:
return False

def check_model_booster(self):
# xgboost v1 made booster default to 'None' meaning 'gbtree'
if self._model.booster not in {"dart", "gbtree", None}:
raise ValueError(
f"booster must exist and be of type 'dart' or "
f"'gbtree', was {self._model.booster!r}"
)

def transform(self) -> Ensemble:
if self._model.booster not in {"dart", "gbtree"}:
raise ValueError("booster must exist and be of type dart or gbtree")
self.check_model_booster()

if not self.is_objective_supported():
raise ValueError(f"Unsupported objective '{self._objective}'")
Expand All @@ -381,8 +387,12 @@ def transform(self) -> Ensemble:

class XGBoostRegressorTransformer(XGBoostForestTransformer):
def __init__(self, model: XGBRegressor, feature_names: List[str]):
# XGBRegressor.base_score defaults to 0.5.
base_score = model.base_score
if base_score is None:
base_score = 0.5
super().__init__(
model.get_booster(), feature_names, model.base_score, model.objective
model.get_booster(), feature_names, base_score, model.objective
)

def determine_target_type(self) -> str:
Expand All @@ -405,7 +415,7 @@ def __init__(
self,
model: XGBClassifier,
feature_names: List[str],
classification_labels: List[str] = None,
classification_labels: Optional[List[str]] = None,
):
super().__init__(
model.get_booster(),
Expand Down
Loading

0 comments on commit 3d81def

Please sign in to comment.