Skip to content

Commit

Permalink
Add support for xgboost v1
Browse files Browse the repository at this point in the history
  • Loading branch information
sethmlarson committed Apr 29, 2020
1 parent df2a21f commit 853ad22
Show file tree
Hide file tree
Showing 9 changed files with 97 additions and 54 deletions.
1 change: 1 addition & 0 deletions .ci/test-matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
ELASTICSEARCH_VERSION:
- 8.0.0-SNAPSHOT
- 7.x-SNAPSHOT
- 7.7-SNAPSHOT
- 7.6-SNAPSHOT

TEST_SUITE:
Expand Down
9 changes: 1 addition & 8 deletions docs/requirements-docs.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,2 @@
elasticsearch==7.7.0a2
pandas>=1
matplotlib
pytest>=5.2.1
-r ../requirements-dev.txt
git+https://github.com/pandas-dev/pydata-sphinx-theme.git@master
numpydoc>=0.9.0
nbsphinx
scikit-learn
xgboost==0.90
60 changes: 30 additions & 30 deletions eland/ml/_model_serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,41 +6,41 @@
import gzip
import json
from abc import ABC
from typing import List
from typing import List, Dict, Any, Optional


def add_if_exists(d: dict, k: str, v) -> dict:
def add_if_exists(d: Dict[str, Any], k: str, v: Any) -> None:
if v is not None:
d[k] = v
return d


class ModelSerializer(ABC):
def __init__(
self,
feature_names: List[str],
target_type: str = None,
classification_labels: List[str] = None,
target_type: Optional[str] = None,
classification_labels: Optional[List[str]] = None,
):
self._target_type = target_type
self._feature_names = feature_names
self._classification_labels = classification_labels

def to_dict(self):
d = dict()
def to_dict(self) -> Dict[str, Any]:
d: Dict[str, Any] = {}
add_if_exists(d, "target_type", self._target_type)
add_if_exists(d, "feature_names", self._feature_names)
add_if_exists(d, "classification_labels", self._classification_labels)
return d

@property
def feature_names(self):
def feature_names(self) -> List[str]:
return self._feature_names

def serialize_model(self) -> Dict[str, Any]:
return {"trained_model": self.to_dict()}

def serialize_and_compress_model(self) -> str:
json_string = json.dumps(
{"trained_model": self.to_dict()}, separators=(",", ":")
)
json_string = json.dumps(self.serialize_model(), separators=(",", ":"))
return base64.b64encode(gzip.compress(json_string.encode("utf-8"))).decode(
"ascii"
)
Expand All @@ -50,13 +50,13 @@ class TreeNode:
def __init__(
self,
node_idx: int,
default_left: bool = None,
decision_type: str = None,
left_child: int = None,
right_child: int = None,
split_feature: int = None,
threshold: float = None,
leaf_value: float = None,
default_left: Optional[bool] = None,
decision_type: Optional[str] = None,
left_child: Optional[int] = None,
right_child: Optional[int] = None,
split_feature: Optional[int] = None,
threshold: Optional[float] = None,
leaf_value: Optional[float] = None,
):
self._node_idx = node_idx
self._decision_type = decision_type
Expand All @@ -67,8 +67,8 @@ def __init__(
self._leaf_value = leaf_value
self._default_left = default_left

def to_dict(self):
d = dict()
def to_dict(self) -> Dict[str, Any]:
d: Dict[str, Any] = {}
add_if_exists(d, "node_index", self._node_idx)
add_if_exists(d, "decision_type", self._decision_type)
if self._leaf_value is None:
Expand All @@ -85,9 +85,9 @@ class Tree(ModelSerializer):
def __init__(
self,
feature_names: List[str],
target_type: str = None,
tree_structure: List[TreeNode] = [],
classification_labels: List[str] = None,
target_type: Optional[str] = None,
tree_structure: Optional[List[TreeNode]] = None,
classification_labels: Optional[List[str]] = None,
):
super().__init__(
feature_names=feature_names,
Expand All @@ -96,9 +96,9 @@ def __init__(
)
if target_type == "regression" and classification_labels:
raise ValueError("regression does not support classification_labels")
self._tree_structure = tree_structure
self._tree_structure = tree_structure or []

def to_dict(self):
def to_dict(self) -> Dict[str, Any]:
d = super().to_dict()
add_if_exists(d, "tree_structure", [t.to_dict() for t in self._tree_structure])
return {"tree": d}
Expand All @@ -109,10 +109,10 @@ def __init__(
self,
feature_names: List[str],
trained_models: List[ModelSerializer],
output_aggregator: dict,
target_type: str = None,
classification_labels: List[str] = None,
classification_weights: List[float] = None,
output_aggregator: Dict[str, Any],
target_type: Optional[str] = None,
classification_labels: Optional[List[str]] = None,
classification_weights: Optional[List[float]] = None,
):
super().__init__(
feature_names=feature_names,
Expand All @@ -123,7 +123,7 @@ def __init__(
self._classification_weights = classification_weights
self._output_aggregator = output_aggregator

def to_dict(self):
def to_dict(self) -> Dict[str, Any]:
d = super().to_dict()
trained_models = None
if self._trained_models:
Expand Down
30 changes: 20 additions & 10 deletions eland/ml/_model_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def transform(self) -> Tree:
check_is_fitted(self._model, ["classes_"])
if tree_classes is None:
tree_classes = [str(c) for c in self._model.classes_]
nodes = list()
nodes = []
tree_state = self._model.tree_.__getstate__()
for i in range(len(tree_state["nodes"])):
nodes.append(
Expand Down Expand Up @@ -330,25 +330,24 @@ def build_forest(self) -> List[Tree]:
:return: A list of Tree objects
"""
if self._model.booster not in {"dart", "gbtree"}:
raise ValueError("booster must exist and be of type dart or gbtree")
self.check_model_booster()

tree_table = self._model.trees_to_dataframe()
transformed_trees = list()
transformed_trees = []
curr_tree = None
tree_nodes = list()
tree_nodes = []
for _, row in tree_table.iterrows():
if row["Tree"] != curr_tree:
if len(tree_nodes) > 0:
transformed_trees.append(self.build_tree(tree_nodes))
curr_tree = row["Tree"]
tree_nodes = list()
tree_nodes = []
tree_nodes.append(self.build_tree_node(row, curr_tree))
# add last tree
if len(tree_nodes) > 0:
transformed_trees.append(self.build_tree(tree_nodes))
# We add this stump as XGBoost adds the base_score to the regression outputs
if self._objective.startswith("reg"):
if self._objective.partition(":")[0] == "reg":
transformed_trees.append(self.build_base_score_stump())
return transformed_trees

Expand All @@ -361,9 +360,16 @@ def determine_target_type(self) -> str:
def is_objective_supported(self) -> bool:
return False

def check_model_booster(self):
# xgboost v1 made booster default to 'None' meaning 'gbtree'
if self._model.booster not in {"dart", "gbtree", None}:
raise ValueError(
f"booster must exist and be of type 'dart' or "
f"'gbtree', was {self._model.booster!r}"
)

def transform(self) -> Ensemble:
if self._model.booster not in {"dart", "gbtree"}:
raise ValueError("booster must exist and be of type dart or gbtree")
self.check_model_booster()

if not self.is_objective_supported():
raise ValueError(f"Unsupported objective '{self._objective}'")
Expand All @@ -381,8 +387,12 @@ def transform(self) -> Ensemble:

class XGBoostRegressorTransformer(XGBoostForestTransformer):
def __init__(self, model: XGBRegressor, feature_names: List[str]):
# XGBRegressor.base_score defaults to 0.5.
base_score = model.base_score
if base_score is None:
base_score = 0.5
super().__init__(
model.get_booster(), feature_names, model.base_score, model.objective
model.get_booster(), feature_names, base_score, model.objective
)

def determine_target_type(self) -> str:
Expand Down
13 changes: 9 additions & 4 deletions eland/ml/imported_ml_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def predict(self, X):
>>> # Get some test results
>>> regressor.predict(np.array(test_data))
array([0.23733574, 1.1897984 ], dtype=float32)
array([0.06062475, 0.9990102 ], dtype=float32)
>>> # Serialise the model to Elasticsearch
>>> feature_names = ["f0", "f1", "f2", "f3", "f4", "f5"]
Expand All @@ -199,7 +199,7 @@ def predict(self, X):
>>> # Get some test results from Elasticsearch model
>>> es_model.predict(test_data)
array([0.2373357, 1.1897984], dtype=float32)
array([0.0606248 , 0.99901026], dtype=float32)
>>> # Delete model from Elasticsearch
>>> es_model.delete_model()
Expand All @@ -216,11 +216,16 @@ def predict(self, X):

else: # single feature vector1
doc = dict()
doc["_source"] = dict(zip(self._feature_names, i))
doc["_source"] = dict(zip(self._feature_names, X))
docs.append(doc)
else:
raise NotImplementedError(f"Prediction for type {type(X)}, not supported")

# field_mappings -> field_map in ES 7.7
field_map_name = (
"field_map" if es_version(self._client) >= (7, 7) else "field_mappings"
)

results = self._client.ingest.simulate(
body={
"pipeline": {
Expand All @@ -229,7 +234,7 @@ def predict(self, X):
"inference": {
"model_id": self._model_id,
"inference_config": {self._model_type: {}},
"field_mappings": {},
field_map_name: {},
}
}
]
Expand Down
29 changes: 28 additions & 1 deletion eland/tests/ml/test_imported_ml_model_pytest.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def test_random_forest_regressor(self):
def test_xgb_classifier(self):
# Train model
training_data = datasets.make_classification(n_features=5)
classifier = XGBClassifier()
classifier = XGBClassifier(booster="gbtree")
classifier.fit(training_data[0], training_data[1])

# Get some test results
Expand Down Expand Up @@ -150,9 +150,36 @@ def test_xgb_regressor(self):
es_model = ImportedMLModel(
ES_TEST_CLIENT, model_id, regressor, feature_names, overwrite=True
)

es_results = es_model.predict(test_data)

np.testing.assert_almost_equal(test_results, es_results, decimal=2)

# Clean up
es_model.delete_model()

def test_predict_single_feature_vector(self):
# Train model
training_data = datasets.make_regression(n_features=1)
regressor = XGBRegressor()
regressor.fit(training_data[0], training_data[1])

# Get some test results
test_data = [[0.1]]
test_results = regressor.predict(np.asarray(test_data))

# Serialise the models to Elasticsearch
feature_names = ["f0"]
model_id = "test_xgb_regressor"

es_model = ImportedMLModel(
ES_TEST_CLIENT, model_id, regressor, feature_names, overwrite=True
)

# Single feature
es_results = es_model.predict(test_data[0])

np.testing.assert_almost_equal(test_results, es_results, decimal=2)

# Clean up
es_model.delete_model()
3 changes: 3 additions & 0 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
"eland/index.py",
"eland/query.py",
"eland/tasks.py",
"eland/ml/_model_serializer.py",
}


Expand All @@ -50,6 +51,8 @@ def lint(session):
# TODO: When all files are typed we can change this to .run("mypy", "--strict", "eland/")
session.log("mypy --strict eland/")
for typed_file in TYPED_FILES:
if not os.path.isfile(typed_file):
session.error(f"The file {typed_file!r} couldn't be found")
popen = subprocess.Popen(
f"mypy --strict {typed_file}",
shell=True,
Expand Down
2 changes: 1 addition & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@ pytest>=5.2.1
nbval
numpydoc>=0.9.0
scikit-learn>=0.22.1
xgboost==0.90
xgboost>=1
nox
4 changes: 4 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,4 +177,8 @@
packages=find_packages(include=["eland", "eland.*"]),
install_requires=["elasticsearch==7.7.0a2", "pandas>=1", "matplotlib", "numpy"],
python_requires=">=3.6",
extras_require={
"xgboost": ["xgboost>=0.90,<2"],
"scikit-learn": ["scikit-learn>=0.22.1,<1"],
},
)

0 comments on commit 853ad22

Please sign in to comment.