From 853ad22cd40d56fe7fcb3948c4a75c6ac669211b Mon Sep 17 00:00:00 2001 From: Seth Michael Larson Date: Wed, 29 Apr 2020 09:41:44 -0500 Subject: [PATCH] Add support for xgboost v1 --- .ci/test-matrix.yml | 1 + docs/requirements-docs.txt | 9 +-- eland/ml/_model_serializer.py | 60 +++++++++---------- eland/ml/_model_transformers.py | 30 ++++++---- eland/ml/imported_ml_model.py | 13 ++-- .../tests/ml/test_imported_ml_model_pytest.py | 29 ++++++++- noxfile.py | 3 + requirements-dev.txt | 2 +- setup.py | 4 ++ 9 files changed, 97 insertions(+), 54 deletions(-) diff --git a/.ci/test-matrix.yml b/.ci/test-matrix.yml index b50a16cc..de241cfd 100755 --- a/.ci/test-matrix.yml +++ b/.ci/test-matrix.yml @@ -3,6 +3,7 @@ ELASTICSEARCH_VERSION: - 8.0.0-SNAPSHOT - 7.x-SNAPSHOT + - 7.7-SNAPSHOT - 7.6-SNAPSHOT TEST_SUITE: diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 22dd2600..605befe9 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -1,9 +1,2 @@ -elasticsearch==7.7.0a2 -pandas>=1 -matplotlib -pytest>=5.2.1 +-r ../requirements-dev.txt git+https://github.com/pandas-dev/pydata-sphinx-theme.git@master -numpydoc>=0.9.0 -nbsphinx -scikit-learn -xgboost==0.90 diff --git a/eland/ml/_model_serializer.py b/eland/ml/_model_serializer.py index 3997efa8..94ea694e 100644 --- a/eland/ml/_model_serializer.py +++ b/eland/ml/_model_serializer.py @@ -6,41 +6,41 @@ import gzip import json from abc import ABC -from typing import List +from typing import List, Dict, Any, Optional -def add_if_exists(d: dict, k: str, v) -> dict: +def add_if_exists(d: Dict[str, Any], k: str, v: Any) -> None: if v is not None: d[k] = v - return d class ModelSerializer(ABC): def __init__( self, feature_names: List[str], - target_type: str = None, - classification_labels: List[str] = None, + target_type: Optional[str] = None, + classification_labels: Optional[List[str]] = None, ): self._target_type = target_type self._feature_names = feature_names self._classification_labels = classification_labels - def to_dict(self): - d = dict() + def to_dict(self) -> Dict[str, Any]: + d: Dict[str, Any] = {} add_if_exists(d, "target_type", self._target_type) add_if_exists(d, "feature_names", self._feature_names) add_if_exists(d, "classification_labels", self._classification_labels) return d @property - def feature_names(self): + def feature_names(self) -> List[str]: return self._feature_names + def serialize_model(self) -> Dict[str, Any]: + return {"trained_model": self.to_dict()} + def serialize_and_compress_model(self) -> str: - json_string = json.dumps( - {"trained_model": self.to_dict()}, separators=(",", ":") - ) + json_string = json.dumps(self.serialize_model(), separators=(",", ":")) return base64.b64encode(gzip.compress(json_string.encode("utf-8"))).decode( "ascii" ) @@ -50,13 +50,13 @@ class TreeNode: def __init__( self, node_idx: int, - default_left: bool = None, - decision_type: str = None, - left_child: int = None, - right_child: int = None, - split_feature: int = None, - threshold: float = None, - leaf_value: float = None, + default_left: Optional[bool] = None, + decision_type: Optional[str] = None, + left_child: Optional[int] = None, + right_child: Optional[int] = None, + split_feature: Optional[int] = None, + threshold: Optional[float] = None, + leaf_value: Optional[float] = None, ): self._node_idx = node_idx self._decision_type = decision_type @@ -67,8 +67,8 @@ def __init__( self._leaf_value = leaf_value self._default_left = default_left - def to_dict(self): - d = dict() + def to_dict(self) -> Dict[str, Any]: + d: Dict[str, Any] = {} add_if_exists(d, "node_index", self._node_idx) add_if_exists(d, "decision_type", self._decision_type) if self._leaf_value is None: @@ -85,9 +85,9 @@ class Tree(ModelSerializer): def __init__( self, feature_names: List[str], - target_type: str = None, - tree_structure: List[TreeNode] = [], - classification_labels: List[str] = None, + target_type: Optional[str] = None, + tree_structure: Optional[List[TreeNode]] = None, + classification_labels: Optional[List[str]] = None, ): super().__init__( feature_names=feature_names, @@ -96,9 +96,9 @@ def __init__( ) if target_type == "regression" and classification_labels: raise ValueError("regression does not support classification_labels") - self._tree_structure = tree_structure + self._tree_structure = tree_structure or [] - def to_dict(self): + def to_dict(self) -> Dict[str, Any]: d = super().to_dict() add_if_exists(d, "tree_structure", [t.to_dict() for t in self._tree_structure]) return {"tree": d} @@ -109,10 +109,10 @@ def __init__( self, feature_names: List[str], trained_models: List[ModelSerializer], - output_aggregator: dict, - target_type: str = None, - classification_labels: List[str] = None, - classification_weights: List[float] = None, + output_aggregator: Dict[str, Any], + target_type: Optional[str] = None, + classification_labels: Optional[List[str]] = None, + classification_weights: Optional[List[float]] = None, ): super().__init__( feature_names=feature_names, @@ -123,7 +123,7 @@ def __init__( self._classification_weights = classification_weights self._output_aggregator = output_aggregator - def to_dict(self): + def to_dict(self) -> Dict[str, Any]: d = super().to_dict() trained_models = None if self._trained_models: diff --git a/eland/ml/_model_transformers.py b/eland/ml/_model_transformers.py index 2ec30bc4..12b99453 100644 --- a/eland/ml/_model_transformers.py +++ b/eland/ml/_model_transformers.py @@ -148,7 +148,7 @@ def transform(self) -> Tree: check_is_fitted(self._model, ["classes_"]) if tree_classes is None: tree_classes = [str(c) for c in self._model.classes_] - nodes = list() + nodes = [] tree_state = self._model.tree_.__getstate__() for i in range(len(tree_state["nodes"])): nodes.append( @@ -330,25 +330,24 @@ def build_forest(self) -> List[Tree]: :return: A list of Tree objects """ - if self._model.booster not in {"dart", "gbtree"}: - raise ValueError("booster must exist and be of type dart or gbtree") + self.check_model_booster() tree_table = self._model.trees_to_dataframe() - transformed_trees = list() + transformed_trees = [] curr_tree = None - tree_nodes = list() + tree_nodes = [] for _, row in tree_table.iterrows(): if row["Tree"] != curr_tree: if len(tree_nodes) > 0: transformed_trees.append(self.build_tree(tree_nodes)) curr_tree = row["Tree"] - tree_nodes = list() + tree_nodes = [] tree_nodes.append(self.build_tree_node(row, curr_tree)) # add last tree if len(tree_nodes) > 0: transformed_trees.append(self.build_tree(tree_nodes)) # We add this stump as XGBoost adds the base_score to the regression outputs - if self._objective.startswith("reg"): + if self._objective.partition(":")[0] == "reg": transformed_trees.append(self.build_base_score_stump()) return transformed_trees @@ -361,9 +360,16 @@ def determine_target_type(self) -> str: def is_objective_supported(self) -> bool: return False + def check_model_booster(self): + # xgboost v1 made booster default to 'None' meaning 'gbtree' + if self._model.booster not in {"dart", "gbtree", None}: + raise ValueError( + f"booster must exist and be of type 'dart' or " + f"'gbtree', was {self._model.booster!r}" + ) + def transform(self) -> Ensemble: - if self._model.booster not in {"dart", "gbtree"}: - raise ValueError("booster must exist and be of type dart or gbtree") + self.check_model_booster() if not self.is_objective_supported(): raise ValueError(f"Unsupported objective '{self._objective}'") @@ -381,8 +387,12 @@ def transform(self) -> Ensemble: class XGBoostRegressorTransformer(XGBoostForestTransformer): def __init__(self, model: XGBRegressor, feature_names: List[str]): + # XGBRegressor.base_score defaults to 0.5. + base_score = model.base_score + if base_score is None: + base_score = 0.5 super().__init__( - model.get_booster(), feature_names, model.base_score, model.objective + model.get_booster(), feature_names, base_score, model.objective ) def determine_target_type(self) -> str: diff --git a/eland/ml/imported_ml_model.py b/eland/ml/imported_ml_model.py index e76e5b12..dc95eda7 100644 --- a/eland/ml/imported_ml_model.py +++ b/eland/ml/imported_ml_model.py @@ -190,7 +190,7 @@ def predict(self, X): >>> # Get some test results >>> regressor.predict(np.array(test_data)) - array([0.23733574, 1.1897984 ], dtype=float32) + array([0.06062475, 0.9990102 ], dtype=float32) >>> # Serialise the model to Elasticsearch >>> feature_names = ["f0", "f1", "f2", "f3", "f4", "f5"] @@ -199,7 +199,7 @@ def predict(self, X): >>> # Get some test results from Elasticsearch model >>> es_model.predict(test_data) - array([0.2373357, 1.1897984], dtype=float32) + array([0.0606248 , 0.99901026], dtype=float32) >>> # Delete model from Elasticsearch >>> es_model.delete_model() @@ -216,11 +216,16 @@ def predict(self, X): else: # single feature vector1 doc = dict() - doc["_source"] = dict(zip(self._feature_names, i)) + doc["_source"] = dict(zip(self._feature_names, X)) docs.append(doc) else: raise NotImplementedError(f"Prediction for type {type(X)}, not supported") + # field_mappings -> field_map in ES 7.7 + field_map_name = ( + "field_map" if es_version(self._client) >= (7, 7) else "field_mappings" + ) + results = self._client.ingest.simulate( body={ "pipeline": { @@ -229,7 +234,7 @@ def predict(self, X): "inference": { "model_id": self._model_id, "inference_config": {self._model_type: {}}, - "field_mappings": {}, + field_map_name: {}, } } ] diff --git a/eland/tests/ml/test_imported_ml_model_pytest.py b/eland/tests/ml/test_imported_ml_model_pytest.py index 746891db..62acbc5b 100644 --- a/eland/tests/ml/test_imported_ml_model_pytest.py +++ b/eland/tests/ml/test_imported_ml_model_pytest.py @@ -112,7 +112,7 @@ def test_random_forest_regressor(self): def test_xgb_classifier(self): # Train model training_data = datasets.make_classification(n_features=5) - classifier = XGBClassifier() + classifier = XGBClassifier(booster="gbtree") classifier.fit(training_data[0], training_data[1]) # Get some test results @@ -150,9 +150,36 @@ def test_xgb_regressor(self): es_model = ImportedMLModel( ES_TEST_CLIENT, model_id, regressor, feature_names, overwrite=True ) + es_results = es_model.predict(test_data) np.testing.assert_almost_equal(test_results, es_results, decimal=2) # Clean up es_model.delete_model() + + def test_predict_single_feature_vector(self): + # Train model + training_data = datasets.make_regression(n_features=1) + regressor = XGBRegressor() + regressor.fit(training_data[0], training_data[1]) + + # Get some test results + test_data = [[0.1]] + test_results = regressor.predict(np.asarray(test_data)) + + # Serialise the models to Elasticsearch + feature_names = ["f0"] + model_id = "test_xgb_regressor" + + es_model = ImportedMLModel( + ES_TEST_CLIENT, model_id, regressor, feature_names, overwrite=True + ) + + # Single feature + es_results = es_model.predict(test_data[0]) + + np.testing.assert_almost_equal(test_results, es_results, decimal=2) + + # Clean up + es_model.delete_model() diff --git a/noxfile.py b/noxfile.py index 200d6478..4c039c4f 100644 --- a/noxfile.py +++ b/noxfile.py @@ -29,6 +29,7 @@ "eland/index.py", "eland/query.py", "eland/tasks.py", + "eland/ml/_model_serializer.py", } @@ -50,6 +51,8 @@ def lint(session): # TODO: When all files are typed we can change this to .run("mypy", "--strict", "eland/") session.log("mypy --strict eland/") for typed_file in TYPED_FILES: + if not os.path.isfile(typed_file): + session.error(f"The file {typed_file!r} couldn't be found") popen = subprocess.Popen( f"mypy --strict {typed_file}", shell=True, diff --git a/requirements-dev.txt b/requirements-dev.txt index 0d246ab4..fb45e77e 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -5,5 +5,5 @@ pytest>=5.2.1 nbval numpydoc>=0.9.0 scikit-learn>=0.22.1 -xgboost==0.90 +xgboost>=1 nox diff --git a/setup.py b/setup.py index 85d364a5..368fba8e 100644 --- a/setup.py +++ b/setup.py @@ -177,4 +177,8 @@ packages=find_packages(include=["eland", "eland.*"]), install_requires=["elasticsearch==7.7.0a2", "pandas>=1", "matplotlib", "numpy"], python_requires=">=3.6", + extras_require={ + "xgboost": ["xgboost>=0.90,<2"], + "scikit-learn": ["scikit-learn>=0.22.1,<1"], + }, )