Add support for xgboost v1

elastic · Apr 29, 2020 · 3d81def · 3d81def
1 parent df2a21f
commit 3d81def
Show file tree

Hide file tree

Showing 10 changed files with 133 additions and 86 deletions.
diff --git a/.ci/test-matrix.yml b/.ci/test-matrix.yml
@@ -3,6 +3,7 @@
 ELASTICSEARCH_VERSION:
   - 8.0.0-SNAPSHOT
   - 7.x-SNAPSHOT
+  - 7.7-SNAPSHOT
   - 7.6-SNAPSHOT
 
 TEST_SUITE:

diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
@@ -1,9 +1,2 @@
-elasticsearch==7.7.0a2
-pandas>=1
-matplotlib
-pytest>=5.2.1
+-r ../requirements-dev.txt
 git+https://github.com/pandas-dev/pydata-sphinx-theme.git@master
-numpydoc>=0.9.0
-nbsphinx
-scikit-learn
-xgboost==0.90
diff --git a/eland/ml/_model_serializer.py b/eland/ml/_model_serializer.py
@@ -6,41 +6,41 @@
 import gzip
 import json
 from abc import ABC
-from typing import List
+from typing import List, Dict, Any, Optional
 
 
-def add_if_exists(d: dict, k: str, v) -> dict:
+def add_if_exists(d: Dict[str, Any], k: str, v: Any) -> None:
     if v is not None:
         d[k] = v
-    return d
 
 
 class ModelSerializer(ABC):
     def __init__(
         self,
         feature_names: List[str],
-        target_type: str = None,
-        classification_labels: List[str] = None,
+        target_type: Optional[str] = None,
+        classification_labels: Optional[List[str]] = None,
     ):
         self._target_type = target_type
         self._feature_names = feature_names
         self._classification_labels = classification_labels
 
-    def to_dict(self):
-        d = dict()
+    def to_dict(self) -> Dict[str, Any]:
+        d: Dict[str, Any] = {}
         add_if_exists(d, "target_type", self._target_type)
         add_if_exists(d, "feature_names", self._feature_names)
         add_if_exists(d, "classification_labels", self._classification_labels)
         return d
 
     @property
-    def feature_names(self):
+    def feature_names(self) -> List[str]:
         return self._feature_names
 
+    def serialize_model(self) -> Dict[str, Any]:
+        return {"trained_model": self.to_dict()}
+
     def serialize_and_compress_model(self) -> str:
-        json_string = json.dumps(
-            {"trained_model": self.to_dict()}, separators=(",", ":")
-        )
+        json_string = json.dumps(self.serialize_model(), separators=(",", ":"))
         return base64.b64encode(gzip.compress(json_string.encode("utf-8"))).decode(
             "ascii"
         )
@@ -50,13 +50,13 @@ class TreeNode:
     def __init__(
         self,
         node_idx: int,
-        default_left: bool = None,
-        decision_type: str = None,
-        left_child: int = None,
-        right_child: int = None,
-        split_feature: int = None,
-        threshold: float = None,
-        leaf_value: float = None,
+        default_left: Optional[bool] = None,
+        decision_type: Optional[str] = None,
+        left_child: Optional[int] = None,
+        right_child: Optional[int] = None,
+        split_feature: Optional[int] = None,
+        threshold: Optional[float] = None,
+        leaf_value: Optional[float] = None,
     ):
         self._node_idx = node_idx
         self._decision_type = decision_type
@@ -67,8 +67,8 @@ def __init__(
         self._leaf_value = leaf_value
         self._default_left = default_left
 
-    def to_dict(self):
-        d = dict()
+    def to_dict(self) -> Dict[str, Any]:
+        d: Dict[str, Any] = {}
         add_if_exists(d, "node_index", self._node_idx)
         add_if_exists(d, "decision_type", self._decision_type)
         if self._leaf_value is None:
@@ -85,9 +85,9 @@ class Tree(ModelSerializer):
     def __init__(
         self,
         feature_names: List[str],
-        target_type: str = None,
-        tree_structure: List[TreeNode] = [],
-        classification_labels: List[str] = None,
+        target_type: Optional[str] = None,
+        tree_structure: Optional[List[TreeNode]] = None,
+        classification_labels: Optional[List[str]] = None,
     ):
         super().__init__(
             feature_names=feature_names,
@@ -96,9 +96,9 @@ def __init__(
         )
         if target_type == "regression" and classification_labels:
             raise ValueError("regression does not support classification_labels")
-        self._tree_structure = tree_structure
+        self._tree_structure = tree_structure or []
 
-    def to_dict(self):
+    def to_dict(self) -> Dict[str, Any]:
         d = super().to_dict()
         add_if_exists(d, "tree_structure", [t.to_dict() for t in self._tree_structure])
         return {"tree": d}
@@ -109,10 +109,10 @@ def __init__(
         self,
         feature_names: List[str],
         trained_models: List[ModelSerializer],
-        output_aggregator: dict,
-        target_type: str = None,
-        classification_labels: List[str] = None,
-        classification_weights: List[float] = None,
+        output_aggregator: Dict[str, Any],
+        target_type: Optional[str] = None,
+        classification_labels: Optional[List[str]] = None,
+        classification_weights: Optional[List[float]] = None,
     ):
         super().__init__(
             feature_names=feature_names,
@@ -123,7 +123,7 @@ def __init__(
         self._classification_weights = classification_weights
         self._output_aggregator = output_aggregator
 
-    def to_dict(self):
+    def to_dict(self) -> Dict[str, Any]:
         d = super().to_dict()
         trained_models = None
         if self._trained_models:

diff --git a/eland/ml/_model_transformers.py b/eland/ml/_model_transformers.py
@@ -2,7 +2,7 @@
 # Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
 # See the LICENSE file in the project root for more information
 
-from typing import List, Union
+from typing import List, Union, Optional
 
 import numpy as np
 
@@ -23,8 +23,8 @@ def __init__(
         self,
         model,
         feature_names: List[str],
-        classification_labels: List[str] = None,
-        classification_weights: List[float] = None,
+        classification_labels: Optional[List[str]] = None,
+        classification_weights: Optional[List[float]] = None,
     ):
         self._feature_names = feature_names
         self._model = model
@@ -56,8 +56,8 @@ def __init__(
         self,
         model,
         feature_names: List[str],
-        classification_labels: List[str] = None,
-        classification_weights: List[float] = None,
+        classification_labels: Optional[List[str]] = None,
+        classification_weights: Optional[List[float]] = None,
     ):
         """
         Base class for SKLearn transformations
@@ -120,7 +120,7 @@ def __init__(
         self,
         model: Union[DecisionTreeRegressor, DecisionTreeClassifier],
         feature_names: List[str],
-        classification_labels: List[str] = None,
+        classification_labels: Optional[List[str]] = None,
     ):
         """
         Transforms a Decision Tree model (Regressor|Classifier) into a ES Supported Tree format
@@ -148,7 +148,7 @@ def transform(self) -> Tree:
             check_is_fitted(self._model, ["classes_"])
             if tree_classes is None:
                 tree_classes = [str(c) for c in self._model.classes_]
-        nodes = list()
+        nodes = []
         tree_state = self._model.tree_.__getstate__()
         for i in range(len(tree_state["nodes"])):
             nodes.append(
@@ -169,8 +169,8 @@ def __init__(
         self,
         model: Union[RandomForestClassifier, RandomForestRegressor],
         feature_names: List[str],
-        classification_labels: List[str] = None,
-        classification_weights: List[float] = None,
+        classification_labels: Optional[List[str]] = None,
+        classification_weights: Optional[List[float]] = None,
     ):
         super().__init__(
             model, feature_names, classification_labels, classification_weights
@@ -235,7 +235,7 @@ def __init__(
         self,
         model: RandomForestClassifier,
         feature_names: List[str],
-        classification_labels: List[str] = None,
+        classification_labels: Optional[List[str]] = None,
     ):
         super().__init__(model, feature_names, classification_labels)
 
@@ -259,8 +259,8 @@ def __init__(
         feature_names: List[str],
         base_score: float = 0.5,
         objective: str = "reg:squarederror",
-        classification_labels: List[str] = None,
-        classification_weights: List[float] = None,
+        classification_labels: Optional[List[str]] = None,
+        classification_weights: Optional[List[float]] = None,
     ):
         super().__init__(
             model, feature_names, classification_labels, classification_weights
@@ -330,25 +330,24 @@ def build_forest(self) -> List[Tree]:
 
         :return: A list of Tree objects
         """
-        if self._model.booster not in {"dart", "gbtree"}:
-            raise ValueError("booster must exist and be of type dart or gbtree")
+        self.check_model_booster()
 
         tree_table = self._model.trees_to_dataframe()
-        transformed_trees = list()
+        transformed_trees = []
         curr_tree = None
-        tree_nodes = list()
+        tree_nodes = []
         for _, row in tree_table.iterrows():
             if row["Tree"] != curr_tree:
                 if len(tree_nodes) > 0:
                     transformed_trees.append(self.build_tree(tree_nodes))
                 curr_tree = row["Tree"]
-                tree_nodes = list()
+                tree_nodes = []
             tree_nodes.append(self.build_tree_node(row, curr_tree))
             # add last tree
         if len(tree_nodes) > 0:
             transformed_trees.append(self.build_tree(tree_nodes))
         # We add this stump as XGBoost adds the base_score to the regression outputs
-        if self._objective.startswith("reg"):
+        if self._objective.partition(":")[0] == "reg":
             transformed_trees.append(self.build_base_score_stump())
         return transformed_trees
 
@@ -361,9 +360,16 @@ def determine_target_type(self) -> str:
     def is_objective_supported(self) -> bool:
         return False
 
+    def check_model_booster(self):
+        # xgboost v1 made booster default to 'None' meaning 'gbtree'
+        if self._model.booster not in {"dart", "gbtree", None}:
+            raise ValueError(
+                f"booster must exist and be of type 'dart' or "
+                f"'gbtree', was {self._model.booster!r}"
+            )
+
     def transform(self) -> Ensemble:
-        if self._model.booster not in {"dart", "gbtree"}:
-            raise ValueError("booster must exist and be of type dart or gbtree")
+        self.check_model_booster()
 
         if not self.is_objective_supported():
             raise ValueError(f"Unsupported objective '{self._objective}'")
@@ -381,8 +387,12 @@ def transform(self) -> Ensemble:
 
 class XGBoostRegressorTransformer(XGBoostForestTransformer):
     def __init__(self, model: XGBRegressor, feature_names: List[str]):
+        # XGBRegressor.base_score defaults to 0.5.
+        base_score = model.base_score
+        if base_score is None:
+            base_score = 0.5
         super().__init__(
-            model.get_booster(), feature_names, model.base_score, model.objective
+            model.get_booster(), feature_names, base_score, model.objective
         )
 
     def determine_target_type(self) -> str:
@@ -405,7 +415,7 @@ def __init__(
         self,
         model: XGBClassifier,
         feature_names: List[str],
-        classification_labels: List[str] = None,
+        classification_labels: Optional[List[str]] = None,
     ):
         super().__init__(
             model.get_booster(),