From f5ff90cd87c0ba77dec21ea528f1b5683b1b8ff0 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 8 Jan 2021 10:01:16 +0800
Subject: [PATCH] Support `_estimator_type`. (#6582)

* Use `_estimator_type`.

For more info, see: https://scikit-learn.org/stable/developers/develop.html#estimator-types

* Model trained from dask can be loaded by single node skl interface.
---
 python-package/xgboost/sklearn.py | 29 +++++++++++++++++++------
 tests/python/test_with_dask.py    | 36 +++++++++++++++++++++++++++++++
 tests/python/test_with_sklearn.py | 23 ++++++++++++++++++++
 3 files changed, 81 insertions(+), 7 deletions(-)

diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 7359ec1242f6..71f593b719fb 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -16,6 +16,12 @@
                      XGBClassifierBase, XGBRegressorBase, XGBoostLabelEncoder)
 
 
+class XGBRankerMixIn:           # pylint: disable=too-few-public-methods
+    """MixIn for ranking, defines the _estimator_type usually defined in scikit-learn base
+    classes."""
+    _estimator_type = "ranker"
+
+
 def _objective_decorator(func):
     """Decorate an objective function
 
@@ -407,6 +413,14 @@ def get_num_boosting_rounds(self):
         """Gets the number of xgboost boosting rounds."""
         return self.n_estimators
 
+    def _get_type(self) -> str:
+        if not hasattr(self, '_estimator_type'):
+            raise TypeError(
+                "`_estimator_type` undefined.  "
+                "Please use appropriate mixin to define estimator type."
+            )
+        return self._estimator_type  # pylint: disable=no-member
+
     def save_model(self, fname: str):
         """Save the model to a file.
 
@@ -442,7 +456,7 @@ def save_model(self, fname: str):
                 meta[k] = v
             except TypeError:
                 warnings.warn(str(k) + ' is not saved in Scikit-Learn meta.')
-        meta['type'] = type(self).__name__
+        meta['_estimator_type'] = self._get_type()
         meta_str = json.dumps(meta)
         self.get_booster().set_attr(scikit_learn=meta_str)
         self.get_booster().save_model(fname)
@@ -484,11 +498,12 @@ def load_model(self, fname):
             if k == 'use_label_encoder':
                 self.use_label_encoder = bool(v)
                 continue
-            if k == 'type' and type(self).__name__ != v:
-                msg = 'Current model type: {}, '.format(type(self).__name__) + \
-                      'type of model in file: {}'.format(v)
-                raise TypeError(msg)
-            if k == 'type':
+            if k == "_estimator_type":
+                if self._get_type() != v:
+                    raise TypeError(
+                        "Loading an estimator with different type. "
+                        f"Expecting: {self._get_type()}, got: {v}"
+                    )
                 continue
             states[k] = v
         self.__dict__.update(states)
@@ -1211,7 +1226,7 @@ def get_num_boosting_rounds(self):
 
         then your group array should be ``[3, 4]``.
 ''')
-class XGBRanker(XGBModel):
+class XGBRanker(XGBModel, XGBRankerMixIn):
     # pylint: disable=missing-docstring,too-many-arguments,invalid-name
     @_deprecate_positional_args
     def __init__(self, *, objective='rank:pairwise', **kwargs):
diff --git a/tests/python/test_with_dask.py b/tests/python/test_with_dask.py
index d3c2f988a3c3..180a6440024a 100644
--- a/tests/python/test_with_dask.py
+++ b/tests/python/test_with_dask.py
@@ -10,6 +10,7 @@
 import asyncio
 import tempfile
 from sklearn.datasets import make_classification
+import sklearn
 import os
 import subprocess
 from hypothesis import given, settings, note
@@ -261,6 +262,9 @@ def test_dask_regressor() -> None:
         with Client(cluster) as client:
             X, y, w = generate_array(with_weights=True)
             regressor = xgb.dask.DaskXGBRegressor(verbosity=1, n_estimators=2)
+            assert regressor._estimator_type == "regressor"
+            assert sklearn.base.is_regressor(regressor)
+
             regressor.set_params(tree_method='hist')
             regressor.client = client
             regressor.fit(X, y, sample_weight=w, eval_set=[(X, y)])
@@ -285,6 +289,9 @@ def test_dask_classifier() -> None:
             y = (y * 10).astype(np.int32)
             classifier = xgb.dask.DaskXGBClassifier(
                 verbosity=1, n_estimators=2, eval_metric='merror')
+            assert classifier._estimator_type == "classifier"
+            assert sklearn.base.is_classifier(classifier)
+
             classifier.client = client
             classifier.fit(X, y, sample_weight=w, eval_set=[(X, y)])
             prediction = classifier.predict(X)
@@ -946,6 +953,35 @@ def worker_fn(worker_addr: str, data_ref: Dict) -> None:
                 # Subtract the on disk resource from each worker
                 assert cnt - n_workers == n_partitions
 
+    @pytest.mark.skipif(**tm.no_sklearn())
+    def test_sklearn_io(self, client: 'Client') -> None:
+        from sklearn.datasets import load_digits
+        X_, y_ = load_digits(return_X_y=True)
+        X, y = da.from_array(X_), da.from_array(y_)
+        cls = xgb.dask.DaskXGBClassifier(n_estimators=10)
+        cls.client = client
+        cls.fit(X, y)
+        predt_0 = cls.predict(X)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = os.path.join(tmpdir, 'cls.json')
+            cls.save_model(path)
+
+            cls = xgb.dask.DaskXGBClassifier()
+            cls.load_model(path)
+            assert cls.n_classes_ == 10
+            predt_1 = cls.predict(X)
+
+            np.testing.assert_allclose(predt_0.compute(), predt_1.compute())
+
+            # Use single node to load
+            cls = xgb.XGBClassifier()
+            cls.load_model(path)
+            assert cls.n_classes_ == 10
+            predt_2 = cls.predict(X_)
+
+            np.testing.assert_allclose(predt_0.compute(), predt_2)
+
 
 class TestDaskCallbacks:
     @pytest.mark.skipif(**tm.no_sklearn())
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index 5d105b5a0c48..99a1a5702f7d 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -1099,3 +1099,26 @@ def test_boost_from_prediction_approx():
 @pytest.mark.skipif(**tm.no_sklearn())
 def test_boost_from_prediction_exact():
     run_boost_from_prediction('exact')
+
+
+def test_estimator_type():
+    assert xgb.XGBClassifier._estimator_type == "classifier"
+    assert xgb.XGBRFClassifier._estimator_type == "classifier"
+    assert xgb.XGBRegressor._estimator_type == "regressor"
+    assert xgb.XGBRFRegressor._estimator_type == "regressor"
+    assert xgb.XGBRanker._estimator_type == "ranker"
+
+    from sklearn.datasets import load_digits
+
+    X, y = load_digits(n_class=2, return_X_y=True)
+    cls = xgb.XGBClassifier(n_estimators=2).fit(X, y)
+    with tempfile.TemporaryDirectory() as tmpdir:
+        path = os.path.join(tmpdir, "cls.json")
+        cls.save_model(path)
+
+        reg = xgb.XGBRegressor()
+        with pytest.raises(TypeError):
+            reg.load_model(path)
+
+        cls = xgb.XGBClassifier()
+        cls.load_model(path)  # no error