From f5ff90cd87c0ba77dec21ea528f1b5683b1b8ff0 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 8 Jan 2021 10:01:16 +0800 Subject: [PATCH] Support `_estimator_type`. (#6582) * Use `_estimator_type`. For more info, see: https://scikit-learn.org/stable/developers/develop.html#estimator-types * Model trained from dask can be loaded by single node skl interface. --- python-package/xgboost/sklearn.py | 29 +++++++++++++++++++------ tests/python/test_with_dask.py | 36 +++++++++++++++++++++++++++++++ tests/python/test_with_sklearn.py | 23 ++++++++++++++++++++ 3 files changed, 81 insertions(+), 7 deletions(-) diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 7359ec1242f6..71f593b719fb 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -16,6 +16,12 @@ XGBClassifierBase, XGBRegressorBase, XGBoostLabelEncoder) +class XGBRankerMixIn: # pylint: disable=too-few-public-methods + """MixIn for ranking, defines the _estimator_type usually defined in scikit-learn base + classes.""" + _estimator_type = "ranker" + + def _objective_decorator(func): """Decorate an objective function @@ -407,6 +413,14 @@ def get_num_boosting_rounds(self): """Gets the number of xgboost boosting rounds.""" return self.n_estimators + def _get_type(self) -> str: + if not hasattr(self, '_estimator_type'): + raise TypeError( + "`_estimator_type` undefined. " + "Please use appropriate mixin to define estimator type." + ) + return self._estimator_type # pylint: disable=no-member + def save_model(self, fname: str): """Save the model to a file. @@ -442,7 +456,7 @@ def save_model(self, fname: str): meta[k] = v except TypeError: warnings.warn(str(k) + ' is not saved in Scikit-Learn meta.') - meta['type'] = type(self).__name__ + meta['_estimator_type'] = self._get_type() meta_str = json.dumps(meta) self.get_booster().set_attr(scikit_learn=meta_str) self.get_booster().save_model(fname) @@ -484,11 +498,12 @@ def load_model(self, fname): if k == 'use_label_encoder': self.use_label_encoder = bool(v) continue - if k == 'type' and type(self).__name__ != v: - msg = 'Current model type: {}, '.format(type(self).__name__) + \ - 'type of model in file: {}'.format(v) - raise TypeError(msg) - if k == 'type': + if k == "_estimator_type": + if self._get_type() != v: + raise TypeError( + "Loading an estimator with different type. " + f"Expecting: {self._get_type()}, got: {v}" + ) continue states[k] = v self.__dict__.update(states) @@ -1211,7 +1226,7 @@ def get_num_boosting_rounds(self): then your group array should be ``[3, 4]``. ''') -class XGBRanker(XGBModel): +class XGBRanker(XGBModel, XGBRankerMixIn): # pylint: disable=missing-docstring,too-many-arguments,invalid-name @_deprecate_positional_args def __init__(self, *, objective='rank:pairwise', **kwargs): diff --git a/tests/python/test_with_dask.py b/tests/python/test_with_dask.py index d3c2f988a3c3..180a6440024a 100644 --- a/tests/python/test_with_dask.py +++ b/tests/python/test_with_dask.py @@ -10,6 +10,7 @@ import asyncio import tempfile from sklearn.datasets import make_classification +import sklearn import os import subprocess from hypothesis import given, settings, note @@ -261,6 +262,9 @@ def test_dask_regressor() -> None: with Client(cluster) as client: X, y, w = generate_array(with_weights=True) regressor = xgb.dask.DaskXGBRegressor(verbosity=1, n_estimators=2) + assert regressor._estimator_type == "regressor" + assert sklearn.base.is_regressor(regressor) + regressor.set_params(tree_method='hist') regressor.client = client regressor.fit(X, y, sample_weight=w, eval_set=[(X, y)]) @@ -285,6 +289,9 @@ def test_dask_classifier() -> None: y = (y * 10).astype(np.int32) classifier = xgb.dask.DaskXGBClassifier( verbosity=1, n_estimators=2, eval_metric='merror') + assert classifier._estimator_type == "classifier" + assert sklearn.base.is_classifier(classifier) + classifier.client = client classifier.fit(X, y, sample_weight=w, eval_set=[(X, y)]) prediction = classifier.predict(X) @@ -946,6 +953,35 @@ def worker_fn(worker_addr: str, data_ref: Dict) -> None: # Subtract the on disk resource from each worker assert cnt - n_workers == n_partitions + @pytest.mark.skipif(**tm.no_sklearn()) + def test_sklearn_io(self, client: 'Client') -> None: + from sklearn.datasets import load_digits + X_, y_ = load_digits(return_X_y=True) + X, y = da.from_array(X_), da.from_array(y_) + cls = xgb.dask.DaskXGBClassifier(n_estimators=10) + cls.client = client + cls.fit(X, y) + predt_0 = cls.predict(X) + + with tempfile.TemporaryDirectory() as tmpdir: + path = os.path.join(tmpdir, 'cls.json') + cls.save_model(path) + + cls = xgb.dask.DaskXGBClassifier() + cls.load_model(path) + assert cls.n_classes_ == 10 + predt_1 = cls.predict(X) + + np.testing.assert_allclose(predt_0.compute(), predt_1.compute()) + + # Use single node to load + cls = xgb.XGBClassifier() + cls.load_model(path) + assert cls.n_classes_ == 10 + predt_2 = cls.predict(X_) + + np.testing.assert_allclose(predt_0.compute(), predt_2) + class TestDaskCallbacks: @pytest.mark.skipif(**tm.no_sklearn()) diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index 5d105b5a0c48..99a1a5702f7d 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -1099,3 +1099,26 @@ def test_boost_from_prediction_approx(): @pytest.mark.skipif(**tm.no_sklearn()) def test_boost_from_prediction_exact(): run_boost_from_prediction('exact') + + +def test_estimator_type(): + assert xgb.XGBClassifier._estimator_type == "classifier" + assert xgb.XGBRFClassifier._estimator_type == "classifier" + assert xgb.XGBRegressor._estimator_type == "regressor" + assert xgb.XGBRFRegressor._estimator_type == "regressor" + assert xgb.XGBRanker._estimator_type == "ranker" + + from sklearn.datasets import load_digits + + X, y = load_digits(n_class=2, return_X_y=True) + cls = xgb.XGBClassifier(n_estimators=2).fit(X, y) + with tempfile.TemporaryDirectory() as tmpdir: + path = os.path.join(tmpdir, "cls.json") + cls.save_model(path) + + reg = xgb.XGBRegressor() + with pytest.raises(TypeError): + reg.load_model(path) + + cls = xgb.XGBClassifier() + cls.load_model(path) # no error