From 1adb77be7016bf8510fbdaea106a97e155644e06 Mon Sep 17 00:00:00 2001 From: vnherdeiro Date: Wed, 11 Sep 2024 11:40:51 +0100 Subject: [PATCH 01/33] __sklearn_tags__ replacing sklearn's BaseEstimator._more_tags_ --- python-package/lightgbm/sklearn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index ad805eef7332..497f8f6c0b8f 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -662,7 +662,7 @@ def __init__( self._n_classes: int = -1 self.set_params(**kwargs) - def _more_tags(self) -> Dict[str, Any]: + def __sklearn_tags__(self) -> Dict[str, Any]: return { "allow_nan": True, "X_types": ["2darray", "sparse", "1dlabels"], From 8ed87d2c0ce5ffa9ce287ae567d973424102bd41 Mon Sep 17 00:00:00 2001 From: vnherdeiro Date: Wed, 11 Sep 2024 12:01:17 +0100 Subject: [PATCH 02/33] fixing tags dict -> dataclass --- python-package/lightgbm/compat.py | 2 ++ python-package/lightgbm/sklearn.py | 16 +++++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index f916ca6be723..07b091e7094d 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -12,6 +12,7 @@ from sklearn.utils.class_weight import compute_sample_weight from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.validation import assert_all_finite, check_array, check_X_y + from sklearn.utils import Tags try: from sklearn.exceptions import NotFittedError @@ -44,6 +45,7 @@ def _check_sample_weight(sample_weight: Any, X: Any, dtype: Any = None) -> Any: _LGBMAssertAllFinite = assert_all_finite _LGBMCheckClassificationTargets = check_classification_targets _LGBMComputeSampleWeight = compute_sample_weight + _LGBMTags = Tags except ImportError: SKLEARN_INSTALLED = False diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 497f8f6c0b8f..6dcbb57dd665 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -24,6 +24,7 @@ _LGBM_InitScoreType, _LGBM_LabelType, _LGBM_WeightType, + _LGBMTags, _log_warning, ) from .callback import _EvalResultDict, record_evaluation @@ -662,16 +663,17 @@ def __init__( self._n_classes: int = -1 self.set_params(**kwargs) - def __sklearn_tags__(self) -> Dict[str, Any]: - return { - "allow_nan": True, - "X_types": ["2darray", "sparse", "1dlabels"], - "_xfail_checks": { + def __sklearn_tags__(self) -> _LGBMTags: + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + tags.input_tags.sparse = True + tags.target_tags.one_d_labels = True + tags._xfail_checks = { "check_no_attributes_set_in_init": "scikit-learn incorrectly asserts that private attributes " "cannot be set in __init__: " "(see https://github.com/microsoft/LightGBM/issues/2628)" - }, - } + } + return tags def __sklearn_is_fitted__(self) -> bool: return getattr(self, "fitted_", False) From 32ec431605a091328270cb85a408e4883ceeab3c Mon Sep 17 00:00:00 2001 From: vnherdeiro Date: Wed, 11 Sep 2024 12:03:16 +0100 Subject: [PATCH 03/33] fixing wrong import --- python-package/lightgbm/sklearn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 6dcbb57dd665..c41264a437be 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -24,7 +24,6 @@ _LGBM_InitScoreType, _LGBM_LabelType, _LGBM_WeightType, - _LGBMTags, _log_warning, ) from .callback import _EvalResultDict, record_evaluation @@ -42,6 +41,7 @@ _LGBMLabelEncoder, _LGBMModelBase, _LGBMRegressorBase, + _LGBMTags, dt_DataTable, pd_DataFrame, ) From ade9798d46a45d1b419dccc4f7e7b9a8e132b798 Mon Sep 17 00:00:00 2001 From: vnherdeiro Date: Wed, 11 Sep 2024 12:18:43 +0100 Subject: [PATCH 04/33] remove type hint --- python-package/lightgbm/sklearn.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index c41264a437be..1b64338408af 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -41,7 +41,6 @@ _LGBMLabelEncoder, _LGBMModelBase, _LGBMRegressorBase, - _LGBMTags, dt_DataTable, pd_DataFrame, ) @@ -663,7 +662,7 @@ def __init__( self._n_classes: int = -1 self.set_params(**kwargs) - def __sklearn_tags__(self) -> _LGBMTags: + def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.allow_nan = True tags.input_tags.sparse = True From 2085a128529446a0f00c813a8184ce339d0865ea Mon Sep 17 00:00:00 2001 From: vnherdeiro Date: Wed, 11 Sep 2024 12:22:16 +0100 Subject: [PATCH 05/33] remove type hint --- python-package/lightgbm/compat.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index 07b091e7094d..f916ca6be723 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -12,7 +12,6 @@ from sklearn.utils.class_weight import compute_sample_weight from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.validation import assert_all_finite, check_array, check_X_y - from sklearn.utils import Tags try: from sklearn.exceptions import NotFittedError @@ -45,7 +44,6 @@ def _check_sample_weight(sample_weight: Any, X: Any, dtype: Any = None) -> Any: _LGBMAssertAllFinite = assert_all_finite _LGBMCheckClassificationTargets = check_classification_targets _LGBMComputeSampleWeight = compute_sample_weight - _LGBMTags = Tags except ImportError: SKLEARN_INSTALLED = False From a9ec348890dce2e4b92db9dc6f87527722374307 Mon Sep 17 00:00:00 2001 From: vnherdeiro Date: Wed, 11 Sep 2024 12:30:24 +0100 Subject: [PATCH 06/33] fix linting --- python-package/lightgbm/sklearn.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 1b64338408af..68dd78c5fb18 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -668,10 +668,10 @@ def __sklearn_tags__(self): tags.input_tags.sparse = True tags.target_tags.one_d_labels = True tags._xfail_checks = { - "check_no_attributes_set_in_init": "scikit-learn incorrectly asserts that private attributes " - "cannot be set in __init__: " - "(see https://github.com/microsoft/LightGBM/issues/2628)" - } + "check_no_attributes_set_in_init": "scikit-learn incorrectly asserts that private attributes " + "cannot be set in __init__: " + "(see https://github.com/microsoft/LightGBM/issues/2628)" + } return tags def __sklearn_is_fitted__(self) -> bool: From fcc4e1299c426e6aeec938cf6832b6cf330b709c Mon Sep 17 00:00:00 2001 From: vnherdeiro Date: Sat, 14 Sep 2024 10:11:06 +0100 Subject: [PATCH 07/33] triggering new CI (scikit-learn dev has changed) From 3b15646fe22d0036910cfcda39e6bcaa1a3306a6 Mon Sep 17 00:00:00 2001 From: vnherdeiro Date: Sun, 15 Sep 2024 13:10:48 +0100 Subject: [PATCH 08/33] bringing back _more_tags, adding convertsion from more_tags to sklearn_tags --- python-package/lightgbm/sklearn.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 68dd78c5fb18..c189ca503a87 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -662,16 +662,27 @@ def __init__( self._n_classes: int = -1 self.set_params(**kwargs) + def _more_tags(self) -> Dict[str, Any]: + return { + "allow_nan": True, + "X_types": ["2darray", "sparse", "1dlabels"], + "_xfail_checks": { + "check_no_attributes_set_in_init": "scikit-learn incorrectly asserts that private attributes " + "cannot be set in __init__: " + "(see https://github.com/microsoft/LightGBM/issues/2628)" + }, + } + def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.input_tags.allow_nan = True - tags.input_tags.sparse = True - tags.target_tags.one_d_labels = True - tags._xfail_checks = { - "check_no_attributes_set_in_init": "scikit-learn incorrectly asserts that private attributes " - "cannot be set in __init__: " - "(see https://github.com/microsoft/LightGBM/issues/2628)" - } + more_tags = self._more_tags() + tags.input_tags.allow_nan = more_tags.pop('allow_nan', False) + tagged_input_types = more_tags.pop('X_types', []) + tags.input_tags.sparse = 'sparse' in tagged_input_types + tags.target_tags.one_d_labels = '1dlabels' in tagged_input_types + tags._xfail_checks = more_tags.pop('_xfail_checks', {}) + if more_tags or set(tagged_input_types).difference({"2darray", "sparse", "1dlabels"}): + _log_warning(f'Some tags sklearn tag values are missing from __sklearn_tags__: `{more_tags}`') return tags def __sklearn_is_fitted__(self) -> bool: From 34d9eb4d0abaa75de064b6e95dc1b5979af6e561 Mon Sep 17 00:00:00 2001 From: vnherdeiro Date: Sun, 15 Sep 2024 19:19:11 +0100 Subject: [PATCH 09/33] lint fix --- python-package/lightgbm/sklearn.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index c189ca503a87..e7e57dba2ba7 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -676,13 +676,13 @@ def _more_tags(self) -> Dict[str, Any]: def __sklearn_tags__(self): tags = super().__sklearn_tags__() more_tags = self._more_tags() - tags.input_tags.allow_nan = more_tags.pop('allow_nan', False) - tagged_input_types = more_tags.pop('X_types', []) - tags.input_tags.sparse = 'sparse' in tagged_input_types - tags.target_tags.one_d_labels = '1dlabels' in tagged_input_types - tags._xfail_checks = more_tags.pop('_xfail_checks', {}) + tags.input_tags.allow_nan = more_tags.pop("allow_nan", False) + tagged_input_types = more_tags.pop("X_types", []) + tags.input_tags.sparse = "sparse" in tagged_input_types + tags.target_tags.one_d_labels = "1dlabels" in tagged_input_types + tags._xfail_checks = more_tags.pop("_xfail_checks", {}) if more_tags or set(tagged_input_types).difference({"2darray", "sparse", "1dlabels"}): - _log_warning(f'Some tags sklearn tag values are missing from __sklearn_tags__: `{more_tags}`') + _log_warning(f"Some tags sklearn tag values are missing from __sklearn_tags__: `{more_tags}`") return tags def __sklearn_is_fitted__(self) -> bool: From 6d20ef858362d468e2008ce343312d9d70908d6b Mon Sep 17 00:00:00 2001 From: vnherdeiro Date: Mon, 16 Sep 2024 07:34:15 +0100 Subject: [PATCH 10/33] Update python-package/lightgbm/sklearn.py Co-authored-by: James Lamb --- python-package/lightgbm/sklearn.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index e7e57dba2ba7..b974e7c8d097 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -676,11 +676,10 @@ def _more_tags(self) -> Dict[str, Any]: def __sklearn_tags__(self): tags = super().__sklearn_tags__() more_tags = self._more_tags() - tags.input_tags.allow_nan = more_tags.pop("allow_nan", False) - tagged_input_types = more_tags.pop("X_types", []) - tags.input_tags.sparse = "sparse" in tagged_input_types - tags.target_tags.one_d_labels = "1dlabels" in tagged_input_types - tags._xfail_checks = more_tags.pop("_xfail_checks", {}) + tags.input_tags.allow_nan = more_tags["allow_nan"] + tags.input_tags.sparse = "sparse" in more_tags["X_types"] + tags.target_tags.one_d_labels = "1dlabels" in more_tags["X_types"] + tags._xfail_checks = more_tags["_xfail_checks"] if more_tags or set(tagged_input_types).difference({"2darray", "sparse", "1dlabels"}): _log_warning(f"Some tags sklearn tag values are missing from __sklearn_tags__: `{more_tags}`") return tags From d715311f3aa8f8228cddbc7e784085d16f06737f Mon Sep 17 00:00:00 2001 From: vnherdeiro Date: Mon, 16 Sep 2024 07:35:41 +0100 Subject: [PATCH 11/33] adressing PR comments --- python-package/lightgbm/sklearn.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index b974e7c8d097..fa42dff54ddc 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -674,14 +674,16 @@ def _more_tags(self) -> Dict[str, Any]: } def __sklearn_tags__(self): + # scikit-learn 1.6 introduced an __sklearn__tags() method intended to replace _more_tags(). + # _more_tags() can be removed whenever lightgbm's minimum supported scikit-learn version + # is >=1.6. + # ref: https://github.com/microsoft/LightGBM/pull/6651 tags = super().__sklearn_tags__() more_tags = self._more_tags() tags.input_tags.allow_nan = more_tags["allow_nan"] tags.input_tags.sparse = "sparse" in more_tags["X_types"] tags.target_tags.one_d_labels = "1dlabels" in more_tags["X_types"] tags._xfail_checks = more_tags["_xfail_checks"] - if more_tags or set(tagged_input_types).difference({"2darray", "sparse", "1dlabels"}): - _log_warning(f"Some tags sklearn tag values are missing from __sklearn_tags__: `{more_tags}`") return tags def __sklearn_is_fitted__(self) -> bool: From c4ec9a4939894885d8ec9eee8dd1911c62d40107 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 16 Sep 2024 11:32:51 -0500 Subject: [PATCH 12/33] move comment --- python-package/lightgbm/sklearn.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index fa42dff54ddc..fe97053a4241 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -662,6 +662,10 @@ def __init__( self._n_classes: int = -1 self.set_params(**kwargs) + # scikit-learn 1.6 introduced an __sklearn__tags() method intended to replace _more_tags(). + # _more_tags() can be removed whenever lightgbm's minimum supported scikit-learn version + # is >=1.6. + # ref: https://github.com/microsoft/LightGBM/pull/6651 def _more_tags(self) -> Dict[str, Any]: return { "allow_nan": True, @@ -674,10 +678,6 @@ def _more_tags(self) -> Dict[str, Any]: } def __sklearn_tags__(self): - # scikit-learn 1.6 introduced an __sklearn__tags() method intended to replace _more_tags(). - # _more_tags() can be removed whenever lightgbm's minimum supported scikit-learn version - # is >=1.6. - # ref: https://github.com/microsoft/LightGBM/pull/6651 tags = super().__sklearn_tags__() more_tags = self._more_tags() tags.input_tags.allow_nan = more_tags["allow_nan"] From b0a47035eadd285cf1d34cf04367653fafe72405 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Fri, 20 Sep 2024 23:57:32 -0500 Subject: [PATCH 13/33] updates --- python-package/lightgbm/sklearn.py | 48 +++++++++++++++++++++-- tests/python_package_test/test_sklearn.py | 18 +++++++-- 2 files changed, 60 insertions(+), 6 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index fe97053a4241..2648a92c4ff5 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -677,15 +677,38 @@ def _more_tags(self) -> Dict[str, Any]: }, } - def __sklearn_tags__(self): - tags = super().__sklearn_tags__() - more_tags = self._more_tags() + @staticmethod + def _update_sklearn_tags_from_dict( + *, + tags: "sklearn.utils.Tags", + tags_dict: Dict[str, Any] + ) -> "sklearn.utils.Tags": + """ + scikit-learn 1.6 introduced a dataclass-based interface for estimator tags. + ref: https://github.com/scikit-learn/scikit-learn/pull/29677 + + That interface means that each + """ tags.input_tags.allow_nan = more_tags["allow_nan"] tags.input_tags.sparse = "sparse" in more_tags["X_types"] tags.target_tags.one_d_labels = "1dlabels" in more_tags["X_types"] tags._xfail_checks = more_tags["_xfail_checks"] return tags + def __sklearn_tags__(self): + # super().__sklearn_tags__() cannot be called unconditionally, + # because that method isn't defined for scikit-learn<1.6 + if not callable(getattr(super(), "__sklearn_tags__", None)): + return None + + # take whatever tags are provided by BaseEstimator, then modify + # them with LightGBM-specific values + tags = self._update_sklearn_tags_from_dict( + tags=super().__sklearn_tags__(), + tags_dict=self._more_tags() + ) + return tags + def __sklearn_is_fitted__(self) -> bool: return getattr(self, "fitted_", False) @@ -1182,6 +1205,17 @@ def feature_names_in_(self) -> np.ndarray: class LGBMRegressor(_LGBMRegressorBase, LGBMModel): """LightGBM regressor.""" + def _more_tags(self) -> Dict[str, Any]: + tags = super(LGBMModel, self)._more_tags() + tags.update(super(_LGBMRegressorBase, self)._more_tags()) + return tags + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + if tags is None: + return None + + def fit( # type: ignore[override] self, X: _LGBM_ScikitMatrixLike, @@ -1228,6 +1262,14 @@ def fit( # type: ignore[override] class LGBMClassifier(_LGBMClassifierBase, LGBMModel): """LightGBM classifier.""" + def _more_tags(self) -> Dict[str, Any]: + tags = super(LGBMModel, self)._more_tags() + tags.update(super(_LGBMClassifierBase, self)._more_tags()) + return tags + + def __sklearn_tags__(self): + return super().__ + def fit( # type: ignore[override] self, X: _LGBM_ScikitMatrixLike, diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 01ab057cf3e2..74f99736b454 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -36,6 +36,7 @@ ) decreasing_generator = itertools.count(0, -1) +estimator_classes = (lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker) task_to_model_factory = { "ranking": lgb.LGBMRanker, "binary-classification": lgb.LGBMClassifier, @@ -1311,7 +1312,7 @@ def test_check_is_fitted(): check_is_fitted(model) -@pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker]) +@pytest.mark.parametrize("estimator_class", estimator_classes) @pytest.mark.parametrize("max_depth", [3, 4, 5, 8]) def test_max_depth_warning_is_never_raised(capsys, estimator_class, max_depth): X, y = make_blobs(n_samples=1_000, n_features=1, centers=2) @@ -1390,7 +1391,7 @@ def test_fit_only_raises_num_rounds_warning_when_expected(capsys): assert_silent(capsys) -@pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker]) +@pytest.mark.parametrize("estimator_class", estimator_classes) def test_getting_feature_names_in_np_input(estimator_class): # input is a numpy array, which doesn't have feature names. LightGBM adds # feature names to the fitted model, which is inconsistent with sklearn's behavior @@ -1409,7 +1410,7 @@ def test_getting_feature_names_in_np_input(estimator_class): np.testing.assert_array_equal(model.feature_names_in_, np.array([f"Column_{i}" for i in range(X.shape[1])])) -@pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker]) +@pytest.mark.parametrize("estimator_class", estimator_classes) def test_getting_feature_names_in_pd_input(estimator_class): X, y = load_digits(n_class=2, return_X_y=True, as_frame=True) col_names = X.columns.to_list() @@ -1436,6 +1437,17 @@ def test_sklearn_integration(estimator, check): check(estimator) +@pytest.mark.parametrize("estimator_class", estimator_classes) +def test_sklearn_tags_should_correctly_reflect_lightgbm_specific_values(estimator_class): + est = estimator_class() + more_tags = est._more_tags() + assert ( + more_tags["X_types"] == ["2darray", "sparse", "1dlabels"], + "List of supported X_types has changed. Update LGBMModel.__sklearn__tags() to match.", + ) + sklearn_tags = est.__sklearn_tags__() + + @pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "ranking", "regression"]) def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task): pd = pytest.importorskip("pandas") From 7eb861addebd6d446be511c35a00b85158d11fa9 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 24 Sep 2024 00:38:43 -0500 Subject: [PATCH 14/33] remove uses of super() --- .ci/test.sh | 1 + python-package/lightgbm/sklearn.py | 77 ++++++++++++++--------- tests/python_package_test/test_sklearn.py | 13 ++-- 3 files changed, 57 insertions(+), 34 deletions(-) diff --git a/.ci/test.sh b/.ci/test.sh index a2e2dde070a2..d05ce857e3dd 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -103,6 +103,7 @@ if [[ $TASK == "lint" ]]; then 'mypy>=1.11.1' \ 'pre-commit>=3.8.0' \ 'pyarrow-core>=17.0' \ + 'scikit-learn>=1.15.0' \ 'r-lintr>=3.1.2' source activate $CONDA_ENV echo "Linting Python code" diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 2648a92c4ff5..fec9cbe21aa5 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -4,7 +4,7 @@ import copy from inspect import signature from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np import scipy.sparse @@ -46,6 +46,12 @@ ) from .engine import train +if TYPE_CHECKING: + try: + from sklearn.utils import Tags as _sklearn_Tags + except ImportError: + _sklearn_Tags = None + __all__ = [ "LGBMClassifier", "LGBMModel", @@ -673,41 +679,45 @@ def _more_tags(self) -> Dict[str, Any]: "_xfail_checks": { "check_no_attributes_set_in_init": "scikit-learn incorrectly asserts that private attributes " "cannot be set in __init__: " - "(see https://github.com/microsoft/LightGBM/issues/2628)" + "(see https://github.com/microsoft/LightGBM/issues/2628)", + "check_n_features_in_after_fitting": ( + "validate_data() was first added in scikit-learn 1.6 and lightgbm" + "supports much older versions than that" + ), }, } @staticmethod def _update_sklearn_tags_from_dict( *, - tags: "sklearn.utils.Tags", - tags_dict: Dict[str, Any] - ) -> "sklearn.utils.Tags": - """ - scikit-learn 1.6 introduced a dataclass-based interface for estimator tags. + tags: "_sklearn_Tags", + tags_dict: Dict[str, Any], + ) -> "_sklearn_Tags": + """Update ``sklearn.utils.Tags`` inherited from ``scikit-learn`` base classes. + + ``scikit-learn`` 1.6 introduced a dataclass-based interface for estimator tags. ref: https://github.com/scikit-learn/scikit-learn/pull/29677 - That interface means that each + This method handles updating that instance based on the value in ``self._more_tags()``. """ - tags.input_tags.allow_nan = more_tags["allow_nan"] - tags.input_tags.sparse = "sparse" in more_tags["X_types"] - tags.target_tags.one_d_labels = "1dlabels" in more_tags["X_types"] - tags._xfail_checks = more_tags["_xfail_checks"] + tags.input_tags.allow_nan = tags_dict["allow_nan"] + tags.input_tags.sparse = "sparse" in tags_dict["X_types"] + tags.target_tags.one_d_labels = "1dlabels" in tags_dict["X_types"] + tags._xfail_checks = tags_dict["_xfail_checks"] return tags - def __sklearn_tags__(self): - # super().__sklearn_tags__() cannot be called unconditionally, + def __sklearn_tags__(self) -> Optional["_sklearn_Tags"]: + # _LGBMModelBase.__sklearn_tags__() cannot be called unconditionally, # because that method isn't defined for scikit-learn<1.6 - if not callable(getattr(super(), "__sklearn_tags__", None)): + if not callable(getattr(_LGBMModelBase, "__sklearn_tags__", None)): return None # take whatever tags are provided by BaseEstimator, then modify # them with LightGBM-specific values - tags = self._update_sklearn_tags_from_dict( - tags=super().__sklearn_tags__(), - tags_dict=self._more_tags() + return self._update_sklearn_tags_from_dict( + tags=_LGBMModelBase.__sklearn_tags__(self), + tags_dict=self._more_tags(), ) - return tags def __sklearn_is_fitted__(self) -> bool: return getattr(self, "fitted_", False) @@ -1206,15 +1216,17 @@ class LGBMRegressor(_LGBMRegressorBase, LGBMModel): """LightGBM regressor.""" def _more_tags(self) -> Dict[str, Any]: - tags = super(LGBMModel, self)._more_tags() - tags.update(super(_LGBMRegressorBase, self)._more_tags()) + # handle the case where ClassifierMixin possibly provides _more_tags() + if callable(getattr(_LGBMClassifierBase, "_more_tags", None)): + tags = _LGBMClassifierBase._more_tags(self) + else: + tags = {} + # override those with LightGBM-specific preferences + tags.update(LGBMModel._more_tags(self)) return tags - def __sklearn_tags__(self): - tags = super().__sklearn_tags__() - if tags is None: - return None - + def __sklearn_tags__(self) -> Optional["_sklearn_Tags"]: + return LGBMModel.__sklearn_tags__(self) def fit( # type: ignore[override] self, @@ -1263,12 +1275,17 @@ class LGBMClassifier(_LGBMClassifierBase, LGBMModel): """LightGBM classifier.""" def _more_tags(self) -> Dict[str, Any]: - tags = super(LGBMModel, self)._more_tags() - tags.update(super(_LGBMClassifierBase, self)._more_tags()) + # handle the case where ClassifierMixin possibly provides _more_tags() + if callable(getattr(_LGBMClassifierBase, "_more_tags", None)): + tags = _LGBMClassifierBase._more_tags(self) + else: + tags = {} + # override those with LightGBM-specific preferences + tags.update(LGBMModel._more_tags(self)) return tags - def __sklearn_tags__(self): - return super().__ + def __sklearn_tags__(self) -> Optional["_sklearn_Tags"]: + return LGBMModel.__sklearn_tags__(self) def fit( # type: ignore[override] self, diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 74f99736b454..216fd50ab0c5 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -1441,11 +1441,16 @@ def test_sklearn_integration(estimator, check): def test_sklearn_tags_should_correctly_reflect_lightgbm_specific_values(estimator_class): est = estimator_class() more_tags = est._more_tags() - assert ( - more_tags["X_types"] == ["2darray", "sparse", "1dlabels"], - "List of supported X_types has changed. Update LGBMModel.__sklearn__tags() to match.", - ) + err_msg = "List of supported X_types has changed. Update LGBMModel.__sklearn__tags() to match." + assert more_tags["X_types"] == ["2darray", "sparse", "1dlabels"], err_msg sklearn_tags = est.__sklearn_tags__() + # these tests should be run unconditionally (no 'if') once lightgbm's + # minimum scikit-learn version is 1.6 or higher + if sklearn_tags is not None: + assert sklearn_tags.input_tags.allow_nan is True + assert sklearn_tags.input_tags.sparse is True + assert sklearn_tags.target_tags.one_d_labels is True + assert sklearn_tags._xfail_checks == more_tags["_xfail_checks"] @pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "ranking", "regression"]) From b137ba25a0aaeef2589762f261c77f438bcb19d3 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 24 Sep 2024 00:49:21 -0500 Subject: [PATCH 15/33] fix version constraint in lint job, add one more comment --- .ci/test.sh | 2 +- python-package/lightgbm/sklearn.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.ci/test.sh b/.ci/test.sh index d05ce857e3dd..9fc4db54adaf 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -103,7 +103,7 @@ if [[ $TASK == "lint" ]]; then 'mypy>=1.11.1' \ 'pre-commit>=3.8.0' \ 'pyarrow-core>=17.0' \ - 'scikit-learn>=1.15.0' \ + 'scikit-learn>=1.5.2' \ 'r-lintr>=3.1.2' source activate $CONDA_ENV echo "Linting Python code" diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index fec9cbe21aa5..600c77207fec 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -47,6 +47,8 @@ from .engine import train if TYPE_CHECKING: + # sklearn.utils.Tags can be imported unconditionally once + # lightgbm's minimum scikit-learn version is 1.6 or higher try: from sklearn.utils import Tags as _sklearn_Tags except ImportError: From d1915c0f008a62936ba0aa8f9f66bdd050dc14ec Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 24 Sep 2024 08:25:51 -0500 Subject: [PATCH 16/33] Update python-package/lightgbm/sklearn.py --- python-package/lightgbm/sklearn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 600c77207fec..7c085f249eb7 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -1219,8 +1219,8 @@ class LGBMRegressor(_LGBMRegressorBase, LGBMModel): def _more_tags(self) -> Dict[str, Any]: # handle the case where ClassifierMixin possibly provides _more_tags() - if callable(getattr(_LGBMClassifierBase, "_more_tags", None)): - tags = _LGBMClassifierBase._more_tags(self) + if callable(getattr(_LGBMRegressorBase, "_more_tags", None)): + tags = _LGBMRegressorBase._more_tags(self) else: tags = {} # override those with LightGBM-specific preferences From 118efd95b898141ba854989fa5fe650727997d06 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 1 Oct 2024 22:58:19 -0500 Subject: [PATCH 17/33] use scikit-learn 1.6 nightlies again, move some code to compat.py, restructure tests --- .ci/test-python-latest.sh | 2 +- python-package/lightgbm/compat.py | 12 +++++++++++- python-package/lightgbm/sklearn.py | 24 ++++++++++++----------- tests/python_package_test/test_sklearn.py | 14 +++++++++---- 4 files changed, 35 insertions(+), 17 deletions(-) diff --git a/.ci/test-python-latest.sh b/.ci/test-python-latest.sh index f98f29f2641a..08fc8558ef3e 100755 --- a/.ci/test-python-latest.sh +++ b/.ci/test-python-latest.sh @@ -22,7 +22,7 @@ python -m pip install \ 'numpy>=2.0.0.dev0' \ 'matplotlib>=3.10.0.dev0' \ 'pandas>=3.0.0.dev0' \ - 'scikit-learn==1.5.*' \ + 'scikit-learn>=1.6.dev0' \ 'scipy>=1.15.0.dev0' python -m pip install \ diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index f916ca6be723..09eed6d6c83c 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -1,7 +1,7 @@ # coding: utf-8 """Compatibility library.""" -from typing import Any, List +from typing import TYPE_CHECKING, Any, List # scikit-learn is intentionally imported first here, # see https://github.com/microsoft/LightGBM/issues/6509 @@ -74,6 +74,16 @@ class _LGBMRegressorBase: # type: ignore _LGBMCheckClassificationTargets = None _LGBMComputeSampleWeight = None +# additional scikit-learn imports only for type hints +if TYPE_CHECKING: + # sklearn.utils.Tags can be imported unconditionally once + # lightgbm's minimum scikit-learn version is 1.6 or higher + try: + from sklearn.utils import Tags as _sklearn_Tags + except ImportError: + _sklearn_Tags = None + + """pandas""" try: from pandas import DataFrame as pd_DataFrame diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 7c085f249eb7..4cf305d777c1 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -47,12 +47,8 @@ from .engine import train if TYPE_CHECKING: - # sklearn.utils.Tags can be imported unconditionally once - # lightgbm's minimum scikit-learn version is 1.6 or higher - try: - from sklearn.utils import Tags as _sklearn_Tags - except ImportError: - _sklearn_Tags = None + from .compat import _sklearn_Tags + __all__ = [ "LGBMClassifier", @@ -711,8 +707,14 @@ def _update_sklearn_tags_from_dict( def __sklearn_tags__(self) -> Optional["_sklearn_Tags"]: # _LGBMModelBase.__sklearn_tags__() cannot be called unconditionally, # because that method isn't defined for scikit-learn<1.6 - if not callable(getattr(_LGBMModelBase, "__sklearn_tags__", None)): - return None + if not hasattr(self, "__sklearn_tags__"): + from sklearn import __version__ as sklearn_version + + err_msg = ( + "__sklearn_tags__() should not be called when using scikit-learn<1.6. " + f"detected version: {sklearn_version}" + ) + raise AttributeError(err_msg) # take whatever tags are provided by BaseEstimator, then modify # them with LightGBM-specific values @@ -1218,7 +1220,7 @@ class LGBMRegressor(_LGBMRegressorBase, LGBMModel): """LightGBM regressor.""" def _more_tags(self) -> Dict[str, Any]: - # handle the case where ClassifierMixin possibly provides _more_tags() + # handle the case where RegressorMixin possibly provides _more_tags() if callable(getattr(_LGBMRegressorBase, "_more_tags", None)): tags = _LGBMRegressorBase._more_tags(self) else: @@ -1227,7 +1229,7 @@ def _more_tags(self) -> Dict[str, Any]: tags.update(LGBMModel._more_tags(self)) return tags - def __sklearn_tags__(self) -> Optional["_sklearn_Tags"]: + def __sklearn_tags__(self) -> "_sklearn_Tags": return LGBMModel.__sklearn_tags__(self) def fit( # type: ignore[override] @@ -1286,7 +1288,7 @@ def _more_tags(self) -> Dict[str, Any]: tags.update(LGBMModel._more_tags(self)) return tags - def __sklearn_tags__(self) -> Optional["_sklearn_Tags"]: + def __sklearn_tags__(self) -> "_sklearn_Tags": return LGBMModel.__sklearn_tags__(self) def fit( # type: ignore[override] diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 216fd50ab0c5..1d127fa6abae 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -1443,10 +1443,16 @@ def test_sklearn_tags_should_correctly_reflect_lightgbm_specific_values(estimato more_tags = est._more_tags() err_msg = "List of supported X_types has changed. Update LGBMModel.__sklearn__tags() to match." assert more_tags["X_types"] == ["2darray", "sparse", "1dlabels"], err_msg - sklearn_tags = est.__sklearn_tags__() - # these tests should be run unconditionally (no 'if') once lightgbm's - # minimum scikit-learn version is 1.6 or higher - if sklearn_tags is not None: + # the try-except part of this should be removed once lightgbm's + # minimum supported scikit-learn version is at least 1.6 + try: + sklearn_tags = est.__sklearn_tags__() + except AttributeError as err: + # only the exact error we expected to be raised should be raised + assert bool(re.search(r"__sklearn_tags__.* should not be called", err)) + else: + # if no AttributeError was thrown, we must be using scikit-learn>=1.6, + # and so the actual effects of __sklearn_tags__() should be tested assert sklearn_tags.input_tags.allow_nan is True assert sklearn_tags.input_tags.sparse is True assert sklearn_tags.target_tags.one_d_labels is True From 4fb82f3080081c960c6dcca7e1798f2ec83b1876 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 2 Oct 2024 23:38:09 -0500 Subject: [PATCH 18/33] optionally use validate_data(), starting in scikit-learn 1.6 --- python-package/lightgbm/compat.py | 36 ++++++++++++++++++++++++++++-- python-package/lightgbm/sklearn.py | 34 ++++++++++++++++++++++++---- 2 files changed, 64 insertions(+), 6 deletions(-) diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index 09eed6d6c83c..c8ba580b2d4d 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -29,6 +29,38 @@ def _check_sample_weight(sample_weight: Any, X: Any, dtype: Any = None) -> Any: check_consistent_length(sample_weight, X) return sample_weight + try: + from sklearn.utils.validation import validate_data + except ImportError: + # validate_data() was added in scikit-learn 1.6, this function roughly imitates it for older versions. + # It can be removed when lightbm's minimum scikit-learn version is at least 1.6. + def validate_data( + _estimator, + X, + y="no_validation", + reset: bool = False, + accept_sparse: bool = True, + # 'force_all_finite' was renamed to 'ensure_all_finite' in scikit-learn 1.6 + ensure_all_finite: bool = False, + ensure_min_samples: int = 1, + ): + # NOTE: check_X_y() calls check_array() internally, so only need to call one or the other of them here + if isinstance(y, str) and y == "no_validation": + X = check_array(X, reset=reset, accept_sparse=accept_sparse, force_all_finite=ensure_all_finite) + else: + check_X_y( + X, + y, + reset=reset, + accept_sparse=accept_sparse, + force_all_finite=ensure_all_finite, + ensure_min_samples=ensure_min_samples, + ) + # this only needs to be updated at fit() time + _estimator.n_features_in_ = _estimator._n_features + + return X, y + SKLEARN_INSTALLED = True _LGBMBaseCrossValidator = BaseCrossValidator _LGBMModelBase = BaseEstimator @@ -38,12 +70,12 @@ def _check_sample_weight(sample_weight: Any, X: Any, dtype: Any = None) -> Any: LGBMNotFittedError = NotFittedError _LGBMStratifiedKFold = StratifiedKFold _LGBMGroupKFold = GroupKFold - _LGBMCheckXY = check_X_y _LGBMCheckArray = check_array _LGBMCheckSampleWeight = _check_sample_weight _LGBMAssertAllFinite = assert_all_finite _LGBMCheckClassificationTargets = check_classification_targets _LGBMComputeSampleWeight = compute_sample_weight + _LGBMValidateData = validate_data except ImportError: SKLEARN_INSTALLED = False @@ -67,12 +99,12 @@ class _LGBMRegressorBase: # type: ignore LGBMNotFittedError = ValueError _LGBMStratifiedKFold = None _LGBMGroupKFold = None - _LGBMCheckXY = None _LGBMCheckArray = None _LGBMCheckSampleWeight = None _LGBMAssertAllFinite = None _LGBMCheckClassificationTargets = None _LGBMComputeSampleWeight = None + _LGBMValidateData = None # additional scikit-learn imports only for type hints if TYPE_CHECKING: diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 4cf305d777c1..2b773d46da7f 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -31,16 +31,15 @@ SKLEARN_INSTALLED, LGBMNotFittedError, _LGBMAssertAllFinite, - _LGBMCheckArray, _LGBMCheckClassificationTargets, _LGBMCheckSampleWeight, - _LGBMCheckXY, _LGBMClassifierBase, _LGBMComputeSampleWeight, _LGBMCpuCount, _LGBMLabelEncoder, _LGBMModelBase, _LGBMRegressorBase, + _LGBMValidateData, dt_DataTable, pd_DataFrame, ) @@ -912,7 +911,21 @@ def fit( params["metric"] = [metric for metric in params["metric"] if metric is not None] if not isinstance(X, (pd_DataFrame, dt_DataTable)): - _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, ensure_min_samples=2) + _X, _y = _LGBMValidateData( + self, + X, + y, + # Prevent scikit-learn from deleting or modifying attributes like 'feature_names_in_' and 'n_features_in_'. + # We prefer to expose these via @property, to be able to raise a NotFittedError if they're accessed on an + # unfitted model... and so don't want to take on the complexity of defining setters and deleters for those. + reset=False, + # allow any input type (this validation is done further down, in lgb.Dataset()) + accept_sparse=True, + # do not raise an error if Inf of NaN values are found (LightGBM handles these internally) + ensure_all_finite=False, + # raise an error on 0-row and 1-row inputs + ensure_min_samples=2, + ) if sample_weight is not None: sample_weight = _LGBMCheckSampleWeight(sample_weight, _X) else: @@ -1054,7 +1067,20 @@ def predict( if not self.__sklearn_is_fitted__(): raise LGBMNotFittedError("Estimator not fitted, call fit before exploiting the model.") if not isinstance(X, (pd_DataFrame, dt_DataTable)): - X = _LGBMCheckArray(X, accept_sparse=True, force_all_finite=False) + X = _LGBMValidateData( + self, + X, + # 'y' being omitted = run scikit-learn's check_array() instead of check_X_y() + # + # Prevent scikit-learn from deleting or modifying attributes like 'feature_names_in_' and 'n_features_in_'. + # We prefer to expose these via @property, to be able to raise a NotFittedError if they're accessed on an + # unfitted model... and so don't want to take on the complexity of defining setters and deleters for those. + reset=False, + # allow any input type (this validation is done further down, in lgb.Dataset()) + accept_sparse=True, + # do not raise an error if Inf of NaN values are found (LightGBM handles these internally) + ensure_all_finite=False, + ) n_features = X.shape[1] if self._n_features != n_features: raise ValueError( From c42c53df4d5a4a88b7cd68c7eff239b0587556a4 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 3 Oct 2024 23:39:12 -0500 Subject: [PATCH 19/33] fix validate_data() for older versions, update tests --- python-package/lightgbm/compat.py | 20 +++++++++++++++----- python-package/lightgbm/sklearn.py | 19 ++----------------- tests/python_package_test/test_sklearn.py | 23 ++++++++++++++++++++++- 3 files changed, 39 insertions(+), 23 deletions(-) diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index c8ba580b2d4d..eecbba08541c 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -38,26 +38,36 @@ def validate_data( _estimator, X, y="no_validation", - reset: bool = False, accept_sparse: bool = True, # 'force_all_finite' was renamed to 'ensure_all_finite' in scikit-learn 1.6 ensure_all_finite: bool = False, ensure_min_samples: int = 1, + # trap other keyword arguments that only work on scikit-learn >=1.6, like 'reset' + **ignored_kwargs, ): # NOTE: check_X_y() calls check_array() internally, so only need to call one or the other of them here if isinstance(y, str) and y == "no_validation": - X = check_array(X, reset=reset, accept_sparse=accept_sparse, force_all_finite=ensure_all_finite) + X = check_array(X, accept_sparse=accept_sparse, force_all_finite=ensure_all_finite) else: - check_X_y( + X, y = check_X_y( X, y, - reset=reset, accept_sparse=accept_sparse, force_all_finite=ensure_all_finite, ensure_min_samples=ensure_min_samples, ) + # this only needs to be updated at fit() time - _estimator.n_features_in_ = _estimator._n_features + _estimator._n_features = X.shape[1] + _estimator._n_features_in = X.shape[1] + + # raise the same error that scikit-learn's `validate_data()` does on scikit-learn>=1.6 + n_features = X.shape[1] + if _estimator._n_features != n_features: + raise ValueError( + f"X has {n_features} features, but {_estimator.__class__.__name__} " + f"is expecting {_estimator._n_features} features as input." + ) return X, y diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 2b773d46da7f..aba4b7770563 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -677,10 +677,6 @@ def _more_tags(self) -> Dict[str, Any]: "check_no_attributes_set_in_init": "scikit-learn incorrectly asserts that private attributes " "cannot be set in __init__: " "(see https://github.com/microsoft/LightGBM/issues/2628)", - "check_n_features_in_after_fitting": ( - "validate_data() was first added in scikit-learn 1.6 and lightgbm" - "supports much older versions than that" - ), }, } @@ -706,7 +702,7 @@ def _update_sklearn_tags_from_dict( def __sklearn_tags__(self) -> Optional["_sklearn_Tags"]: # _LGBMModelBase.__sklearn_tags__() cannot be called unconditionally, # because that method isn't defined for scikit-learn<1.6 - if not hasattr(self, "__sklearn_tags__"): + if not hasattr(_LGBMModelBase, "__sklearn_tags__"): from sklearn import __version__ as sklearn_version err_msg = ( @@ -940,10 +936,6 @@ def fit( else: sample_weight = np.multiply(sample_weight, class_sample_weight) - self._n_features = _X.shape[1] - # copy for consistency - self._n_features_in = self._n_features - train_set = Dataset( data=_X, label=_y, @@ -1067,7 +1059,7 @@ def predict( if not self.__sklearn_is_fitted__(): raise LGBMNotFittedError("Estimator not fitted, call fit before exploiting the model.") if not isinstance(X, (pd_DataFrame, dt_DataTable)): - X = _LGBMValidateData( + X, _ = _LGBMValidateData( self, X, # 'y' being omitted = run scikit-learn's check_array() instead of check_X_y() @@ -1081,13 +1073,6 @@ def predict( # do not raise an error if Inf of NaN values are found (LightGBM handles these internally) ensure_all_finite=False, ) - n_features = X.shape[1] - if self._n_features != n_features: - raise ValueError( - "Number of features of the model must " - f"match the input. Model n_features_ is {self._n_features} and " - f"input n_features is {n_features}" - ) # retrieve original params that possibly can be used in both training and prediction # and then overwrite them (considering aliases) with params that were passed directly in prediction predict_params = self._process_params(stage="predict") diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 1d127fa6abae..1f7e7c7a04e8 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -1449,7 +1449,7 @@ def test_sklearn_tags_should_correctly_reflect_lightgbm_specific_values(estimato sklearn_tags = est.__sklearn_tags__() except AttributeError as err: # only the exact error we expected to be raised should be raised - assert bool(re.search(r"__sklearn_tags__.* should not be called", err)) + assert bool(re.search(r"__sklearn_tags__.* should not be called", str(err))) else: # if no AttributeError was thrown, we must be using scikit-learn>=1.6, # and so the actual effects of __sklearn_tags__() should be tested @@ -1584,6 +1584,27 @@ def test_validate_features(task): model.predict(df2, validate_features=False) +# LightGBM's 'predict_disable_shape_check' mechanism is intentionally not respected by +# its scikit-learn estimators, for consistency with scikit-learn's own behavior. +@pytest.mark.parametrize("predict_disable_shape_check", [True, False]) +def test_predict_rejects_inputs_with_incorrect_number_of_features(predict_disable_shape_check): + X, y, _ = _create_data(task="regression", n_features=4) + # train on the first 3 features + model = lgb.LGBMRegressor(n_estimators=5, num_leaves=7, verbose=-1).fit(X[:, :-1], y) + + # more cols in X than features: error + with pytest.raises(ValueError, match="X has 4 features, but LGBMRegressor is expecting 3 features as input"): + model.predict(X, predict_disable_shape_check=predict_disable_shape_check) + + # fewer cols in X than features: error + with pytest.raises(ValueError, match="X has 2 features, but LGBMRegressor is expecting 3 features as input"): + model.predict(X[:, :-2], predict_disable_shape_check=predict_disable_shape_check) + + # same number of columns in both: no error + preds = model.predict(X[:, :-1], predict_disable_shape_check=predict_disable_shape_check) + assert preds.shape == y.shape + + @pytest.mark.parametrize("X_type", ["dt_DataTable", "list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"]) @pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_Series", "pd_DataFrame"]) @pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "regression"]) From 33fb5b680655646ce3acbab90d8f9def407e5e20 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Fri, 4 Oct 2024 00:12:49 -0500 Subject: [PATCH 20/33] more changes --- python-package/lightgbm/compat.py | 25 +++++++++++++------------ python-package/lightgbm/sklearn.py | 17 ++++++++++++++++- 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index eecbba08541c..a04256df9e37 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -48,22 +48,23 @@ def validate_data( # NOTE: check_X_y() calls check_array() internally, so only need to call one or the other of them here if isinstance(y, str) and y == "no_validation": X = check_array(X, accept_sparse=accept_sparse, force_all_finite=ensure_all_finite) - else: - X, y = check_X_y( - X, - y, - accept_sparse=accept_sparse, - force_all_finite=ensure_all_finite, - ensure_min_samples=ensure_min_samples, - ) + return X + + # if we reach here, we're validating features and labels + X, y = check_X_y( + X, + y, + accept_sparse=accept_sparse, + force_all_finite=ensure_all_finite, + ensure_min_samples=ensure_min_samples, + ) - # this only needs to be updated at fit() time - _estimator._n_features = X.shape[1] - _estimator._n_features_in = X.shape[1] + # this only needs to be updated at fit() time + _estimator._n_features_in = X.shape[1] # raise the same error that scikit-learn's `validate_data()` does on scikit-learn>=1.6 n_features = X.shape[1] - if _estimator._n_features != n_features: + if _estimator.__sklearn_is_fitted__() and _estimator._n_features != n_features: raise ValueError( f"X has {n_features} features, but {_estimator.__class__.__name__} " f"is expecting {_estimator._n_features} features as input." diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index aba4b7770563..5e923ce19769 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -906,6 +906,14 @@ def fit( params["metric"] = [e for e in eval_metrics_builtin if e not in params["metric"]] + params["metric"] params["metric"] = [metric for metric in params["metric"] if metric is not None] + # this needs to be set before calling _LGBMValidateData(), as that function uses + # self.n_features_in_ (the corresponding public attribute) to check that the input has + # the expected number of features + if isinstance(X, list): + self._n_features_in = len(X[0]) + else: + self._n_features_in = X.shape[1] + if not isinstance(X, (pd_DataFrame, dt_DataTable)): _X, _y = _LGBMValidateData( self, @@ -1018,6 +1026,13 @@ def fit( callbacks=callbacks, ) + # This populates the property self.n_features_, the number of features in the fitted model, + # and so should only be set after fitting. + # + # The related property self._n_features_in, which populates self.n_features_in_, + # is set BEFORE fitting. + self._n_features = self._Booster.num_feature() + self._evals_result = evals_result self._best_iteration = self._Booster.best_iteration self._best_score = self._Booster.best_score @@ -1059,7 +1074,7 @@ def predict( if not self.__sklearn_is_fitted__(): raise LGBMNotFittedError("Estimator not fitted, call fit before exploiting the model.") if not isinstance(X, (pd_DataFrame, dt_DataTable)): - X, _ = _LGBMValidateData( + X = _LGBMValidateData( self, X, # 'y' being omitted = run scikit-learn's check_array() instead of check_X_y() From 6689faaa6cd906cd4fd2425fca1e711aacbdd0fe Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 5 Oct 2024 00:04:05 -0500 Subject: [PATCH 21/33] fix n_features_in setting --- python-package/lightgbm/compat.py | 2 +- python-package/lightgbm/sklearn.py | 37 ++++++++++++++++++++++++------ 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index a04256df9e37..6b56a4f9b0e3 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -48,7 +48,7 @@ def validate_data( # NOTE: check_X_y() calls check_array() internally, so only need to call one or the other of them here if isinstance(y, str) and y == "no_validation": X = check_array(X, accept_sparse=accept_sparse, force_all_finite=ensure_all_finite) - return X + return X # noqa: RET504 # if we reach here, we're validating features and labels X, y = check_X_y( diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 5e923ce19769..8f03d53506e7 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -147,6 +147,32 @@ def _get_weight_from_constructed_dataset(dataset: Dataset) -> Optional[np.ndarra return weight +def _num_features_for_raw_input(X: _LGBM_ScikitMatrixLike) -> int: + """Get number of features from raw input data. + + Some ``scikit-learn`` versions accomplish this with a private function + ``sklearn.utils.validation._num_features()``. This is here to avoid depending on that. + """ + # if a list, assume a list of lists where each item is one row of training data, + # and all rows have the same number of features + if isinstance(X, list): + return len(X[0]) + + # scikit-learn estimator checks error out if .shape is accessed unconditionally... + # but allow hard-coding an assumption that anything that isn't a list and doesn't have .shape + # must be convertible to something following the Array API via a __array__() method + if hasattr(X, "shape"): + X_shape = X.shape + else: + X_shape = X.__array__().shape + + # this condition accounts for training on a 1-dimensional input + if len(X_shape) == 1: + return 1 + else: + return X_shape[1] + + class _ObjectiveFunctionWrapper: """Proxy class for objective function.""" @@ -906,13 +932,10 @@ def fit( params["metric"] = [e for e in eval_metrics_builtin if e not in params["metric"]] + params["metric"] params["metric"] = [metric for metric in params["metric"] if metric is not None] - # this needs to be set before calling _LGBMValidateData(), as that function uses - # self.n_features_in_ (the corresponding public attribute) to check that the input has - # the expected number of features - if isinstance(X, list): - self._n_features_in = len(X[0]) - else: - self._n_features_in = X.shape[1] + # `sklearn.utils.validation.validate_data()` expects self.n_features_in_ to already be set by the + # time it's called (if you call it with reset=True like LightGBM does), and the scikit-learn + # estimator checks complain if X.shape is accessed unconditionally + self._n_features_in = _num_features_for_raw_input(X) if not isinstance(X, (pd_DataFrame, dt_DataTable)): _X, _y = _LGBMValidateData( From 9a056701844e60dbdd2c37d32bf078c3434df080 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 5 Oct 2024 00:13:29 -0500 Subject: [PATCH 22/33] fix return type --- python-package/lightgbm/compat.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index 6b56a4f9b0e3..b8db1e010464 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -45,22 +45,22 @@ def validate_data( # trap other keyword arguments that only work on scikit-learn >=1.6, like 'reset' **ignored_kwargs, ): + no_val_y = isinstance(y, str) and y == "no_validation" + # NOTE: check_X_y() calls check_array() internally, so only need to call one or the other of them here - if isinstance(y, str) and y == "no_validation": + if no_val_y: X = check_array(X, accept_sparse=accept_sparse, force_all_finite=ensure_all_finite) - return X # noqa: RET504 - - # if we reach here, we're validating features and labels - X, y = check_X_y( - X, - y, - accept_sparse=accept_sparse, - force_all_finite=ensure_all_finite, - ensure_min_samples=ensure_min_samples, - ) + else: + X, y = check_X_y( + X, + y, + accept_sparse=accept_sparse, + force_all_finite=ensure_all_finite, + ensure_min_samples=ensure_min_samples, + ) - # this only needs to be updated at fit() time - _estimator._n_features_in = X.shape[1] + # this only needs to be updated at fit() time + _estimator._n_features_in = X.shape[1] # raise the same error that scikit-learn's `validate_data()` does on scikit-learn>=1.6 n_features = X.shape[1] @@ -70,7 +70,10 @@ def validate_data( f"is expecting {_estimator._n_features} features as input." ) - return X, y + if no_val_y: + return X + else: + return X, y SKLEARN_INSTALLED = True _LGBMBaseCrossValidator = BaseCrossValidator From 815433fa6d8ffe1d026bbaa4a5d04b2d5026b679 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 5 Oct 2024 00:16:22 -0500 Subject: [PATCH 23/33] remove now-unnecessary _LGBMCheckXY() --- python-package/lightgbm/compat.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index b8db1e010464..6e443ffcc48c 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -84,7 +84,6 @@ def validate_data( LGBMNotFittedError = NotFittedError _LGBMStratifiedKFold = StratifiedKFold _LGBMGroupKFold = GroupKFold - _LGBMCheckArray = check_array _LGBMCheckSampleWeight = _check_sample_weight _LGBMAssertAllFinite = assert_all_finite _LGBMCheckClassificationTargets = check_classification_targets @@ -113,7 +112,6 @@ class _LGBMRegressorBase: # type: ignore LGBMNotFittedError = ValueError _LGBMStratifiedKFold = None _LGBMGroupKFold = None - _LGBMCheckArray = None _LGBMCheckSampleWeight = None _LGBMAssertAllFinite = None _LGBMCheckClassificationTargets = None From ffebe417532f00ed622c4b81b1816d14dc0cfe2b Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 5 Oct 2024 00:23:31 -0500 Subject: [PATCH 24/33] correct comment --- python-package/lightgbm/sklearn.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 8f03d53506e7..853b54de6b6e 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -932,9 +932,12 @@ def fit( params["metric"] = [e for e in eval_metrics_builtin if e not in params["metric"]] + params["metric"] params["metric"] = [metric for metric in params["metric"] if metric is not None] - # `sklearn.utils.validation.validate_data()` expects self.n_features_in_ to already be set by the - # time it's called (if you call it with reset=True like LightGBM does), and the scikit-learn - # estimator checks complain if X.shape is accessed unconditionally + # Tf self.n_features_in_ is set to any value other than ``None``, then it needs to be populated + # before ``sklearn.utils.validation.validate_data()`` is called, to avoid an error about mismatched + # feature numbers being raised on the first call to ``fit()``. + # + # lightgbm initializes that property to -1, so it can be an integer throughout its entire life + # (to help with type-checking), so it needs to be updated here. self._n_features_in = _num_features_for_raw_input(X) if not isinstance(X, (pd_DataFrame, dt_DataTable)): From f2cb2fe74151cad847f9c3177bbbe430ebc46b80 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 5 Oct 2024 22:26:50 -0500 Subject: [PATCH 25/33] Apply suggestions from code review Co-authored-by: Nikita Titov --- python-package/lightgbm/compat.py | 2 +- python-package/lightgbm/sklearn.py | 2 +- tests/python_package_test/test_sklearn.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index 6e443ffcc48c..ebb3f47ebcf2 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -33,7 +33,7 @@ def _check_sample_weight(sample_weight: Any, X: Any, dtype: Any = None) -> Any: from sklearn.utils.validation import validate_data except ImportError: # validate_data() was added in scikit-learn 1.6, this function roughly imitates it for older versions. - # It can be removed when lightbm's minimum scikit-learn version is at least 1.6. + # It can be removed when lightgbm's minimum scikit-learn version is at least 1.6. def validate_data( _estimator, X, diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 853b54de6b6e..998c26c2f3bc 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -932,7 +932,7 @@ def fit( params["metric"] = [e for e in eval_metrics_builtin if e not in params["metric"]] + params["metric"] params["metric"] = [metric for metric in params["metric"] if metric is not None] - # Tf self.n_features_in_ is set to any value other than ``None``, then it needs to be populated + # If self.n_features_in_ is set to any value other than ``None``, then it needs to be populated # before ``sklearn.utils.validation.validate_data()`` is called, to avoid an error about mismatched # feature numbers being raised on the first call to ``fit()``. # diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 1f7e7c7a04e8..fc996ca01cee 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -1441,7 +1441,7 @@ def test_sklearn_integration(estimator, check): def test_sklearn_tags_should_correctly_reflect_lightgbm_specific_values(estimator_class): est = estimator_class() more_tags = est._more_tags() - err_msg = "List of supported X_types has changed. Update LGBMModel.__sklearn__tags() to match." + err_msg = "List of supported X_types has changed. Update LGBMModel.__sklearn_tags__() to match." assert more_tags["X_types"] == ["2darray", "sparse", "1dlabels"], err_msg # the try-except part of this should be removed once lightgbm's # minimum supported scikit-learn version is at least 1.6 From 86b5ab3b7bc58ebf9d8e4510f55c5d78768b0089 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sat, 5 Oct 2024 23:46:22 -0500 Subject: [PATCH 26/33] move __version__ import to compat.py, test with all ML tasks --- python-package/lightgbm/compat.py | 2 ++ python-package/lightgbm/sklearn.py | 5 ++- tests/python_package_test/test_sklearn.py | 40 +++++++++++++++++++---- 3 files changed, 37 insertions(+), 10 deletions(-) diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index 6e443ffcc48c..617bd31b9e36 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -7,6 +7,7 @@ # see https://github.com/microsoft/LightGBM/issues/6509 """sklearn""" try: + from sklearn import __version__ as _sklearn_version from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin from sklearn.preprocessing import LabelEncoder from sklearn.utils.class_weight import compute_sample_weight @@ -117,6 +118,7 @@ class _LGBMRegressorBase: # type: ignore _LGBMCheckClassificationTargets = None _LGBMComputeSampleWeight = None _LGBMValidateData = None + _sklearn_version = None # additional scikit-learn imports only for type hints if TYPE_CHECKING: diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 853b54de6b6e..2c2193303569 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -40,6 +40,7 @@ _LGBMModelBase, _LGBMRegressorBase, _LGBMValidateData, + _sklearn_version, dt_DataTable, pd_DataFrame, ) @@ -729,11 +730,9 @@ def __sklearn_tags__(self) -> Optional["_sklearn_Tags"]: # _LGBMModelBase.__sklearn_tags__() cannot be called unconditionally, # because that method isn't defined for scikit-learn<1.6 if not hasattr(_LGBMModelBase, "__sklearn_tags__"): - from sklearn import __version__ as sklearn_version - err_msg = ( "__sklearn_tags__() should not be called when using scikit-learn<1.6. " - f"detected version: {sklearn_version}" + f"detected version: {_sklearn_version}" ) raise AttributeError(err_msg) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 1f7e7c7a04e8..bb14e46ebf7d 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -43,6 +43,7 @@ "multiclass-classification": lgb.LGBMClassifier, "regression": lgb.LGBMRegressor, } +all_tasks = tuple(task_to_model_factory.keys()) def _create_data(task, n_samples=100, n_features=4): @@ -1459,7 +1460,7 @@ def test_sklearn_tags_should_correctly_reflect_lightgbm_specific_values(estimato assert sklearn_tags._xfail_checks == more_tags["_xfail_checks"] -@pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "ranking", "regression"]) +@pytest.mark.parametrize("task", all_tasks) def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task): pd = pytest.importorskip("pandas") X, y, g = _create_data(task) @@ -1563,7 +1564,7 @@ def test_default_n_jobs(tmp_path): @pytest.mark.skipif(not PANDAS_INSTALLED, reason="pandas is not installed") -@pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "ranking", "regression"]) +@pytest.mark.parametrize("task", all_tasks) def test_validate_features(task): X, y, g = _create_data(task, n_features=4) features = ["x1", "x2", "x3", "x4"] @@ -1586,24 +1587,49 @@ def test_validate_features(task): # LightGBM's 'predict_disable_shape_check' mechanism is intentionally not respected by # its scikit-learn estimators, for consistency with scikit-learn's own behavior. +@pytest.mark.parametrize("task", all_tasks) @pytest.mark.parametrize("predict_disable_shape_check", [True, False]) -def test_predict_rejects_inputs_with_incorrect_number_of_features(predict_disable_shape_check): - X, y, _ = _create_data(task="regression", n_features=4) +def test_predict_rejects_inputs_with_incorrect_number_of_features(predict_disable_shape_check, task): + X, y, g = _create_data(task, n_features=4) + model_factory = task_to_model_factory[task] + fit_kwargs = {"X": X[:, :-1], "y": y} + if task == "ranking": + estimator_name = "LGBMRanker" + fit_kwargs.update({"group": g}) + elif task == "regression": + estimator_name = "LGBMRegressor" + else: + estimator_name = "LGBMClassifier" + # train on the first 3 features - model = lgb.LGBMRegressor(n_estimators=5, num_leaves=7, verbose=-1).fit(X[:, :-1], y) + model = model_factory(n_estimators=5, num_leaves=7, verbose=-1).fit(**fit_kwargs) # more cols in X than features: error - with pytest.raises(ValueError, match="X has 4 features, but LGBMRegressor is expecting 3 features as input"): + err_msg = f"X has 4 features, but {estimator_name} is expecting 3 features as input" + with pytest.raises(ValueError, match=err_msg): model.predict(X, predict_disable_shape_check=predict_disable_shape_check) + if estimator_name == "LGBMClassifier": + with pytest.raises(ValueError, match=err_msg): + model.predict_proba(X, predict_disable_shape_check=predict_disable_shape_check) + # fewer cols in X than features: error - with pytest.raises(ValueError, match="X has 2 features, but LGBMRegressor is expecting 3 features as input"): + err_msg = f"X has 2 features, but {estimator_name} is expecting 3 features as input" + with pytest.raises(ValueError, match=err_msg): model.predict(X[:, :-2], predict_disable_shape_check=predict_disable_shape_check) + if estimator_name == "LGBMClassifier": + with pytest.raises(ValueError, match=err_msg): + model.predict_proba(X[:, :-2], predict_disable_shape_check=predict_disable_shape_check) + # same number of columns in both: no error preds = model.predict(X[:, :-1], predict_disable_shape_check=predict_disable_shape_check) assert preds.shape == y.shape + if estimator_name == "LGBMClassifier": + preds = model.predict(X[:, :-1], predict_disable_shape_check=predict_disable_shape_check) + assert preds.shape == y.shape + @pytest.mark.parametrize("X_type", ["dt_DataTable", "list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"]) @pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_Series", "pd_DataFrame"]) From 125f4eace8a6f0cc4e8b5c3de8aa054c89b3c934 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 6 Oct 2024 00:28:40 -0500 Subject: [PATCH 27/33] just set the setters and deleters --- python-package/lightgbm/sklearn.py | 41 ++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 2c2193303569..c691c921d4e7 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -937,7 +937,7 @@ def fit( # # lightgbm initializes that property to -1, so it can be an integer throughout its entire life # (to help with type-checking), so it needs to be updated here. - self._n_features_in = _num_features_for_raw_input(X) + # self._n_features_in = _num_features_for_raw_input(X) if not isinstance(X, (pd_DataFrame, dt_DataTable)): _X, _y = _LGBMValidateData( @@ -947,7 +947,7 @@ def fit( # Prevent scikit-learn from deleting or modifying attributes like 'feature_names_in_' and 'n_features_in_'. # We prefer to expose these via @property, to be able to raise a NotFittedError if they're accessed on an # unfitted model... and so don't want to take on the complexity of defining setters and deleters for those. - reset=False, + reset=True, # allow any input type (this validation is done further down, in lgb.Dataset()) accept_sparse=True, # do not raise an error if Inf of NaN values are found (LightGBM handles these internally) @@ -1168,6 +1168,21 @@ def n_features_in_(self) -> int: raise LGBMNotFittedError("No n_features_in found. Need to call fit beforehand.") return self._n_features_in + @n_features_in_.setter + def n_features_in_(self, value: int) -> None: + """Set number of features found in passed-in dataset. + + Starting with ``scikit-learn`` 1.6, ``scikit-learn` expects to be able to directly + set this property in functions like ``validate_data()``. + + .. note:: + + Do not call ``estimator.n_features_in_ = some_int`` or anything else that invokes + this method. It is only here for compatibility with ``scikit-learn`` validation + functions used internally in ``lightgbm``. + """ + self._n_features_in = value + @property def best_score_(self) -> _LGBM_BoosterBestScoreType: """:obj:`dict`: The best score of fitted model.""" @@ -1266,6 +1281,28 @@ def feature_names_in_(self) -> np.ndarray: raise LGBMNotFittedError("No feature_names_in_ found. Need to call fit beforehand.") return np.array(self.feature_name_) + @feature_names_in_.deleter + def feature_names_in_(self) -> None: + """Intercept calls to delete ``feature_names_in_``. + + Some code paths in ``scikit-learn`` try to delete the ``feature_names_in_`` attribute + on estimators when a new training dataset that doesn't have features is passed. + LightGBM automatically assigns feature names to such datasets + (like ``Column_0``, ``Column_1``, etc.) and so does not want that behavior. + + However, that behavior is coupled to ``scikit-learn`` automatically updating + ``n_features_in_`` in those same code paths, which is necessary for compliance + with its API (via argument ``reset`` to functions like ``validate_data()`` and + ``check_array()``). + + .. note:: + + Do not call ``del estimator.feature_names_in_`` or anything else that invokes + this method. It is only here for compatibility with ``scikit-learn`` validation + functions used internally in ``lightgbm``. + """ + pass + class LGBMRegressor(_LGBMRegressorBase, LGBMModel): """LightGBM regressor.""" From 4233d702a6f4431805859a8059e4ee2ba563ca34 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 6 Oct 2024 00:55:56 -0500 Subject: [PATCH 28/33] set floor of scikit-learn>=0.24.2, fix ordering of n_features_in_ setting for older versions --- .ci/test-python-oldest.sh | 2 +- python-package/lightgbm/compat.py | 25 ++++++++++++++++---- python-package/lightgbm/sklearn.py | 37 +++--------------------------- python-package/pyproject.toml | 2 +- 4 files changed, 26 insertions(+), 40 deletions(-) diff --git a/.ci/test-python-oldest.sh b/.ci/test-python-oldest.sh index c6de079351e3..06db15b0eb7a 100644 --- a/.ci/test-python-oldest.sh +++ b/.ci/test-python-oldest.sh @@ -15,7 +15,7 @@ pip install \ 'numpy==1.19.0' \ 'pandas==1.1.3' \ 'pyarrow==6.0.1' \ - 'scikit-learn==0.24.0' \ + 'scikit-learn==0.24.2' \ 'scipy==1.6.0' \ || exit 1 echo "done installing lightgbm's dependencies" diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index 617bd31b9e36..a09e97553fa0 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -46,6 +46,24 @@ def validate_data( # trap other keyword arguments that only work on scikit-learn >=1.6, like 'reset' **ignored_kwargs, ): + # it's safe to import _num_features unconditionally because: + # + # * it was first added in scikit-learn 0.24.2 + # * lightgbm cannot be used with scikit-learn versions older than that + # * this validate_data() re-implementation will not be called in scikit-learn>=1.6 + # + from sklearn.utils.validation import _num_features + + # _num_features() raises a TypeError on 1-dimensional input. That's a problem + # because scikit-learn's 'check_fit1d' estimator check sets that expectation that + # estimators must raise a ValueError when a 1-dimensional input is passed to fit(). + # + # So here, lightgbm avoids calling _num_features() on 1-dimensional inputs. + if hasattr(X, "shape") and len(X.shape) == 1: + n_features_in_ = 1 + else: + n_features_in_ = _num_features(X) + no_val_y = isinstance(y, str) and y == "no_validation" # NOTE: check_X_y() calls check_array() internally, so only need to call one or the other of them here @@ -61,13 +79,12 @@ def validate_data( ) # this only needs to be updated at fit() time - _estimator._n_features_in = X.shape[1] + _estimator.n_features_in_ = n_features_in_ # raise the same error that scikit-learn's `validate_data()` does on scikit-learn>=1.6 - n_features = X.shape[1] - if _estimator.__sklearn_is_fitted__() and _estimator._n_features != n_features: + if _estimator.__sklearn_is_fitted__() and _estimator._n_features != n_features_in_: raise ValueError( - f"X has {n_features} features, but {_estimator.__class__.__name__} " + f"X has {n_features_in_} features, but {_estimator.__class__.__name__} " f"is expecting {_estimator._n_features} features as input." ) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index c691c921d4e7..e2b3e423a8aa 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -148,32 +148,6 @@ def _get_weight_from_constructed_dataset(dataset: Dataset) -> Optional[np.ndarra return weight -def _num_features_for_raw_input(X: _LGBM_ScikitMatrixLike) -> int: - """Get number of features from raw input data. - - Some ``scikit-learn`` versions accomplish this with a private function - ``sklearn.utils.validation._num_features()``. This is here to avoid depending on that. - """ - # if a list, assume a list of lists where each item is one row of training data, - # and all rows have the same number of features - if isinstance(X, list): - return len(X[0]) - - # scikit-learn estimator checks error out if .shape is accessed unconditionally... - # but allow hard-coding an assumption that anything that isn't a list and doesn't have .shape - # must be convertible to something following the Array API via a __array__() method - if hasattr(X, "shape"): - X_shape = X.shape - else: - X_shape = X.__array__().shape - - # this condition accounts for training on a 1-dimensional input - if len(X_shape) == 1: - return 1 - else: - return X_shape[1] - - class _ObjectiveFunctionWrapper: """Proxy class for objective function.""" @@ -931,14 +905,6 @@ def fit( params["metric"] = [e for e in eval_metrics_builtin if e not in params["metric"]] + params["metric"] params["metric"] = [metric for metric in params["metric"] if metric is not None] - # Tf self.n_features_in_ is set to any value other than ``None``, then it needs to be populated - # before ``sklearn.utils.validation.validate_data()`` is called, to avoid an error about mismatched - # feature numbers being raised on the first call to ``fit()``. - # - # lightgbm initializes that property to -1, so it can be an integer throughout its entire life - # (to help with type-checking), so it needs to be updated here. - # self._n_features_in = _num_features_for_raw_input(X) - if not isinstance(X, (pd_DataFrame, dt_DataTable)): _X, _y = _LGBMValidateData( self, @@ -960,6 +926,9 @@ def fit( else: _X, _y = X, y + # for other data types, setting n_features_in_ is handled by _LGBMValidateData() in the branch above + self.n_features_in_ = _X.shape[1] + if self._class_weight is None: self._class_weight = self.class_weight if self._class_weight is not None: diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml index 0d3d70feac66..4f6a17cb3666 100644 --- a/python-package/pyproject.toml +++ b/python-package/pyproject.toml @@ -45,7 +45,7 @@ pandas = [ "pandas>=0.24.0" ] scikit-learn = [ - "scikit-learn!=0.22.0" + "scikit-learn>=0.24.2" ] [project.urls] From e8e4cdb97a2f448f65dde2e6c1f5851fe015126c Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 6 Oct 2024 01:48:51 -0500 Subject: [PATCH 29/33] Update python-package/lightgbm/sklearn.py --- python-package/lightgbm/sklearn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index e2b3e423a8aa..a3829170ae38 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -1141,7 +1141,7 @@ def n_features_in_(self) -> int: def n_features_in_(self, value: int) -> None: """Set number of features found in passed-in dataset. - Starting with ``scikit-learn`` 1.6, ``scikit-learn` expects to be able to directly + Starting with ``scikit-learn`` 1.6, ``scikit-learn`` expects to be able to directly set this property in functions like ``validate_data()``. .. note:: From f22e4948943cc440bb848a44ecfa688b8d5c940f Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 6 Oct 2024 22:09:10 -0500 Subject: [PATCH 30/33] forgot to commit ... fix comment --- python-package/lightgbm/sklearn.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index e2b3e423a8aa..0cc2c24c8466 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -1074,8 +1074,7 @@ def predict( # 'y' being omitted = run scikit-learn's check_array() instead of check_X_y() # # Prevent scikit-learn from deleting or modifying attributes like 'feature_names_in_' and 'n_features_in_'. - # We prefer to expose these via @property, to be able to raise a NotFittedError if they're accessed on an - # unfitted model... and so don't want to take on the complexity of defining setters and deleters for those. + # These shouldn't be changed at predict() time. reset=False, # allow any input type (this validation is done further down, in lgb.Dataset()) accept_sparse=True, From c6e6fad4520a8caf1bf133d5521475f88c635ca7 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 8 Oct 2024 14:30:20 -0500 Subject: [PATCH 31/33] Apply suggestions from code review Co-authored-by: Nikita Titov --- python-package/lightgbm/sklearn.py | 5 +---- tests/python_package_test/test_sklearn.py | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 30744144c3f8..45d3102dbbf2 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -706,7 +706,7 @@ def __sklearn_tags__(self) -> Optional["_sklearn_Tags"]: if not hasattr(_LGBMModelBase, "__sklearn_tags__"): err_msg = ( "__sklearn_tags__() should not be called when using scikit-learn<1.6. " - f"detected version: {_sklearn_version}" + f"Detected version: {_sklearn_version}" ) raise AttributeError(err_msg) @@ -910,9 +910,6 @@ def fit( self, X, y, - # Prevent scikit-learn from deleting or modifying attributes like 'feature_names_in_' and 'n_features_in_'. - # We prefer to expose these via @property, to be able to raise a NotFittedError if they're accessed on an - # unfitted model... and so don't want to take on the complexity of defining setters and deleters for those. reset=True, # allow any input type (this validation is done further down, in lgb.Dataset()) accept_sparse=True, diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index b68daa86c5b1..6bf4a40c13a3 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -1627,7 +1627,7 @@ def test_predict_rejects_inputs_with_incorrect_number_of_features(predict_disabl assert preds.shape == y.shape if estimator_name == "LGBMClassifier": - preds = model.predict(X[:, :-1], predict_disable_shape_check=predict_disable_shape_check) + preds = model.predict_proba(X[:, :-1], predict_disable_shape_check=predict_disable_shape_check) assert preds.shape == y.shape From d8762e5b27007bce6be6b7b1050f2bbaa8da0599 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 8 Oct 2024 20:44:27 -0500 Subject: [PATCH 32/33] predict_proba() shape is (num_data, num_classes) for multi-class --- tests/python_package_test/test_sklearn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 6bf4a40c13a3..6eca66ff20d3 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -1628,7 +1628,7 @@ def test_predict_rejects_inputs_with_incorrect_number_of_features(predict_disabl if estimator_name == "LGBMClassifier": preds = model.predict_proba(X[:, :-1], predict_disable_shape_check=predict_disable_shape_check) - assert preds.shape == y.shape + assert preds.shape[0] == y.shape[0] @pytest.mark.parametrize("X_type", ["dt_DataTable", "list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"]) From 8ef1debf71c0a1bdae36eed1f24714226c3e7706 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 9 Oct 2024 11:15:53 -0500 Subject: [PATCH 33/33] pass ensure_min_samples=1 at predict() time too --- python-package/lightgbm/compat.py | 7 ++++++- python-package/lightgbm/sklearn.py | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index 77f08b883f36..96dee6522572 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -68,7 +68,12 @@ def validate_data( # NOTE: check_X_y() calls check_array() internally, so only need to call one or the other of them here if no_val_y: - X = check_array(X, accept_sparse=accept_sparse, force_all_finite=ensure_all_finite) + X = check_array( + X, + accept_sparse=accept_sparse, + force_all_finite=ensure_all_finite, + ensure_min_samples=ensure_min_samples, + ) else: X, y = check_X_y( X, diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 45d3102dbbf2..e8e46eb42b86 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -1077,6 +1077,8 @@ def predict( accept_sparse=True, # do not raise an error if Inf of NaN values are found (LightGBM handles these internally) ensure_all_finite=False, + # raise an error on 0-row inputs + ensure_min_samples=1, ) # retrieve original params that possibly can be used in both training and prediction # and then overwrite them (considering aliases) with params that were passed directly in prediction