From 1adb77be7016bf8510fbdaea106a97e155644e06 Mon Sep 17 00:00:00 2001
From: vnherdeiro <victor.oriedreh@gmail.com>
Date: Wed, 11 Sep 2024 11:40:51 +0100
Subject: [PATCH 01/33] __sklearn_tags__ replacing sklearn's
 BaseEstimator._more_tags_

---
 python-package/lightgbm/sklearn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index ad805eef7332..497f8f6c0b8f 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -662,7 +662,7 @@ def __init__(
         self._n_classes: int = -1
         self.set_params(**kwargs)
 
-    def _more_tags(self) -> Dict[str, Any]:
+    def __sklearn_tags__(self) -> Dict[str, Any]:
         return {
             "allow_nan": True,
             "X_types": ["2darray", "sparse", "1dlabels"],

From 8ed87d2c0ce5ffa9ce287ae567d973424102bd41 Mon Sep 17 00:00:00 2001
From: vnherdeiro <victor.oriedreh@gmail.com>
Date: Wed, 11 Sep 2024 12:01:17 +0100
Subject: [PATCH 02/33] fixing tags dict -> dataclass

---
 python-package/lightgbm/compat.py  |  2 ++
 python-package/lightgbm/sklearn.py | 16 +++++++++-------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index f916ca6be723..07b091e7094d 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -12,6 +12,7 @@
     from sklearn.utils.class_weight import compute_sample_weight
     from sklearn.utils.multiclass import check_classification_targets
     from sklearn.utils.validation import assert_all_finite, check_array, check_X_y
+    from sklearn.utils import Tags
 
     try:
         from sklearn.exceptions import NotFittedError
@@ -44,6 +45,7 @@ def _check_sample_weight(sample_weight: Any, X: Any, dtype: Any = None) -> Any:
     _LGBMAssertAllFinite = assert_all_finite
     _LGBMCheckClassificationTargets = check_classification_targets
     _LGBMComputeSampleWeight = compute_sample_weight
+    _LGBMTags = Tags
 except ImportError:
     SKLEARN_INSTALLED = False
 
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index 497f8f6c0b8f..6dcbb57dd665 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -24,6 +24,7 @@
     _LGBM_InitScoreType,
     _LGBM_LabelType,
     _LGBM_WeightType,
+    _LGBMTags,
     _log_warning,
 )
 from .callback import _EvalResultDict, record_evaluation
@@ -662,16 +663,17 @@ def __init__(
         self._n_classes: int = -1
         self.set_params(**kwargs)
 
-    def __sklearn_tags__(self) -> Dict[str, Any]:
-        return {
-            "allow_nan": True,
-            "X_types": ["2darray", "sparse", "1dlabels"],
-            "_xfail_checks": {
+    def __sklearn_tags__(self) -> _LGBMTags:
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        tags.input_tags.sparse = True
+        tags.target_tags.one_d_labels = True
+        tags._xfail_checks = {
                 "check_no_attributes_set_in_init": "scikit-learn incorrectly asserts that private attributes "
                 "cannot be set in __init__: "
                 "(see https://github.com/microsoft/LightGBM/issues/2628)"
-            },
-        }
+            }
+        return tags
 
     def __sklearn_is_fitted__(self) -> bool:
         return getattr(self, "fitted_", False)

From 32ec431605a091328270cb85a408e4883ceeab3c Mon Sep 17 00:00:00 2001
From: vnherdeiro <victor.oriedreh@gmail.com>
Date: Wed, 11 Sep 2024 12:03:16 +0100
Subject: [PATCH 03/33] fixing wrong import

---
 python-package/lightgbm/sklearn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index 6dcbb57dd665..c41264a437be 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -24,7 +24,6 @@
     _LGBM_InitScoreType,
     _LGBM_LabelType,
     _LGBM_WeightType,
-    _LGBMTags,
     _log_warning,
 )
 from .callback import _EvalResultDict, record_evaluation
@@ -42,6 +41,7 @@
     _LGBMLabelEncoder,
     _LGBMModelBase,
     _LGBMRegressorBase,
+    _LGBMTags,
     dt_DataTable,
     pd_DataFrame,
 )

From ade9798d46a45d1b419dccc4f7e7b9a8e132b798 Mon Sep 17 00:00:00 2001
From: vnherdeiro <victor.oriedreh@gmail.com>
Date: Wed, 11 Sep 2024 12:18:43 +0100
Subject: [PATCH 04/33] remove type hint

---
 python-package/lightgbm/sklearn.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index c41264a437be..1b64338408af 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -41,7 +41,6 @@
     _LGBMLabelEncoder,
     _LGBMModelBase,
     _LGBMRegressorBase,
-    _LGBMTags,
     dt_DataTable,
     pd_DataFrame,
 )
@@ -663,7 +662,7 @@ def __init__(
         self._n_classes: int = -1
         self.set_params(**kwargs)
 
-    def __sklearn_tags__(self) -> _LGBMTags:
+    def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         tags.input_tags.allow_nan = True
         tags.input_tags.sparse = True

From 2085a128529446a0f00c813a8184ce339d0865ea Mon Sep 17 00:00:00 2001
From: vnherdeiro <victor.oriedreh@gmail.com>
Date: Wed, 11 Sep 2024 12:22:16 +0100
Subject: [PATCH 05/33] remove type hint

---
 python-package/lightgbm/compat.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index 07b091e7094d..f916ca6be723 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -12,7 +12,6 @@
     from sklearn.utils.class_weight import compute_sample_weight
     from sklearn.utils.multiclass import check_classification_targets
     from sklearn.utils.validation import assert_all_finite, check_array, check_X_y
-    from sklearn.utils import Tags
 
     try:
         from sklearn.exceptions import NotFittedError
@@ -45,7 +44,6 @@ def _check_sample_weight(sample_weight: Any, X: Any, dtype: Any = None) -> Any:
     _LGBMAssertAllFinite = assert_all_finite
     _LGBMCheckClassificationTargets = check_classification_targets
     _LGBMComputeSampleWeight = compute_sample_weight
-    _LGBMTags = Tags
 except ImportError:
     SKLEARN_INSTALLED = False
 

From a9ec348890dce2e4b92db9dc6f87527722374307 Mon Sep 17 00:00:00 2001
From: vnherdeiro <victor.oriedreh@gmail.com>
Date: Wed, 11 Sep 2024 12:30:24 +0100
Subject: [PATCH 06/33] fix linting

---
 python-package/lightgbm/sklearn.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index 1b64338408af..68dd78c5fb18 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -668,10 +668,10 @@ def __sklearn_tags__(self):
         tags.input_tags.sparse = True
         tags.target_tags.one_d_labels = True
         tags._xfail_checks = {
-                "check_no_attributes_set_in_init": "scikit-learn incorrectly asserts that private attributes "
-                "cannot be set in __init__: "
-                "(see https://github.com/microsoft/LightGBM/issues/2628)"
-            }
+            "check_no_attributes_set_in_init": "scikit-learn incorrectly asserts that private attributes "
+            "cannot be set in __init__: "
+            "(see https://github.com/microsoft/LightGBM/issues/2628)"
+        }
         return tags
 
     def __sklearn_is_fitted__(self) -> bool:

From fcc4e1299c426e6aeec938cf6832b6cf330b709c Mon Sep 17 00:00:00 2001
From: vnherdeiro <victor.oriedreh@gmail.com>
Date: Sat, 14 Sep 2024 10:11:06 +0100
Subject: [PATCH 07/33] triggering new CI (scikit-learn dev has changed)


From 3b15646fe22d0036910cfcda39e6bcaa1a3306a6 Mon Sep 17 00:00:00 2001
From: vnherdeiro <victor.oriedreh@gmail.com>
Date: Sun, 15 Sep 2024 13:10:48 +0100
Subject: [PATCH 08/33] bringing back _more_tags, adding convertsion from
 more_tags to sklearn_tags

---
 python-package/lightgbm/sklearn.py | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index 68dd78c5fb18..c189ca503a87 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -662,16 +662,27 @@ def __init__(
         self._n_classes: int = -1
         self.set_params(**kwargs)
 
+    def _more_tags(self) -> Dict[str, Any]:
+        return {
+            "allow_nan": True,
+            "X_types": ["2darray", "sparse", "1dlabels"],
+            "_xfail_checks": {
+                "check_no_attributes_set_in_init": "scikit-learn incorrectly asserts that private attributes "
+                "cannot be set in __init__: "
+                "(see https://github.com/microsoft/LightGBM/issues/2628)"
+            },
+        }
+
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
-        tags.input_tags.allow_nan = True
-        tags.input_tags.sparse = True
-        tags.target_tags.one_d_labels = True
-        tags._xfail_checks = {
-            "check_no_attributes_set_in_init": "scikit-learn incorrectly asserts that private attributes "
-            "cannot be set in __init__: "
-            "(see https://github.com/microsoft/LightGBM/issues/2628)"
-        }
+        more_tags = self._more_tags()
+        tags.input_tags.allow_nan = more_tags.pop('allow_nan', False)
+        tagged_input_types = more_tags.pop('X_types', [])
+        tags.input_tags.sparse = 'sparse' in tagged_input_types
+        tags.target_tags.one_d_labels = '1dlabels' in tagged_input_types
+        tags._xfail_checks = more_tags.pop('_xfail_checks', {})
+        if more_tags or set(tagged_input_types).difference({"2darray", "sparse", "1dlabels"}):
+            _log_warning(f'Some tags sklearn tag values are missing from __sklearn_tags__: `{more_tags}`')
         return tags
 
     def __sklearn_is_fitted__(self) -> bool:

From 34d9eb4d0abaa75de064b6e95dc1b5979af6e561 Mon Sep 17 00:00:00 2001
From: vnherdeiro <victor.oriedreh@gmail.com>
Date: Sun, 15 Sep 2024 19:19:11 +0100
Subject: [PATCH 09/33] lint fix

---
 python-package/lightgbm/sklearn.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index c189ca503a87..e7e57dba2ba7 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -676,13 +676,13 @@ def _more_tags(self) -> Dict[str, Any]:
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         more_tags = self._more_tags()
-        tags.input_tags.allow_nan = more_tags.pop('allow_nan', False)
-        tagged_input_types = more_tags.pop('X_types', [])
-        tags.input_tags.sparse = 'sparse' in tagged_input_types
-        tags.target_tags.one_d_labels = '1dlabels' in tagged_input_types
-        tags._xfail_checks = more_tags.pop('_xfail_checks', {})
+        tags.input_tags.allow_nan = more_tags.pop("allow_nan", False)
+        tagged_input_types = more_tags.pop("X_types", [])
+        tags.input_tags.sparse = "sparse" in tagged_input_types
+        tags.target_tags.one_d_labels = "1dlabels" in tagged_input_types
+        tags._xfail_checks = more_tags.pop("_xfail_checks", {})
         if more_tags or set(tagged_input_types).difference({"2darray", "sparse", "1dlabels"}):
-            _log_warning(f'Some tags sklearn tag values are missing from __sklearn_tags__: `{more_tags}`')
+            _log_warning(f"Some tags sklearn tag values are missing from __sklearn_tags__: `{more_tags}`")
         return tags
 
     def __sklearn_is_fitted__(self) -> bool:

From 6d20ef858362d468e2008ce343312d9d70908d6b Mon Sep 17 00:00:00 2001
From: vnherdeiro <victor.oriedreh@gmail.com>
Date: Mon, 16 Sep 2024 07:34:15 +0100
Subject: [PATCH 10/33] Update python-package/lightgbm/sklearn.py

Co-authored-by: James Lamb <jaylamb20@gmail.com>
---
 python-package/lightgbm/sklearn.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index e7e57dba2ba7..b974e7c8d097 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -676,11 +676,10 @@ def _more_tags(self) -> Dict[str, Any]:
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         more_tags = self._more_tags()
-        tags.input_tags.allow_nan = more_tags.pop("allow_nan", False)
-        tagged_input_types = more_tags.pop("X_types", [])
-        tags.input_tags.sparse = "sparse" in tagged_input_types
-        tags.target_tags.one_d_labels = "1dlabels" in tagged_input_types
-        tags._xfail_checks = more_tags.pop("_xfail_checks", {})
+        tags.input_tags.allow_nan = more_tags["allow_nan"]
+        tags.input_tags.sparse = "sparse" in more_tags["X_types"]
+        tags.target_tags.one_d_labels = "1dlabels" in more_tags["X_types"]
+        tags._xfail_checks = more_tags["_xfail_checks"]
         if more_tags or set(tagged_input_types).difference({"2darray", "sparse", "1dlabels"}):
             _log_warning(f"Some tags sklearn tag values are missing from __sklearn_tags__: `{more_tags}`")
         return tags

From d715311f3aa8f8228cddbc7e784085d16f06737f Mon Sep 17 00:00:00 2001
From: vnherdeiro <victor.oriedreh@gmail.com>
Date: Mon, 16 Sep 2024 07:35:41 +0100
Subject: [PATCH 11/33] adressing PR comments

---
 python-package/lightgbm/sklearn.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index b974e7c8d097..fa42dff54ddc 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -674,14 +674,16 @@ def _more_tags(self) -> Dict[str, Any]:
         }
 
     def __sklearn_tags__(self):
+        # scikit-learn 1.6 introduced an __sklearn__tags() method intended to replace _more_tags().
+        # _more_tags() can be removed whenever lightgbm's minimum supported scikit-learn version
+        # is >=1.6.
+        # ref: https://github.com/microsoft/LightGBM/pull/6651
         tags = super().__sklearn_tags__()
         more_tags = self._more_tags()
         tags.input_tags.allow_nan = more_tags["allow_nan"]
         tags.input_tags.sparse = "sparse" in more_tags["X_types"]
         tags.target_tags.one_d_labels = "1dlabels" in more_tags["X_types"]
         tags._xfail_checks = more_tags["_xfail_checks"]
-        if more_tags or set(tagged_input_types).difference({"2darray", "sparse", "1dlabels"}):
-            _log_warning(f"Some tags sklearn tag values are missing from __sklearn_tags__: `{more_tags}`")
         return tags
 
     def __sklearn_is_fitted__(self) -> bool:

From c4ec9a4939894885d8ec9eee8dd1911c62d40107 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Mon, 16 Sep 2024 11:32:51 -0500
Subject: [PATCH 12/33] move comment

---
 python-package/lightgbm/sklearn.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index fa42dff54ddc..fe97053a4241 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -662,6 +662,10 @@ def __init__(
         self._n_classes: int = -1
         self.set_params(**kwargs)
 
+    # scikit-learn 1.6 introduced an __sklearn__tags() method intended to replace _more_tags().
+    # _more_tags() can be removed whenever lightgbm's minimum supported scikit-learn version
+    # is >=1.6.
+    # ref: https://github.com/microsoft/LightGBM/pull/6651
     def _more_tags(self) -> Dict[str, Any]:
         return {
             "allow_nan": True,
@@ -674,10 +678,6 @@ def _more_tags(self) -> Dict[str, Any]:
         }
 
     def __sklearn_tags__(self):
-        # scikit-learn 1.6 introduced an __sklearn__tags() method intended to replace _more_tags().
-        # _more_tags() can be removed whenever lightgbm's minimum supported scikit-learn version
-        # is >=1.6.
-        # ref: https://github.com/microsoft/LightGBM/pull/6651
         tags = super().__sklearn_tags__()
         more_tags = self._more_tags()
         tags.input_tags.allow_nan = more_tags["allow_nan"]

From b0a47035eadd285cf1d34cf04367653fafe72405 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Fri, 20 Sep 2024 23:57:32 -0500
Subject: [PATCH 13/33] updates

---
 python-package/lightgbm/sklearn.py        | 48 +++++++++++++++++++++--
 tests/python_package_test/test_sklearn.py | 18 +++++++--
 2 files changed, 60 insertions(+), 6 deletions(-)

diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index fe97053a4241..2648a92c4ff5 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -677,15 +677,38 @@ def _more_tags(self) -> Dict[str, Any]:
             },
         }
 
-    def __sklearn_tags__(self):
-        tags = super().__sklearn_tags__()
-        more_tags = self._more_tags()
+    @staticmethod
+    def _update_sklearn_tags_from_dict(
+        *,
+        tags: "sklearn.utils.Tags",
+        tags_dict: Dict[str, Any]
+    ) -> "sklearn.utils.Tags":
+        """
+        scikit-learn 1.6 introduced a dataclass-based interface for estimator tags.
+        ref: https://github.com/scikit-learn/scikit-learn/pull/29677
+
+        That interface means that each 
+        """
         tags.input_tags.allow_nan = more_tags["allow_nan"]
         tags.input_tags.sparse = "sparse" in more_tags["X_types"]
         tags.target_tags.one_d_labels = "1dlabels" in more_tags["X_types"]
         tags._xfail_checks = more_tags["_xfail_checks"]
         return tags
 
+    def __sklearn_tags__(self):
+        # super().__sklearn_tags__() cannot be called unconditionally,
+        # because that method isn't defined for scikit-learn<1.6
+        if not callable(getattr(super(), "__sklearn_tags__", None)):
+            return None
+
+        # take whatever tags are provided by BaseEstimator, then modify
+        # them with LightGBM-specific values
+        tags = self._update_sklearn_tags_from_dict(
+            tags=super().__sklearn_tags__(),
+            tags_dict=self._more_tags()
+        )
+        return tags
+
     def __sklearn_is_fitted__(self) -> bool:
         return getattr(self, "fitted_", False)
 
@@ -1182,6 +1205,17 @@ def feature_names_in_(self) -> np.ndarray:
 class LGBMRegressor(_LGBMRegressorBase, LGBMModel):
     """LightGBM regressor."""
 
+    def _more_tags(self) -> Dict[str, Any]:
+        tags = super(LGBMModel, self)._more_tags()
+        tags.update(super(_LGBMRegressorBase, self)._more_tags())
+        return tags
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        if tags is None:
+            return None
+        
+
     def fit(  # type: ignore[override]
         self,
         X: _LGBM_ScikitMatrixLike,
@@ -1228,6 +1262,14 @@ def fit(  # type: ignore[override]
 class LGBMClassifier(_LGBMClassifierBase, LGBMModel):
     """LightGBM classifier."""
 
+    def _more_tags(self) -> Dict[str, Any]:
+        tags = super(LGBMModel, self)._more_tags()
+        tags.update(super(_LGBMClassifierBase, self)._more_tags())
+        return tags
+
+    def __sklearn_tags__(self):
+        return super().__
+
     def fit(  # type: ignore[override]
         self,
         X: _LGBM_ScikitMatrixLike,
diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py
index 01ab057cf3e2..74f99736b454 100644
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -36,6 +36,7 @@
 )
 
 decreasing_generator = itertools.count(0, -1)
+estimator_classes = (lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker)
 task_to_model_factory = {
     "ranking": lgb.LGBMRanker,
     "binary-classification": lgb.LGBMClassifier,
@@ -1311,7 +1312,7 @@ def test_check_is_fitted():
         check_is_fitted(model)
 
 
-@pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker])
+@pytest.mark.parametrize("estimator_class", estimator_classes)
 @pytest.mark.parametrize("max_depth", [3, 4, 5, 8])
 def test_max_depth_warning_is_never_raised(capsys, estimator_class, max_depth):
     X, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
@@ -1390,7 +1391,7 @@ def test_fit_only_raises_num_rounds_warning_when_expected(capsys):
     assert_silent(capsys)
 
 
-@pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker])
+@pytest.mark.parametrize("estimator_class", estimator_classes)
 def test_getting_feature_names_in_np_input(estimator_class):
     # input is a numpy array, which doesn't have feature names. LightGBM adds
     # feature names to the fitted model, which is inconsistent with sklearn's behavior
@@ -1409,7 +1410,7 @@ def test_getting_feature_names_in_np_input(estimator_class):
     np.testing.assert_array_equal(model.feature_names_in_, np.array([f"Column_{i}" for i in range(X.shape[1])]))
 
 
-@pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker])
+@pytest.mark.parametrize("estimator_class", estimator_classes)
 def test_getting_feature_names_in_pd_input(estimator_class):
     X, y = load_digits(n_class=2, return_X_y=True, as_frame=True)
     col_names = X.columns.to_list()
@@ -1436,6 +1437,17 @@ def test_sklearn_integration(estimator, check):
     check(estimator)
 
 
+@pytest.mark.parametrize("estimator_class", estimator_classes)
+def test_sklearn_tags_should_correctly_reflect_lightgbm_specific_values(estimator_class):
+    est = estimator_class()
+    more_tags = est._more_tags()
+    assert (
+        more_tags["X_types"] == ["2darray", "sparse", "1dlabels"],
+        "List of supported X_types has changed. Update LGBMModel.__sklearn__tags() to match.",
+    )
+    sklearn_tags = est.__sklearn_tags__()
+
+
 @pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "ranking", "regression"])
 def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task):
     pd = pytest.importorskip("pandas")

From 7eb861addebd6d446be511c35a00b85158d11fa9 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Tue, 24 Sep 2024 00:38:43 -0500
Subject: [PATCH 14/33] remove uses of super()

---
 .ci/test.sh                               |  1 +
 python-package/lightgbm/sklearn.py        | 77 ++++++++++++++---------
 tests/python_package_test/test_sklearn.py | 13 ++--
 3 files changed, 57 insertions(+), 34 deletions(-)

diff --git a/.ci/test.sh b/.ci/test.sh
index a2e2dde070a2..d05ce857e3dd 100755
--- a/.ci/test.sh
+++ b/.ci/test.sh
@@ -103,6 +103,7 @@ if [[ $TASK == "lint" ]]; then
         'mypy>=1.11.1' \
         'pre-commit>=3.8.0' \
         'pyarrow-core>=17.0' \
+        'scikit-learn>=1.15.0' \
         'r-lintr>=3.1.2'
     source activate $CONDA_ENV
     echo "Linting Python code"
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index 2648a92c4ff5..fec9cbe21aa5 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -4,7 +4,7 @@
 import copy
 from inspect import signature
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import scipy.sparse
@@ -46,6 +46,12 @@
 )
 from .engine import train
 
+if TYPE_CHECKING:
+    try:
+        from sklearn.utils import Tags as _sklearn_Tags
+    except ImportError:
+        _sklearn_Tags = None
+
 __all__ = [
     "LGBMClassifier",
     "LGBMModel",
@@ -673,41 +679,45 @@ def _more_tags(self) -> Dict[str, Any]:
             "_xfail_checks": {
                 "check_no_attributes_set_in_init": "scikit-learn incorrectly asserts that private attributes "
                 "cannot be set in __init__: "
-                "(see https://github.com/microsoft/LightGBM/issues/2628)"
+                "(see https://github.com/microsoft/LightGBM/issues/2628)",
+                "check_n_features_in_after_fitting": (
+                    "validate_data() was first added in scikit-learn 1.6 and lightgbm"
+                    "supports much older versions than that"
+                ),
             },
         }
 
     @staticmethod
     def _update_sklearn_tags_from_dict(
         *,
-        tags: "sklearn.utils.Tags",
-        tags_dict: Dict[str, Any]
-    ) -> "sklearn.utils.Tags":
-        """
-        scikit-learn 1.6 introduced a dataclass-based interface for estimator tags.
+        tags: "_sklearn_Tags",
+        tags_dict: Dict[str, Any],
+    ) -> "_sklearn_Tags":
+        """Update ``sklearn.utils.Tags`` inherited from ``scikit-learn`` base classes.
+
+        ``scikit-learn`` 1.6 introduced a dataclass-based interface for estimator tags.
         ref: https://github.com/scikit-learn/scikit-learn/pull/29677
 
-        That interface means that each 
+        This method handles updating that instance based on the value in ``self._more_tags()``.
         """
-        tags.input_tags.allow_nan = more_tags["allow_nan"]
-        tags.input_tags.sparse = "sparse" in more_tags["X_types"]
-        tags.target_tags.one_d_labels = "1dlabels" in more_tags["X_types"]
-        tags._xfail_checks = more_tags["_xfail_checks"]
+        tags.input_tags.allow_nan = tags_dict["allow_nan"]
+        tags.input_tags.sparse = "sparse" in tags_dict["X_types"]
+        tags.target_tags.one_d_labels = "1dlabels" in tags_dict["X_types"]
+        tags._xfail_checks = tags_dict["_xfail_checks"]
         return tags
 
-    def __sklearn_tags__(self):
-        # super().__sklearn_tags__() cannot be called unconditionally,
+    def __sklearn_tags__(self) -> Optional["_sklearn_Tags"]:
+        # _LGBMModelBase.__sklearn_tags__() cannot be called unconditionally,
         # because that method isn't defined for scikit-learn<1.6
-        if not callable(getattr(super(), "__sklearn_tags__", None)):
+        if not callable(getattr(_LGBMModelBase, "__sklearn_tags__", None)):
             return None
 
         # take whatever tags are provided by BaseEstimator, then modify
         # them with LightGBM-specific values
-        tags = self._update_sklearn_tags_from_dict(
-            tags=super().__sklearn_tags__(),
-            tags_dict=self._more_tags()
+        return self._update_sklearn_tags_from_dict(
+            tags=_LGBMModelBase.__sklearn_tags__(self),
+            tags_dict=self._more_tags(),
         )
-        return tags
 
     def __sklearn_is_fitted__(self) -> bool:
         return getattr(self, "fitted_", False)
@@ -1206,15 +1216,17 @@ class LGBMRegressor(_LGBMRegressorBase, LGBMModel):
     """LightGBM regressor."""
 
     def _more_tags(self) -> Dict[str, Any]:
-        tags = super(LGBMModel, self)._more_tags()
-        tags.update(super(_LGBMRegressorBase, self)._more_tags())
+        # handle the case where ClassifierMixin possibly provides _more_tags()
+        if callable(getattr(_LGBMClassifierBase, "_more_tags", None)):
+            tags = _LGBMClassifierBase._more_tags(self)
+        else:
+            tags = {}
+        # override those with LightGBM-specific preferences
+        tags.update(LGBMModel._more_tags(self))
         return tags
 
-    def __sklearn_tags__(self):
-        tags = super().__sklearn_tags__()
-        if tags is None:
-            return None
-        
+    def __sklearn_tags__(self) -> Optional["_sklearn_Tags"]:
+        return LGBMModel.__sklearn_tags__(self)
 
     def fit(  # type: ignore[override]
         self,
@@ -1263,12 +1275,17 @@ class LGBMClassifier(_LGBMClassifierBase, LGBMModel):
     """LightGBM classifier."""
 
     def _more_tags(self) -> Dict[str, Any]:
-        tags = super(LGBMModel, self)._more_tags()
-        tags.update(super(_LGBMClassifierBase, self)._more_tags())
+        # handle the case where ClassifierMixin possibly provides _more_tags()
+        if callable(getattr(_LGBMClassifierBase, "_more_tags", None)):
+            tags = _LGBMClassifierBase._more_tags(self)
+        else:
+            tags = {}
+        # override those with LightGBM-specific preferences
+        tags.update(LGBMModel._more_tags(self))
         return tags
 
-    def __sklearn_tags__(self):
-        return super().__
+    def __sklearn_tags__(self) -> Optional["_sklearn_Tags"]:
+        return LGBMModel.__sklearn_tags__(self)
 
     def fit(  # type: ignore[override]
         self,
diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py
index 74f99736b454..216fd50ab0c5 100644
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -1441,11 +1441,16 @@ def test_sklearn_integration(estimator, check):
 def test_sklearn_tags_should_correctly_reflect_lightgbm_specific_values(estimator_class):
     est = estimator_class()
     more_tags = est._more_tags()
-    assert (
-        more_tags["X_types"] == ["2darray", "sparse", "1dlabels"],
-        "List of supported X_types has changed. Update LGBMModel.__sklearn__tags() to match.",
-    )
+    err_msg = "List of supported X_types has changed. Update LGBMModel.__sklearn__tags() to match."
+    assert more_tags["X_types"] == ["2darray", "sparse", "1dlabels"], err_msg
     sklearn_tags = est.__sklearn_tags__()
+    # these tests should be run unconditionally (no 'if') once lightgbm's
+    # minimum scikit-learn version is 1.6 or higher
+    if sklearn_tags is not None:
+        assert sklearn_tags.input_tags.allow_nan is True
+        assert sklearn_tags.input_tags.sparse is True
+        assert sklearn_tags.target_tags.one_d_labels is True
+        assert sklearn_tags._xfail_checks == more_tags["_xfail_checks"]
 
 
 @pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "ranking", "regression"])

From b137ba25a0aaeef2589762f261c77f438bcb19d3 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Tue, 24 Sep 2024 00:49:21 -0500
Subject: [PATCH 15/33] fix version constraint in lint job, add one more
 comment

---
 .ci/test.sh                        | 2 +-
 python-package/lightgbm/sklearn.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.ci/test.sh b/.ci/test.sh
index d05ce857e3dd..9fc4db54adaf 100755
--- a/.ci/test.sh
+++ b/.ci/test.sh
@@ -103,7 +103,7 @@ if [[ $TASK == "lint" ]]; then
         'mypy>=1.11.1' \
         'pre-commit>=3.8.0' \
         'pyarrow-core>=17.0' \
-        'scikit-learn>=1.15.0' \
+        'scikit-learn>=1.5.2' \
         'r-lintr>=3.1.2'
     source activate $CONDA_ENV
     echo "Linting Python code"
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index fec9cbe21aa5..600c77207fec 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -47,6 +47,8 @@
 from .engine import train
 
 if TYPE_CHECKING:
+    # sklearn.utils.Tags can be imported unconditionally once
+    # lightgbm's minimum scikit-learn version is 1.6 or higher
     try:
         from sklearn.utils import Tags as _sklearn_Tags
     except ImportError:

From d1915c0f008a62936ba0aa8f9f66bdd050dc14ec Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Tue, 24 Sep 2024 08:25:51 -0500
Subject: [PATCH 16/33] Update python-package/lightgbm/sklearn.py

---
 python-package/lightgbm/sklearn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index 600c77207fec..7c085f249eb7 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -1219,8 +1219,8 @@ class LGBMRegressor(_LGBMRegressorBase, LGBMModel):
 
     def _more_tags(self) -> Dict[str, Any]:
         # handle the case where ClassifierMixin possibly provides _more_tags()
-        if callable(getattr(_LGBMClassifierBase, "_more_tags", None)):
-            tags = _LGBMClassifierBase._more_tags(self)
+        if callable(getattr(_LGBMRegressorBase, "_more_tags", None)):
+            tags = _LGBMRegressorBase._more_tags(self)
         else:
             tags = {}
         # override those with LightGBM-specific preferences

From 118efd95b898141ba854989fa5fe650727997d06 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Tue, 1 Oct 2024 22:58:19 -0500
Subject: [PATCH 17/33] use scikit-learn 1.6 nightlies again, move some code to
 compat.py, restructure tests

---
 .ci/test-python-latest.sh                 |  2 +-
 python-package/lightgbm/compat.py         | 12 +++++++++++-
 python-package/lightgbm/sklearn.py        | 24 ++++++++++++-----------
 tests/python_package_test/test_sklearn.py | 14 +++++++++----
 4 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/.ci/test-python-latest.sh b/.ci/test-python-latest.sh
index f98f29f2641a..08fc8558ef3e 100755
--- a/.ci/test-python-latest.sh
+++ b/.ci/test-python-latest.sh
@@ -22,7 +22,7 @@ python -m pip install \
         'numpy>=2.0.0.dev0' \
         'matplotlib>=3.10.0.dev0' \
         'pandas>=3.0.0.dev0' \
-        'scikit-learn==1.5.*' \
+        'scikit-learn>=1.6.dev0' \
         'scipy>=1.15.0.dev0'
 
 python -m pip install \
diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index f916ca6be723..09eed6d6c83c 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -1,7 +1,7 @@
 # coding: utf-8
 """Compatibility library."""
 
-from typing import Any, List
+from typing import TYPE_CHECKING, Any, List
 
 # scikit-learn is intentionally imported first here,
 # see https://github.com/microsoft/LightGBM/issues/6509
@@ -74,6 +74,16 @@ class _LGBMRegressorBase:  # type: ignore
     _LGBMCheckClassificationTargets = None
     _LGBMComputeSampleWeight = None
 
+# additional scikit-learn imports only for type hints
+if TYPE_CHECKING:
+    # sklearn.utils.Tags can be imported unconditionally once
+    # lightgbm's minimum scikit-learn version is 1.6 or higher
+    try:
+        from sklearn.utils import Tags as _sklearn_Tags
+    except ImportError:
+        _sklearn_Tags = None
+
+
 """pandas"""
 try:
     from pandas import DataFrame as pd_DataFrame
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index 7c085f249eb7..4cf305d777c1 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -47,12 +47,8 @@
 from .engine import train
 
 if TYPE_CHECKING:
-    # sklearn.utils.Tags can be imported unconditionally once
-    # lightgbm's minimum scikit-learn version is 1.6 or higher
-    try:
-        from sklearn.utils import Tags as _sklearn_Tags
-    except ImportError:
-        _sklearn_Tags = None
+    from .compat import _sklearn_Tags
+
 
 __all__ = [
     "LGBMClassifier",
@@ -711,8 +707,14 @@ def _update_sklearn_tags_from_dict(
     def __sklearn_tags__(self) -> Optional["_sklearn_Tags"]:
         # _LGBMModelBase.__sklearn_tags__() cannot be called unconditionally,
         # because that method isn't defined for scikit-learn<1.6
-        if not callable(getattr(_LGBMModelBase, "__sklearn_tags__", None)):
-            return None
+        if not hasattr(self, "__sklearn_tags__"):
+            from sklearn import __version__ as sklearn_version
+
+            err_msg = (
+                "__sklearn_tags__() should not be called when using scikit-learn<1.6. "
+                f"detected version: {sklearn_version}"
+            )
+            raise AttributeError(err_msg)
 
         # take whatever tags are provided by BaseEstimator, then modify
         # them with LightGBM-specific values
@@ -1218,7 +1220,7 @@ class LGBMRegressor(_LGBMRegressorBase, LGBMModel):
     """LightGBM regressor."""
 
     def _more_tags(self) -> Dict[str, Any]:
-        # handle the case where ClassifierMixin possibly provides _more_tags()
+        # handle the case where RegressorMixin possibly provides _more_tags()
         if callable(getattr(_LGBMRegressorBase, "_more_tags", None)):
             tags = _LGBMRegressorBase._more_tags(self)
         else:
@@ -1227,7 +1229,7 @@ def _more_tags(self) -> Dict[str, Any]:
         tags.update(LGBMModel._more_tags(self))
         return tags
 
-    def __sklearn_tags__(self) -> Optional["_sklearn_Tags"]:
+    def __sklearn_tags__(self) -> "_sklearn_Tags":
         return LGBMModel.__sklearn_tags__(self)
 
     def fit(  # type: ignore[override]
@@ -1286,7 +1288,7 @@ def _more_tags(self) -> Dict[str, Any]:
         tags.update(LGBMModel._more_tags(self))
         return tags
 
-    def __sklearn_tags__(self) -> Optional["_sklearn_Tags"]:
+    def __sklearn_tags__(self) -> "_sklearn_Tags":
         return LGBMModel.__sklearn_tags__(self)
 
     def fit(  # type: ignore[override]
diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py
index 216fd50ab0c5..1d127fa6abae 100644
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -1443,10 +1443,16 @@ def test_sklearn_tags_should_correctly_reflect_lightgbm_specific_values(estimato
     more_tags = est._more_tags()
     err_msg = "List of supported X_types has changed. Update LGBMModel.__sklearn__tags() to match."
     assert more_tags["X_types"] == ["2darray", "sparse", "1dlabels"], err_msg
-    sklearn_tags = est.__sklearn_tags__()
-    # these tests should be run unconditionally (no 'if') once lightgbm's
-    # minimum scikit-learn version is 1.6 or higher
-    if sklearn_tags is not None:
+    # the try-except part of this should be removed once lightgbm's
+    # minimum supported scikit-learn version is at least 1.6
+    try:
+        sklearn_tags = est.__sklearn_tags__()
+    except AttributeError as err:
+        # only the exact error we expected to be raised should be raised
+        assert bool(re.search(r"__sklearn_tags__.* should not be called", err))
+    else:
+        # if no AttributeError was thrown, we must be using scikit-learn>=1.6,
+        # and so the actual effects of __sklearn_tags__() should be tested
         assert sklearn_tags.input_tags.allow_nan is True
         assert sklearn_tags.input_tags.sparse is True
         assert sklearn_tags.target_tags.one_d_labels is True

From 4fb82f3080081c960c6dcca7e1798f2ec83b1876 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Wed, 2 Oct 2024 23:38:09 -0500
Subject: [PATCH 18/33] optionally use validate_data(), starting in
 scikit-learn 1.6

---
 python-package/lightgbm/compat.py  | 36 ++++++++++++++++++++++++++++--
 python-package/lightgbm/sklearn.py | 34 ++++++++++++++++++++++++----
 2 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index 09eed6d6c83c..c8ba580b2d4d 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -29,6 +29,38 @@ def _check_sample_weight(sample_weight: Any, X: Any, dtype: Any = None) -> Any:
             check_consistent_length(sample_weight, X)
             return sample_weight
 
+    try:
+        from sklearn.utils.validation import validate_data
+    except ImportError:
+        # validate_data() was added in scikit-learn 1.6, this function roughly imitates it for older versions.
+        # It can be removed when lightbm's minimum scikit-learn version is at least 1.6.
+        def validate_data(
+            _estimator,
+            X,
+            y="no_validation",
+            reset: bool = False,
+            accept_sparse: bool = True,
+            # 'force_all_finite' was renamed to 'ensure_all_finite' in scikit-learn 1.6
+            ensure_all_finite: bool = False,
+            ensure_min_samples: int = 1,
+        ):
+            # NOTE: check_X_y() calls check_array() internally, so only need to call one or the other of them here
+            if isinstance(y, str) and y == "no_validation":
+                X = check_array(X, reset=reset, accept_sparse=accept_sparse, force_all_finite=ensure_all_finite)
+            else:
+                check_X_y(
+                    X,
+                    y,
+                    reset=reset,
+                    accept_sparse=accept_sparse,
+                    force_all_finite=ensure_all_finite,
+                    ensure_min_samples=ensure_min_samples,
+                )
+                # this only needs to be updated at fit() time
+                _estimator.n_features_in_ = _estimator._n_features
+
+            return X, y
+
     SKLEARN_INSTALLED = True
     _LGBMBaseCrossValidator = BaseCrossValidator
     _LGBMModelBase = BaseEstimator
@@ -38,12 +70,12 @@ def _check_sample_weight(sample_weight: Any, X: Any, dtype: Any = None) -> Any:
     LGBMNotFittedError = NotFittedError
     _LGBMStratifiedKFold = StratifiedKFold
     _LGBMGroupKFold = GroupKFold
-    _LGBMCheckXY = check_X_y
     _LGBMCheckArray = check_array
     _LGBMCheckSampleWeight = _check_sample_weight
     _LGBMAssertAllFinite = assert_all_finite
     _LGBMCheckClassificationTargets = check_classification_targets
     _LGBMComputeSampleWeight = compute_sample_weight
+    _LGBMValidateData = validate_data
 except ImportError:
     SKLEARN_INSTALLED = False
 
@@ -67,12 +99,12 @@ class _LGBMRegressorBase:  # type: ignore
     LGBMNotFittedError = ValueError
     _LGBMStratifiedKFold = None
     _LGBMGroupKFold = None
-    _LGBMCheckXY = None
     _LGBMCheckArray = None
     _LGBMCheckSampleWeight = None
     _LGBMAssertAllFinite = None
     _LGBMCheckClassificationTargets = None
     _LGBMComputeSampleWeight = None
+    _LGBMValidateData = None
 
 # additional scikit-learn imports only for type hints
 if TYPE_CHECKING:
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index 4cf305d777c1..2b773d46da7f 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -31,16 +31,15 @@
     SKLEARN_INSTALLED,
     LGBMNotFittedError,
     _LGBMAssertAllFinite,
-    _LGBMCheckArray,
     _LGBMCheckClassificationTargets,
     _LGBMCheckSampleWeight,
-    _LGBMCheckXY,
     _LGBMClassifierBase,
     _LGBMComputeSampleWeight,
     _LGBMCpuCount,
     _LGBMLabelEncoder,
     _LGBMModelBase,
     _LGBMRegressorBase,
+    _LGBMValidateData,
     dt_DataTable,
     pd_DataFrame,
 )
@@ -912,7 +911,21 @@ def fit(
         params["metric"] = [metric for metric in params["metric"] if metric is not None]
 
         if not isinstance(X, (pd_DataFrame, dt_DataTable)):
-            _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, ensure_min_samples=2)
+            _X, _y = _LGBMValidateData(
+                self,
+                X,
+                y,
+                # Prevent scikit-learn from deleting or modifying attributes like 'feature_names_in_' and 'n_features_in_'.
+                # We prefer to expose these via @property, to be able to raise a NotFittedError if they're accessed on an
+                # unfitted model... and so don't want to take on the complexity of defining setters and deleters for those.
+                reset=False,
+                # allow any input type (this validation is done further down, in lgb.Dataset())
+                accept_sparse=True,
+                # do not raise an error if Inf of NaN values are found (LightGBM handles these internally)
+                ensure_all_finite=False,
+                # raise an error on 0-row and 1-row inputs
+                ensure_min_samples=2,
+            )
             if sample_weight is not None:
                 sample_weight = _LGBMCheckSampleWeight(sample_weight, _X)
         else:
@@ -1054,7 +1067,20 @@ def predict(
         if not self.__sklearn_is_fitted__():
             raise LGBMNotFittedError("Estimator not fitted, call fit before exploiting the model.")
         if not isinstance(X, (pd_DataFrame, dt_DataTable)):
-            X = _LGBMCheckArray(X, accept_sparse=True, force_all_finite=False)
+            X = _LGBMValidateData(
+                self,
+                X,
+                # 'y' being omitted = run scikit-learn's check_array() instead of check_X_y()
+                #
+                # Prevent scikit-learn from deleting or modifying attributes like 'feature_names_in_' and 'n_features_in_'.
+                # We prefer to expose these via @property, to be able to raise a NotFittedError if they're accessed on an
+                # unfitted model... and so don't want to take on the complexity of defining setters and deleters for those.
+                reset=False,
+                # allow any input type (this validation is done further down, in lgb.Dataset())
+                accept_sparse=True,
+                # do not raise an error if Inf of NaN values are found (LightGBM handles these internally)
+                ensure_all_finite=False,
+            )
         n_features = X.shape[1]
         if self._n_features != n_features:
             raise ValueError(

From c42c53df4d5a4a88b7cd68c7eff239b0587556a4 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Thu, 3 Oct 2024 23:39:12 -0500
Subject: [PATCH 19/33] fix validate_data() for older versions, update tests

---
 python-package/lightgbm/compat.py         | 20 +++++++++++++++-----
 python-package/lightgbm/sklearn.py        | 19 ++-----------------
 tests/python_package_test/test_sklearn.py | 23 ++++++++++++++++++++++-
 3 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index c8ba580b2d4d..eecbba08541c 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -38,26 +38,36 @@ def validate_data(
             _estimator,
             X,
             y="no_validation",
-            reset: bool = False,
             accept_sparse: bool = True,
             # 'force_all_finite' was renamed to 'ensure_all_finite' in scikit-learn 1.6
             ensure_all_finite: bool = False,
             ensure_min_samples: int = 1,
+            # trap other keyword arguments that only work on scikit-learn >=1.6, like 'reset'
+            **ignored_kwargs,
         ):
             # NOTE: check_X_y() calls check_array() internally, so only need to call one or the other of them here
             if isinstance(y, str) and y == "no_validation":
-                X = check_array(X, reset=reset, accept_sparse=accept_sparse, force_all_finite=ensure_all_finite)
+                X = check_array(X, accept_sparse=accept_sparse, force_all_finite=ensure_all_finite)
             else:
-                check_X_y(
+                X, y = check_X_y(
                     X,
                     y,
-                    reset=reset,
                     accept_sparse=accept_sparse,
                     force_all_finite=ensure_all_finite,
                     ensure_min_samples=ensure_min_samples,
                 )
+
                 # this only needs to be updated at fit() time
-                _estimator.n_features_in_ = _estimator._n_features
+                _estimator._n_features = X.shape[1]
+                _estimator._n_features_in = X.shape[1]
+
+            # raise the same error that scikit-learn's `validate_data()` does on scikit-learn>=1.6
+            n_features = X.shape[1]
+            if _estimator._n_features != n_features:
+                raise ValueError(
+                    f"X has {n_features} features, but {_estimator.__class__.__name__} "
+                    f"is expecting {_estimator._n_features} features as input."
+                )
 
             return X, y
 
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index 2b773d46da7f..aba4b7770563 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -677,10 +677,6 @@ def _more_tags(self) -> Dict[str, Any]:
                 "check_no_attributes_set_in_init": "scikit-learn incorrectly asserts that private attributes "
                 "cannot be set in __init__: "
                 "(see https://github.com/microsoft/LightGBM/issues/2628)",
-                "check_n_features_in_after_fitting": (
-                    "validate_data() was first added in scikit-learn 1.6 and lightgbm"
-                    "supports much older versions than that"
-                ),
             },
         }
 
@@ -706,7 +702,7 @@ def _update_sklearn_tags_from_dict(
     def __sklearn_tags__(self) -> Optional["_sklearn_Tags"]:
         # _LGBMModelBase.__sklearn_tags__() cannot be called unconditionally,
         # because that method isn't defined for scikit-learn<1.6
-        if not hasattr(self, "__sklearn_tags__"):
+        if not hasattr(_LGBMModelBase, "__sklearn_tags__"):
             from sklearn import __version__ as sklearn_version
 
             err_msg = (
@@ -940,10 +936,6 @@ def fit(
             else:
                 sample_weight = np.multiply(sample_weight, class_sample_weight)
 
-        self._n_features = _X.shape[1]
-        # copy for consistency
-        self._n_features_in = self._n_features
-
         train_set = Dataset(
             data=_X,
             label=_y,
@@ -1067,7 +1059,7 @@ def predict(
         if not self.__sklearn_is_fitted__():
             raise LGBMNotFittedError("Estimator not fitted, call fit before exploiting the model.")
         if not isinstance(X, (pd_DataFrame, dt_DataTable)):
-            X = _LGBMValidateData(
+            X, _ = _LGBMValidateData(
                 self,
                 X,
                 # 'y' being omitted = run scikit-learn's check_array() instead of check_X_y()
@@ -1081,13 +1073,6 @@ def predict(
                 # do not raise an error if Inf of NaN values are found (LightGBM handles these internally)
                 ensure_all_finite=False,
             )
-        n_features = X.shape[1]
-        if self._n_features != n_features:
-            raise ValueError(
-                "Number of features of the model must "
-                f"match the input. Model n_features_ is {self._n_features} and "
-                f"input n_features is {n_features}"
-            )
         # retrieve original params that possibly can be used in both training and prediction
         # and then overwrite them (considering aliases) with params that were passed directly in prediction
         predict_params = self._process_params(stage="predict")
diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py
index 1d127fa6abae..1f7e7c7a04e8 100644
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -1449,7 +1449,7 @@ def test_sklearn_tags_should_correctly_reflect_lightgbm_specific_values(estimato
         sklearn_tags = est.__sklearn_tags__()
     except AttributeError as err:
         # only the exact error we expected to be raised should be raised
-        assert bool(re.search(r"__sklearn_tags__.* should not be called", err))
+        assert bool(re.search(r"__sklearn_tags__.* should not be called", str(err)))
     else:
         # if no AttributeError was thrown, we must be using scikit-learn>=1.6,
         # and so the actual effects of __sklearn_tags__() should be tested
@@ -1584,6 +1584,27 @@ def test_validate_features(task):
     model.predict(df2, validate_features=False)
 
 
+# LightGBM's 'predict_disable_shape_check' mechanism is intentionally not respected by
+# its scikit-learn estimators, for consistency with scikit-learn's own behavior.
+@pytest.mark.parametrize("predict_disable_shape_check", [True, False])
+def test_predict_rejects_inputs_with_incorrect_number_of_features(predict_disable_shape_check):
+    X, y, _ = _create_data(task="regression", n_features=4)
+    # train on the first 3 features
+    model = lgb.LGBMRegressor(n_estimators=5, num_leaves=7, verbose=-1).fit(X[:, :-1], y)
+
+    # more cols in X than features: error
+    with pytest.raises(ValueError, match="X has 4 features, but LGBMRegressor is expecting 3 features as input"):
+        model.predict(X, predict_disable_shape_check=predict_disable_shape_check)
+
+    # fewer cols in X than features: error
+    with pytest.raises(ValueError, match="X has 2 features, but LGBMRegressor is expecting 3 features as input"):
+        model.predict(X[:, :-2], predict_disable_shape_check=predict_disable_shape_check)
+
+    # same number of columns in both: no error
+    preds = model.predict(X[:, :-1], predict_disable_shape_check=predict_disable_shape_check)
+    assert preds.shape == y.shape
+
+
 @pytest.mark.parametrize("X_type", ["dt_DataTable", "list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
 @pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_Series", "pd_DataFrame"])
 @pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "regression"])

From 33fb5b680655646ce3acbab90d8f9def407e5e20 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Fri, 4 Oct 2024 00:12:49 -0500
Subject: [PATCH 20/33] more changes

---
 python-package/lightgbm/compat.py  | 25 +++++++++++++------------
 python-package/lightgbm/sklearn.py | 17 ++++++++++++++++-
 2 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index eecbba08541c..a04256df9e37 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -48,22 +48,23 @@ def validate_data(
             # NOTE: check_X_y() calls check_array() internally, so only need to call one or the other of them here
             if isinstance(y, str) and y == "no_validation":
                 X = check_array(X, accept_sparse=accept_sparse, force_all_finite=ensure_all_finite)
-            else:
-                X, y = check_X_y(
-                    X,
-                    y,
-                    accept_sparse=accept_sparse,
-                    force_all_finite=ensure_all_finite,
-                    ensure_min_samples=ensure_min_samples,
-                )
+                return X
+
+            # if we reach here, we're validating features and labels
+            X, y = check_X_y(
+                X,
+                y,
+                accept_sparse=accept_sparse,
+                force_all_finite=ensure_all_finite,
+                ensure_min_samples=ensure_min_samples,
+            )
 
-                # this only needs to be updated at fit() time
-                _estimator._n_features = X.shape[1]
-                _estimator._n_features_in = X.shape[1]
+            # this only needs to be updated at fit() time
+            _estimator._n_features_in = X.shape[1]
 
             # raise the same error that scikit-learn's `validate_data()` does on scikit-learn>=1.6
             n_features = X.shape[1]
-            if _estimator._n_features != n_features:
+            if _estimator.__sklearn_is_fitted__() and _estimator._n_features != n_features:
                 raise ValueError(
                     f"X has {n_features} features, but {_estimator.__class__.__name__} "
                     f"is expecting {_estimator._n_features} features as input."
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index aba4b7770563..5e923ce19769 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -906,6 +906,14 @@ def fit(
         params["metric"] = [e for e in eval_metrics_builtin if e not in params["metric"]] + params["metric"]
         params["metric"] = [metric for metric in params["metric"] if metric is not None]
 
+        # this needs to be set before calling _LGBMValidateData(), as that function uses
+        # self.n_features_in_ (the corresponding public attribute) to check that the input has
+        # the expected number of features
+        if isinstance(X, list):
+            self._n_features_in = len(X[0])
+        else:
+            self._n_features_in = X.shape[1]
+
         if not isinstance(X, (pd_DataFrame, dt_DataTable)):
             _X, _y = _LGBMValidateData(
                 self,
@@ -1018,6 +1026,13 @@ def fit(
             callbacks=callbacks,
         )
 
+        # This populates the property self.n_features_, the number of features in the fitted model,
+        # and so should only be set after fitting.
+        #
+        # The related property self._n_features_in, which populates self.n_features_in_,
+        # is set BEFORE fitting.
+        self._n_features = self._Booster.num_feature()
+
         self._evals_result = evals_result
         self._best_iteration = self._Booster.best_iteration
         self._best_score = self._Booster.best_score
@@ -1059,7 +1074,7 @@ def predict(
         if not self.__sklearn_is_fitted__():
             raise LGBMNotFittedError("Estimator not fitted, call fit before exploiting the model.")
         if not isinstance(X, (pd_DataFrame, dt_DataTable)):
-            X, _ = _LGBMValidateData(
+            X = _LGBMValidateData(
                 self,
                 X,
                 # 'y' being omitted = run scikit-learn's check_array() instead of check_X_y()

From 6689faaa6cd906cd4fd2425fca1e711aacbdd0fe Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sat, 5 Oct 2024 00:04:05 -0500
Subject: [PATCH 21/33] fix n_features_in setting

---
 python-package/lightgbm/compat.py  |  2 +-
 python-package/lightgbm/sklearn.py | 37 ++++++++++++++++++++++++------
 2 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index a04256df9e37..6b56a4f9b0e3 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -48,7 +48,7 @@ def validate_data(
             # NOTE: check_X_y() calls check_array() internally, so only need to call one or the other of them here
             if isinstance(y, str) and y == "no_validation":
                 X = check_array(X, accept_sparse=accept_sparse, force_all_finite=ensure_all_finite)
-                return X
+                return X  # noqa: RET504
 
             # if we reach here, we're validating features and labels
             X, y = check_X_y(
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index 5e923ce19769..8f03d53506e7 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -147,6 +147,32 @@ def _get_weight_from_constructed_dataset(dataset: Dataset) -> Optional[np.ndarra
     return weight
 
 
+def _num_features_for_raw_input(X: _LGBM_ScikitMatrixLike) -> int:
+    """Get number of features from raw input data.
+
+    Some ``scikit-learn`` versions accomplish this with a private function
+    ``sklearn.utils.validation._num_features()``. This is here to avoid depending on that.
+    """
+    # if a list, assume a list of lists where each item is one row of training data,
+    # and all rows have the same number of features
+    if isinstance(X, list):
+        return len(X[0])
+
+    # scikit-learn estimator checks error out if .shape is accessed unconditionally...
+    # but allow hard-coding an assumption that anything that isn't a list and doesn't have .shape
+    # must be convertible to something following the Array API via a __array__() method
+    if hasattr(X, "shape"):
+        X_shape = X.shape
+    else:
+        X_shape = X.__array__().shape
+
+    # this condition accounts for training on a 1-dimensional input
+    if len(X_shape) == 1:
+        return 1
+    else:
+        return X_shape[1]
+
+
 class _ObjectiveFunctionWrapper:
     """Proxy class for objective function."""
 
@@ -906,13 +932,10 @@ def fit(
         params["metric"] = [e for e in eval_metrics_builtin if e not in params["metric"]] + params["metric"]
         params["metric"] = [metric for metric in params["metric"] if metric is not None]
 
-        # this needs to be set before calling _LGBMValidateData(), as that function uses
-        # self.n_features_in_ (the corresponding public attribute) to check that the input has
-        # the expected number of features
-        if isinstance(X, list):
-            self._n_features_in = len(X[0])
-        else:
-            self._n_features_in = X.shape[1]
+        # `sklearn.utils.validation.validate_data()` expects self.n_features_in_ to already be set by the
+        # time it's called (if you call it with reset=True like LightGBM does), and the scikit-learn
+        # estimator checks complain if X.shape is accessed unconditionally
+        self._n_features_in = _num_features_for_raw_input(X)
 
         if not isinstance(X, (pd_DataFrame, dt_DataTable)):
             _X, _y = _LGBMValidateData(

From 9a056701844e60dbdd2c37d32bf078c3434df080 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sat, 5 Oct 2024 00:13:29 -0500
Subject: [PATCH 22/33] fix return type

---
 python-package/lightgbm/compat.py | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index 6b56a4f9b0e3..b8db1e010464 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -45,22 +45,22 @@ def validate_data(
             # trap other keyword arguments that only work on scikit-learn >=1.6, like 'reset'
             **ignored_kwargs,
         ):
+            no_val_y = isinstance(y, str) and y == "no_validation"
+
             # NOTE: check_X_y() calls check_array() internally, so only need to call one or the other of them here
-            if isinstance(y, str) and y == "no_validation":
+            if no_val_y:
                 X = check_array(X, accept_sparse=accept_sparse, force_all_finite=ensure_all_finite)
-                return X  # noqa: RET504
-
-            # if we reach here, we're validating features and labels
-            X, y = check_X_y(
-                X,
-                y,
-                accept_sparse=accept_sparse,
-                force_all_finite=ensure_all_finite,
-                ensure_min_samples=ensure_min_samples,
-            )
+            else:
+                X, y = check_X_y(
+                    X,
+                    y,
+                    accept_sparse=accept_sparse,
+                    force_all_finite=ensure_all_finite,
+                    ensure_min_samples=ensure_min_samples,
+                )
 
-            # this only needs to be updated at fit() time
-            _estimator._n_features_in = X.shape[1]
+                # this only needs to be updated at fit() time
+                _estimator._n_features_in = X.shape[1]
 
             # raise the same error that scikit-learn's `validate_data()` does on scikit-learn>=1.6
             n_features = X.shape[1]
@@ -70,7 +70,10 @@ def validate_data(
                     f"is expecting {_estimator._n_features} features as input."
                 )
 
-            return X, y
+            if no_val_y:
+                return X
+            else:
+                return X, y
 
     SKLEARN_INSTALLED = True
     _LGBMBaseCrossValidator = BaseCrossValidator

From 815433fa6d8ffe1d026bbaa4a5d04b2d5026b679 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sat, 5 Oct 2024 00:16:22 -0500
Subject: [PATCH 23/33] remove now-unnecessary _LGBMCheckXY()

---
 python-package/lightgbm/compat.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index b8db1e010464..6e443ffcc48c 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -84,7 +84,6 @@ def validate_data(
     LGBMNotFittedError = NotFittedError
     _LGBMStratifiedKFold = StratifiedKFold
     _LGBMGroupKFold = GroupKFold
-    _LGBMCheckArray = check_array
     _LGBMCheckSampleWeight = _check_sample_weight
     _LGBMAssertAllFinite = assert_all_finite
     _LGBMCheckClassificationTargets = check_classification_targets
@@ -113,7 +112,6 @@ class _LGBMRegressorBase:  # type: ignore
     LGBMNotFittedError = ValueError
     _LGBMStratifiedKFold = None
     _LGBMGroupKFold = None
-    _LGBMCheckArray = None
     _LGBMCheckSampleWeight = None
     _LGBMAssertAllFinite = None
     _LGBMCheckClassificationTargets = None

From ffebe417532f00ed622c4b81b1816d14dc0cfe2b Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sat, 5 Oct 2024 00:23:31 -0500
Subject: [PATCH 24/33] correct comment

---
 python-package/lightgbm/sklearn.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index 8f03d53506e7..853b54de6b6e 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -932,9 +932,12 @@ def fit(
         params["metric"] = [e for e in eval_metrics_builtin if e not in params["metric"]] + params["metric"]
         params["metric"] = [metric for metric in params["metric"] if metric is not None]
 
-        # `sklearn.utils.validation.validate_data()` expects self.n_features_in_ to already be set by the
-        # time it's called (if you call it with reset=True like LightGBM does), and the scikit-learn
-        # estimator checks complain if X.shape is accessed unconditionally
+        # Tf self.n_features_in_ is set to any value other than ``None``, then it needs to be populated
+        # before ``sklearn.utils.validation.validate_data()`` is called, to avoid an error about mismatched
+        # feature numbers being raised on the first call to ``fit()``.
+        #
+        # lightgbm initializes that property to -1, so it can be an integer throughout its entire life
+        # (to help with type-checking), so it needs to be updated here.
         self._n_features_in = _num_features_for_raw_input(X)
 
         if not isinstance(X, (pd_DataFrame, dt_DataTable)):

From f2cb2fe74151cad847f9c3177bbbe430ebc46b80 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sat, 5 Oct 2024 22:26:50 -0500
Subject: [PATCH 25/33] Apply suggestions from code review

Co-authored-by: Nikita Titov <nekit94-08@mail.ru>
---
 python-package/lightgbm/compat.py         | 2 +-
 python-package/lightgbm/sklearn.py        | 2 +-
 tests/python_package_test/test_sklearn.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index 6e443ffcc48c..ebb3f47ebcf2 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -33,7 +33,7 @@ def _check_sample_weight(sample_weight: Any, X: Any, dtype: Any = None) -> Any:
         from sklearn.utils.validation import validate_data
     except ImportError:
         # validate_data() was added in scikit-learn 1.6, this function roughly imitates it for older versions.
-        # It can be removed when lightbm's minimum scikit-learn version is at least 1.6.
+        # It can be removed when lightgbm's minimum scikit-learn version is at least 1.6.
         def validate_data(
             _estimator,
             X,
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index 853b54de6b6e..998c26c2f3bc 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -932,7 +932,7 @@ def fit(
         params["metric"] = [e for e in eval_metrics_builtin if e not in params["metric"]] + params["metric"]
         params["metric"] = [metric for metric in params["metric"] if metric is not None]
 
-        # Tf self.n_features_in_ is set to any value other than ``None``, then it needs to be populated
+        # If self.n_features_in_ is set to any value other than ``None``, then it needs to be populated
         # before ``sklearn.utils.validation.validate_data()`` is called, to avoid an error about mismatched
         # feature numbers being raised on the first call to ``fit()``.
         #
diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py
index 1f7e7c7a04e8..fc996ca01cee 100644
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -1441,7 +1441,7 @@ def test_sklearn_integration(estimator, check):
 def test_sklearn_tags_should_correctly_reflect_lightgbm_specific_values(estimator_class):
     est = estimator_class()
     more_tags = est._more_tags()
-    err_msg = "List of supported X_types has changed. Update LGBMModel.__sklearn__tags() to match."
+    err_msg = "List of supported X_types has changed. Update LGBMModel.__sklearn_tags__() to match."
     assert more_tags["X_types"] == ["2darray", "sparse", "1dlabels"], err_msg
     # the try-except part of this should be removed once lightgbm's
     # minimum supported scikit-learn version is at least 1.6

From 86b5ab3b7bc58ebf9d8e4510f55c5d78768b0089 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sat, 5 Oct 2024 23:46:22 -0500
Subject: [PATCH 26/33] move __version__ import to compat.py, test with all ML
 tasks

---
 python-package/lightgbm/compat.py         |  2 ++
 python-package/lightgbm/sklearn.py        |  5 ++-
 tests/python_package_test/test_sklearn.py | 40 +++++++++++++++++++----
 3 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index 6e443ffcc48c..617bd31b9e36 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -7,6 +7,7 @@
 # see https://github.com/microsoft/LightGBM/issues/6509
 """sklearn"""
 try:
+    from sklearn import __version__ as _sklearn_version
     from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
     from sklearn.preprocessing import LabelEncoder
     from sklearn.utils.class_weight import compute_sample_weight
@@ -117,6 +118,7 @@ class _LGBMRegressorBase:  # type: ignore
     _LGBMCheckClassificationTargets = None
     _LGBMComputeSampleWeight = None
     _LGBMValidateData = None
+    _sklearn_version = None
 
 # additional scikit-learn imports only for type hints
 if TYPE_CHECKING:
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index 853b54de6b6e..2c2193303569 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -40,6 +40,7 @@
     _LGBMModelBase,
     _LGBMRegressorBase,
     _LGBMValidateData,
+    _sklearn_version,
     dt_DataTable,
     pd_DataFrame,
 )
@@ -729,11 +730,9 @@ def __sklearn_tags__(self) -> Optional["_sklearn_Tags"]:
         # _LGBMModelBase.__sklearn_tags__() cannot be called unconditionally,
         # because that method isn't defined for scikit-learn<1.6
         if not hasattr(_LGBMModelBase, "__sklearn_tags__"):
-            from sklearn import __version__ as sklearn_version
-
             err_msg = (
                 "__sklearn_tags__() should not be called when using scikit-learn<1.6. "
-                f"detected version: {sklearn_version}"
+                f"detected version: {_sklearn_version}"
             )
             raise AttributeError(err_msg)
 
diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py
index 1f7e7c7a04e8..bb14e46ebf7d 100644
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -43,6 +43,7 @@
     "multiclass-classification": lgb.LGBMClassifier,
     "regression": lgb.LGBMRegressor,
 }
+all_tasks = tuple(task_to_model_factory.keys())
 
 
 def _create_data(task, n_samples=100, n_features=4):
@@ -1459,7 +1460,7 @@ def test_sklearn_tags_should_correctly_reflect_lightgbm_specific_values(estimato
         assert sklearn_tags._xfail_checks == more_tags["_xfail_checks"]
 
 
-@pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "ranking", "regression"])
+@pytest.mark.parametrize("task", all_tasks)
 def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task):
     pd = pytest.importorskip("pandas")
     X, y, g = _create_data(task)
@@ -1563,7 +1564,7 @@ def test_default_n_jobs(tmp_path):
 
 
 @pytest.mark.skipif(not PANDAS_INSTALLED, reason="pandas is not installed")
-@pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "ranking", "regression"])
+@pytest.mark.parametrize("task", all_tasks)
 def test_validate_features(task):
     X, y, g = _create_data(task, n_features=4)
     features = ["x1", "x2", "x3", "x4"]
@@ -1586,24 +1587,49 @@ def test_validate_features(task):
 
 # LightGBM's 'predict_disable_shape_check' mechanism is intentionally not respected by
 # its scikit-learn estimators, for consistency with scikit-learn's own behavior.
+@pytest.mark.parametrize("task", all_tasks)
 @pytest.mark.parametrize("predict_disable_shape_check", [True, False])
-def test_predict_rejects_inputs_with_incorrect_number_of_features(predict_disable_shape_check):
-    X, y, _ = _create_data(task="regression", n_features=4)
+def test_predict_rejects_inputs_with_incorrect_number_of_features(predict_disable_shape_check, task):
+    X, y, g = _create_data(task, n_features=4)
+    model_factory = task_to_model_factory[task]
+    fit_kwargs = {"X": X[:, :-1], "y": y}
+    if task == "ranking":
+        estimator_name = "LGBMRanker"
+        fit_kwargs.update({"group": g})
+    elif task == "regression":
+        estimator_name = "LGBMRegressor"
+    else:
+        estimator_name = "LGBMClassifier"
+
     # train on the first 3 features
-    model = lgb.LGBMRegressor(n_estimators=5, num_leaves=7, verbose=-1).fit(X[:, :-1], y)
+    model = model_factory(n_estimators=5, num_leaves=7, verbose=-1).fit(**fit_kwargs)
 
     # more cols in X than features: error
-    with pytest.raises(ValueError, match="X has 4 features, but LGBMRegressor is expecting 3 features as input"):
+    err_msg = f"X has 4 features, but {estimator_name} is expecting 3 features as input"
+    with pytest.raises(ValueError, match=err_msg):
         model.predict(X, predict_disable_shape_check=predict_disable_shape_check)
 
+    if estimator_name == "LGBMClassifier":
+        with pytest.raises(ValueError, match=err_msg):
+            model.predict_proba(X, predict_disable_shape_check=predict_disable_shape_check)
+
     # fewer cols in X than features: error
-    with pytest.raises(ValueError, match="X has 2 features, but LGBMRegressor is expecting 3 features as input"):
+    err_msg = f"X has 2 features, but {estimator_name} is expecting 3 features as input"
+    with pytest.raises(ValueError, match=err_msg):
         model.predict(X[:, :-2], predict_disable_shape_check=predict_disable_shape_check)
 
+    if estimator_name == "LGBMClassifier":
+        with pytest.raises(ValueError, match=err_msg):
+            model.predict_proba(X[:, :-2], predict_disable_shape_check=predict_disable_shape_check)
+
     # same number of columns in both: no error
     preds = model.predict(X[:, :-1], predict_disable_shape_check=predict_disable_shape_check)
     assert preds.shape == y.shape
 
+    if estimator_name == "LGBMClassifier":
+        preds = model.predict(X[:, :-1], predict_disable_shape_check=predict_disable_shape_check)
+        assert preds.shape == y.shape
+
 
 @pytest.mark.parametrize("X_type", ["dt_DataTable", "list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
 @pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_Series", "pd_DataFrame"])

From 125f4eace8a6f0cc4e8b5c3de8aa054c89b3c934 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sun, 6 Oct 2024 00:28:40 -0500
Subject: [PATCH 27/33] just set the setters and deleters

---
 python-package/lightgbm/sklearn.py | 41 ++++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index 2c2193303569..c691c921d4e7 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -937,7 +937,7 @@ def fit(
         #
         # lightgbm initializes that property to -1, so it can be an integer throughout its entire life
         # (to help with type-checking), so it needs to be updated here.
-        self._n_features_in = _num_features_for_raw_input(X)
+        # self._n_features_in = _num_features_for_raw_input(X)
 
         if not isinstance(X, (pd_DataFrame, dt_DataTable)):
             _X, _y = _LGBMValidateData(
@@ -947,7 +947,7 @@ def fit(
                 # Prevent scikit-learn from deleting or modifying attributes like 'feature_names_in_' and 'n_features_in_'.
                 # We prefer to expose these via @property, to be able to raise a NotFittedError if they're accessed on an
                 # unfitted model... and so don't want to take on the complexity of defining setters and deleters for those.
-                reset=False,
+                reset=True,
                 # allow any input type (this validation is done further down, in lgb.Dataset())
                 accept_sparse=True,
                 # do not raise an error if Inf of NaN values are found (LightGBM handles these internally)
@@ -1168,6 +1168,21 @@ def n_features_in_(self) -> int:
             raise LGBMNotFittedError("No n_features_in found. Need to call fit beforehand.")
         return self._n_features_in
 
+    @n_features_in_.setter
+    def n_features_in_(self, value: int) -> None:
+        """Set number of features found in passed-in dataset.
+
+        Starting with ``scikit-learn`` 1.6, ``scikit-learn` expects to be able to directly
+        set this property in functions like ``validate_data()``.
+
+        .. note::
+
+            Do not call ``estimator.n_features_in_ = some_int`` or anything else that invokes
+            this method. It is only here for compatibility with ``scikit-learn`` validation
+            functions used internally in ``lightgbm``.
+        """
+        self._n_features_in = value
+
     @property
     def best_score_(self) -> _LGBM_BoosterBestScoreType:
         """:obj:`dict`: The best score of fitted model."""
@@ -1266,6 +1281,28 @@ def feature_names_in_(self) -> np.ndarray:
             raise LGBMNotFittedError("No feature_names_in_ found. Need to call fit beforehand.")
         return np.array(self.feature_name_)
 
+    @feature_names_in_.deleter
+    def feature_names_in_(self) -> None:
+        """Intercept calls to delete ``feature_names_in_``.
+
+        Some code paths in ``scikit-learn`` try to delete the ``feature_names_in_`` attribute
+        on estimators when a new training dataset that doesn't have features is passed.
+        LightGBM automatically assigns feature names to such datasets
+        (like ``Column_0``, ``Column_1``, etc.) and so does not want that behavior.
+
+        However, that behavior is coupled to ``scikit-learn`` automatically updating
+        ``n_features_in_`` in those same code paths, which is necessary for compliance
+        with its API (via argument ``reset`` to functions like ``validate_data()`` and
+        ``check_array()``).
+
+        .. note::
+
+            Do not call ``del estimator.feature_names_in_`` or anything else that invokes
+            this method. It is only here for compatibility with ``scikit-learn`` validation
+            functions used internally in ``lightgbm``.
+        """
+        pass
+
 
 class LGBMRegressor(_LGBMRegressorBase, LGBMModel):
     """LightGBM regressor."""

From 4233d702a6f4431805859a8059e4ee2ba563ca34 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sun, 6 Oct 2024 00:55:56 -0500
Subject: [PATCH 28/33] set floor of scikit-learn>=0.24.2, fix ordering of
 n_features_in_ setting for older versions

---
 .ci/test-python-oldest.sh          |  2 +-
 python-package/lightgbm/compat.py  | 25 ++++++++++++++++----
 python-package/lightgbm/sklearn.py | 37 +++---------------------------
 python-package/pyproject.toml      |  2 +-
 4 files changed, 26 insertions(+), 40 deletions(-)

diff --git a/.ci/test-python-oldest.sh b/.ci/test-python-oldest.sh
index c6de079351e3..06db15b0eb7a 100644
--- a/.ci/test-python-oldest.sh
+++ b/.ci/test-python-oldest.sh
@@ -15,7 +15,7 @@ pip install \
   'numpy==1.19.0' \
   'pandas==1.1.3' \
   'pyarrow==6.0.1' \
-  'scikit-learn==0.24.0' \
+  'scikit-learn==0.24.2' \
   'scipy==1.6.0' \
 || exit 1
 echo "done installing lightgbm's dependencies"
diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index 617bd31b9e36..a09e97553fa0 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -46,6 +46,24 @@ def validate_data(
             # trap other keyword arguments that only work on scikit-learn >=1.6, like 'reset'
             **ignored_kwargs,
         ):
+            # it's safe to import _num_features unconditionally because:
+            #
+            #  * it was first added in scikit-learn 0.24.2
+            #  * lightgbm cannot be used with scikit-learn versions older than that
+            #  * this validate_data() re-implementation will not be called in scikit-learn>=1.6
+            #
+            from sklearn.utils.validation import _num_features
+
+            # _num_features() raises a TypeError on 1-dimensional input. That's a problem
+            # because scikit-learn's 'check_fit1d' estimator check sets that expectation that
+            # estimators must raise a ValueError when a 1-dimensional input is passed to fit().
+            #
+            # So here, lightgbm avoids calling _num_features() on 1-dimensional inputs.
+            if hasattr(X, "shape") and len(X.shape) == 1:
+                n_features_in_ = 1
+            else:
+                n_features_in_ = _num_features(X)
+
             no_val_y = isinstance(y, str) and y == "no_validation"
 
             # NOTE: check_X_y() calls check_array() internally, so only need to call one or the other of them here
@@ -61,13 +79,12 @@ def validate_data(
                 )
 
                 # this only needs to be updated at fit() time
-                _estimator._n_features_in = X.shape[1]
+                _estimator.n_features_in_ = n_features_in_
 
             # raise the same error that scikit-learn's `validate_data()` does on scikit-learn>=1.6
-            n_features = X.shape[1]
-            if _estimator.__sklearn_is_fitted__() and _estimator._n_features != n_features:
+            if _estimator.__sklearn_is_fitted__() and _estimator._n_features != n_features_in_:
                 raise ValueError(
-                    f"X has {n_features} features, but {_estimator.__class__.__name__} "
+                    f"X has {n_features_in_} features, but {_estimator.__class__.__name__} "
                     f"is expecting {_estimator._n_features} features as input."
                 )
 
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index c691c921d4e7..e2b3e423a8aa 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -148,32 +148,6 @@ def _get_weight_from_constructed_dataset(dataset: Dataset) -> Optional[np.ndarra
     return weight
 
 
-def _num_features_for_raw_input(X: _LGBM_ScikitMatrixLike) -> int:
-    """Get number of features from raw input data.
-
-    Some ``scikit-learn`` versions accomplish this with a private function
-    ``sklearn.utils.validation._num_features()``. This is here to avoid depending on that.
-    """
-    # if a list, assume a list of lists where each item is one row of training data,
-    # and all rows have the same number of features
-    if isinstance(X, list):
-        return len(X[0])
-
-    # scikit-learn estimator checks error out if .shape is accessed unconditionally...
-    # but allow hard-coding an assumption that anything that isn't a list and doesn't have .shape
-    # must be convertible to something following the Array API via a __array__() method
-    if hasattr(X, "shape"):
-        X_shape = X.shape
-    else:
-        X_shape = X.__array__().shape
-
-    # this condition accounts for training on a 1-dimensional input
-    if len(X_shape) == 1:
-        return 1
-    else:
-        return X_shape[1]
-
-
 class _ObjectiveFunctionWrapper:
     """Proxy class for objective function."""
 
@@ -931,14 +905,6 @@ def fit(
         params["metric"] = [e for e in eval_metrics_builtin if e not in params["metric"]] + params["metric"]
         params["metric"] = [metric for metric in params["metric"] if metric is not None]
 
-        # Tf self.n_features_in_ is set to any value other than ``None``, then it needs to be populated
-        # before ``sklearn.utils.validation.validate_data()`` is called, to avoid an error about mismatched
-        # feature numbers being raised on the first call to ``fit()``.
-        #
-        # lightgbm initializes that property to -1, so it can be an integer throughout its entire life
-        # (to help with type-checking), so it needs to be updated here.
-        # self._n_features_in = _num_features_for_raw_input(X)
-
         if not isinstance(X, (pd_DataFrame, dt_DataTable)):
             _X, _y = _LGBMValidateData(
                 self,
@@ -960,6 +926,9 @@ def fit(
         else:
             _X, _y = X, y
 
+            # for other data types, setting n_features_in_ is handled by _LGBMValidateData() in the branch above
+            self.n_features_in_ = _X.shape[1]
+
         if self._class_weight is None:
             self._class_weight = self.class_weight
         if self._class_weight is not None:
diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
index 0d3d70feac66..4f6a17cb3666 100644
--- a/python-package/pyproject.toml
+++ b/python-package/pyproject.toml
@@ -45,7 +45,7 @@ pandas = [
     "pandas>=0.24.0"
 ]
 scikit-learn = [
-    "scikit-learn!=0.22.0"
+    "scikit-learn>=0.24.2"
 ]
 
 [project.urls]

From e8e4cdb97a2f448f65dde2e6c1f5851fe015126c Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sun, 6 Oct 2024 01:48:51 -0500
Subject: [PATCH 29/33] Update python-package/lightgbm/sklearn.py

---
 python-package/lightgbm/sklearn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index e2b3e423a8aa..a3829170ae38 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -1141,7 +1141,7 @@ def n_features_in_(self) -> int:
     def n_features_in_(self, value: int) -> None:
         """Set number of features found in passed-in dataset.
 
-        Starting with ``scikit-learn`` 1.6, ``scikit-learn` expects to be able to directly
+        Starting with ``scikit-learn`` 1.6, ``scikit-learn`` expects to be able to directly
         set this property in functions like ``validate_data()``.
 
         .. note::

From f22e4948943cc440bb848a44ecfa688b8d5c940f Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sun, 6 Oct 2024 22:09:10 -0500
Subject: [PATCH 30/33] forgot to commit ... fix comment

---
 python-package/lightgbm/sklearn.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index e2b3e423a8aa..0cc2c24c8466 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -1074,8 +1074,7 @@ def predict(
                 # 'y' being omitted = run scikit-learn's check_array() instead of check_X_y()
                 #
                 # Prevent scikit-learn from deleting or modifying attributes like 'feature_names_in_' and 'n_features_in_'.
-                # We prefer to expose these via @property, to be able to raise a NotFittedError if they're accessed on an
-                # unfitted model... and so don't want to take on the complexity of defining setters and deleters for those.
+                # These shouldn't be changed at predict() time.
                 reset=False,
                 # allow any input type (this validation is done further down, in lgb.Dataset())
                 accept_sparse=True,

From c6e6fad4520a8caf1bf133d5521475f88c635ca7 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Tue, 8 Oct 2024 14:30:20 -0500
Subject: [PATCH 31/33] Apply suggestions from code review

Co-authored-by: Nikita Titov <nekit94-08@mail.ru>
---
 python-package/lightgbm/sklearn.py        | 5 +----
 tests/python_package_test/test_sklearn.py | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index 30744144c3f8..45d3102dbbf2 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -706,7 +706,7 @@ def __sklearn_tags__(self) -> Optional["_sklearn_Tags"]:
         if not hasattr(_LGBMModelBase, "__sklearn_tags__"):
             err_msg = (
                 "__sklearn_tags__() should not be called when using scikit-learn<1.6. "
-                f"detected version: {_sklearn_version}"
+                f"Detected version: {_sklearn_version}"
             )
             raise AttributeError(err_msg)
 
@@ -910,9 +910,6 @@ def fit(
                 self,
                 X,
                 y,
-                # Prevent scikit-learn from deleting or modifying attributes like 'feature_names_in_' and 'n_features_in_'.
-                # We prefer to expose these via @property, to be able to raise a NotFittedError if they're accessed on an
-                # unfitted model... and so don't want to take on the complexity of defining setters and deleters for those.
                 reset=True,
                 # allow any input type (this validation is done further down, in lgb.Dataset())
                 accept_sparse=True,
diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py
index b68daa86c5b1..6bf4a40c13a3 100644
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -1627,7 +1627,7 @@ def test_predict_rejects_inputs_with_incorrect_number_of_features(predict_disabl
     assert preds.shape == y.shape
 
     if estimator_name == "LGBMClassifier":
-        preds = model.predict(X[:, :-1], predict_disable_shape_check=predict_disable_shape_check)
+        preds = model.predict_proba(X[:, :-1], predict_disable_shape_check=predict_disable_shape_check)
         assert preds.shape == y.shape
 
 
From d8762e5b27007bce6be6b7b1050f2bbaa8da0599 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Tue, 8 Oct 2024 20:44:27 -0500
Subject: [PATCH 32/33] predict_proba() shape is (num_data, num_classes) for
 multi-class

---
 tests/python_package_test/test_sklearn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py
index 6bf4a40c13a3..6eca66ff20d3 100644
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -1628,7 +1628,7 @@ def test_predict_rejects_inputs_with_incorrect_number_of_features(predict_disabl
 
     if estimator_name == "LGBMClassifier":
         preds = model.predict_proba(X[:, :-1], predict_disable_shape_check=predict_disable_shape_check)
-        assert preds.shape == y.shape
+        assert preds.shape[0] == y.shape[0]
 
 
 @pytest.mark.parametrize("X_type", ["dt_DataTable", "list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])

From 8ef1debf71c0a1bdae36eed1f24714226c3e7706 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Wed, 9 Oct 2024 11:15:53 -0500
Subject: [PATCH 33/33] pass ensure_min_samples=1 at predict() time too

---
 python-package/lightgbm/compat.py  | 7 ++++++-
 python-package/lightgbm/sklearn.py | 2 ++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index 77f08b883f36..96dee6522572 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -68,7 +68,12 @@ def validate_data(
 
             # NOTE: check_X_y() calls check_array() internally, so only need to call one or the other of them here
             if no_val_y:
-                X = check_array(X, accept_sparse=accept_sparse, force_all_finite=ensure_all_finite)
+                X = check_array(
+                    X,
+                    accept_sparse=accept_sparse,
+                    force_all_finite=ensure_all_finite,
+                    ensure_min_samples=ensure_min_samples,
+                )
             else:
                 X, y = check_X_y(
                     X,
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index 45d3102dbbf2..e8e46eb42b86 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -1077,6 +1077,8 @@ def predict(
                 accept_sparse=True,
                 # do not raise an error if Inf of NaN values are found (LightGBM handles these internally)
                 ensure_all_finite=False,
+                # raise an error on 0-row inputs
+                ensure_min_samples=1,
             )
         # retrieve original params that possibly can be used in both training and prediction
         # and then overwrite them (considering aliases) with params that were passed directly in prediction