From 1bfb98aca0a34dc43538cd362e2b4a76dd7f87f6 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Fri, 26 May 2023 10:18:33 +0800
Subject: [PATCH 01/13] [pyspark] add parameters in the ctor of all estimators.

---
 python-package/xgboost/spark/core.py      |  20 +-
 python-package/xgboost/spark/estimator.py | 252 ++++++++++++++++------
 2 files changed, 198 insertions(+), 74 deletions(-)

diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index d2eff943c23b..5ea003e79088 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -337,11 +337,9 @@ def _validate_params(self) -> None:
 
         if self.getOrDefault(self.features_cols):
             if not self.getOrDefault(self.use_gpu):
-                raise ValueError("features_cols param requires enabling use_gpu.")
-
-            get_logger(self.__class__.__name__).warning(
-                "If features_cols param set, then features_col param is ignored."
-            )
+                raise ValueError(
+                    "features_col param with list value requires enabling use_gpu."
+                )
 
         if self.getOrDefault("objective") is not None:
             if not isinstance(self.getOrDefault("objective"), str):
@@ -547,6 +545,8 @@ def _get_unwrapped_vec_cols(feature_col: Column) -> List[Column]:
 
 
 class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
+    _input_kwargs: Dict[str, Any]
+
     def __init__(self) -> None:
         super().__init__()
         self._set_xgb_params_default()
@@ -576,6 +576,11 @@ def setParams(
             raise ValueError("Invalid param name: 'arbitrary_params_dict'.")
 
         for k, v in kwargs.items():
+            # We're not allowing user use features_cols directly.
+            if k == self.features_cols.name:
+                raise ValueError(
+                    f"Unsupported param '{k}' please use features_col instead."
+                )
             if k in _inverse_pyspark_param_alias_map:
                 raise ValueError(
                     f"Please use param name {_inverse_pyspark_param_alias_map[k]} instead."
@@ -591,7 +596,10 @@ def setParams(
                     k = real_k
 
             if self.hasParam(k):
-                self._set(**{str(k): v})
+                if k == "features_col" and isinstance(v, list):
+                    self._set({"features_cols": v})
+                else:
+                    self._set(**{str(k): v})
             else:
                 if (
                     k in _unsupported_xgb_params
diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py
index 67eea4ad2d28..3c4bd3f6372f 100644
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@@ -2,9 +2,10 @@
 # pylint: disable=too-many-ancestors
 # pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name
 
-from typing import Any, Type
+from typing import Any, Dict, List, Optional, Type, Union
 
 import numpy as np
+from pyspark import keyword_only
 from pyspark.ml.param import Param, Params
 from pyspark.ml.param.shared import HasProbabilityCol, HasRawPredictionCol
 
@@ -97,35 +98,57 @@ class SparkXGBRegressor(_SparkXGBEstimator):
     SparkXGBRegressor doesn't support setting `nthread` xgboost param, instead, the `nthread`
     param for each xgboost worker will be set equal to `spark.task.cpus` config value.
 
-    callbacks:
-        The export and import of the callback functions are at best effort.
-        For details, see :py:attr:`xgboost.spark.SparkXGBRegressor.callbacks` param doc.
+
+    Parameters
+    ----------
+
+    features_col:
+        When the value is string, it requires the features column name to be vector type.
+        When the value is a list of string, it requires all the feature columns to be numeric types.
+    label_col:
+        Label column name. Default to "label".
+    prediction_col:
+        Prediction column name. Default to "prediction"
+    pred_contrib_col:
+        Contribution prediction column name.
     validation_indicator_col
-        For params related to `xgboost.XGBRegressor` training
-        with evaluation dataset's supervision, set
-        :py:attr:`xgboost.spark.SparkXGBRegressor.validation_indicator_col`
-        parameter instead of setting the `eval_set` parameter in `xgboost.XGBRegressor`
+        For params related to `xgboost.XGBClassifier` training with
+        evaluation dataset's supervision,
+        set :py:attr:`xgboost.spark.SparkXGBClassifier.validation_indicator_col`
+        parameter instead of setting the `eval_set` parameter in `xgboost.XGBClassifier`
         fit method.
     weight_col:
         To specify the weight of the training and validation dataset, set
-        :py:attr:`xgboost.spark.SparkXGBRegressor.weight_col` parameter instead of setting
-        `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBRegressor`
-        fit method.
-    xgb_model:
-        Set the value to be the instance returned by
-        :func:`xgboost.spark.SparkXGBRegressorModel.get_booster`.
-    num_workers:
-        Integer that specifies the number of XGBoost workers to use.
-        Each XGBoost worker corresponds to one spark task.
-    use_gpu:
-        Boolean that specifies whether the executors are running on GPU
-        instances.
+        :py:attr:`xgboost.spark.SparkXGBClassifier.weight_col` parameter instead of setting
+        `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBClassifier`
+        fit method.    base_margin_col
+        Base margin column name
     base_margin_col:
         To specify the base margins of the training and validation
-        dataset, set :py:attr:`xgboost.spark.SparkXGBRegressor.base_margin_col` parameter
+        dataset, set :py:attr:`xgboost.spark.SparkXGBClassifier.base_margin_col` parameter
         instead of setting `base_margin` and `base_margin_eval_set` in the
-        `xgboost.XGBRegressor` fit method. Note: this isn't available for distributed
+        `xgboost.XGBClassifier` fit method. Note: this isn't available for distributed
         training.
+    qid_col"
+        Query id column name.
+
+    num_workers:
+        How many XGBoost workers to be used to train.
+        Each XGBoost worker corresponds to one spark task.
+    use_gpu:
+        Boolean value to specify whether the executors are running on GPU
+        instances.
+    force_repartition:
+        Boolean value to specify if forcing the input dataset to be repartitioned before XGBoost training.
+    repartition_random_shuffle
+        Boolean value to specify if randomly shuffling the dataset when repartitioning is required.
+    enable_sparse_data_optim:
+        Boolean value to specify if enabling sparse data optimization, if True,
+        Xgboost DMatrix object will be constructed from sparse matrix instead of
+        dense matrix.
+
+    xgboost_parameters:
+        A dictionary of xgboost parameters, please refer to https://xgboost.readthedocs.io/en/stable/parameter.html
 
     .. Note:: The Parameters chart above contains parameters that need special handling.
         For a full list of parameters, see entries with `Param(parent=...` below.
@@ -155,9 +178,28 @@ class SparkXGBRegressor(_SparkXGBEstimator):
 
     """
 
-    def __init__(self, **kwargs: Any) -> None:
+    @keyword_only
+    def __init__(
+        self,
+        *,
+        features_col: Union[str, List[str]] = "features",
+        label_col: str = "label",
+        prediction_col: str = "prediction",
+        pred_contrib_col: Optional[str] = None,
+        validation_indicator_col: Optional[str] = None,
+        weight_col: Optional[str] = None,
+        base_margin_col: Optional[str] = None,
+        qid_col: Optional[str] = None,
+        num_workers: int = 1,
+        use_gpu: bool = False,
+        force_repartition: bool = False,
+        repartition_random_shuffle: bool = False,
+        enable_sparse_data_optim: bool = False,
+        **xgboost_parameters: Dict[str, Any],
+    ) -> None:
         super().__init__()
-        self.setParams(**kwargs)
+        input_kwargs = self._input_kwargs
+        self.setParams(**input_kwargs)
 
     @classmethod
     def _xgb_cls(cls) -> Type[XGBRegressor]:
@@ -220,14 +262,22 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
     Parameters
     ----------
 
-    callbacks:
-        The export and import of the callback functions are at best effort. For
-        details, see :py:attr:`xgboost.spark.SparkXGBClassifier.callbacks` param doc.
-    raw_prediction_col:
+    features_col:
+        When the value is string, it requires the features column name to be vector type.
+        When the value is a list of string, it requires all the feature columns to be numeric types.
+    label_col:
+        Label column name. Default to "label".
+    prediction_col:
+        Prediction column name. Default to "prediction"
+    probability_col:
+        Column name for predicted class conditional probabilities. Default to probabilityCol
+    raw_prediction_col
         The `output_margin=True` is implicitly supported by the
         `rawPredictionCol` output column, which is always returned with the predicted margin
         values.
-    validation_indicator_col:
+    pred_contrib_col:
+        Contribution prediction column name.
+    validation_indicator_col
         For params related to `xgboost.XGBClassifier` training with
         evaluation dataset's supervision,
         set :py:attr:`xgboost.spark.SparkXGBClassifier.validation_indicator_col`
@@ -237,22 +287,34 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
         To specify the weight of the training and validation dataset, set
         :py:attr:`xgboost.spark.SparkXGBClassifier.weight_col` parameter instead of setting
         `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBClassifier`
-        fit method.
-    xgb_model:
-        Set the value to be the instance returned by
-        :func:`xgboost.spark.SparkXGBClassifierModel.get_booster`.
-    num_workers:
-        Integer that specifies the number of XGBoost workers to use.
-        Each XGBoost worker corresponds to one spark task.
-    use_gpu:
-        Boolean that specifies whether the executors are running on GPU
-        instances.
+        fit method.    base_margin_col
+        Base margin column name
     base_margin_col:
         To specify the base margins of the training and validation
         dataset, set :py:attr:`xgboost.spark.SparkXGBClassifier.base_margin_col` parameter
         instead of setting `base_margin` and `base_margin_eval_set` in the
         `xgboost.XGBClassifier` fit method. Note: this isn't available for distributed
         training.
+    qid_col"
+        Query id column name.
+
+    num_workers:
+        How many XGBoost workers to be used to train.
+        Each XGBoost worker corresponds to one spark task.
+    use_gpu:
+        Boolean value to specify whether the executors are running on GPU
+        instances.
+    force_repartition:
+        Boolean value to specify if forcing the input dataset to be repartitioned before XGBoost training.
+    repartition_random_shuffle
+        Boolean value to specify if randomly shuffling the dataset when repartitioning is required.
+    enable_sparse_data_optim:
+        Boolean value to specify if enabling sparse data optimization, if True,
+        Xgboost DMatrix object will be constructed from sparse matrix instead of
+        dense matrix.
+
+    xgboost_parameters:
+        A dictionary of xgboost parameters, please refer to https://xgboost.readthedocs.io/en/stable/parameter.html
 
     .. Note:: The Parameters chart above contains parameters that need special handling.
         For a full list of parameters, see entries with `Param(parent=...` below.
@@ -281,14 +343,35 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
 
     """
 
-    def __init__(self, **kwargs: Any) -> None:
+    @keyword_only
+    def __init__(
+        self,
+        *,
+        features_col: Union[str, List[str]] = "features",
+        label_col: str = "label",
+        prediction_col: str = "prediction",
+        probability_col: str = "probability",
+        raw_prediction_col: str = "rawPrediction",
+        pred_contrib_col: Optional[str] = None,
+        validation_indicator_col: Optional[str] = None,
+        weight_col: Optional[str] = None,
+        base_margin_col: Optional[str] = None,
+        qid_col: Optional[str] = None,
+        num_workers: int = 1,
+        use_gpu: bool = False,
+        force_repartition: bool = False,
+        repartition_random_shuffle: bool = False,
+        enable_sparse_data_optim: bool = False,
+        **xgboost_parameters: Dict[str, Any],
+    ) -> None:
         super().__init__()
         # The default 'objective' param value comes from sklearn `XGBClassifier` ctor,
         # but in pyspark we will automatically set objective param depending on
         # binary or multinomial input dataset, and we need to remove the fixed default
         # param value as well to avoid causing ambiguity.
+        input_kwargs = self._input_kwargs
+        self.setParams(**input_kwargs)
         self._setDefault(objective=None)
-        self.setParams(**kwargs)
 
     @classmethod
     def _xgb_cls(cls) -> Type[XGBClassifier]:
@@ -355,39 +438,53 @@ class SparkXGBRanker(_SparkXGBEstimator):
     Parameters
     ----------
 
-    callbacks:
-        The export and import of the callback functions are at best effort. For
-        details, see :py:attr:`xgboost.spark.SparkXGBRanker.callbacks` param doc.
-    validation_indicator_col:
-        For params related to `xgboost.XGBRanker` training with
+    features_col:
+        When the value is string, it requires the features column name to be vector type.
+        When the value is a list of string, it requires all the feature columns to be numeric types.
+    label_col:
+        Label column name. Default to "label".
+    prediction_col:
+        Prediction column name. Default to "prediction"
+    pred_contrib_col:
+        Contribution prediction column name.
+    validation_indicator_col
+        For params related to `xgboost.XGBClassifier` training with
         evaluation dataset's supervision,
-        set :py:attr:`xgboost.spark.XGBRanker.validation_indicator_col`
-        parameter instead of setting the `eval_set` parameter in `xgboost.XGBRanker`
+        set :py:attr:`xgboost.spark.SparkXGBClassifier.validation_indicator_col`
+        parameter instead of setting the `eval_set` parameter in `xgboost.XGBClassifier`
         fit method.
     weight_col:
         To specify the weight of the training and validation dataset, set
-        :py:attr:`xgboost.spark.SparkXGBRanker.weight_col` parameter instead of setting
-        `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBRanker`
-        fit method.
-    xgb_model:
-        Set the value to be the instance returned by
-        :func:`xgboost.spark.SparkXGBRankerModel.get_booster`.
+        :py:attr:`xgboost.spark.SparkXGBClassifier.weight_col` parameter instead of setting
+        `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBClassifier`
+        fit method.    base_margin_col
+        Base margin column name
+    base_margin_col:
+        To specify the base margins of the training and validation
+        dataset, set :py:attr:`xgboost.spark.SparkXGBClassifier.base_margin_col` parameter
+        instead of setting `base_margin` and `base_margin_eval_set` in the
+        `xgboost.XGBClassifier` fit method. Note: this isn't available for distributed
+        training.
+    qid_col"
+        Query id column name.
+
     num_workers:
-        Integer that specifies the number of XGBoost workers to use.
+        How many XGBoost workers to be used to train.
         Each XGBoost worker corresponds to one spark task.
     use_gpu:
-        Boolean that specifies whether the executors are running on GPU
+        Boolean value to specify whether the executors are running on GPU
         instances.
-    base_margin_col:
-        To specify the base margins of the training and validation
-        dataset, set :py:attr:`xgboost.spark.SparkXGBRanker.base_margin_col` parameter
-        instead of setting `base_margin` and `base_margin_eval_set` in the
-        `xgboost.XGBRanker` fit method.
-    qid_col:
-        To specify the qid of the training and validation
-        dataset, set :py:attr:`xgboost.spark.SparkXGBRanker.qid_col` parameter
-        instead of setting `qid` / `group`, `eval_qid` / `eval_group` in the
-        `xgboost.XGBRanker` fit method.
+    force_repartition:
+        Boolean value to specify if forcing the input dataset to be repartitioned before XGBoost training.
+    repartition_random_shuffle
+        Boolean value to specify if randomly shuffling the dataset when repartitioning is required.
+    enable_sparse_data_optim:
+        Boolean value to specify if enabling sparse data optimization, if True,
+        Xgboost DMatrix object will be constructed from sparse matrix instead of
+        dense matrix.
+
+    xgboost_parameters:
+        A dictionary of xgboost parameters, please refer to https://xgboost.readthedocs.io/en/stable/parameter.html
 
     .. Note:: The Parameters chart above contains parameters that need special handling.
         For a full list of parameters, see entries with `Param(parent=...` below.
@@ -426,9 +523,28 @@ class SparkXGBRanker(_SparkXGBEstimator):
     >>> model.transform(df_test).show()
     """
 
-    def __init__(self, **kwargs: Any) -> None:
+    @keyword_only
+    def __init__(
+        self,
+        *,
+        features_col: Union[str, List[str]] = "features",
+        label_col: str = "label",
+        prediction_col: str = "prediction",
+        pred_contrib_col: Optional[str] = None,
+        validation_indicator_col: Optional[str] = None,
+        weight_col: Optional[str] = None,
+        base_margin_col: Optional[str] = None,
+        qid_col: Optional[str] = None,
+        num_workers: int = 1,
+        use_gpu: bool = False,
+        force_repartition: bool = False,
+        repartition_random_shuffle: bool = False,
+        enable_sparse_data_optim: bool = False,
+        **xgboost_parameters: Dict[str, Any],
+    ) -> None:
         super().__init__()
-        self.setParams(**kwargs)
+        input_kwargs = self._input_kwargs
+        self.setParams(**input_kwargs)
 
     @classmethod
     def _xgb_cls(cls) -> Type[XGBRanker]:

From ee12a3d28a654dcd15ede83d125573a2ca4144ac Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Fri, 26 May 2023 11:12:52 +0800
Subject: [PATCH 02/13] pylint

---
 python-package/xgboost/spark/core.py      |  2 +-
 python-package/xgboost/spark/estimator.py | 20 ++++++++++++++------
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index 5ea003e79088..0181e678d201 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -597,7 +597,7 @@ def setParams(
 
             if self.hasParam(k):
                 if k == "features_col" and isinstance(v, list):
-                    self._set({"features_cols": v})
+                    self._set(**{"features_cols": v})
                 else:
                     self._set(**{str(k): v})
             else:
diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py
index 3c4bd3f6372f..1a614a3ddb20 100644
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@@ -1,6 +1,8 @@
 """Xgboost pyspark integration submodule for estimator API."""
 # pylint: disable=too-many-ancestors
 # pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name
+# pylint: disable=unused-argument, too-many-locals
+
 
 from typing import Any, Dict, List, Optional, Type, Union
 
@@ -139,7 +141,8 @@ class SparkXGBRegressor(_SparkXGBEstimator):
         Boolean value to specify whether the executors are running on GPU
         instances.
     force_repartition:
-        Boolean value to specify if forcing the input dataset to be repartitioned before XGBoost training.
+        Boolean value to specify if forcing the input dataset to be repartitioned
+        before XGBoost training.
     repartition_random_shuffle
         Boolean value to specify if randomly shuffling the dataset when repartitioning is required.
     enable_sparse_data_optim:
@@ -148,7 +151,8 @@ class SparkXGBRegressor(_SparkXGBEstimator):
         dense matrix.
 
     xgboost_parameters:
-        A dictionary of xgboost parameters, please refer to https://xgboost.readthedocs.io/en/stable/parameter.html
+        A dictionary of xgboost parameters, please refer to
+        https://xgboost.readthedocs.io/en/stable/parameter.html
 
     .. Note:: The Parameters chart above contains parameters that need special handling.
         For a full list of parameters, see entries with `Param(parent=...` below.
@@ -305,7 +309,8 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
         Boolean value to specify whether the executors are running on GPU
         instances.
     force_repartition:
-        Boolean value to specify if forcing the input dataset to be repartitioned before XGBoost training.
+        Boolean value to specify if forcing the input dataset to be repartitioned
+        before XGBoost training.
     repartition_random_shuffle
         Boolean value to specify if randomly shuffling the dataset when repartitioning is required.
     enable_sparse_data_optim:
@@ -314,7 +319,8 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
         dense matrix.
 
     xgboost_parameters:
-        A dictionary of xgboost parameters, please refer to https://xgboost.readthedocs.io/en/stable/parameter.html
+        A dictionary of xgboost parameters, please refer to
+        https://xgboost.readthedocs.io/en/stable/parameter.html
 
     .. Note:: The Parameters chart above contains parameters that need special handling.
         For a full list of parameters, see entries with `Param(parent=...` below.
@@ -475,7 +481,8 @@ class SparkXGBRanker(_SparkXGBEstimator):
         Boolean value to specify whether the executors are running on GPU
         instances.
     force_repartition:
-        Boolean value to specify if forcing the input dataset to be repartitioned before XGBoost training.
+        Boolean value to specify if forcing the input dataset to be repartitioned
+        before XGBoost training.
     repartition_random_shuffle
         Boolean value to specify if randomly shuffling the dataset when repartitioning is required.
     enable_sparse_data_optim:
@@ -484,7 +491,8 @@ class SparkXGBRanker(_SparkXGBEstimator):
         dense matrix.
 
     xgboost_parameters:
-        A dictionary of xgboost parameters, please refer to https://xgboost.readthedocs.io/en/stable/parameter.html
+        A dictionary of xgboost parameters, please refer to
+        https://xgboost.readthedocs.io/en/stable/parameter.html
 
     .. Note:: The Parameters chart above contains parameters that need special handling.
         For a full list of parameters, see entries with `Param(parent=...` below.

From 06f52563869a31af32c0af3b7b3d7df491cd1541 Mon Sep 17 00:00:00 2001
From: Bobby Wang <bobwang@nvidia.com>
Date: Fri, 26 May 2023 17:04:41 +0800
Subject: [PATCH 03/13] Update python-package/xgboost/spark/estimator.py

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 python-package/xgboost/spark/estimator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py
index 1a614a3ddb20..7363685dc6e7 100644
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@@ -457,7 +457,7 @@ class SparkXGBRanker(_SparkXGBEstimator):
         For params related to `xgboost.XGBClassifier` training with
         evaluation dataset's supervision,
         set :py:attr:`xgboost.spark.SparkXGBClassifier.validation_indicator_col`
-        parameter instead of setting the `eval_set` parameter in `xgboost.XGBClassifier`
+        parameter instead of setting the `eval_set` parameter in :py:class:`xgboost.XGBClassifier`
         fit method.
     weight_col:
         To specify the weight of the training and validation dataset, set

From 5fe69a004eec24ad672aec6eeed69ed50f3220c7 Mon Sep 17 00:00:00 2001
From: Bobby Wang <bobwang@nvidia.com>
Date: Fri, 26 May 2023 17:04:47 +0800
Subject: [PATCH 04/13] Update python-package/xgboost/spark/estimator.py

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 python-package/xgboost/spark/estimator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py
index 7363685dc6e7..b7cb0a85f7bc 100644
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@@ -469,7 +469,7 @@ class SparkXGBRanker(_SparkXGBEstimator):
         To specify the base margins of the training and validation
         dataset, set :py:attr:`xgboost.spark.SparkXGBClassifier.base_margin_col` parameter
         instead of setting `base_margin` and `base_margin_eval_set` in the
-        `xgboost.XGBClassifier` fit method. Note: this isn't available for distributed
+        :py:class:`xgboost.XGBClassifier` fit method. Note: this isn't available for distributed
         training.
     qid_col"
         Query id column name.

From 6ee5b7d15410b4c788fbb4ae3d55fb6379366759 Mon Sep 17 00:00:00 2001
From: Bobby Wang <bobwang@nvidia.com>
Date: Fri, 26 May 2023 17:04:57 +0800
Subject: [PATCH 05/13] Update python-package/xgboost/spark/estimator.py

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 python-package/xgboost/spark/estimator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py
index b7cb0a85f7bc..a8825f3c90dd 100644
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@@ -462,7 +462,7 @@ class SparkXGBRanker(_SparkXGBEstimator):
     weight_col:
         To specify the weight of the training and validation dataset, set
         :py:attr:`xgboost.spark.SparkXGBClassifier.weight_col` parameter instead of setting
-        `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBClassifier`
+        `sample_weight` and `sample_weight_eval_set` parameter in :py:class:`xgboost.XGBClassifier`
         fit method.    base_margin_col
         Base margin column name
     base_margin_col:

From 50a905ddaaa6678b2b72dd5fa3e2b8d6b946d1b3 Mon Sep 17 00:00:00 2001
From: Bobby Wang <bobwang@nvidia.com>
Date: Fri, 26 May 2023 17:05:04 +0800
Subject: [PATCH 06/13] Update python-package/xgboost/spark/estimator.py

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 python-package/xgboost/spark/estimator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py
index a8825f3c90dd..4c59f98385ef 100644
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@@ -471,7 +471,7 @@ class SparkXGBRanker(_SparkXGBEstimator):
         instead of setting `base_margin` and `base_margin_eval_set` in the
         :py:class:`xgboost.XGBClassifier` fit method. Note: this isn't available for distributed
         training.
-    qid_col"
+    qid_col:
         Query id column name.
 
     num_workers:

From 136085660ce7c59a86bd044e67cb640ab0bbb048 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Fri, 26 May 2023 17:20:38 +0800
Subject: [PATCH 07/13] comments

---
 python-package/xgboost/spark/estimator.py | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py
index 4c59f98385ef..9ee7fac51cbe 100644
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@@ -131,8 +131,6 @@ class SparkXGBRegressor(_SparkXGBEstimator):
         instead of setting `base_margin` and `base_margin_eval_set` in the
         `xgboost.XGBClassifier` fit method. Note: this isn't available for distributed
         training.
-    qid_col"
-        Query id column name.
 
     num_workers:
         How many XGBoost workers to be used to train.
@@ -150,7 +148,7 @@ class SparkXGBRegressor(_SparkXGBEstimator):
         Xgboost DMatrix object will be constructed from sparse matrix instead of
         dense matrix.
 
-    xgboost_parameters:
+    kwargs:
         A dictionary of xgboost parameters, please refer to
         https://xgboost.readthedocs.io/en/stable/parameter.html
 
@@ -193,13 +191,12 @@ def __init__(
         validation_indicator_col: Optional[str] = None,
         weight_col: Optional[str] = None,
         base_margin_col: Optional[str] = None,
-        qid_col: Optional[str] = None,
         num_workers: int = 1,
         use_gpu: bool = False,
         force_repartition: bool = False,
         repartition_random_shuffle: bool = False,
         enable_sparse_data_optim: bool = False,
-        **xgboost_parameters: Dict[str, Any],
+        **kwargs: Dict[str, Any],
     ) -> None:
         super().__init__()
         input_kwargs = self._input_kwargs
@@ -299,8 +296,6 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
         instead of setting `base_margin` and `base_margin_eval_set` in the
         `xgboost.XGBClassifier` fit method. Note: this isn't available for distributed
         training.
-    qid_col"
-        Query id column name.
 
     num_workers:
         How many XGBoost workers to be used to train.
@@ -318,7 +313,7 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
         Xgboost DMatrix object will be constructed from sparse matrix instead of
         dense matrix.
 
-    xgboost_parameters:
+    kwargs:
         A dictionary of xgboost parameters, please refer to
         https://xgboost.readthedocs.io/en/stable/parameter.html
 
@@ -362,13 +357,12 @@ def __init__(
         validation_indicator_col: Optional[str] = None,
         weight_col: Optional[str] = None,
         base_margin_col: Optional[str] = None,
-        qid_col: Optional[str] = None,
         num_workers: int = 1,
         use_gpu: bool = False,
         force_repartition: bool = False,
         repartition_random_shuffle: bool = False,
         enable_sparse_data_optim: bool = False,
-        **xgboost_parameters: Dict[str, Any],
+        **kwargs: Dict[str, Any],
     ) -> None:
         super().__init__()
         # The default 'objective' param value comes from sklearn `XGBClassifier` ctor,
@@ -490,7 +484,7 @@ class SparkXGBRanker(_SparkXGBEstimator):
         Xgboost DMatrix object will be constructed from sparse matrix instead of
         dense matrix.
 
-    xgboost_parameters:
+    kwargs:
         A dictionary of xgboost parameters, please refer to
         https://xgboost.readthedocs.io/en/stable/parameter.html
 
@@ -548,7 +542,7 @@ def __init__(
         force_repartition: bool = False,
         repartition_random_shuffle: bool = False,
         enable_sparse_data_optim: bool = False,
-        **xgboost_parameters: Dict[str, Any],
+        **kwargs: Dict[str, Any],
     ) -> None:
         super().__init__()
         input_kwargs = self._input_kwargs

From 7515ef5408fe8742ee8d9a555dba75d73a307f42 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Fri, 26 May 2023 17:33:49 +0800
Subject: [PATCH 08/13] update

---
 python-package/xgboost/spark/estimator.py | 37 +++++++++++------------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py
index 9ee7fac51cbe..67df1d1751d2 100644
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@@ -114,22 +114,21 @@ class SparkXGBRegressor(_SparkXGBEstimator):
     pred_contrib_col:
         Contribution prediction column name.
     validation_indicator_col
-        For params related to `xgboost.XGBClassifier` training with
+        For params related to `xgboost.XGBRegressor` training with
         evaluation dataset's supervision,
-        set :py:attr:`xgboost.spark.SparkXGBClassifier.validation_indicator_col`
-        parameter instead of setting the `eval_set` parameter in `xgboost.XGBClassifier`
+        set :py:attr:`xgboost.spark.SparkXGBRegressor.validation_indicator_col`
+        parameter instead of setting the `eval_set` parameter in `xgboost.XGBRegressor`
         fit method.
     weight_col:
         To specify the weight of the training and validation dataset, set
-        :py:attr:`xgboost.spark.SparkXGBClassifier.weight_col` parameter instead of setting
-        `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBClassifier`
-        fit method.    base_margin_col
-        Base margin column name
+        :py:attr:`xgboost.spark.SparkXGBRegressor.weight_col` parameter instead of setting
+        `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBRegressor`
+        fit method.
     base_margin_col:
         To specify the base margins of the training and validation
-        dataset, set :py:attr:`xgboost.spark.SparkXGBClassifier.base_margin_col` parameter
+        dataset, set :py:attr:`xgboost.spark.SparkXGBRegressor.base_margin_col` parameter
         instead of setting `base_margin` and `base_margin_eval_set` in the
-        `xgboost.XGBClassifier` fit method. Note: this isn't available for distributed
+        `xgboost.XGBRegressor` fit method. Note: this isn't available for distributed
         training.
 
     num_workers:
@@ -288,8 +287,7 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
         To specify the weight of the training and validation dataset, set
         :py:attr:`xgboost.spark.SparkXGBClassifier.weight_col` parameter instead of setting
         `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBClassifier`
-        fit method.    base_margin_col
-        Base margin column name
+        fit method.
     base_margin_col:
         To specify the base margins of the training and validation
         dataset, set :py:attr:`xgboost.spark.SparkXGBClassifier.base_margin_col` parameter
@@ -448,22 +446,21 @@ class SparkXGBRanker(_SparkXGBEstimator):
     pred_contrib_col:
         Contribution prediction column name.
     validation_indicator_col
-        For params related to `xgboost.XGBClassifier` training with
+        For params related to `xgboost.XGBRanker` training with
         evaluation dataset's supervision,
-        set :py:attr:`xgboost.spark.SparkXGBClassifier.validation_indicator_col`
-        parameter instead of setting the `eval_set` parameter in :py:class:`xgboost.XGBClassifier`
+        set :py:attr:`xgboost.spark.SparkXGBRanker.validation_indicator_col`
+        parameter instead of setting the `eval_set` parameter in :py:class:`xgboost.XGBRanker`
         fit method.
     weight_col:
         To specify the weight of the training and validation dataset, set
-        :py:attr:`xgboost.spark.SparkXGBClassifier.weight_col` parameter instead of setting
-        `sample_weight` and `sample_weight_eval_set` parameter in :py:class:`xgboost.XGBClassifier`
-        fit method.    base_margin_col
-        Base margin column name
+        :py:attr:`xgboost.spark.SparkXGBRanker.weight_col` parameter instead of setting
+        `sample_weight` and `sample_weight_eval_set` parameter in :py:class:`xgboost.XGBRanker`
+        fit method.
     base_margin_col:
         To specify the base margins of the training and validation
-        dataset, set :py:attr:`xgboost.spark.SparkXGBClassifier.base_margin_col` parameter
+        dataset, set :py:attr:`xgboost.spark.SparkXGBRanker.base_margin_col` parameter
         instead of setting `base_margin` and `base_margin_eval_set` in the
-        :py:class:`xgboost.XGBClassifier` fit method. Note: this isn't available for distributed
+        :py:class:`xgboost.XGBRanker` fit method. Note: this isn't available for distributed
         training.
     qid_col:
         Query id column name.

From 8748ce381a7f3d8369e23e7752949072c896a6e1 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Fri, 26 May 2023 17:48:14 +0800
Subject: [PATCH 09/13] udpate

---
 python-package/xgboost/spark/estimator.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py
index 67df1d1751d2..d7fb27874698 100644
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@@ -128,8 +128,7 @@ class SparkXGBRegressor(_SparkXGBEstimator):
         To specify the base margins of the training and validation
         dataset, set :py:attr:`xgboost.spark.SparkXGBRegressor.base_margin_col` parameter
         instead of setting `base_margin` and `base_margin_eval_set` in the
-        `xgboost.XGBRegressor` fit method. Note: this isn't available for distributed
-        training.
+        `xgboost.XGBRegressor` fit method.
 
     num_workers:
         How many XGBoost workers to be used to train.
@@ -292,8 +291,7 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
         To specify the base margins of the training and validation
         dataset, set :py:attr:`xgboost.spark.SparkXGBClassifier.base_margin_col` parameter
         instead of setting `base_margin` and `base_margin_eval_set` in the
-        `xgboost.XGBClassifier` fit method. Note: this isn't available for distributed
-        training.
+        `xgboost.XGBClassifier` fit method.
 
     num_workers:
         How many XGBoost workers to be used to train.
@@ -460,8 +458,7 @@ class SparkXGBRanker(_SparkXGBEstimator):
         To specify the base margins of the training and validation
         dataset, set :py:attr:`xgboost.spark.SparkXGBRanker.base_margin_col` parameter
         instead of setting `base_margin` and `base_margin_eval_set` in the
-        :py:class:`xgboost.XGBRanker` fit method. Note: this isn't available for distributed
-        training.
+        :py:class:`xgboost.XGBRanker` fit method.
     qid_col:
         Query id column name.
 

From 432bd2e4adee4b9540105223eb8ec14bf214bb39 Mon Sep 17 00:00:00 2001
From: Bobby Wang <bobwang@nvidia.com>
Date: Sun, 28 May 2023 05:52:25 +0800
Subject: [PATCH 10/13] Update python-package/xgboost/spark/estimator.py

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 python-package/xgboost/spark/estimator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py
index d7fb27874698..3ca81247d45f 100644
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@@ -270,7 +270,7 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
         Prediction column name. Default to "prediction"
     probability_col:
         Column name for predicted class conditional probabilities. Default to probabilityCol
-    raw_prediction_col
+    raw_prediction_col:
         The `output_margin=True` is implicitly supported by the
         `rawPredictionCol` output column, which is always returned with the predicted margin
         values.

From 983128755aa3e99f6f366990715b1ccc46dd0ed1 Mon Sep 17 00:00:00 2001
From: Bobby Wang <bobwang@nvidia.com>
Date: Sun, 28 May 2023 05:52:39 +0800
Subject: [PATCH 11/13] Update python-package/xgboost/spark/estimator.py

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 python-package/xgboost/spark/estimator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py
index 3ca81247d45f..b43c4045479a 100644
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@@ -276,7 +276,7 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
         values.
     pred_contrib_col:
         Contribution prediction column name.
-    validation_indicator_col
+    validation_indicator_col:
         For params related to `xgboost.XGBClassifier` training with
         evaluation dataset's supervision,
         set :py:attr:`xgboost.spark.SparkXGBClassifier.validation_indicator_col`

From 83a6f3f379b3423559af74db4dbda9536c74e664 Mon Sep 17 00:00:00 2001
From: Bobby Wang <bobwang@nvidia.com>
Date: Sun, 28 May 2023 05:52:45 +0800
Subject: [PATCH 12/13] Update python-package/xgboost/spark/estimator.py

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 python-package/xgboost/spark/estimator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py
index b43c4045479a..6c70c50c30a3 100644
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@@ -302,7 +302,7 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
     force_repartition:
         Boolean value to specify if forcing the input dataset to be repartitioned
         before XGBoost training.
-    repartition_random_shuffle
+    repartition_random_shuffle:
         Boolean value to specify if randomly shuffling the dataset when repartitioning is required.
     enable_sparse_data_optim:
         Boolean value to specify if enabling sparse data optimization, if True,

From 04eb0f4d9465b08614f10e6dcbfa20d7d892ce38 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Sun, 28 May 2023 06:26:13 +0800
Subject: [PATCH 13/13] comments

---
 python-package/xgboost/spark/estimator.py | 39 +++++++++++++----------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py
index 6c70c50c30a3..5054ef0ddb2e 100644
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@@ -86,8 +86,8 @@ class SparkXGBRegressor(_SparkXGBEstimator):
     :py:class:`~pyspark.ml.classification.OneVsRest`
 
     SparkXGBRegressor automatically supports most of the parameters in
-    `xgboost.XGBRegressor` constructor and most of the parameters used in
-    :py:class:`xgboost.XGBRegressor` fit and predict method.
+    :py:class:`xgboost.XGBRegressor` constructor and most of the parameters used in
+    :py:meth:`xgboost.XGBRegressor.fit` and :py:meth:`xgboost.XGBRegressor.predict` method.
 
     SparkXGBRegressor doesn't support setting `gpu_id` but support another param `use_gpu`,
     see doc below for more details.
@@ -113,7 +113,7 @@ class SparkXGBRegressor(_SparkXGBEstimator):
         Prediction column name. Default to "prediction"
     pred_contrib_col:
         Contribution prediction column name.
-    validation_indicator_col
+    validation_indicator_col:
         For params related to `xgboost.XGBRegressor` training with
         evaluation dataset's supervision,
         set :py:attr:`xgboost.spark.SparkXGBRegressor.validation_indicator_col`
@@ -139,7 +139,7 @@ class SparkXGBRegressor(_SparkXGBEstimator):
     force_repartition:
         Boolean value to specify if forcing the input dataset to be repartitioned
         before XGBoost training.
-    repartition_random_shuffle
+    repartition_random_shuffle:
         Boolean value to specify if randomly shuffling the dataset when repartitioning is required.
     enable_sparse_data_optim:
         Boolean value to specify if enabling sparse data optimization, if True,
@@ -150,10 +150,14 @@ class SparkXGBRegressor(_SparkXGBEstimator):
         A dictionary of xgboost parameters, please refer to
         https://xgboost.readthedocs.io/en/stable/parameter.html
 
-    .. Note:: The Parameters chart above contains parameters that need special handling.
-        For a full list of parameters, see entries with `Param(parent=...` below.
+    Note
+    ----
+
+    The Parameters chart above contains parameters that need special handling.
+    For a full list of parameters, see entries with `Param(parent=...` below.
+
+    This API is experimental.
 
-    .. Note:: This API is experimental.
 
     Examples
     --------
@@ -240,8 +244,8 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
     :py:class:`~pyspark.ml.classification.OneVsRest`
 
     SparkXGBClassifier automatically supports most of the parameters in
-    `xgboost.XGBClassifier` constructor and most of the parameters used in
-    :py:class:`xgboost.XGBClassifier` fit and predict method.
+    :py:class:`xgboost.XGBClassifier` constructor and most of the parameters used in
+    :py:meth:`xgboost.XGBClassifier.fit` and :py:meth:`xgboost.XGBClassifier.predict` method.
 
     SparkXGBClassifier doesn't support setting `gpu_id` but support another param `use_gpu`,
     see doc below for more details.
@@ -313,10 +317,13 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
         A dictionary of xgboost parameters, please refer to
         https://xgboost.readthedocs.io/en/stable/parameter.html
 
-    .. Note:: The Parameters chart above contains parameters that need special handling.
-        For a full list of parameters, see entries with `Param(parent=...` below.
+    Note
+    ----
 
-    .. Note:: This API is experimental.
+    The Parameters chart above contains parameters that need special handling.
+    For a full list of parameters, see entries with `Param(parent=...` below.
+
+    This API is experimental.
 
     Examples
     --------
@@ -413,8 +420,8 @@ class SparkXGBRanker(_SparkXGBEstimator):
     :py:class:`~pyspark.ml.classification.OneVsRest`
 
     SparkXGBRanker automatically supports most of the parameters in
-    `xgboost.XGBRanker` constructor and most of the parameters used in
-    :py:class:`xgboost.XGBRanker` fit and predict method.
+    :py:class:`xgboost.XGBRanker` constructor and most of the parameters used in
+    :py:meth:`xgboost.XGBRanker.fit` and :py:meth:`xgboost.XGBRanker.predict` method.
 
     SparkXGBRanker doesn't support setting `gpu_id` but support another param `use_gpu`,
     see doc below for more details.
@@ -443,7 +450,7 @@ class SparkXGBRanker(_SparkXGBEstimator):
         Prediction column name. Default to "prediction"
     pred_contrib_col:
         Contribution prediction column name.
-    validation_indicator_col
+    validation_indicator_col:
         For params related to `xgboost.XGBRanker` training with
         evaluation dataset's supervision,
         set :py:attr:`xgboost.spark.SparkXGBRanker.validation_indicator_col`
@@ -471,7 +478,7 @@ class SparkXGBRanker(_SparkXGBEstimator):
     force_repartition:
         Boolean value to specify if forcing the input dataset to be repartitioned
         before XGBoost training.
-    repartition_random_shuffle
+    repartition_random_shuffle:
         Boolean value to specify if randomly shuffling the dataset when repartitioning is required.
     enable_sparse_data_optim:
         Boolean value to specify if enabling sparse data optimization, if True,