From 1bfb98aca0a34dc43538cd362e2b4a76dd7f87f6 Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Fri, 26 May 2023 10:18:33 +0800 Subject: [PATCH 01/13] [pyspark] add parameters in the ctor of all estimators. --- python-package/xgboost/spark/core.py | 20 +- python-package/xgboost/spark/estimator.py | 252 ++++++++++++++++------ 2 files changed, 198 insertions(+), 74 deletions(-) diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py index d2eff943c23b..5ea003e79088 100644 --- a/python-package/xgboost/spark/core.py +++ b/python-package/xgboost/spark/core.py @@ -337,11 +337,9 @@ def _validate_params(self) -> None: if self.getOrDefault(self.features_cols): if not self.getOrDefault(self.use_gpu): - raise ValueError("features_cols param requires enabling use_gpu.") - - get_logger(self.__class__.__name__).warning( - "If features_cols param set, then features_col param is ignored." - ) + raise ValueError( + "features_col param with list value requires enabling use_gpu." + ) if self.getOrDefault("objective") is not None: if not isinstance(self.getOrDefault("objective"), str): @@ -547,6 +545,8 @@ def _get_unwrapped_vec_cols(feature_col: Column) -> List[Column]: class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable): + _input_kwargs: Dict[str, Any] + def __init__(self) -> None: super().__init__() self._set_xgb_params_default() @@ -576,6 +576,11 @@ def setParams( raise ValueError("Invalid param name: 'arbitrary_params_dict'.") for k, v in kwargs.items(): + # We're not allowing user use features_cols directly. + if k == self.features_cols.name: + raise ValueError( + f"Unsupported param '{k}' please use features_col instead." + ) if k in _inverse_pyspark_param_alias_map: raise ValueError( f"Please use param name {_inverse_pyspark_param_alias_map[k]} instead." @@ -591,7 +596,10 @@ def setParams( k = real_k if self.hasParam(k): - self._set(**{str(k): v}) + if k == "features_col" and isinstance(v, list): + self._set({"features_cols": v}) + else: + self._set(**{str(k): v}) else: if ( k in _unsupported_xgb_params diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py index 67eea4ad2d28..3c4bd3f6372f 100644 --- a/python-package/xgboost/spark/estimator.py +++ b/python-package/xgboost/spark/estimator.py @@ -2,9 +2,10 @@ # pylint: disable=too-many-ancestors # pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name -from typing import Any, Type +from typing import Any, Dict, List, Optional, Type, Union import numpy as np +from pyspark import keyword_only from pyspark.ml.param import Param, Params from pyspark.ml.param.shared import HasProbabilityCol, HasRawPredictionCol @@ -97,35 +98,57 @@ class SparkXGBRegressor(_SparkXGBEstimator): SparkXGBRegressor doesn't support setting `nthread` xgboost param, instead, the `nthread` param for each xgboost worker will be set equal to `spark.task.cpus` config value. - callbacks: - The export and import of the callback functions are at best effort. - For details, see :py:attr:`xgboost.spark.SparkXGBRegressor.callbacks` param doc. + + Parameters + ---------- + + features_col: + When the value is string, it requires the features column name to be vector type. + When the value is a list of string, it requires all the feature columns to be numeric types. + label_col: + Label column name. Default to "label". + prediction_col: + Prediction column name. Default to "prediction" + pred_contrib_col: + Contribution prediction column name. validation_indicator_col - For params related to `xgboost.XGBRegressor` training - with evaluation dataset's supervision, set - :py:attr:`xgboost.spark.SparkXGBRegressor.validation_indicator_col` - parameter instead of setting the `eval_set` parameter in `xgboost.XGBRegressor` + For params related to `xgboost.XGBClassifier` training with + evaluation dataset's supervision, + set :py:attr:`xgboost.spark.SparkXGBClassifier.validation_indicator_col` + parameter instead of setting the `eval_set` parameter in `xgboost.XGBClassifier` fit method. weight_col: To specify the weight of the training and validation dataset, set - :py:attr:`xgboost.spark.SparkXGBRegressor.weight_col` parameter instead of setting - `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBRegressor` - fit method. - xgb_model: - Set the value to be the instance returned by - :func:`xgboost.spark.SparkXGBRegressorModel.get_booster`. - num_workers: - Integer that specifies the number of XGBoost workers to use. - Each XGBoost worker corresponds to one spark task. - use_gpu: - Boolean that specifies whether the executors are running on GPU - instances. + :py:attr:`xgboost.spark.SparkXGBClassifier.weight_col` parameter instead of setting + `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBClassifier` + fit method. base_margin_col + Base margin column name base_margin_col: To specify the base margins of the training and validation - dataset, set :py:attr:`xgboost.spark.SparkXGBRegressor.base_margin_col` parameter + dataset, set :py:attr:`xgboost.spark.SparkXGBClassifier.base_margin_col` parameter instead of setting `base_margin` and `base_margin_eval_set` in the - `xgboost.XGBRegressor` fit method. Note: this isn't available for distributed + `xgboost.XGBClassifier` fit method. Note: this isn't available for distributed training. + qid_col" + Query id column name. + + num_workers: + How many XGBoost workers to be used to train. + Each XGBoost worker corresponds to one spark task. + use_gpu: + Boolean value to specify whether the executors are running on GPU + instances. + force_repartition: + Boolean value to specify if forcing the input dataset to be repartitioned before XGBoost training. + repartition_random_shuffle + Boolean value to specify if randomly shuffling the dataset when repartitioning is required. + enable_sparse_data_optim: + Boolean value to specify if enabling sparse data optimization, if True, + Xgboost DMatrix object will be constructed from sparse matrix instead of + dense matrix. + + xgboost_parameters: + A dictionary of xgboost parameters, please refer to https://xgboost.readthedocs.io/en/stable/parameter.html .. Note:: The Parameters chart above contains parameters that need special handling. For a full list of parameters, see entries with `Param(parent=...` below. @@ -155,9 +178,28 @@ class SparkXGBRegressor(_SparkXGBEstimator): """ - def __init__(self, **kwargs: Any) -> None: + @keyword_only + def __init__( + self, + *, + features_col: Union[str, List[str]] = "features", + label_col: str = "label", + prediction_col: str = "prediction", + pred_contrib_col: Optional[str] = None, + validation_indicator_col: Optional[str] = None, + weight_col: Optional[str] = None, + base_margin_col: Optional[str] = None, + qid_col: Optional[str] = None, + num_workers: int = 1, + use_gpu: bool = False, + force_repartition: bool = False, + repartition_random_shuffle: bool = False, + enable_sparse_data_optim: bool = False, + **xgboost_parameters: Dict[str, Any], + ) -> None: super().__init__() - self.setParams(**kwargs) + input_kwargs = self._input_kwargs + self.setParams(**input_kwargs) @classmethod def _xgb_cls(cls) -> Type[XGBRegressor]: @@ -220,14 +262,22 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction Parameters ---------- - callbacks: - The export and import of the callback functions are at best effort. For - details, see :py:attr:`xgboost.spark.SparkXGBClassifier.callbacks` param doc. - raw_prediction_col: + features_col: + When the value is string, it requires the features column name to be vector type. + When the value is a list of string, it requires all the feature columns to be numeric types. + label_col: + Label column name. Default to "label". + prediction_col: + Prediction column name. Default to "prediction" + probability_col: + Column name for predicted class conditional probabilities. Default to probabilityCol + raw_prediction_col The `output_margin=True` is implicitly supported by the `rawPredictionCol` output column, which is always returned with the predicted margin values. - validation_indicator_col: + pred_contrib_col: + Contribution prediction column name. + validation_indicator_col For params related to `xgboost.XGBClassifier` training with evaluation dataset's supervision, set :py:attr:`xgboost.spark.SparkXGBClassifier.validation_indicator_col` @@ -237,22 +287,34 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction To specify the weight of the training and validation dataset, set :py:attr:`xgboost.spark.SparkXGBClassifier.weight_col` parameter instead of setting `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBClassifier` - fit method. - xgb_model: - Set the value to be the instance returned by - :func:`xgboost.spark.SparkXGBClassifierModel.get_booster`. - num_workers: - Integer that specifies the number of XGBoost workers to use. - Each XGBoost worker corresponds to one spark task. - use_gpu: - Boolean that specifies whether the executors are running on GPU - instances. + fit method. base_margin_col + Base margin column name base_margin_col: To specify the base margins of the training and validation dataset, set :py:attr:`xgboost.spark.SparkXGBClassifier.base_margin_col` parameter instead of setting `base_margin` and `base_margin_eval_set` in the `xgboost.XGBClassifier` fit method. Note: this isn't available for distributed training. + qid_col" + Query id column name. + + num_workers: + How many XGBoost workers to be used to train. + Each XGBoost worker corresponds to one spark task. + use_gpu: + Boolean value to specify whether the executors are running on GPU + instances. + force_repartition: + Boolean value to specify if forcing the input dataset to be repartitioned before XGBoost training. + repartition_random_shuffle + Boolean value to specify if randomly shuffling the dataset when repartitioning is required. + enable_sparse_data_optim: + Boolean value to specify if enabling sparse data optimization, if True, + Xgboost DMatrix object will be constructed from sparse matrix instead of + dense matrix. + + xgboost_parameters: + A dictionary of xgboost parameters, please refer to https://xgboost.readthedocs.io/en/stable/parameter.html .. Note:: The Parameters chart above contains parameters that need special handling. For a full list of parameters, see entries with `Param(parent=...` below. @@ -281,14 +343,35 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction """ - def __init__(self, **kwargs: Any) -> None: + @keyword_only + def __init__( + self, + *, + features_col: Union[str, List[str]] = "features", + label_col: str = "label", + prediction_col: str = "prediction", + probability_col: str = "probability", + raw_prediction_col: str = "rawPrediction", + pred_contrib_col: Optional[str] = None, + validation_indicator_col: Optional[str] = None, + weight_col: Optional[str] = None, + base_margin_col: Optional[str] = None, + qid_col: Optional[str] = None, + num_workers: int = 1, + use_gpu: bool = False, + force_repartition: bool = False, + repartition_random_shuffle: bool = False, + enable_sparse_data_optim: bool = False, + **xgboost_parameters: Dict[str, Any], + ) -> None: super().__init__() # The default 'objective' param value comes from sklearn `XGBClassifier` ctor, # but in pyspark we will automatically set objective param depending on # binary or multinomial input dataset, and we need to remove the fixed default # param value as well to avoid causing ambiguity. + input_kwargs = self._input_kwargs + self.setParams(**input_kwargs) self._setDefault(objective=None) - self.setParams(**kwargs) @classmethod def _xgb_cls(cls) -> Type[XGBClassifier]: @@ -355,39 +438,53 @@ class SparkXGBRanker(_SparkXGBEstimator): Parameters ---------- - callbacks: - The export and import of the callback functions are at best effort. For - details, see :py:attr:`xgboost.spark.SparkXGBRanker.callbacks` param doc. - validation_indicator_col: - For params related to `xgboost.XGBRanker` training with + features_col: + When the value is string, it requires the features column name to be vector type. + When the value is a list of string, it requires all the feature columns to be numeric types. + label_col: + Label column name. Default to "label". + prediction_col: + Prediction column name. Default to "prediction" + pred_contrib_col: + Contribution prediction column name. + validation_indicator_col + For params related to `xgboost.XGBClassifier` training with evaluation dataset's supervision, - set :py:attr:`xgboost.spark.XGBRanker.validation_indicator_col` - parameter instead of setting the `eval_set` parameter in `xgboost.XGBRanker` + set :py:attr:`xgboost.spark.SparkXGBClassifier.validation_indicator_col` + parameter instead of setting the `eval_set` parameter in `xgboost.XGBClassifier` fit method. weight_col: To specify the weight of the training and validation dataset, set - :py:attr:`xgboost.spark.SparkXGBRanker.weight_col` parameter instead of setting - `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBRanker` - fit method. - xgb_model: - Set the value to be the instance returned by - :func:`xgboost.spark.SparkXGBRankerModel.get_booster`. + :py:attr:`xgboost.spark.SparkXGBClassifier.weight_col` parameter instead of setting + `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBClassifier` + fit method. base_margin_col + Base margin column name + base_margin_col: + To specify the base margins of the training and validation + dataset, set :py:attr:`xgboost.spark.SparkXGBClassifier.base_margin_col` parameter + instead of setting `base_margin` and `base_margin_eval_set` in the + `xgboost.XGBClassifier` fit method. Note: this isn't available for distributed + training. + qid_col" + Query id column name. + num_workers: - Integer that specifies the number of XGBoost workers to use. + How many XGBoost workers to be used to train. Each XGBoost worker corresponds to one spark task. use_gpu: - Boolean that specifies whether the executors are running on GPU + Boolean value to specify whether the executors are running on GPU instances. - base_margin_col: - To specify the base margins of the training and validation - dataset, set :py:attr:`xgboost.spark.SparkXGBRanker.base_margin_col` parameter - instead of setting `base_margin` and `base_margin_eval_set` in the - `xgboost.XGBRanker` fit method. - qid_col: - To specify the qid of the training and validation - dataset, set :py:attr:`xgboost.spark.SparkXGBRanker.qid_col` parameter - instead of setting `qid` / `group`, `eval_qid` / `eval_group` in the - `xgboost.XGBRanker` fit method. + force_repartition: + Boolean value to specify if forcing the input dataset to be repartitioned before XGBoost training. + repartition_random_shuffle + Boolean value to specify if randomly shuffling the dataset when repartitioning is required. + enable_sparse_data_optim: + Boolean value to specify if enabling sparse data optimization, if True, + Xgboost DMatrix object will be constructed from sparse matrix instead of + dense matrix. + + xgboost_parameters: + A dictionary of xgboost parameters, please refer to https://xgboost.readthedocs.io/en/stable/parameter.html .. Note:: The Parameters chart above contains parameters that need special handling. For a full list of parameters, see entries with `Param(parent=...` below. @@ -426,9 +523,28 @@ class SparkXGBRanker(_SparkXGBEstimator): >>> model.transform(df_test).show() """ - def __init__(self, **kwargs: Any) -> None: + @keyword_only + def __init__( + self, + *, + features_col: Union[str, List[str]] = "features", + label_col: str = "label", + prediction_col: str = "prediction", + pred_contrib_col: Optional[str] = None, + validation_indicator_col: Optional[str] = None, + weight_col: Optional[str] = None, + base_margin_col: Optional[str] = None, + qid_col: Optional[str] = None, + num_workers: int = 1, + use_gpu: bool = False, + force_repartition: bool = False, + repartition_random_shuffle: bool = False, + enable_sparse_data_optim: bool = False, + **xgboost_parameters: Dict[str, Any], + ) -> None: super().__init__() - self.setParams(**kwargs) + input_kwargs = self._input_kwargs + self.setParams(**input_kwargs) @classmethod def _xgb_cls(cls) -> Type[XGBRanker]: From ee12a3d28a654dcd15ede83d125573a2ca4144ac Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Fri, 26 May 2023 11:12:52 +0800 Subject: [PATCH 02/13] pylint --- python-package/xgboost/spark/core.py | 2 +- python-package/xgboost/spark/estimator.py | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py index 5ea003e79088..0181e678d201 100644 --- a/python-package/xgboost/spark/core.py +++ b/python-package/xgboost/spark/core.py @@ -597,7 +597,7 @@ def setParams( if self.hasParam(k): if k == "features_col" and isinstance(v, list): - self._set({"features_cols": v}) + self._set(**{"features_cols": v}) else: self._set(**{str(k): v}) else: diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py index 3c4bd3f6372f..1a614a3ddb20 100644 --- a/python-package/xgboost/spark/estimator.py +++ b/python-package/xgboost/spark/estimator.py @@ -1,6 +1,8 @@ """Xgboost pyspark integration submodule for estimator API.""" # pylint: disable=too-many-ancestors # pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name +# pylint: disable=unused-argument, too-many-locals + from typing import Any, Dict, List, Optional, Type, Union @@ -139,7 +141,8 @@ class SparkXGBRegressor(_SparkXGBEstimator): Boolean value to specify whether the executors are running on GPU instances. force_repartition: - Boolean value to specify if forcing the input dataset to be repartitioned before XGBoost training. + Boolean value to specify if forcing the input dataset to be repartitioned + before XGBoost training. repartition_random_shuffle Boolean value to specify if randomly shuffling the dataset when repartitioning is required. enable_sparse_data_optim: @@ -148,7 +151,8 @@ class SparkXGBRegressor(_SparkXGBEstimator): dense matrix. xgboost_parameters: - A dictionary of xgboost parameters, please refer to https://xgboost.readthedocs.io/en/stable/parameter.html + A dictionary of xgboost parameters, please refer to + https://xgboost.readthedocs.io/en/stable/parameter.html .. Note:: The Parameters chart above contains parameters that need special handling. For a full list of parameters, see entries with `Param(parent=...` below. @@ -305,7 +309,8 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction Boolean value to specify whether the executors are running on GPU instances. force_repartition: - Boolean value to specify if forcing the input dataset to be repartitioned before XGBoost training. + Boolean value to specify if forcing the input dataset to be repartitioned + before XGBoost training. repartition_random_shuffle Boolean value to specify if randomly shuffling the dataset when repartitioning is required. enable_sparse_data_optim: @@ -314,7 +319,8 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction dense matrix. xgboost_parameters: - A dictionary of xgboost parameters, please refer to https://xgboost.readthedocs.io/en/stable/parameter.html + A dictionary of xgboost parameters, please refer to + https://xgboost.readthedocs.io/en/stable/parameter.html .. Note:: The Parameters chart above contains parameters that need special handling. For a full list of parameters, see entries with `Param(parent=...` below. @@ -475,7 +481,8 @@ class SparkXGBRanker(_SparkXGBEstimator): Boolean value to specify whether the executors are running on GPU instances. force_repartition: - Boolean value to specify if forcing the input dataset to be repartitioned before XGBoost training. + Boolean value to specify if forcing the input dataset to be repartitioned + before XGBoost training. repartition_random_shuffle Boolean value to specify if randomly shuffling the dataset when repartitioning is required. enable_sparse_data_optim: @@ -484,7 +491,8 @@ class SparkXGBRanker(_SparkXGBEstimator): dense matrix. xgboost_parameters: - A dictionary of xgboost parameters, please refer to https://xgboost.readthedocs.io/en/stable/parameter.html + A dictionary of xgboost parameters, please refer to + https://xgboost.readthedocs.io/en/stable/parameter.html .. Note:: The Parameters chart above contains parameters that need special handling. For a full list of parameters, see entries with `Param(parent=...` below. From 06f52563869a31af32c0af3b7b3d7df491cd1541 Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Fri, 26 May 2023 17:04:41 +0800 Subject: [PATCH 03/13] Update python-package/xgboost/spark/estimator.py Co-authored-by: Jiaming Yuan --- python-package/xgboost/spark/estimator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py index 1a614a3ddb20..7363685dc6e7 100644 --- a/python-package/xgboost/spark/estimator.py +++ b/python-package/xgboost/spark/estimator.py @@ -457,7 +457,7 @@ class SparkXGBRanker(_SparkXGBEstimator): For params related to `xgboost.XGBClassifier` training with evaluation dataset's supervision, set :py:attr:`xgboost.spark.SparkXGBClassifier.validation_indicator_col` - parameter instead of setting the `eval_set` parameter in `xgboost.XGBClassifier` + parameter instead of setting the `eval_set` parameter in :py:class:`xgboost.XGBClassifier` fit method. weight_col: To specify the weight of the training and validation dataset, set From 5fe69a004eec24ad672aec6eeed69ed50f3220c7 Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Fri, 26 May 2023 17:04:47 +0800 Subject: [PATCH 04/13] Update python-package/xgboost/spark/estimator.py Co-authored-by: Jiaming Yuan --- python-package/xgboost/spark/estimator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py index 7363685dc6e7..b7cb0a85f7bc 100644 --- a/python-package/xgboost/spark/estimator.py +++ b/python-package/xgboost/spark/estimator.py @@ -469,7 +469,7 @@ class SparkXGBRanker(_SparkXGBEstimator): To specify the base margins of the training and validation dataset, set :py:attr:`xgboost.spark.SparkXGBClassifier.base_margin_col` parameter instead of setting `base_margin` and `base_margin_eval_set` in the - `xgboost.XGBClassifier` fit method. Note: this isn't available for distributed + :py:class:`xgboost.XGBClassifier` fit method. Note: this isn't available for distributed training. qid_col" Query id column name. From 6ee5b7d15410b4c788fbb4ae3d55fb6379366759 Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Fri, 26 May 2023 17:04:57 +0800 Subject: [PATCH 05/13] Update python-package/xgboost/spark/estimator.py Co-authored-by: Jiaming Yuan --- python-package/xgboost/spark/estimator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py index b7cb0a85f7bc..a8825f3c90dd 100644 --- a/python-package/xgboost/spark/estimator.py +++ b/python-package/xgboost/spark/estimator.py @@ -462,7 +462,7 @@ class SparkXGBRanker(_SparkXGBEstimator): weight_col: To specify the weight of the training and validation dataset, set :py:attr:`xgboost.spark.SparkXGBClassifier.weight_col` parameter instead of setting - `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBClassifier` + `sample_weight` and `sample_weight_eval_set` parameter in :py:class:`xgboost.XGBClassifier` fit method. base_margin_col Base margin column name base_margin_col: From 50a905ddaaa6678b2b72dd5fa3e2b8d6b946d1b3 Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Fri, 26 May 2023 17:05:04 +0800 Subject: [PATCH 06/13] Update python-package/xgboost/spark/estimator.py Co-authored-by: Jiaming Yuan --- python-package/xgboost/spark/estimator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py index a8825f3c90dd..4c59f98385ef 100644 --- a/python-package/xgboost/spark/estimator.py +++ b/python-package/xgboost/spark/estimator.py @@ -471,7 +471,7 @@ class SparkXGBRanker(_SparkXGBEstimator): instead of setting `base_margin` and `base_margin_eval_set` in the :py:class:`xgboost.XGBClassifier` fit method. Note: this isn't available for distributed training. - qid_col" + qid_col: Query id column name. num_workers: From 136085660ce7c59a86bd044e67cb640ab0bbb048 Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Fri, 26 May 2023 17:20:38 +0800 Subject: [PATCH 07/13] comments --- python-package/xgboost/spark/estimator.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py index 4c59f98385ef..9ee7fac51cbe 100644 --- a/python-package/xgboost/spark/estimator.py +++ b/python-package/xgboost/spark/estimator.py @@ -131,8 +131,6 @@ class SparkXGBRegressor(_SparkXGBEstimator): instead of setting `base_margin` and `base_margin_eval_set` in the `xgboost.XGBClassifier` fit method. Note: this isn't available for distributed training. - qid_col" - Query id column name. num_workers: How many XGBoost workers to be used to train. @@ -150,7 +148,7 @@ class SparkXGBRegressor(_SparkXGBEstimator): Xgboost DMatrix object will be constructed from sparse matrix instead of dense matrix. - xgboost_parameters: + kwargs: A dictionary of xgboost parameters, please refer to https://xgboost.readthedocs.io/en/stable/parameter.html @@ -193,13 +191,12 @@ def __init__( validation_indicator_col: Optional[str] = None, weight_col: Optional[str] = None, base_margin_col: Optional[str] = None, - qid_col: Optional[str] = None, num_workers: int = 1, use_gpu: bool = False, force_repartition: bool = False, repartition_random_shuffle: bool = False, enable_sparse_data_optim: bool = False, - **xgboost_parameters: Dict[str, Any], + **kwargs: Dict[str, Any], ) -> None: super().__init__() input_kwargs = self._input_kwargs @@ -299,8 +296,6 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction instead of setting `base_margin` and `base_margin_eval_set` in the `xgboost.XGBClassifier` fit method. Note: this isn't available for distributed training. - qid_col" - Query id column name. num_workers: How many XGBoost workers to be used to train. @@ -318,7 +313,7 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction Xgboost DMatrix object will be constructed from sparse matrix instead of dense matrix. - xgboost_parameters: + kwargs: A dictionary of xgboost parameters, please refer to https://xgboost.readthedocs.io/en/stable/parameter.html @@ -362,13 +357,12 @@ def __init__( validation_indicator_col: Optional[str] = None, weight_col: Optional[str] = None, base_margin_col: Optional[str] = None, - qid_col: Optional[str] = None, num_workers: int = 1, use_gpu: bool = False, force_repartition: bool = False, repartition_random_shuffle: bool = False, enable_sparse_data_optim: bool = False, - **xgboost_parameters: Dict[str, Any], + **kwargs: Dict[str, Any], ) -> None: super().__init__() # The default 'objective' param value comes from sklearn `XGBClassifier` ctor, @@ -490,7 +484,7 @@ class SparkXGBRanker(_SparkXGBEstimator): Xgboost DMatrix object will be constructed from sparse matrix instead of dense matrix. - xgboost_parameters: + kwargs: A dictionary of xgboost parameters, please refer to https://xgboost.readthedocs.io/en/stable/parameter.html @@ -548,7 +542,7 @@ def __init__( force_repartition: bool = False, repartition_random_shuffle: bool = False, enable_sparse_data_optim: bool = False, - **xgboost_parameters: Dict[str, Any], + **kwargs: Dict[str, Any], ) -> None: super().__init__() input_kwargs = self._input_kwargs From 7515ef5408fe8742ee8d9a555dba75d73a307f42 Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Fri, 26 May 2023 17:33:49 +0800 Subject: [PATCH 08/13] update --- python-package/xgboost/spark/estimator.py | 37 +++++++++++------------ 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py index 9ee7fac51cbe..67df1d1751d2 100644 --- a/python-package/xgboost/spark/estimator.py +++ b/python-package/xgboost/spark/estimator.py @@ -114,22 +114,21 @@ class SparkXGBRegressor(_SparkXGBEstimator): pred_contrib_col: Contribution prediction column name. validation_indicator_col - For params related to `xgboost.XGBClassifier` training with + For params related to `xgboost.XGBRegressor` training with evaluation dataset's supervision, - set :py:attr:`xgboost.spark.SparkXGBClassifier.validation_indicator_col` - parameter instead of setting the `eval_set` parameter in `xgboost.XGBClassifier` + set :py:attr:`xgboost.spark.SparkXGBRegressor.validation_indicator_col` + parameter instead of setting the `eval_set` parameter in `xgboost.XGBRegressor` fit method. weight_col: To specify the weight of the training and validation dataset, set - :py:attr:`xgboost.spark.SparkXGBClassifier.weight_col` parameter instead of setting - `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBClassifier` - fit method. base_margin_col - Base margin column name + :py:attr:`xgboost.spark.SparkXGBRegressor.weight_col` parameter instead of setting + `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBRegressor` + fit method. base_margin_col: To specify the base margins of the training and validation - dataset, set :py:attr:`xgboost.spark.SparkXGBClassifier.base_margin_col` parameter + dataset, set :py:attr:`xgboost.spark.SparkXGBRegressor.base_margin_col` parameter instead of setting `base_margin` and `base_margin_eval_set` in the - `xgboost.XGBClassifier` fit method. Note: this isn't available for distributed + `xgboost.XGBRegressor` fit method. Note: this isn't available for distributed training. num_workers: @@ -288,8 +287,7 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction To specify the weight of the training and validation dataset, set :py:attr:`xgboost.spark.SparkXGBClassifier.weight_col` parameter instead of setting `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBClassifier` - fit method. base_margin_col - Base margin column name + fit method. base_margin_col: To specify the base margins of the training and validation dataset, set :py:attr:`xgboost.spark.SparkXGBClassifier.base_margin_col` parameter @@ -448,22 +446,21 @@ class SparkXGBRanker(_SparkXGBEstimator): pred_contrib_col: Contribution prediction column name. validation_indicator_col - For params related to `xgboost.XGBClassifier` training with + For params related to `xgboost.XGBRanker` training with evaluation dataset's supervision, - set :py:attr:`xgboost.spark.SparkXGBClassifier.validation_indicator_col` - parameter instead of setting the `eval_set` parameter in :py:class:`xgboost.XGBClassifier` + set :py:attr:`xgboost.spark.SparkXGBRanker.validation_indicator_col` + parameter instead of setting the `eval_set` parameter in :py:class:`xgboost.XGBRanker` fit method. weight_col: To specify the weight of the training and validation dataset, set - :py:attr:`xgboost.spark.SparkXGBClassifier.weight_col` parameter instead of setting - `sample_weight` and `sample_weight_eval_set` parameter in :py:class:`xgboost.XGBClassifier` - fit method. base_margin_col - Base margin column name + :py:attr:`xgboost.spark.SparkXGBRanker.weight_col` parameter instead of setting + `sample_weight` and `sample_weight_eval_set` parameter in :py:class:`xgboost.XGBRanker` + fit method. base_margin_col: To specify the base margins of the training and validation - dataset, set :py:attr:`xgboost.spark.SparkXGBClassifier.base_margin_col` parameter + dataset, set :py:attr:`xgboost.spark.SparkXGBRanker.base_margin_col` parameter instead of setting `base_margin` and `base_margin_eval_set` in the - :py:class:`xgboost.XGBClassifier` fit method. Note: this isn't available for distributed + :py:class:`xgboost.XGBRanker` fit method. Note: this isn't available for distributed training. qid_col: Query id column name. From 8748ce381a7f3d8369e23e7752949072c896a6e1 Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Fri, 26 May 2023 17:48:14 +0800 Subject: [PATCH 09/13] udpate --- python-package/xgboost/spark/estimator.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py index 67df1d1751d2..d7fb27874698 100644 --- a/python-package/xgboost/spark/estimator.py +++ b/python-package/xgboost/spark/estimator.py @@ -128,8 +128,7 @@ class SparkXGBRegressor(_SparkXGBEstimator): To specify the base margins of the training and validation dataset, set :py:attr:`xgboost.spark.SparkXGBRegressor.base_margin_col` parameter instead of setting `base_margin` and `base_margin_eval_set` in the - `xgboost.XGBRegressor` fit method. Note: this isn't available for distributed - training. + `xgboost.XGBRegressor` fit method. num_workers: How many XGBoost workers to be used to train. @@ -292,8 +291,7 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction To specify the base margins of the training and validation dataset, set :py:attr:`xgboost.spark.SparkXGBClassifier.base_margin_col` parameter instead of setting `base_margin` and `base_margin_eval_set` in the - `xgboost.XGBClassifier` fit method. Note: this isn't available for distributed - training. + `xgboost.XGBClassifier` fit method. num_workers: How many XGBoost workers to be used to train. @@ -460,8 +458,7 @@ class SparkXGBRanker(_SparkXGBEstimator): To specify the base margins of the training and validation dataset, set :py:attr:`xgboost.spark.SparkXGBRanker.base_margin_col` parameter instead of setting `base_margin` and `base_margin_eval_set` in the - :py:class:`xgboost.XGBRanker` fit method. Note: this isn't available for distributed - training. + :py:class:`xgboost.XGBRanker` fit method. qid_col: Query id column name. From 432bd2e4adee4b9540105223eb8ec14bf214bb39 Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Sun, 28 May 2023 05:52:25 +0800 Subject: [PATCH 10/13] Update python-package/xgboost/spark/estimator.py Co-authored-by: Jiaming Yuan --- python-package/xgboost/spark/estimator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py index d7fb27874698..3ca81247d45f 100644 --- a/python-package/xgboost/spark/estimator.py +++ b/python-package/xgboost/spark/estimator.py @@ -270,7 +270,7 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction Prediction column name. Default to "prediction" probability_col: Column name for predicted class conditional probabilities. Default to probabilityCol - raw_prediction_col + raw_prediction_col: The `output_margin=True` is implicitly supported by the `rawPredictionCol` output column, which is always returned with the predicted margin values. From 983128755aa3e99f6f366990715b1ccc46dd0ed1 Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Sun, 28 May 2023 05:52:39 +0800 Subject: [PATCH 11/13] Update python-package/xgboost/spark/estimator.py Co-authored-by: Jiaming Yuan --- python-package/xgboost/spark/estimator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py index 3ca81247d45f..b43c4045479a 100644 --- a/python-package/xgboost/spark/estimator.py +++ b/python-package/xgboost/spark/estimator.py @@ -276,7 +276,7 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction values. pred_contrib_col: Contribution prediction column name. - validation_indicator_col + validation_indicator_col: For params related to `xgboost.XGBClassifier` training with evaluation dataset's supervision, set :py:attr:`xgboost.spark.SparkXGBClassifier.validation_indicator_col` From 83a6f3f379b3423559af74db4dbda9536c74e664 Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Sun, 28 May 2023 05:52:45 +0800 Subject: [PATCH 12/13] Update python-package/xgboost/spark/estimator.py Co-authored-by: Jiaming Yuan --- python-package/xgboost/spark/estimator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py index b43c4045479a..6c70c50c30a3 100644 --- a/python-package/xgboost/spark/estimator.py +++ b/python-package/xgboost/spark/estimator.py @@ -302,7 +302,7 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction force_repartition: Boolean value to specify if forcing the input dataset to be repartitioned before XGBoost training. - repartition_random_shuffle + repartition_random_shuffle: Boolean value to specify if randomly shuffling the dataset when repartitioning is required. enable_sparse_data_optim: Boolean value to specify if enabling sparse data optimization, if True, From 04eb0f4d9465b08614f10e6dcbfa20d7d892ce38 Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Sun, 28 May 2023 06:26:13 +0800 Subject: [PATCH 13/13] comments --- python-package/xgboost/spark/estimator.py | 39 +++++++++++++---------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py index 6c70c50c30a3..5054ef0ddb2e 100644 --- a/python-package/xgboost/spark/estimator.py +++ b/python-package/xgboost/spark/estimator.py @@ -86,8 +86,8 @@ class SparkXGBRegressor(_SparkXGBEstimator): :py:class:`~pyspark.ml.classification.OneVsRest` SparkXGBRegressor automatically supports most of the parameters in - `xgboost.XGBRegressor` constructor and most of the parameters used in - :py:class:`xgboost.XGBRegressor` fit and predict method. + :py:class:`xgboost.XGBRegressor` constructor and most of the parameters used in + :py:meth:`xgboost.XGBRegressor.fit` and :py:meth:`xgboost.XGBRegressor.predict` method. SparkXGBRegressor doesn't support setting `gpu_id` but support another param `use_gpu`, see doc below for more details. @@ -113,7 +113,7 @@ class SparkXGBRegressor(_SparkXGBEstimator): Prediction column name. Default to "prediction" pred_contrib_col: Contribution prediction column name. - validation_indicator_col + validation_indicator_col: For params related to `xgboost.XGBRegressor` training with evaluation dataset's supervision, set :py:attr:`xgboost.spark.SparkXGBRegressor.validation_indicator_col` @@ -139,7 +139,7 @@ class SparkXGBRegressor(_SparkXGBEstimator): force_repartition: Boolean value to specify if forcing the input dataset to be repartitioned before XGBoost training. - repartition_random_shuffle + repartition_random_shuffle: Boolean value to specify if randomly shuffling the dataset when repartitioning is required. enable_sparse_data_optim: Boolean value to specify if enabling sparse data optimization, if True, @@ -150,10 +150,14 @@ class SparkXGBRegressor(_SparkXGBEstimator): A dictionary of xgboost parameters, please refer to https://xgboost.readthedocs.io/en/stable/parameter.html - .. Note:: The Parameters chart above contains parameters that need special handling. - For a full list of parameters, see entries with `Param(parent=...` below. + Note + ---- + + The Parameters chart above contains parameters that need special handling. + For a full list of parameters, see entries with `Param(parent=...` below. + + This API is experimental. - .. Note:: This API is experimental. Examples -------- @@ -240,8 +244,8 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction :py:class:`~pyspark.ml.classification.OneVsRest` SparkXGBClassifier automatically supports most of the parameters in - `xgboost.XGBClassifier` constructor and most of the parameters used in - :py:class:`xgboost.XGBClassifier` fit and predict method. + :py:class:`xgboost.XGBClassifier` constructor and most of the parameters used in + :py:meth:`xgboost.XGBClassifier.fit` and :py:meth:`xgboost.XGBClassifier.predict` method. SparkXGBClassifier doesn't support setting `gpu_id` but support another param `use_gpu`, see doc below for more details. @@ -313,10 +317,13 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction A dictionary of xgboost parameters, please refer to https://xgboost.readthedocs.io/en/stable/parameter.html - .. Note:: The Parameters chart above contains parameters that need special handling. - For a full list of parameters, see entries with `Param(parent=...` below. + Note + ---- - .. Note:: This API is experimental. + The Parameters chart above contains parameters that need special handling. + For a full list of parameters, see entries with `Param(parent=...` below. + + This API is experimental. Examples -------- @@ -413,8 +420,8 @@ class SparkXGBRanker(_SparkXGBEstimator): :py:class:`~pyspark.ml.classification.OneVsRest` SparkXGBRanker automatically supports most of the parameters in - `xgboost.XGBRanker` constructor and most of the parameters used in - :py:class:`xgboost.XGBRanker` fit and predict method. + :py:class:`xgboost.XGBRanker` constructor and most of the parameters used in + :py:meth:`xgboost.XGBRanker.fit` and :py:meth:`xgboost.XGBRanker.predict` method. SparkXGBRanker doesn't support setting `gpu_id` but support another param `use_gpu`, see doc below for more details. @@ -443,7 +450,7 @@ class SparkXGBRanker(_SparkXGBEstimator): Prediction column name. Default to "prediction" pred_contrib_col: Contribution prediction column name. - validation_indicator_col + validation_indicator_col: For params related to `xgboost.XGBRanker` training with evaluation dataset's supervision, set :py:attr:`xgboost.spark.SparkXGBRanker.validation_indicator_col` @@ -471,7 +478,7 @@ class SparkXGBRanker(_SparkXGBEstimator): force_repartition: Boolean value to specify if forcing the input dataset to be repartitioned before XGBoost training. - repartition_random_shuffle + repartition_random_shuffle: Boolean value to specify if randomly shuffling the dataset when repartitioning is required. enable_sparse_data_optim: Boolean value to specify if enabling sparse data optimization, if True,