From 2903ff20b0a9b763cf1b76974f76e2cb0fad0863 Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 14 Jan 2022 22:39:52 +0800 Subject: [PATCH] Update document for multi output and categorical. * Group together categorical related parameters. * Update documents about multioutput and categorical. --- demo/guide-python/custom_rmsle.py | 2 +- demo/guide-python/multioutput_regression.py | 2 ++ doc/tutorials/categorical.rst | 2 +- doc/tutorials/multioutput.rst | 17 +++++++------- python-package/xgboost/sklearn.py | 26 +++++++++++---------- 5 files changed, 27 insertions(+), 22 deletions(-) diff --git a/demo/guide-python/custom_rmsle.py b/demo/guide-python/custom_rmsle.py index 66fbd83a0b18..bc21f9022d81 100644 --- a/demo/guide-python/custom_rmsle.py +++ b/demo/guide-python/custom_rmsle.py @@ -7,7 +7,7 @@ Error (SLE) objective and RMSLE metric as customized functions, then compare it with native implementation in XGBoost. -See doc/tutorials/custom_metric_obj.rst for a step by step walkthrough, with other +See :doc:`/tutorials/custom_metric_obj` for a step by step walkthrough, with other details. The `SLE` objective reduces impact of outliers in training dataset, hence here we also diff --git a/demo/guide-python/multioutput_regression.py b/demo/guide-python/multioutput_regression.py index a0d0998e6b88..f3f62609c4f4 100644 --- a/demo/guide-python/multioutput_regression.py +++ b/demo/guide-python/multioutput_regression.py @@ -5,6 +5,8 @@ The demo is adopted from scikit-learn: https://scikit-learn.org/stable/auto_examples/ensemble/plot_random_forest_regression_multioutput.html#sphx-glr-auto-examples-ensemble-plot-random-forest-regression-multioutput-py + +See :doc:`/tutorials/multioutput` for more information. """ import numpy as np import xgboost as xgb diff --git a/doc/tutorials/categorical.rst b/doc/tutorials/categorical.rst index f302e5e47e79..c1d93fb45df4 100644 --- a/doc/tutorials/categorical.rst +++ b/doc/tutorials/categorical.rst @@ -113,7 +113,7 @@ Miscellaneous ************* By default, XGBoost assumes input categories are integers starting from 0 till the number -of categories :math:`[0, n_categories)`. However, user might provide inputs with invalid +of categories :math:`[0, n\_categories)`. However, user might provide inputs with invalid values due to mistakes or missing values. It can be negative value, integer values that can not be accurately represented by 32-bit floating point, or values that are larger than actual number of unique categories. During training this is validated but for prediction diff --git a/doc/tutorials/multioutput.rst b/doc/tutorials/multioutput.rst index d9af9313e475..0be27ced0b9b 100644 --- a/doc/tutorials/multioutput.rst +++ b/doc/tutorials/multioutput.rst @@ -12,14 +12,15 @@ terminologies related to different multi-output models please refer to the `scik user guide `_. Internally, XGBoost builds one model for each target similar to sklearn meta estimators, -with the added benefit of reusing data and custom objective support. For a worked example -of regression, see :ref:`sphx_glr_python_examples_multioutput_regression.py`. For -multi-label classification, the binary relevance strategy is used. Input ``y`` should be -of shape ``(n_samples, n_classes)`` with each column having a value of 0 or 1 to specify -whether the sample is labeled as positive for respective class. Given a sample with 3 -output classes and 2 labels, the corresponding `y` should be encoded as ``[1, 0, 1]`` with -the second class labeled as negative and the rest labeled as positive. At the moment -XGBoost supports only dense matrix for labels. +with the added benefit of reusing data and other integrated features like SHAP. For a +worked example of regression, see +:ref:`sphx_glr_python_examples_multioutput_regression.py`. For multi-label classification, +the binary relevance strategy is used. Input ``y`` should be of shape ``(n_samples, +n_classes)`` with each column having a value of 0 or 1 to specify whether the sample is +labeled as positive for respective class. Given a sample with 3 output classes and 2 +labels, the corresponding `y` should be encoded as ``[1, 0, 1]`` with the second class +labeled as negative and the rest labeled as positive. At the moment XGBoost supports only +dense matrix for labels. .. code-block:: python diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 54970af6d07a..374958f750c4 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -197,6 +197,18 @@ def inner(y_score: np.ndarray, dmatrix: DMatrix) -> Tuple[str, float]: Experimental support for categorical data. Do not set to true unless you are interested in development. Only valid when `gpu_hist` and dataframe are used. + max_cat_to_onehot : bool + + .. versionadded:: 1.6.0 + + .. note:: This parameter is experimental + + A threshold for deciding whether XGBoost should use one-hot encoding based split + for categorical data. When number of categories is lesser than the threshold then + one-hot encoding is chosen, otherwise the categories will be partitioned into + children nodes. Only relevant for regression and binary classification and + `approx` tree method. + eval_metric : Optional[Union[str, List[str], Callable]] .. versionadded:: 1.6.0 @@ -267,16 +279,6 @@ def inner(y_score: np.ndarray, dmatrix: DMatrix) -> Tuple[str, float]: callbacks = [xgb.callback.EarlyStopping(rounds=early_stopping_rounds, save_best=True)] - max_cat_to_onehot : bool - - .. versionadded:: 1.6.0 - - A threshold for deciding whether XGBoost should use one-hot encoding based split - for categorical data. When number of categories is lesser than the threshold then - one-hot encoding is chosen, otherwise the categories will be partitioned into - children nodes. Only relevant for regression and binary classification and - `approx` tree method. - kwargs : dict, optional Keyword arguments for XGBoost Booster object. Full documentation of parameters can be found :doc:`here `. @@ -490,10 +492,10 @@ def __init__( validate_parameters: Optional[bool] = None, predictor: Optional[str] = None, enable_categorical: bool = False, + max_cat_to_onehot: Optional[int] = None, eval_metric: Optional[Union[str, List[str], Callable]] = None, early_stopping_rounds: Optional[int] = None, callbacks: Optional[List[TrainingCallback]] = None, - max_cat_to_onehot: Optional[int] = None, **kwargs: Any ) -> None: if not SKLEARN_INSTALLED: @@ -530,10 +532,10 @@ def __init__( self.validate_parameters = validate_parameters self.predictor = predictor self.enable_categorical = enable_categorical + self.max_cat_to_onehot = max_cat_to_onehot self.eval_metric = eval_metric self.early_stopping_rounds = early_stopping_rounds self.callbacks = callbacks - self.max_cat_to_onehot = max_cat_to_onehot if kwargs: self.kwargs = kwargs