-
-
Notifications
You must be signed in to change notification settings - Fork 8.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add document for categorical data. (#7307)
- Loading branch information
1 parent
a7d0c66
commit 0bd8f21
Showing
4 changed files
with
239 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
"""A simple demo for categorical data support using dataset from Kaggle categorical data | ||
tutorial. | ||
The excellent tutorial is at: | ||
https://www.kaggle.com/shahules/an-overview-of-encoding-techniques | ||
And the data can be found at: | ||
https://www.kaggle.com/shahules/an-overview-of-encoding-techniques/data | ||
.. versionadded 1.6.0 | ||
""" | ||
|
||
from __future__ import annotations | ||
from time import time | ||
import os | ||
from tempfile import TemporaryDirectory | ||
|
||
import pandas as pd | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.metrics import roc_auc_score | ||
|
||
import xgboost as xgb | ||
|
||
|
||
def load_cat_in_the_dat() -> tuple[pd.DataFrame, pd.Series]: | ||
"""Assuming you have already downloaded the data into `input` directory.""" | ||
|
||
df_train = pd.read_csv("./input/cat-in-the-dat/train.csv") | ||
|
||
print( | ||
"train data set has got {} rows and {} columns".format( | ||
df_train.shape[0], df_train.shape[1] | ||
) | ||
) | ||
X = df_train.drop(["target"], axis=1) | ||
y = df_train["target"] | ||
|
||
for i in range(0, 5): | ||
X["bin_" + str(i)] = X["bin_" + str(i)].astype("category") | ||
|
||
for i in range(0, 5): | ||
X["nom_" + str(i)] = X["nom_" + str(i)].astype("category") | ||
|
||
for i in range(5, 10): | ||
X["nom_" + str(i)] = X["nom_" + str(i)].apply(int, base=16) | ||
|
||
for i in range(0, 6): | ||
X["ord_" + str(i)] = X["ord_" + str(i)].astype("category") | ||
|
||
print(X.shape) | ||
|
||
print( | ||
"train data set has got {} rows and {} columns".format(X.shape[0], X.shape[1]) | ||
) | ||
return X, y | ||
|
||
|
||
params = {"tree_method": "gpu_hist", "use_label_encoder": False, "n_estimators": 32} | ||
|
||
|
||
def categorical_model(X: pd.DataFrame, y: pd.Series, output_dir: str) -> None: | ||
"""Train using builtin categorical data support from XGBoost""" | ||
X_train, X_test, y_train, y_test = train_test_split( | ||
X, y, random_state=1994, test_size=0.2 | ||
) | ||
|
||
clf = xgb.XGBClassifier(**params, enable_categorical=True) | ||
clf.fit( | ||
X_train, | ||
y_train, | ||
eval_set=[(X_test, y_test), (X_train, y_train)], | ||
eval_metric="auc", | ||
) | ||
print(clf.n_classes_) | ||
clf.save_model(os.path.join(output_dir, "categorical.json")) | ||
|
||
y_score = clf.predict_proba(X_test)[:, 1] # proba of positive samples | ||
auc = roc_auc_score(y_test, y_score) | ||
print("AUC of using builtin categorical data support:", auc) | ||
|
||
|
||
def onehot_encoding_model(X: pd.DataFrame, y: pd.Series, output_dir: str) -> None: | ||
"""Train using one-hot encoded data.""" | ||
|
||
X_train, X_test, y_train, y_test = train_test_split( | ||
X, y, random_state=42, test_size=0.2 | ||
) | ||
print(X_train.shape, y_train.shape) | ||
|
||
clf = xgb.XGBClassifier(**params, enable_categorical=False) | ||
clf.fit( | ||
X_train, | ||
y_train, | ||
eval_set=[(X_test, y_test), (X_train, y_train)], | ||
eval_metric="auc", | ||
) | ||
clf.save_model(os.path.join(output_dir, "one-hot.json")) | ||
|
||
y_score = clf.predict_proba(X_test)[:, 1] # proba of positive samples | ||
auc = roc_auc_score(y_test, y_score) | ||
print("AUC of using onehot encoding:", auc) | ||
|
||
|
||
if __name__ == "__main__": | ||
X, y = load_cat_in_the_dat() | ||
|
||
with TemporaryDirectory() as tmpdir: | ||
start = time() | ||
categorical_model(X, y, tmpdir) | ||
end = time() | ||
print("Duration:categorical", end - start) | ||
|
||
X = pd.get_dummies(X) | ||
start = time() | ||
onehot_encoding_model(X, y, tmpdir) | ||
end = time() | ||
print("Duration:onehot", end - start) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
################ | ||
Categorical Data | ||
################ | ||
|
||
Starting from version 1.5, XGBoost has experimental support for categorical data available | ||
for public testing. At the moment, the support is implemented as one-hot encoding based | ||
categorical tree splits. For numerical data, the split condition is defined as | ||
:math:`value < threshold`, while for categorical data the split is defined as :math:`value | ||
== category` and ``category`` is a discrete value. More advanced categorical split | ||
strategy is planned for future releases and this tutorial details how to inform XGBoost | ||
about the data type. Also, the current support for training is limited to ``gpu_hist`` | ||
tree method. | ||
|
||
************************************ | ||
Training with scikit-learn Interface | ||
************************************ | ||
|
||
The easiest way to pass categorical data into XGBoost is using dataframe and the | ||
``scikit-learn`` interface like :class:`XGBClassifier <xgboost.XGBClassifier>`. For | ||
preparing the data, users need to specify the data type of input predictor as | ||
``category``. For ``pandas/cudf Dataframe``, this can be achieved by | ||
|
||
.. code:: python | ||
X["cat_feature"].astype("category") | ||
for all columns that represent categorical features. After which, users can tell XGBoost | ||
to enable training with categorical data. Assuming that you are using the | ||
:class:`XGBClassifier <xgboost.XGBClassifier>` for classification problem, specify the | ||
parameter ``enable_categorical``: | ||
|
||
.. code:: python | ||
# Only gpu_hist is supported for categorical data as mentioned previously | ||
clf = xgb.XGBClassifier( | ||
tree_method="gpu_hist", enable_categorical=True, use_label_encoder=False | ||
) | ||
# X is the dataframe we created in previous snippet | ||
clf.fit(X, y) | ||
# Must use JSON for serialization, otherwise the information is lost | ||
clf.save_model("categorical-model.json") | ||
Once training is finished, most of other features can utilize the model. For instance one | ||
can plot the model and calculate the global feature importance: | ||
|
||
|
||
.. code:: python | ||
# Get a graph | ||
graph = xgb.to_graphviz(clf, num_trees=1) | ||
# Or get a matplotlib axis | ||
ax = xgb.plot_tree(reg, num_trees=1) | ||
# Get feature importances | ||
clf.feature_importances_ | ||
The ``scikit-learn`` interface from dask is similar to single node version. The basic | ||
idea is create dataframe with category feature type, and tell XGBoost to use ``gpu_hist`` | ||
with parameter ``enable_categorical``. See `this demo | ||
<https://github.com/dmlc/xgboost/blob/master/demo/guide-python/categorical.py>`_ for a | ||
worked example using categorical data with ``scikit-learn`` interface. For using it with | ||
the Kaggle tutorial dataset, see `<this demo | ||
https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cat_in_the_dat.py>`_ | ||
|
||
|
||
********************** | ||
Using native interface | ||
********************** | ||
|
||
The ``scikit-learn`` interface is user friendly, but lacks some features that are only | ||
available in native interface. For instance users cannot compute SHAP value directly or | ||
use quantized ``DMatrix``. Also native interface supports data types other than | ||
dataframe, like ``numpy/cupy array``. To use the native interface with categorical data, | ||
we need to pass the similar parameter to ``DMatrix`` and the ``train`` function. For | ||
dataframe input: | ||
|
||
.. code:: python | ||
# X is a dataframe we created in previous snippet | ||
Xy = xgb.DMatrix(X, y, enable_categorical=True) | ||
booster = xgb.train({"tree_method": "gpu_hist"}, Xy) | ||
# Must use JSON for serialization, otherwise the information is lost | ||
booster.save_model("categorical-model.json") | ||
SHAP value computation: | ||
|
||
.. code:: python | ||
SHAP = booster.predict(Xy, pred_interactions=True) | ||
# categorical features are listed as "c" | ||
print(booster.feature_types) | ||
For other types of input, like ``numpy array``, we can tell XGBoost about the feature | ||
types by using the ``feature_types`` parameter in :class:`DMatrix <xgboost.DMatrix>`: | ||
|
||
.. code:: python | ||
# "q" is numerical feature, while "c" is categorical feature | ||
ft = ["q", "c", "c"] | ||
X: np.ndarray = load_my_data() | ||
assert X.shape[1] == 3 | ||
Xy = xgb.DMatrix(X, y, feature_types=ft, enable_categorical=True) | ||
For numerical data, the feature type can be ``"q"`` or ``"float"``, while for categorical | ||
feature it's specified as ``"c"``. The Dask module in XGBoost has the same interface so | ||
``dask.Array`` can also be used as categorical data. | ||
|
||
|
||
********** | ||
Next Steps | ||
********** | ||
|
||
As of XGBoost 1.5, the feature is highly experimental and have limited features like CPU | ||
training is not yet supported. Please see `<this issue> | ||
https://github.com/dmlc/xgboost/issues/6503`_ for progress. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters