Skip to content

Commit

Permalink
Support more input types for categorical data. (#7220)
Browse files Browse the repository at this point in the history
* Support more input types for categorical data.

* Shorten the type name from "categorical" to "c".
* Tests for np/cp array and scipy csr/csc/coo.
* Specify the type for feature info.
  • Loading branch information
trivialfis authored Sep 16, 2021
1 parent 2942dc6 commit 0ed979b
Show file tree
Hide file tree
Showing 11 changed files with 229 additions and 61 deletions.
3 changes: 2 additions & 1 deletion demo/guide-python/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ def make_categorical(

def main() -> None:
# Use builtin categorical data support
# Must be pandas DataFrame or cudf DataFrame with categorical data
# For scikit-learn interface, the input data must be pandas DataFrame or cudf
# DataFrame with categorical features
X, y = make_categorical(100, 10, 4, False)
# Specify `enable_categorical` to True.
reg = xgb.XGBRegressor(tree_method="gpu_hist", enable_categorical=True)
Expand Down
2 changes: 1 addition & 1 deletion include/xgboost/feature_map.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ class FeatureMap {
if (!strcmp("q", tname)) return kQuantitive;
if (!strcmp("int", tname)) return kInteger;
if (!strcmp("float", tname)) return kFloat;
if (!strcmp("categorical", tname)) return kCategorical;
if (!strcmp("c", tname)) return kCategorical;
LOG(FATAL) << "unknown feature type, use i for indicator and q for quantity";
return kIndicator;
}
Expand Down
47 changes: 29 additions & 18 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,8 +518,8 @@ def __init__(
base_margin=None,
missing: Optional[float] = None,
silent=False,
feature_names=None,
feature_types=None,
feature_names: Optional[List[str]] = None,
feature_types: Optional[List[str]] = None,
nthread: Optional[int] = None,
group=None,
qid=None,
Expand Down Expand Up @@ -558,8 +558,11 @@ def __init__(
Whether print messages during construction
feature_names : list, optional
Set names for features.
feature_types : list, optional
Set types for features.
feature_types :
Set types for features. When `enable_categorical` is set to `True`, string
"c" represents categorical data type.
nthread : integer, optional
Number of threads to use for loading data when parallelization is
applicable. If -1, uses maximum threads available on the system.
Expand All @@ -577,11 +580,10 @@ def __init__(
.. versionadded:: 1.3.0
Experimental support of specializing for categorical features. Do
not set to True unless you are interested in development.
Currently it's only available for `gpu_hist` tree method with 1 vs
rest (one hot) categorical split. Also, JSON serialization format,
`gpu_predictor` and pandas input are required.
Experimental support of specializing for categorical features. Do not set to
True unless you are interested in development. Currently it's only available
for `gpu_hist` tree method with 1 vs rest (one hot) categorical split. Also,
JSON serialization format is required.
"""
if group is not None and qid is not None:
Expand Down Expand Up @@ -673,8 +675,8 @@ def set_info(
qid=None,
label_lower_bound=None,
label_upper_bound=None,
feature_names=None,
feature_types=None,
feature_names: Optional[List[str]] = None,
feature_types: Optional[List[str]] = None,
feature_weights=None
) -> None:
"""Set meta info for DMatrix. See doc string for :py:obj:`xgboost.DMatrix`."""
Expand Down Expand Up @@ -945,7 +947,7 @@ def slice(
return res

@property
def feature_names(self) -> List[str]:
def feature_names(self) -> Optional[List[str]]:
"""Get feature names (column labels).
Returns
Expand Down Expand Up @@ -1033,17 +1035,21 @@ def feature_types(self) -> Optional[List[str]]:
return res

@feature_types.setter
def feature_types(self, feature_types: Optional[Union[List[Any], Any]]) -> None:
def feature_types(self, feature_types: Optional[Union[List[str], str]]) -> None:
"""Set feature types (column types).
This is for displaying the results and unrelated
to the learning process.
This is for displaying the results and categorical data support. See doc string
of :py:obj:`xgboost.DMatrix` for details.
Parameters
----------
feature_types : list or None
Labels for features. None will reset existing feature names
"""
# For compatibility reason this function wraps single str input into a list. But
# we should not promote such usage since other than visualization, the field is
# also used for specifying categorical data type.
if feature_types is not None:
if not isinstance(feature_types, (list, str)):
raise TypeError(
Expand Down Expand Up @@ -2461,8 +2467,13 @@ def _validate_features(self, data: DMatrix):

raise ValueError(msg.format(self.feature_names, data.feature_names))

def get_split_value_histogram(self, feature, fmap='', bins=None,
as_pandas=True):
def get_split_value_histogram(
self,
feature: str,
fmap: Union[os.PathLike, str] = '',
bins: Optional[int] = None,
as_pandas: bool = True
):
"""Get split value histogram of a feature
Parameters
Expand Down Expand Up @@ -2510,7 +2521,7 @@ def get_split_value_histogram(self, feature, fmap='', bins=None,
except (ValueError, AttributeError, TypeError):
# None.index: attr err, None[0]: type err, fn.index(-1): value err
feature_t = None
if feature_t == "categorical":
if feature_t == "c": # categorical
raise ValueError(
"Split value historgam doesn't support categorical split."
)
Expand Down
Loading

0 comments on commit 0ed979b

Please sign in to comment.