Skip to content

Commit

Permalink
Fix mixed types with cuDF. (#8280)
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis authored Sep 28, 2022
1 parent f835368 commit 6925b22
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 44 deletions.
59 changes: 41 additions & 18 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,72 @@
# pylint: disable=too-many-arguments, too-many-branches, invalid-name
# pylint: disable=too-many-lines, too-many-locals
"""Core XGBoost Library."""
from abc import ABC, abstractmethod
from collections.abc import Mapping
import copy
from typing import List, Optional, Any, Union, Dict, TypeVar
from typing import Callable, Tuple, cast, Sequence, Type, Iterable
import ctypes
import json
import os
import re
import sys
import json
import warnings
from abc import ABC, abstractmethod
from collections.abc import Mapping
from functools import wraps
from inspect import signature, Parameter
from inspect import Parameter, signature
from typing import (
Any,
Callable,
Dict,
Iterable,
List,
Optional,
Sequence,
Tuple,
Type,
TypeVar,
Union,
cast,
overload,
)

import numpy as np
import scipy.sparse

from .compat import DataFrame, py_str, PANDAS_INSTALLED
from .libpath import find_lib_path
from ._typing import (
CStrPptr,
c_bst_ulong,
_T,
ArrayLike,
BoosterParam,
CFloatPtr,
CNumeric,
DataType,
CNumericPtr,
CStrPptr,
CStrPtr,
CTypeT,
ArrayLike,
CFloatPtr,
NumpyOrCupy,
CupyT,
DataType,
FeatureInfo,
FeatureTypes,
FeatureNames,
_T,
CupyT,
BoosterParam
FeatureTypes,
NumpyOrCupy,
c_bst_ulong,
)
from .compat import PANDAS_INSTALLED, DataFrame, py_str
from .libpath import find_lib_path


class XGBoostError(ValueError):
"""Error thrown by xgboost trainer."""


@overload
def from_pystr_to_cstr(data: str) -> bytes:
...


@overload
def from_pystr_to_cstr(data: List[str]) -> ctypes.Array:
...


def from_pystr_to_cstr(data: Union[str, List[str]]) -> Union[bytes, ctypes.Array]:
"""Convert a Python str or list of Python str to C pointer
Expand Down
60 changes: 38 additions & 22 deletions python-package/xgboost/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,33 @@
'''Data dispatching for DMatrix.'''
import ctypes
import json
import warnings
import os
from typing import Any, Tuple, Callable, Optional, List, Union, Iterator, Sequence, cast
import warnings
from typing import Any, Callable, Iterator, List, Optional, Sequence, Tuple, Union, cast

import numpy as np

from .core import c_array, _LIB, _check_call, c_str
from .core import _cuda_array_interface
from .core import DataIter, _ProxyDMatrix, DMatrix
from .compat import lazy_isinstance, DataFrame
from ._typing import (
c_bst_ulong,
CupyT,
DataType,
FeatureTypes,
FeatureNames,
FeatureTypes,
FloatCompatible,
NumpyDType,
CupyT,
FloatCompatible, PandasDType
PandasDType,
c_bst_ulong,
)
from .compat import DataFrame, lazy_isinstance
from .core import (
_LIB,
DataIter,
DMatrix,
_check_call,
_cuda_array_interface,
_ProxyDMatrix,
c_array,
c_str,
from_pystr_to_cstr,
)

DispatchedDataBackendReturnType = Tuple[
Expand Down Expand Up @@ -631,10 +640,10 @@ def _is_cudf_df(data: DataType) -> bool:


def _cudf_array_interfaces(data: DataType, cat_codes: list) -> bytes:
"""Extract CuDF __cuda_array_interface__. This is special as it returns a new list of
data and a list of array interfaces. The data is list of categorical codes that
caller can safely ignore, but have to keep their reference alive until usage of array
interface is finished.
"""Extract CuDF __cuda_array_interface__. This is special as it returns a new list
of data and a list of array interfaces. The data is list of categorical codes that
caller can safely ignore, but have to keep their reference alive until usage of
array interface is finished.
"""
try:
Expand All @@ -643,25 +652,27 @@ def _cudf_array_interfaces(data: DataType, cat_codes: list) -> bytes:
from cudf.utils.dtypes import is_categorical_dtype

interfaces = []

def append(interface: dict) -> None:
if "mask" in interface:
interface["mask"] = interface["mask"].__cuda_array_interface__
interfaces.append(interface)

if _is_cudf_ser(data):
if is_categorical_dtype(data.dtype):
interface = cat_codes[0].__cuda_array_interface__
else:
interface = data.__cuda_array_interface__
if "mask" in interface:
interface["mask"] = interface["mask"].__cuda_array_interface__
interfaces.append(interface)
append(interface)
else:
for i, col in enumerate(data):
if is_categorical_dtype(data[col].dtype):
codes = cat_codes[i]
interface = codes.__cuda_array_interface__
else:
interface = data[col].__cuda_array_interface__
if "mask" in interface:
interface["mask"] = interface["mask"].__cuda_array_interface__
interfaces.append(interface)
interfaces_str = bytes(json.dumps(interfaces, indent=2), "utf-8")
append(interface)
interfaces_str = from_pystr_to_cstr(json.dumps(interfaces))
return interfaces_str


Expand Down Expand Up @@ -722,9 +733,14 @@ def _transform_cudf_df(
cat_codes.append(codes)
else:
for col in data:
if is_categorical_dtype(data[col].dtype) and enable_categorical:
dtype = data[col].dtype
if is_categorical_dtype(dtype) and enable_categorical:
codes = data[col].cat.codes
cat_codes.append(codes)
elif is_categorical_dtype(dtype):
raise ValueError(_ENABLE_CAT_ERR)
else:
cat_codes.append([])

return data, cat_codes, feature_names, feature_types

Expand Down
28 changes: 24 additions & 4 deletions tests/python-gpu/test_from_cudf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import json
import sys

import numpy as np
import xgboost as xgb
import sys
import pytest

sys.path.append("tests/python")
Expand Down Expand Up @@ -176,20 +178,38 @@ def test_cudf_metainfo_device_dmatrix(self):
_test_cudf_metainfo(xgb.DeviceQuantileDMatrix)

@pytest.mark.skipif(**tm.no_cudf())
def test_cudf_categorical(self):
def test_cudf_categorical(self) -> None:
import cudf
_X, _y = tm.make_categorical(100, 30, 17, False)
n_features = 30
_X, _y = tm.make_categorical(100, n_features, 17, False)
X = cudf.from_pandas(_X)
y = cudf.from_pandas(_y)

Xy = xgb.DMatrix(X, y, enable_categorical=True)
assert Xy.feature_types is not None
assert len(Xy.feature_types) == X.shape[1]
assert all(t == "c" for t in Xy.feature_types)

Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True)
assert Xy.feature_types is not None
assert len(Xy.feature_types) == X.shape[1]
assert all(t == "c" for t in Xy.feature_types)

# mixed dtypes
X["1"] = X["1"].astype(np.int64)
X["3"] = X["3"].astype(np.int64)
df, cat_codes, _, _ = xgb.data._transform_cudf_df(
X, None, None, enable_categorical=True
)
assert X.shape[1] == n_features
assert len(cat_codes) == X.shape[1]
assert not cat_codes[0]
assert not cat_codes[2]

interfaces_str = xgb.data._cudf_array_interfaces(df, cat_codes)
interfaces = json.loads(interfaces_str)
assert len(interfaces) == X.shape[1]

# test missing value
X = cudf.DataFrame({"f0": ["a", "b", np.NaN]})
X["f0"] = X["f0"].astype("category")
Expand All @@ -206,7 +226,7 @@ def test_cudf_categorical(self):
assert Xy.num_row() == 3
assert Xy.num_col() == 1

with pytest.raises(ValueError):
with pytest.raises(ValueError, match="enable_categorical"):
xgb.DeviceQuantileDMatrix(X, y)

Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True)
Expand Down

0 comments on commit 6925b22

Please sign in to comment.