Skip to content

Commit

Permalink
Add support for cudf.Series (#4891)
Browse files Browse the repository at this point in the history
  • Loading branch information
VibhuJawa authored and trivialfis committed Sep 26, 2019
1 parent 82ee231 commit 2fa8b35
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 12 deletions.
2 changes: 2 additions & 0 deletions python-package/xgboost/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,11 @@ class DataTable(object):

try:
from cudf import DataFrame as CUDF_DataFrame
from cudf import Series as CUDF_Series
CUDF_INSTALLED = True
except ImportError:
CUDF_DataFrame = object
CUDF_Series = object
CUDF_INSTALLED = False

# sklearn
Expand Down
29 changes: 19 additions & 10 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

from .compat import (STRING_TYPES, PY3, DataFrame, MultiIndex, py_str,
PANDAS_INSTALLED, DataTable,
CUDF_INSTALLED, CUDF_DataFrame,
CUDF_INSTALLED, CUDF_DataFrame, CUDF_Series,
os_fspath, os_PathLike)
from .libpath import find_lib_path

Expand Down Expand Up @@ -243,26 +243,35 @@ def c_array(ctype, values):

def _use_columnar_initializer(data):
'''Whether should we use columnar format initializer (pass data in as
json string). Currently cudf is the only valid option.'''
if CUDF_INSTALLED and isinstance(data, CUDF_DataFrame):
json string). Currently cudf is the only valid option.'''
if CUDF_INSTALLED and (isinstance(data, (CUDF_DataFrame, CUDF_Series))):
return True
return False


def _extract_interface_from_cudf_series(data):
"""This returns the array interface from the cudf series. This function should
be upstreamed to cudf."""
interface = data.__cuda_array_interface__
if data.has_null_mask:
interface['mask'] = interface['mask'].__cuda_array_interface__
return interface


def _extract_interface_from_cudf(df, is_info):
'''This function should be upstreamed to cudf.'''
"""This function should be upstreamed to cudf."""
if not _use_columnar_initializer(df):
raise ValueError('Only cudf is supported for initializing as json ' +
'columnar format. For other libraries please ' +
'refer to specific API.')

array_interfaces = []
for col in df.columns:
data = df[col]
interface = data.__cuda_array_interface__
if data.has_null_mask:
interface['mask'] = interface['mask'].__cuda_array_interface__
array_interfaces.append(interface)
if isinstance(df, CUDF_DataFrame):
for col in df.columns:
array_interfaces.append(
_extract_interface_from_cudf_series(df[col]))
else:
array_interfaces.append(_extract_interface_from_cudf_series(df))

if is_info:
array_interfaces = array_interfaces[0]
Expand Down
4 changes: 2 additions & 2 deletions tests/python-gpu/test_from_columnar.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ def dmatrix_from_cudf(input_type, missing=np.NAN):
np_label = np.random.randn(kRows).astype(input_type)
pa_label = pd.DataFrame(np_label)

cd: cudf.DataFrame = cudf.from_pandas(pa)
cd_label: cudf.DataFrame = cudf.from_pandas(pa_label)
cd = cudf.from_pandas(pa)
cd_label = cudf.from_pandas(pa_label).iloc[:, 0]

dtrain = xgb.DMatrix(cd, missing=missing, label=cd_label)
assert dtrain.num_col() == kCols
Expand Down

0 comments on commit 2fa8b35

Please sign in to comment.