diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py index 25edc2435125..b9975736b626 100644 --- a/python-package/xgboost/compat.py +++ b/python-package/xgboost/compat.py @@ -129,9 +129,11 @@ class DataTable(object): try: from cudf import DataFrame as CUDF_DataFrame + from cudf import Series as CUDF_Series CUDF_INSTALLED = True except ImportError: CUDF_DataFrame = object + CUDF_Series = object CUDF_INSTALLED = False # sklearn diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index d2c62b71d3ee..62c356de8187 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -19,7 +19,7 @@ from .compat import (STRING_TYPES, PY3, DataFrame, MultiIndex, py_str, PANDAS_INSTALLED, DataTable, - CUDF_INSTALLED, CUDF_DataFrame, + CUDF_INSTALLED, CUDF_DataFrame, CUDF_Series, os_fspath, os_PathLike) from .libpath import find_lib_path @@ -221,26 +221,35 @@ def c_array(ctype, values): def _use_columnar_initializer(data): '''Whether should we use columnar format initializer (pass data in as -json string). Currently cudf is the only valid option.''' - if CUDF_INSTALLED and isinstance(data, CUDF_DataFrame): + json string). Currently cudf is the only valid option.''' + if CUDF_INSTALLED and (isinstance(data, (CUDF_DataFrame, CUDF_Series))): return True return False +def _extract_interface_from_cudf_series(data): + """This returns the array interface from the cudf series. This function should + be upstreamed to cudf.""" + interface = data.__cuda_array_interface__ + if data.has_null_mask: + interface['mask'] = interface['mask'].__cuda_array_interface__ + return interface + + def _extract_interface_from_cudf(df, is_info): - '''This function should be upstreamed to cudf.''' + """This function should be upstreamed to cudf.""" if not _use_columnar_initializer(df): raise ValueError('Only cudf is supported for initializing as json ' + 'columnar format. For other libraries please ' + 'refer to specific API.') array_interfaces = [] - for col in df.columns: - data = df[col] - interface = data.__cuda_array_interface__ - if data.has_null_mask: - interface['mask'] = interface['mask'].__cuda_array_interface__ - array_interfaces.append(interface) + if isinstance(df, CUDF_DataFrame): + for col in df.columns: + array_interfaces.append( + _extract_interface_from_cudf_series(df[col])) + else: + array_interfaces.append(_extract_interface_from_cudf_series(df)) if is_info: array_interfaces = array_interfaces[0] diff --git a/tests/python-gpu/test_from_columnar.py b/tests/python-gpu/test_from_columnar.py index b942446f9ab7..fe993533eec7 100644 --- a/tests/python-gpu/test_from_columnar.py +++ b/tests/python-gpu/test_from_columnar.py @@ -27,8 +27,8 @@ def dmatrix_from_cudf(input_type, missing=np.NAN): np_label = np.random.randn(kRows).astype(input_type) pa_label = pd.DataFrame(np_label) - cd: cudf.DataFrame = cudf.from_pandas(pa) - cd_label: cudf.DataFrame = cudf.from_pandas(pa_label) + cd = cudf.from_pandas(pa) + cd_label = cudf.from_pandas(pa_label).iloc[:, 0] dtrain = xgb.DMatrix(cd, missing=missing, label=cd_label) assert dtrain.num_col() == kCols