Skip to content

Commit

Permalink
Add changes for early experimental support for dataframe interchange …
Browse files Browse the repository at this point in the history
…protocol API (#5591)

PR adds some small changes that will allow us to experiment using the `__dataframe__` interchange protocol API to improve support of dataframe objects and potentially simplify our management of them in a future version. Changes should not affect existing codepaths for cuDF and Pandas objects for release 23.10.

Authors:
  - Dante Gama Dessavre (https://github.com/dantegd)

Approvers:
  - Simon Adorf (https://github.com/csadorf)

URL: #5591
  • Loading branch information
dantegd authored Oct 3, 2023
1 parent 3c4ceb9 commit 83e6f4c
Showing 1 changed file with 46 additions and 12 deletions.
58 changes: 46 additions & 12 deletions python/cuml/internals/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,19 @@
"numba.cuda", "is_cuda_array", alt=return_false
)

cp_ndarray = gpu_only_import_from("cupy", "ndarray")
np_ndarray = cpu_only_import_from("numpy", "ndarray")
numba_devicearray = gpu_only_import_from("numba.cuda", "devicearray")

_specific_supported_types = (
np_ndarray,
cp_ndarray,
CudfSeries,
CudfDataFrame,
PandasSeries,
PandasDataFrame,
)


def _order_to_strides(order, shape, dtype):
"""
Expand Down Expand Up @@ -219,23 +232,21 @@ def __init__(
mem_type = MemoryType.from_str(mem_type)
self._mem_type = mem_type

# Coerce data into an array interface and determine mem_type and owner
# if necessary
try:
if hasattr(data, "__cuda_array_interface__"):
self._array_interface = data.__cuda_array_interface__
if mem_type in (None, MemoryType.mirror):
self._mem_type = MemoryType.device
self._owner = data
except AttributeError: # Not a Cuda array object
try:
else: # Not a CUDA array object
if hasattr(data, "__array_interface__"):
self._array_interface = data.__array_interface__
self._mem_type = MemoryType.host
self._owner = data
except AttributeError: # Must construct array interface
else: # Must construct array interface
if dtype is None:
try:
if hasattr(data, "dtype"):
dtype = data.dtype
except AttributeError:
else:
raise ValueError(
"Must specify dtype when data is passed as a"
" {}".format(type(data))
Expand Down Expand Up @@ -618,8 +629,23 @@ def to_output(
return np.asarray(
self, dtype=output_dtype, order=self.order
)
if isinstance(
self._owner, _specific_supported_types
) or "cuml" in str(type(self._owner)):
cp_arr = cp.asarray(
self, dtype=output_dtype, order=self.order
)
else:
if self._owner is not None:
cp_arr = cp.asarray(
self._owner, dtype=output_dtype, order=self.order
)
else:
cp_arr = cp.asarray(
self, dtype=output_dtype, order=self.order
)
return cp.asnumpy(
cp.asarray(self, dtype=output_dtype, order=self.order),
cp_arr,
order=self.order,
)
return output_mem_type.xpy.asarray(
Expand Down Expand Up @@ -1057,7 +1083,10 @@ def from_input(
elif convert_to_mem_type is MemoryType.device and isinstance(
index, PandasIndex
):
index = CudfIndex.from_pandas(index)
try:
index = CudfIndex.from_pandas(index)
except TypeError:
index = CudfIndex(index)

if isinstance(X, CudfSeries):
if X.null_count != 0:
Expand All @@ -1066,10 +1095,15 @@ def from_input(
"which are not supported by cuML."
)

if isinstance(X, (PandasDataFrame, PandasSeries)):
X = X.to_numpy(copy=False)
if isinstance(X, CudfDataFrame):
X = X.to_cupy(copy=False)
elif isinstance(X, (PandasDataFrame, PandasSeries)):
X = X.to_numpy(copy=False)
elif hasattr(X, "__dataframe__"):
# temporarily use this codepath to avoid errors, substitute
# usage of dataframe interchange protocol once ready.
X = X.to_numpy()
deepcopy = False

requested_order = (order, None)[fail_on_order]
arr = cls(X, index=index, order=requested_order, validate=False)
Expand Down

0 comments on commit 83e6f4c

Please sign in to comment.