-
Notifications
You must be signed in to change notification settings - Fork 916
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Remove cudf._lib.interop in favor of inlining pylibcudf #17555
Changes from 3 commits
a5923d7
70379d2
dada097
add48f0
f706244
cbba8bd
deed917
0852374
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,6 @@ | |
copying, | ||
csv, | ||
groupby, | ||
interop, | ||
nvtext, | ||
parquet, | ||
reduce, | ||
|
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -282,6 +282,7 @@ def dropna(self) -> Self: | |
else: | ||
return self.copy() | ||
|
||
@acquire_spill_lock() | ||
def to_arrow(self) -> pa.Array: | ||
"""Convert to PyArrow Array | ||
|
||
|
@@ -298,9 +299,7 @@ def to_arrow(self) -> pa.Array: | |
4 | ||
] | ||
""" | ||
return libcudf.interop.to_arrow([self], [("None", self.dtype)])[ | ||
"None" | ||
].chunk(0) | ||
return plc.interop.to_arrow(self.to_pylibcudf(mode="read")).chunk(0) | ||
|
||
@classmethod | ||
def from_arrow(cls, array: pa.Array) -> ColumnBase: | ||
|
@@ -333,30 +332,20 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: | |
elif isinstance(array.type, ArrowIntervalType): | ||
return cudf.core.column.IntervalColumn.from_arrow(array) | ||
|
||
data = pa.table([array], [None]) | ||
if isinstance(array, pa.ChunkedArray): | ||
array = array.combine_chunks() | ||
|
||
if isinstance(array.type, pa.DictionaryType): | ||
indices_table = pa.table( | ||
{ | ||
"None": pa.chunked_array( | ||
[chunk.indices for chunk in data["None"].chunks], | ||
type=array.type.index_type, | ||
) | ||
} | ||
) | ||
dictionaries_table = pa.table( | ||
{ | ||
"None": pa.chunked_array( | ||
[chunk.dictionary for chunk in data["None"].chunks], | ||
type=array.type.value_type, | ||
) | ||
} | ||
) | ||
|
||
codes = libcudf.interop.from_arrow(indices_table)[0] | ||
categories = libcudf.interop.from_arrow(dictionaries_table)[0] | ||
with acquire_spill_lock(): | ||
codes = cls.from_pylibcudf( | ||
plc.interop.from_arrow(array.indices) | ||
) | ||
categories = cls.from_pylibcudf( | ||
plc.interop.from_arrow(array.dictionary) | ||
) | ||
codes = cudf.core.column.categorical.as_unsigned_codes( | ||
len(categories), codes | ||
len(categories), | ||
codes, # type: ignore[arg-type] | ||
) | ||
return cudf.core.column.CategoricalColumn( | ||
data=None, | ||
|
@@ -367,10 +356,12 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: | |
mask=codes.base_mask, | ||
children=(codes,), | ||
) | ||
|
||
result = libcudf.interop.from_arrow(data)[0] | ||
|
||
return result._with_type_metadata(cudf_dtype_from_pa_type(array.type)) | ||
else: | ||
result = cls.from_pylibcudf(plc.interop.from_arrow(array)) | ||
# TODO: cudf_dtype_from_pa_type may be less necessary for some types | ||
return result._with_type_metadata( | ||
cudf_dtype_from_pa_type(array.type) | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the kind of thing I was thinking of in the other PR. Having a standardized entrypoint of some sort (maybe per-class?) into pylibcudf from cudf Python would help us collect common functionality like |
||
|
||
@acquire_spill_lock() | ||
def _get_mask_as_column(self) -> ColumnBase: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we always have to combine chunks? IIRC the existing implementation works without combining in most cases, and I don't think combining is free performance-wise so we should avoid it if we can. I could be wrong though, or misremembering an earlier state of the code.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah right. Yeah this will make a copy on the CPU side.
I see now in libcudf side we only support returning tables (and not columns) from an arrow stream. I was hoping to avoid the dance of putting the chunked array in a pyarrow table but I think the dance is worth avoiding a CPU copy
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks yeah I think this is the right call for now. We could generalize the libcudf APIs in the future if that helps.