Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create pylibcudf.Column from a column_view and an arbitrary owning object #17543

Draft
wants to merge 18 commits into
base: branch-25.02
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 114 additions & 6 deletions python/cudf/cudf/_lib/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ from pylibcudf.libcudf.column.column_factories cimport (
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.null_mask cimport null_count as cpp_null_count
from pylibcudf.libcudf.scalar.scalar cimport scalar
from pylibcudf.column cimport Column as plc_Column

from cudf._lib.scalar cimport DeviceScalar

Expand Down Expand Up @@ -639,15 +640,122 @@ cdef class Column:

dtype = dtype_from_pylibcudf_column(col)

data=as_buffer(
col.data().obj, exposed=data_ptr_exposed
) if col.data() is not None else None
mask=as_buffer(
col.null_mask().obj, exposed=data_ptr_exposed
) if col.null_mask() is not None else None

if not mask and not data and (
hasattr(col.data(), "obj") and hasattr(col.data().obj, "owner")
):
size = col.size()
offset = col.offset()
dtype_itemsize = getattr(dtype, "itemsize", 1)

data_ptr = col.data().obj.__cuda_array_interface__["data"][0]
mask_ptr = col.null_mask().obj.__cuda_array_interface__["data"][0]
data = None
base_size = size + offset
data_owner = col.data().obj.owner
mask_owner = col.null_mask().obj.owner
base_nbytes = base_size * dtype_itemsize

is_string_column = (col.type().id() == libcudf_types.type_id.STRING)
if is_string_column:
if col.num_children() == 0:
base_nbytes = 0
else:
# get the size from offset child column (device to host copy)
offsets_column_index = 0
offset_child_column = <plc_Column>col.child(offsets_column_index)
if offset_child_column.size() == 0:
base_nbytes = 0
else:
chars_size = get_element(
offset_child_column.view(),
offset_child_column.size()-1
).value
base_nbytes = chars_size

if (isinstance(data_owner, ExposureTrackedBuffer)):
data = as_buffer(
data=data_ptr,
size=base_nbytes,
owner=data_owner,
exposed=False,
)
elif (
# This is an optimization of the most common case where
# from_column_view creates a "view" that is identical to
# the owner.
isinstance(data_owner, SpillableBuffer) and
# We check that `data_owner` is spill locked (not spillable)
# and that it points to the same memory as `data_ptr`.
not data_owner.spillable and
data_owner.memory_info() == (data_ptr, base_nbytes, "gpu")
):
data = data_owner
else:
# At this point we don't know the relationship between data_ptr
# and data_owner thus we mark both of them exposed.
# TODO: try to discover their relationship and create a
# SpillableBufferSlice instead.
data = as_buffer(
data=data_ptr,
size=base_nbytes,
owner=data_owner,
exposed=True,
)
if isinstance(data_owner, ExposureTrackedBuffer):
# accessing the pointer marks it exposed permanently.
data_owner.mark_exposed()
elif isinstance(data_owner, SpillableBuffer):
if data_owner.is_spilled:
raise ValueError(
f"{data_owner} is spilled, which invalidates "
f"the exposed data_ptr ({hex(data_ptr)})"
)
# accessing the pointer marks it exposed permanently.
data_owner.mark_exposed()

if mask_owner is None:
# if we reached here, it means `owner` is a `Column`
# that does not have a null mask, but `cv` thinks it
# should have a null mask. This can happen in the
# following sequence of events:
#
# 1) `cv` is constructed as a view into a
# `cudf::column` that is nullable (i.e., it has
# a null mask), but contains no nulls.
# 2) `owner`, a `Column`, is constructed from the
# same `cudf::column`. Because `cudf::column`
# is memory owning, `owner` takes ownership of
# the memory owned by the
# `cudf::column`. Because the column has a null
# count of 0, it may choose to discard the null
# mask.
# 3) Now, `cv` points to a discarded null mask.
#
# TL;DR: we should not include a null mask in the
# result:
mask = None
else:
mask = as_buffer(
data=mask_ptr,
size=pylibcudf.null_mask.bitmask_allocation_size_bytes(
base_size
),
owner=mask_owner,
exposed=True
)

return cudf.core.column.build_column(
data=as_buffer(
col.data().obj, exposed=data_ptr_exposed
) if col.data() is not None else None,
data=data,
dtype=dtype,
size=col.size(),
mask=as_buffer(
col.null_mask().obj, exposed=data_ptr_exposed
) if col.null_mask() is not None else None,
mask=mask,
offset=col.offset(),
null_count=col.null_count(),
children=tuple([
Expand Down
22 changes: 16 additions & 6 deletions python/cudf/cudf/_lib/utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -344,8 +344,11 @@ cdef columns_from_table_view(
"""

return [
Column.from_column_view(
tv.column(i), owners[i] if isinstance(owners, list) else None
Column.from_pylibcudf(
plc_Column.from_column_view(
tv.column(i),
owners[i] if isinstance(owners, list) else None
)
) for i in range(tv.num_columns())
]

Expand Down Expand Up @@ -376,9 +379,11 @@ cdef data_from_table_view(
if table_owner:
column_owner = owner._index._columns[column_idx]
index_columns.append(
Column.from_column_view(
tv.column(column_idx),
column_owner
Column.from_pylibcudf(
plc_Column.from_column_view(
tv.column(column_idx),
column_owner
)
)
)
column_idx += 1
Expand All @@ -393,7 +398,12 @@ cdef data_from_table_view(
if table_owner:
column_owner = owner._columns[source_column_idx]
data_columns.append(
Column.from_column_view(tv.column(column_idx), column_owner)
Column.from_pylibcudf(
plc_Column.from_column_view(
tv.column(column_idx),
column_owner
)
)
)
column_idx += 1
source_column_idx += 1
Expand Down
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ rapids_cython_create_modules(
SOURCE_FILES "${cython_sources}"
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
)

set_source_files_properties(contiguous_split.cxx PROPERTIES COMPILE_OPTIONS "-g;-O0")
target_include_directories(pylibcudf_interop PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")

include(${rapids-cmake-dir}/export/find_package_root.cmake)
Expand Down
5 changes: 4 additions & 1 deletion python/pylibcudf/pylibcudf/column.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ from pylibcudf.libcudf.types cimport bitmask_type, size_type
from .gpumemoryview cimport gpumemoryview
from .types cimport DataType

ctypedef fused ColumnOrObject:
Column
object
Comment on lines +16 to +18
Copy link
Contributor Author

@Matt711 Matt711 Dec 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can default a fused typed argument to None

Suggested change
ctypedef fused ColumnOrObject:
Column
object
ctypedef fused OwningTypes:
Column
PackedColumns
object


cdef class Column:
# TODO: Should we document these attributes? Should we mark them readonly?
Expand All @@ -35,7 +38,7 @@ cdef class Column:
cdef Column from_libcudf(unique_ptr[column] libcudf_col)

@staticmethod
cdef Column from_column_view(const column_view& libcudf_col, Column owner)
cdef Column from_column_view(const column_view& libcudf_col, object owner)

cpdef DataType type(self)
cpdef Column child(self, size_type index)
Expand Down
Loading
Loading