Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create pylibcudf.Column from a column_view and an arbitrary owning object #17543

Draft
wants to merge 18 commits into
base: branch-25.02
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 112 additions & 6 deletions python/cudf/cudf/_lib/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ from pylibcudf.libcudf.column.column_factories cimport (
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.null_mask cimport null_count as cpp_null_count
from pylibcudf.libcudf.scalar.scalar cimport scalar
from pylibcudf.column cimport Column as plc_Column

from cudf._lib.scalar cimport DeviceScalar

Expand Down Expand Up @@ -639,15 +640,120 @@ cdef class Column:

dtype = dtype_from_pylibcudf_column(col)

data=as_buffer(
col.data().obj, exposed=data_ptr_exposed
) if col.data() is not None else None
mask=as_buffer(
col.null_mask().obj, exposed=data_ptr_exposed
) if col.null_mask() is not None else None

if hasattr(col.data().obj, "owner"):
size = col.size()
offset = col.offset()
dtype_itemsize = getattr(dtype, "itemsize", 1)

data_ptr = col.data().obj.ptr
mask_ptr = col.null_mask().obj.ptr
Matt711 marked this conversation as resolved.
Show resolved Hide resolved
data = None
base_size = size + offset
data_owner = col.data().obj.owner
mask_owner = col.null_mask().obj.owner
base_nbytes = base_size * dtype_itemsize

is_string_column = (col.type().id() == libcudf_types.type_id.STRING)
if is_string_column:
if col.num_children() == 0:
base_nbytes = 0
else:
# get the size from offset child column (device to host copy)
offsets_column_index = 0
offset_child_column = <plc_Column>col.child(offsets_column_index)
if offset_child_column.size() == 0:
base_nbytes = 0
else:
chars_size = get_element(
offset_child_column.view(),
offset_child_column.size()-1
).value
base_nbytes = chars_size

if (isinstance(data_owner, ExposureTrackedBuffer)):
data = as_buffer(
data=data_ptr,
size=base_nbytes,
owner=data_owner,
exposed=False,
)
elif (
# This is an optimization of the most common case where
# from_column_view creates a "view" that is identical to
# the owner.
isinstance(data_owner, SpillableBuffer) and
# We check that `data_owner` is spill locked (not spillable)
# and that it points to the same memory as `data_ptr`.
not data_owner.spillable and
data_owner.memory_info() == (data_ptr, base_nbytes, "gpu")
):
data = data_owner
else:
# At this point we don't know the relationship between data_ptr
# and data_owner thus we mark both of them exposed.
# TODO: try to discover their relationship and create a
# SpillableBufferSlice instead.
data = as_buffer(
data=data_ptr,
size=base_nbytes,
owner=data_owner,
exposed=True,
)
if isinstance(data_owner, ExposureTrackedBuffer):
# accessing the pointer marks it exposed permanently.
data_owner.mark_exposed()
elif isinstance(data_owner, SpillableBuffer):
if data_owner.is_spilled:
raise ValueError(
f"{data_owner} is spilled, which invalidates "
f"the exposed data_ptr ({hex(data_ptr)})"
)
# accessing the pointer marks it exposed permanently.
data_owner.mark_exposed()

if mask_owner is None:
# if we reached here, it means `owner` is a `Column`
# that does not have a null mask, but `cv` thinks it
# should have a null mask. This can happen in the
# following sequence of events:
#
# 1) `cv` is constructed as a view into a
# `cudf::column` that is nullable (i.e., it has
# a null mask), but contains no nulls.
# 2) `owner`, a `Column`, is constructed from the
# same `cudf::column`. Because `cudf::column`
# is memory owning, `owner` takes ownership of
# the memory owned by the
# `cudf::column`. Because the column has a null
# count of 0, it may choose to discard the null
# mask.
# 3) Now, `cv` points to a discarded null mask.
#
# TL;DR: we should not include a null mask in the
# result:
mask = None
else:
mask = as_buffer(
data=mask_ptr,
size=pylibcudf.null_mask.bitmask_allocation_size_bytes(
base_size
),
owner=mask_owner,
exposed=True
)

return cudf.core.column.build_column(
data=as_buffer(
col.data().obj, exposed=data_ptr_exposed
) if col.data() is not None else None,
data=data,
dtype=dtype,
size=col.size(),
mask=as_buffer(
col.null_mask().obj, exposed=data_ptr_exposed
) if col.null_mask() is not None else None,
mask=mask,
offset=col.offset(),
null_count=col.null_count(),
children=tuple([
Expand Down
5 changes: 4 additions & 1 deletion python/pylibcudf/pylibcudf/column.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ from pylibcudf.libcudf.types cimport bitmask_type, size_type
from .gpumemoryview cimport gpumemoryview
from .types cimport DataType

ctypedef fused ColumnOrObject:
Column
object
Comment on lines +16 to +18
Copy link
Contributor Author

@Matt711 Matt711 Dec 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can default a fused typed argument to None

Suggested change
ctypedef fused ColumnOrObject:
Column
object
ctypedef fused OwningTypes:
Column
PackedColumns
object


cdef class Column:
# TODO: Should we document these attributes? Should we mark them readonly?
Expand All @@ -35,7 +38,7 @@ cdef class Column:
cdef Column from_libcudf(unique_ptr[column] libcudf_col)

@staticmethod
cdef Column from_column_view(const column_view& libcudf_col, Column owner)
cdef Column from_column_view(const column_view& libcudf_col, object owner)

cpdef DataType type(self)
cpdef Column child(self, size_type index)
Expand Down
166 changes: 138 additions & 28 deletions python/pylibcudf/pylibcudf/column.pyx
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.

from cython.operator cimport dereference
from libcpp.memory cimport make_unique, unique_ptr
from libc.stdint cimport uintptr_t
from libcpp.memory cimport make_unique, unique_ptr, make_shared, shared_ptr
from libcpp.utility cimport move
from pylibcudf.libcudf.column.column cimport column, column_contents
from pylibcudf.libcudf.column.column_factories cimport make_column_from_scalar
from pylibcudf.libcudf.scalar.scalar cimport scalar
from pylibcudf.libcudf.scalar.scalar cimport scalar, string_scalar
from pylibcudf.libcudf.types cimport size_type
from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
from pylibcudf.libcudf.copying cimport get_element as cpp_get_element
from pylibcudf.null_mask cimport bitmask_allocation_size_bytes

from rmm.pylibrmm.device_buffer cimport DeviceBuffer

Expand Down Expand Up @@ -200,37 +204,96 @@ cdef class Column:
)

@staticmethod
cdef Column from_column_view(const column_view& cv, Column owner):
"""Create a Column from a libcudf column_view.

This method accepts shared ownership of the underlying data from the
owner and relies on the offset from the view.

This method is for pylibcudf's functions to use to ingest outputs of
calling libcudf algorithms, and should generally not be needed by users
(even direct pylibcudf Cython users).
"""
cdef Column from_column_view(const column_view& cv, object owner):
column_owner = isinstance(owner, Column)
cdef DataType dtype = DataType.from_libcudf(cv.type())
cdef size_type size = cv.size()
cdef size_type null_count = cv.null_count()

children = []
if cv.num_children() != 0:
for i in range(cv.num_children()):
children.append(
Column.from_column_view(cv.child(i), owner.child(i))
cdef size_type offset = cv.offset()
cdef size_type base_size = size + offset
cdef size_type dtype_itemsize = dtype_itemsize_from_column_view(cv)
cdef base_nbytes = base_size * dtype_itemsize
cdef string_scalar* str_ptr
cdef const scalar* ptr

is_string_column = (cv.type().id() == type_id.STRING)
if is_string_column:
if cv.num_children() == 0:
base_nbytes = 0
else:
# get the size from offset child column (device to host copy)
offsets_column_index = 0
offset_child_column = cv.child(offsets_column_index)
if offset_child_column.size() == 0:
base_nbytes = 0
else:
ptr = (<Scalar> get_element(
offset_child_column, offset_child_column.size()-1
)).get()
str_ptr = <string_scalar*>ptr
base_nbytes = dereference(str_ptr).size()

if column_owner:
children = []
if cv.num_children() != 0:
for i in range(cv.num_children()):
children.append(
Column.from_column_view(cv.child(i), owner.child(i))
)
return Column(
dtype,
size,
(<Column>owner)._data,
(<Column>owner)._mask,
null_count,
offset,
children,
)
else:
if owner is not None:
try:
owner = Column.from_cuda_array_interface_obj(owner)
return Column.from_column_view(cv, owner)
except Exception as e:
raise AttributeError(
"Argument 'owner' must have attribute __cuda_array_interface__"
) from e
else:
data_ptr = <uintptr_t>(cv.head[void]())
mask_ptr = <uintptr_t>(cv.null_mask())
mask = None
if data_ptr:
buffer_size = (
base_nbytes
if is_string_column
else ((size + offset) * dtype_itemsize)
)
data = DeviceBuffer(ptr=data_ptr, size=buffer_size)
else:
data = DeviceBuffer(ptr=data_ptr, size=0)
if mask_ptr:
mask = DeviceBuffer(
ptr=mask_ptr,
size=bitmask_allocation_size_bytes(
base_size
)
)
children = []
if cv.num_children() != 0:
for i in range(cv.num_children()):
children.append(
Column.from_column_view(cv.child(i), None)
)
return Column(
dtype,
size,
data,
mask,
null_count,
offset,
children,
)

return Column(
dtype,
size,
owner._data,
owner._mask,
null_count,
cv.offset(),
children,
)

@staticmethod
def from_scalar(Scalar slr, size_type size):
"""Create a Column from a Scalar.
Expand Down Expand Up @@ -407,6 +470,53 @@ cdef class ListColumnView:
return lists_column_view(self._column.view())


cdef get_element(column_view cv, size_type index):

cdef unique_ptr[scalar] c_output
with nogil:
c_output = move(
cpp_get_element(cv, index)
)

return Scalar.from_libcudf(move(c_output))


cdef size_type dtype_itemsize_from_lists_column_view(const column_view cv):
# lists_column_view have no default constructor, so we heap
# allocate it to get around Cython's limitation of requiring
# default constructors for stack allocated objects
cdef shared_ptr[lists_column_view] lv = make_shared[lists_column_view](cv)
cdef column_view child = lv.get()[0].child()

if child.type().id() == type_id.LIST:
return dtype_itemsize_from_lists_column_view(child)
elif child.type().id() == type_id.EMPTY:
return 1
else:
return dtype_itemsize_from_column_view(child)


cdef size_type dtype_itemsize_from_column_view(const column_view& cv):
cdef type_id tid = cv.type().id()
if tid == type_id.LIST:
return dtype_itemsize_from_lists_column_view(cv)
elif tid == type_id.STRUCT:
return sum(
[
dtype_itemsize_from_column_view(cv.child(i))
for i in range(cv.num_children())
]
)
elif tid == type_id.DECIMAL32:
return 4
elif tid == type_id.DECIMAL64:
return 8
elif tid == type_id.DECIMAL128:
return 16
else:
return 1
Matt711 marked this conversation as resolved.
Show resolved Hide resolved


@functools.cache
def _datatype_from_dtype_desc(desc):
mapping = {
Expand Down
3 changes: 2 additions & 1 deletion python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.fixed_point.fixed_point cimport scale_type
from pylibcudf.libcudf.table.table_view cimport table_view
from pylibcudf.libcudf.types cimport data_type
from pylibcudf.libcudf.types cimport data_type, size_type


cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
Expand Down Expand Up @@ -54,6 +54,7 @@ cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
string_scalar(string st, bool is_valid) except +libcudf_exception_handler
string_scalar(string_scalar other) except +libcudf_exception_handler
string to_string() except +libcudf_exception_handler
size_type size() except +libcudf_exception_handler

cdef cppclass fixed_point_scalar[T](scalar):
fixed_point_scalar() except +libcudf_exception_handler
Expand Down
Loading