rapidsai · Matt711 · Dec 6, 2024 · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024
@@ -47,6 +47,7 @@ from pylibcudf.libcudf.column.column_factories cimport (
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.null_mask cimport null_count as cpp_null_count
 from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.column cimport Column as plc_Column
 
 from cudf._lib.scalar cimport DeviceScalar
 
@@ -639,15 +640,122 @@ cdef class Column:
 
         dtype = dtype_from_pylibcudf_column(col)
 
+        data=as_buffer(
+            col.data().obj, exposed=data_ptr_exposed
+        ) if col.data() is not None else None
+        mask=as_buffer(
+            col.null_mask().obj, exposed=data_ptr_exposed
+        ) if col.null_mask() is not None else None
+
+        if not mask and not data and (
+            hasattr(col.data(), "obj") and hasattr(col.data().obj, "owner")
+        ):
+            size = col.size()
+            offset = col.offset()
+            dtype_itemsize = getattr(dtype, "itemsize", 1)
+
+            data_ptr = col.data().obj.__cuda_array_interface__["data"][0]
+            mask_ptr = col.null_mask().obj.__cuda_array_interface__["data"][0]
+            data = None
+            base_size = size + offset
+            data_owner = col.data().obj.owner
+            mask_owner = col.null_mask().obj.owner
+            base_nbytes = base_size * dtype_itemsize
+
+            is_string_column = (col.type().id() == libcudf_types.type_id.STRING)
+            if is_string_column:
+                if col.num_children() == 0:
+                    base_nbytes = 0
+                else:
+                    # get the size from offset child column (device to host copy)
+                    offsets_column_index = 0
+                    offset_child_column = <plc_Column>col.child(offsets_column_index)
+                    if offset_child_column.size() == 0:
+                        base_nbytes = 0
+                    else:
+                        chars_size = get_element(
+                            offset_child_column.view(),
+                            offset_child_column.size()-1
+                        ).value
+                        base_nbytes = chars_size
+
+            if (isinstance(data_owner, ExposureTrackedBuffer)):
+                data = as_buffer(
+                    data=data_ptr,
+                    size=base_nbytes,
+                    owner=data_owner,
+                    exposed=False,
+                )
+            elif (
+                # This is an optimization of the most common case where
+                # from_column_view creates a "view" that is identical to
+                # the owner.
+                isinstance(data_owner, SpillableBuffer) and
+                # We check that `data_owner` is spill locked (not spillable)
+                # and that it points to the same memory as `data_ptr`.
+                not data_owner.spillable and
+                data_owner.memory_info() == (data_ptr, base_nbytes, "gpu")
+            ):
+                data = data_owner
+            else:
+                # At this point we don't know the relationship between data_ptr
+                # and data_owner thus we mark both of them exposed.
+                # TODO: try to discover their relationship and create a
+                #       SpillableBufferSlice instead.
+                data = as_buffer(
+                    data=data_ptr,
+                    size=base_nbytes,
+                    owner=data_owner,
+                    exposed=True,
+                )
+                if isinstance(data_owner, ExposureTrackedBuffer):
+                    # accessing the pointer marks it exposed permanently.
+                    data_owner.mark_exposed()
+                elif isinstance(data_owner, SpillableBuffer):
+                    if data_owner.is_spilled:
+                        raise ValueError(
+                            f"{data_owner} is spilled, which invalidates "
+                            f"the exposed data_ptr ({hex(data_ptr)})"
+                        )
+                    # accessing the pointer marks it exposed permanently.
+                    data_owner.mark_exposed()
+
+            if mask_owner is None:
+                # if we reached here, it means `owner` is a `Column`
+                # that does not have a null mask, but `cv` thinks it
+                # should have a null mask. This can happen in the
+                # following sequence of events:
+                #
+                # 1) `cv` is constructed as a view into a
+                #    `cudf::column` that is nullable (i.e., it has
+                #    a null mask), but contains no nulls.
+                # 2) `owner`, a `Column`, is constructed from the
+                #    same `cudf::column`. Because `cudf::column`
+                #    is memory owning, `owner` takes ownership of
+                #    the memory owned by the
+                #    `cudf::column`. Because the column has a null
+                #    count of 0, it may choose to discard the null
+                #    mask.
+                # 3) Now, `cv` points to a discarded null mask.
+                #
+                # TL;DR: we should not include a null mask in the
+                # result:
+                mask = None
+            else:
+                mask = as_buffer(
+                    data=mask_ptr,
+                    size=pylibcudf.null_mask.bitmask_allocation_size_bytes(
+                        base_size
+                    ),
+                    owner=mask_owner,
+                    exposed=True
+                )
+
         return cudf.core.column.build_column(
-            data=as_buffer(
-                col.data().obj, exposed=data_ptr_exposed
-            ) if col.data() is not None else None,
+            data=data,
             dtype=dtype,
             size=col.size(),
-            mask=as_buffer(
-                col.null_mask().obj, exposed=data_ptr_exposed
-            ) if col.null_mask() is not None else None,
+            mask=mask,
             offset=col.offset(),
             null_count=col.null_count(),
             children=tuple([

@@ -344,8 +344,11 @@ cdef columns_from_table_view(
     """
 
     return [
-        Column.from_column_view(
-            tv.column(i), owners[i] if isinstance(owners, list) else None
+        Column.from_pylibcudf(
+            plc_Column.from_column_view(
+                tv.column(i),
+                owners[i] if isinstance(owners, list) else None
+            )
         ) for i in range(tv.num_columns())
     ]
 
@@ -376,9 +379,11 @@ cdef data_from_table_view(
             if table_owner:
                 column_owner = owner._index._columns[column_idx]
             index_columns.append(
-                Column.from_column_view(
-                    tv.column(column_idx),
-                    column_owner
+                Column.from_pylibcudf(
+                    plc_Column.from_column_view(
+                        tv.column(column_idx),
+                        column_owner
+                    )
                 )
             )
             column_idx += 1
@@ -393,7 +398,12 @@ cdef data_from_table_view(
         if table_owner:
             column_owner = owner._columns[source_column_idx]
         data_columns.append(
-            Column.from_column_view(tv.column(column_idx), column_owner)
+            Column.from_pylibcudf(
+                plc_Column.from_column_view(
+                        tv.column(column_idx),
+                        column_owner
+                    )
+            )
         )
         column_idx += 1
         source_column_idx += 1

@@ -59,7 +59,7 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
 )
-
+set_source_files_properties(contiguous_split.cxx PROPERTIES COMPILE_OPTIONS "-g;-O0")
 target_include_directories(pylibcudf_interop PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")
 
 include(${rapids-cmake-dir}/export/find_package_root.cmake)

@@ -13,6 +13,9 @@ from pylibcudf.libcudf.types cimport bitmask_type, size_type
 from .gpumemoryview cimport gpumemoryview
 from .types cimport DataType
 
+ctypedef fused ColumnOrObject:
+    Column
+    object
-ctypedef fused ColumnOrObject:
-    Column
-    object
+ctypedef fused OwningTypes:
+    Column
+    PackedColumns
+    object
-ctypedef fused ColumnOrObject:
-    Column
-    object
+ctypedef fused OwningTypes:
+    Column
+    PackedColumns
+    object
 
 cdef class Column:
     # TODO: Should we document these attributes? Should we mark them readonly?
@@ -35,7 +38,7 @@ cdef class Column:
     cdef Column from_libcudf(unique_ptr[column] libcudf_col)
 
     @staticmethod
-    cdef Column from_column_view(const column_view& libcudf_col, Column owner)
+    cdef Column from_column_view(const column_view& libcudf_col, object owner)
 
     cpdef DataType type(self)
     cpdef Column child(self, size_type index)