From 11dfd16e988044df9f58e1408e8e81c4064defe8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 5 Sep 2024 17:37:12 -0700 Subject: [PATCH 1/4] Add labeling APIs to pylibcudf --- python/cudf/cudf/_lib/labeling.pyx | 40 ++++--------- python/pylibcudf/pylibcudf/CMakeLists.txt | 1 + python/pylibcudf/pylibcudf/__init__.pxd | 1 + python/pylibcudf/pylibcudf/__init__.py | 3 + python/pylibcudf/pylibcudf/labeling.pxd | 13 ++++ python/pylibcudf/pylibcudf/labeling.pyx | 59 +++++++++++++++++++ .../pylibcudf/pylibcudf/libcudf/labeling.pxd | 8 +-- .../pylibcudf/tests/test_labeling.py | 20 +++++++ 8 files changed, 111 insertions(+), 34 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/labeling.pxd create mode 100644 python/pylibcudf/pylibcudf/labeling.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_labeling.py diff --git a/python/cudf/cudf/_lib/labeling.pyx b/python/cudf/cudf/_lib/labeling.pyx index 2e1959a348d..3966cce8981 100644 --- a/python/cudf/cudf/_lib/labeling.pyx +++ b/python/cudf/cudf/_lib/labeling.pyx @@ -1,16 +1,11 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -from cudf.core.buffer import acquire_spill_lock - from libcpp cimport bool as cbool -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.labeling cimport inclusive, label_bins as cpp_label_bins +import pylibcudf as plc from cudf._lib.column cimport Column +from cudf.core.buffer import acquire_spill_lock # Note that the parameter input shadows a Python built-in in the local scope, @@ -19,26 +14,11 @@ from cudf._lib.column cimport Column @acquire_spill_lock() def label_bins(Column input, Column left_edges, cbool left_inclusive, Column right_edges, cbool right_inclusive): - cdef inclusive c_left_inclusive = \ - inclusive.YES if left_inclusive else inclusive.NO - cdef inclusive c_right_inclusive = \ - inclusive.YES if right_inclusive else inclusive.NO - - cdef column_view input_view = input.view() - cdef column_view left_edges_view = left_edges.view() - cdef column_view right_edges_view = right_edges.view() - - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_label_bins( - input_view, - left_edges_view, - c_left_inclusive, - right_edges_view, - c_right_inclusive, - ) - ) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.labeling.label_bins( + input.to_pylibcudf(mode="read"), + left_edges.to_pylibcudf(mode="read"), + left_inclusive, + right_edges.to_pylibcudf(mode="read"), + right_inclusive + ) + return Column.from_pylibcudf(plc_column) diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt index a4f17344cb0..f07c8897e34 100644 --- a/python/pylibcudf/pylibcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/CMakeLists.txt @@ -27,6 +27,7 @@ set(cython_sources groupby.pyx interop.pyx join.pyx + labeling.pyx lists.pyx merge.pyx null_mask.pyx diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd index 841efa59bda..b7cf6413c05 100644 --- a/python/pylibcudf/pylibcudf/__init__.pxd +++ b/python/pylibcudf/pylibcudf/__init__.pxd @@ -13,6 +13,7 @@ from . cimport ( filling, groupby, join, + labeling, lists, merge, null_mask, diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py index d3878a89a6a..84b1c29f791 100644 --- a/python/pylibcudf/pylibcudf/__init__.py +++ b/python/pylibcudf/pylibcudf/__init__.py @@ -24,6 +24,7 @@ interop, io, join, + labeling, lists, merge, null_mask, @@ -67,7 +68,9 @@ "gpumemoryview", "groupby", "interop", + "io", "join", + "labeling", "lists", "merge", "null_mask", diff --git a/python/pylibcudf/pylibcudf/labeling.pxd b/python/pylibcudf/pylibcudf/labeling.pxd new file mode 100644 index 00000000000..0415178b3a9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/labeling.pxd @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp cimport bool + +from .column cimport Column + + +cpdef Column label_bins( + Column input, + Column left_edges, + bool left_inclusive, + Column right_edges, + bool right_inclusive +) diff --git a/python/pylibcudf/pylibcudf/labeling.pyx b/python/pylibcudf/pylibcudf/labeling.pyx new file mode 100644 index 00000000000..8c269b51ca4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/labeling.pyx @@ -0,0 +1,59 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.libcudf cimport labeling as cpp_labeling +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.labeling cimport inclusive + +from .column cimport Column + + +cpdef Column label_bins( + Column input, + Column left_edges, + bool left_inclusive, + Column right_edges, + bool right_inclusive +): + """Labels elements based on membership in the specified bins. + + Parameters + ---------- + input : Column + Column of input elements to label according to the specified bins. + left_edges : Column + Column of the left edge of each bin. + left_inclusive : bool + Whether or not the left edge is inclusive. + right_edges : Column + Column of the right edge of each bin. + right_inclusive : bool + Whether or not the right edge is inclusive. + + Returns + ------- + Column + Column of integer labels of the elements in `input` + according to the specified bins. + """ + cdef unique_ptr[column] c_result + cdef inclusive c_left_inclusive = ( + inclusive.YES if left_inclusive else inclusive.NO + ) + cdef inclusive c_right_inclusive = ( + inclusive.YES if right_inclusive else inclusive.NO + ) + + with nogil: + c_result = move( + cpp_labeling.label_bins( + input.view(), + left_edges.view(), + c_left_inclusive, + right_edges.view(), + c_right_inclusive, + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/libcudf/labeling.pxd b/python/pylibcudf/pylibcudf/libcudf/labeling.pxd index ec6ef6b2a41..026ea0df9ef 100644 --- a/python/pylibcudf/pylibcudf/libcudf/labeling.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/labeling.pxd @@ -1,14 +1,14 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. - +from libcpp cimport bool from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view cdef extern from "cudf/labeling/label_bins.hpp" namespace "cudf" nogil: - ctypedef enum inclusive: - YES "cudf::inclusive::YES" - NO "cudf::inclusive::NO" + cpdef enum class inclusive(bool): + YES + NO cdef unique_ptr[column] label_bins ( const column_view &input, diff --git a/python/pylibcudf/pylibcudf/tests/test_labeling.py b/python/pylibcudf/pylibcudf/tests/test_labeling.py new file mode 100644 index 00000000000..0a956eab70b --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_labeling.py @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest + + +@pytest.mark.parametrize("left_inclusive", [True, False]) +@pytest.mark.parametrize("right_inclusive", [True, False]) +def test_label_bins(left_inclusive, right_inclusive): + in_col = plc.interop.from_arrow(pa.array([1, 2, 3])) + left_edges = plc.interop.from_arrow(pa.array([0, 5])) + right_edges = plc.interop.from_arrow(pa.array([4, 6])) + result = plc.interop.to_arrow( + plc.labeling.label_bins( + in_col, left_edges, left_inclusive, right_edges, right_inclusive + ) + ) + expected = pa.chunked_array([[0, 0, 0]], type=pa.int32()) + assert result.equals(expected) From 26a2ac5057db95092b3c51145c0543e81cf090cc Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 6 Sep 2024 14:21:01 -0700 Subject: [PATCH 2/4] int instead of bool --- python/pylibcudf/pylibcudf/libcudf/labeling.pxd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pylibcudf/pylibcudf/libcudf/labeling.pxd b/python/pylibcudf/pylibcudf/libcudf/labeling.pxd index 026ea0df9ef..400c4282f7a 100644 --- a/python/pylibcudf/pylibcudf/libcudf/labeling.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/labeling.pxd @@ -1,12 +1,12 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -from libcpp cimport bool +from libcpp cimport int from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view cdef extern from "cudf/labeling/label_bins.hpp" namespace "cudf" nogil: - cpdef enum class inclusive(bool): + cpdef enum class inclusive(int): YES NO From 22b9e547edc046a848a8631b33403e835e8989ea Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 6 Sep 2024 16:32:04 -0700 Subject: [PATCH 3/4] Add Inclusive enum, update enum docs --- docs/cudf/source/developer_guide/pylibcudf.md | 17 +++++++++-------- python/pylibcudf/pylibcudf/labeling.pxd | 1 + python/pylibcudf/pylibcudf/labeling.pyx | 10 ++++++++-- .../pylibcudf/pylibcudf/libcudf/CMakeLists.txt | 4 ++-- python/pylibcudf/pylibcudf/libcudf/labeling.pyx | 0 .../pylibcudf/pylibcudf/tests/test_labeling.py | 5 +++++ 6 files changed, 25 insertions(+), 12 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/libcudf/labeling.pyx diff --git a/docs/cudf/source/developer_guide/pylibcudf.md b/docs/cudf/source/developer_guide/pylibcudf.md index 4e10459fe2b..39840e72e21 100644 --- a/docs/cudf/source/developer_guide/pylibcudf.md +++ b/docs/cudf/source/developer_guide/pylibcudf.md @@ -186,7 +186,7 @@ Here is an example of appropriate enum usage. ```cython -# cpp/copying.pxd +# pylibcudf/libcudf/copying.pxd cdef extern from "cudf/copying.hpp" namespace "cudf" nogil: # cpdef here so that we export both a cdef enum class and a Python enum.Enum. cpdef enum class out_of_bounds_policy(bool): @@ -194,8 +194,9 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil: DONT_CHECK -# cpp/copying.pyx -# This file is empty, but is required to compile the Python enum in cpp/copying.pxd +# pylibcudf/libcudf/copying.pyx +# This file is empty, but is required to compile the Python enum in pylibcudf/libcudf/copying.pxd +# Ensure this file is included in pylibcudf/libcudf/CMakeLists.txt # pylibcudf/copying.pxd @@ -203,21 +204,21 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil: # cimport the enum using the exact name # Once https://github.com/cython/cython/issues/5609 is resolved, # this import should instead be -# from cudf._lib.cpp.copying cimport out_of_bounds_policy as OutOfBoundsPolicy -from cudf._lib.cpp.copying cimport out_of_bounds_policy +# from pylibcudf.libcudf.copying cimport out_of_bounds_policy as OutOfBoundsPolicy +from pylibcudf.libcudf.copying cimport out_of_bounds_policy # pylibcudf/copying.pyx # Access cpp.copying members that aren't part of this module's public API via # this module alias -from cudf._lib.cpp cimport copying as cpp_copying -from cudf._lib.cpp.copying cimport out_of_bounds_policy +from pylibcudf.libcudf cimport copying as cpp_copying +from pylibcudf.libcudf.copying cimport out_of_bounds_policy # This import exposes the enum in the public API of this module. # It requires a no-cython-lint tag because it will be unused: all typing of # parameters etc will need to use the Cython name `out_of_bounds_policy` until # the Cython bug is resolved. -from cudf._lib.cpp.copying import \ +from pylibcudf.libcudf.copying import \ out_of_bounds_policy as OutOfBoundsPolicy # no-cython-lint ``` diff --git a/python/pylibcudf/pylibcudf/labeling.pxd b/python/pylibcudf/pylibcudf/labeling.pxd index 0415178b3a9..6f8797ae7d3 100644 --- a/python/pylibcudf/pylibcudf/labeling.pxd +++ b/python/pylibcudf/pylibcudf/labeling.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from libcpp cimport bool +from pylibcudf.libcudf.labeling cimport inclusive from .column cimport Column diff --git a/python/pylibcudf/pylibcudf/labeling.pyx b/python/pylibcudf/pylibcudf/labeling.pyx index 8c269b51ca4..b5a7445df36 100644 --- a/python/pylibcudf/pylibcudf/labeling.pyx +++ b/python/pylibcudf/pylibcudf/labeling.pyx @@ -6,6 +6,8 @@ from pylibcudf.libcudf cimport labeling as cpp_labeling from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.labeling cimport inclusive +from pylibcudf.libcudf.labeling import inclusive as Inclusive # no-cython-lint + from .column cimport Column @@ -39,10 +41,14 @@ cpdef Column label_bins( """ cdef unique_ptr[column] c_result cdef inclusive c_left_inclusive = ( - inclusive.YES if left_inclusive else inclusive.NO + inclusive.YES + if left_inclusive + else inclusive.NO ) cdef inclusive c_right_inclusive = ( - inclusive.YES if right_inclusive else inclusive.NO + inclusive.YES + if right_inclusive + else inclusive.NO ) with nogil: diff --git a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt index b04e94f1546..2167616690f 100644 --- a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt @@ -12,8 +12,8 @@ # the License. # ============================================================================= -set(cython_sources aggregation.pyx binaryop.pyx copying.pyx expressions.pyx reduce.pyx replace.pyx - round.pyx stream_compaction.pyx types.pyx unary.pyx +set(cython_sources aggregation.pyx binaryop.pyx copying.pyx expressions.pyx labeling.pyx reduce.pyx + replace.pyx round.pyx stream_compaction.pyx types.pyx unary.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/libcudf/labeling.pyx b/python/pylibcudf/pylibcudf/libcudf/labeling.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/tests/test_labeling.py b/python/pylibcudf/pylibcudf/tests/test_labeling.py index 0a956eab70b..35f33a46837 100644 --- a/python/pylibcudf/pylibcudf/tests/test_labeling.py +++ b/python/pylibcudf/pylibcudf/tests/test_labeling.py @@ -18,3 +18,8 @@ def test_label_bins(left_inclusive, right_inclusive): ) expected = pa.chunked_array([[0, 0, 0]], type=pa.int32()) assert result.equals(expected) + + +def test_Inclusive_enum(): + assert plc.labeling.Inclusive.YES == 0 + assert plc.labeling.Inclusive.NO == 1 From 0892610e4311f0aaf13787cc2343799e9542d13d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 9 Sep 2024 09:57:19 -0700 Subject: [PATCH 4/4] Update python/pylibcudf/pylibcudf/tests/test_labeling.py Co-authored-by: Lawrence Mitchell --- python/pylibcudf/pylibcudf/tests/test_labeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pylibcudf/pylibcudf/tests/test_labeling.py b/python/pylibcudf/pylibcudf/tests/test_labeling.py index 35f33a46837..f7fb7463b50 100644 --- a/python/pylibcudf/pylibcudf/tests/test_labeling.py +++ b/python/pylibcudf/pylibcudf/tests/test_labeling.py @@ -20,6 +20,6 @@ def test_label_bins(left_inclusive, right_inclusive): assert result.equals(expected) -def test_Inclusive_enum(): +def test_inclusive_enum(): assert plc.labeling.Inclusive.YES == 0 assert plc.labeling.Inclusive.NO == 1