From 84e7c4eedfd086ac15857de5055b1049dd08c6db Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 1 Jul 2024 17:15:51 +0000 Subject: [PATCH 01/21] Handle case of broadcasting empty list of columns --- python/cudf_polars/cudf_polars/dsl/ir.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 9b3096becd4..31a0be004ea 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -96,6 +96,8 @@ def broadcast( ``target_length`` is provided and not all columns are length-1 (i.e. ``n != 1``), then ``target_length`` must be equal to ``n``. """ + if len(columns) == 0: + return [] lengths: set[int] = {column.obj.size() for column in columns} if lengths == {1}: if target_length is None: From 27ac28abb7b9bca67f12bfebaec0bc74a891d3b2 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 1 Jul 2024 17:16:54 +0000 Subject: [PATCH 02/21] Create regex program during StringFunction init This way, we raise early if we can't support the pattern. --- python/cudf_polars/cudf_polars/dsl/expr.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index fe859c8d958..b8fb200b8df 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -679,7 +679,7 @@ def do_evaluate( class StringFunction(Expr): - __slots__ = ("name", "options", "children") + __slots__ = ("name", "options", "children", "regex_program") _non_child = ("dtype", "name", "options") children: tuple[Expr, ...] @@ -716,6 +716,13 @@ def _validate_input(self): raise NotImplementedError( "Regex contains only supports a scalar pattern" ) + try: + self.regex_program = plc.strings.regex_program.RegexProgram.create( + self.children[1].value.as_py(), + flags=plc.strings.regex_flags.RegexFlags.DEFAULT, + ) + except Exception as e: + raise NotImplementedError("Unsupported regex pattern.") from e def do_evaluate( self, @@ -739,11 +746,9 @@ def do_evaluate( ) return Column(plc.strings.find.contains(column.obj, pattern)) assert isinstance(arg, Literal) - prog = plc.strings.regex_program.RegexProgram.create( - arg.value.as_py(), - flags=plc.strings.regex_flags.RegexFlags.DEFAULT, + return Column( + plc.strings.contains.contains_re(column.obj, self.regex_program) ) - return Column(plc.strings.contains.contains_re(column.obj, prog)) columns = [ child.evaluate(df, context=context, mapping=mapping) for child in self.children From a4dbd0d158c4db5edd4aa989847c4e325f72d18b Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 1 Jul 2024 17:18:32 +0000 Subject: [PATCH 03/21] Match ordering requirements of polars for left join --- python/cudf_polars/cudf_polars/dsl/ir.py | 61 ++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 31a0be004ea..2fc7820b40b 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -575,6 +575,62 @@ def _joiners( else: assert_never(how) + def _reorder_maps( + self, + left_rows: int, + lg: plc.Column, + left_policy: plc.copying.OutOfBoundsPolicy, + right_rows: int, + rg: plc.Column, + right_policy: plc.copying.OutOfBoundsPolicy, + ) -> tuple[plc.Column, plc.Column]: + """ + Reorder gather maps to satisfy polars join order restrictions. + + Parameters + ---------- + left_rows + Number of rows in left table + lg + Left gather map + left_policy + Nullify policy for left map + right_rows + Number of rows in right table + rg + Right gathert map + right_policy + Nullify policy for right map + + Returns + ------- + tuple of reordered left and right gather maps. + + Notes + ----- + For a left join, the polars result preserves the order of the + left keys, and is stable wrt the right keys. For all other + joins, there is no order obligation. + """ + # TODO: size_type + dt = plc.interop.to_arrow(plc.DataType(plc.TypeId.INT32)) + init = plc.interop.from_arrow(pa.scalar(0, type=dt)) + step = plc.interop.from_arrow(pa.scalar(1, type=dt)) + left_order = plc.copying.gather( + plc.Table([plc.filling.sequence(left_rows, init, step)]), lg, left_policy + ) + right_order = plc.copying.gather( + plc.Table([plc.filling.sequence(right_rows, init, step)]), rg, right_policy + ) + return tuple( + plc.sorting.stable_sort_by_key( + plc.Table([lg, rg]), + plc.Table([*left_order.columns(), *right_order.columns()]), + [plc.types.Order.ASCENDING, plc.types.Order.ASCENDING], + [plc.types.NullOrder.AFTER, plc.types.NullOrder.AFTER], + ).columns() + ) + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" left = self.left.evaluate(cache=cache) @@ -617,6 +673,11 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: lg, rg = join_fn(left_on.table, right_on.table, null_equality) if coalesce and how == "inner": right = right.discard_columns(right_on.column_names_set) + if how == "left": + # Order of left table is preserved + lg, rg = self._reorder_maps( + left.num_rows, lg, left_policy, right.num_rows, rg, right_policy + ) left = DataFrame.from_table( plc.copying.gather(left.table, lg, left_policy), left.column_names ) From 2a628dac9c8e85036e6ec291e63eedd5ee610a8b Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 1 Jul 2024 17:19:09 +0000 Subject: [PATCH 04/21] Allow specifying exceptions to catch in execute_with_cudf Default to Exception so we can catch errors from arrow and other third-party libraries. --- python/cudf_polars/cudf_polars/callback.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index 979087d5273..764cdd3b3ca 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -34,7 +34,12 @@ def _callback( return ir.evaluate(cache={}).to_polars() -def execute_with_cudf(nt: NodeTraverser, *, raise_on_fail: bool = False) -> None: +def execute_with_cudf( + nt: NodeTraverser, + *, + raise_on_fail: bool = False, + exception: type[Exception] | tuple[type[Exception], ...] = Exception, +) -> None: """ A post optimization callback that attempts to execute the plan with cudf. @@ -47,11 +52,15 @@ def execute_with_cudf(nt: NodeTraverser, *, raise_on_fail: bool = False) -> None Should conversion raise an exception rather than continuing without setting a callback. + exception + Optional exception, or tuple of exceptions, to catch during + translation. Defaults to ``Exception``. + The NodeTraverser is mutated if the libcudf executor can handle the plan. """ try: with nvtx.annotate(message="ConvertIR", domain="cudf_polars"): nt.set_udf(partial(_callback, translate_ir(nt))) - except NotImplementedError: + except exception: if raise_on_fail: raise From 917b5a75409c9bc466905c320696e08688498a0c Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 1 Jul 2024 17:20:41 +0000 Subject: [PATCH 05/21] Fix bug in documented behaviour of with_columns We were not previously discarding overlapping column names. --- python/cudf_polars/cudf_polars/containers/dataframe.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index ec8d00c3123..d86656578d7 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -5,6 +5,7 @@ from __future__ import annotations +import itertools from functools import cached_property from typing import TYPE_CHECKING, cast @@ -160,7 +161,10 @@ def with_columns(self, columns: Sequence[NamedColumn]) -> Self: ----- If column names overlap, newer names replace older ones. """ - return type(self)([*self.columns, *columns]) + columns = list( + {c.name: c for c in itertools.chain(self.columns, columns)}.values() + ) + return type(self)(columns) def discard_columns(self, names: Set[str]) -> Self: """Drop columns by name.""" From 672b356d6fa440e17a848eccee4bf54824ded30b Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 1 Jul 2024 17:21:19 +0000 Subject: [PATCH 06/21] Allow dataframe to have overlapping names Only complain when trying to select columns. This is a bit of a band-aid, should fix closer to the source: intermediate results don't necessarily need names (e.g. join keys). --- python/cudf_polars/cudf_polars/containers/dataframe.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index d86656578d7..cf7c2ab7fa0 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -173,6 +173,8 @@ def discard_columns(self, names: Set[str]) -> Self: def select(self, names: Sequence[str]) -> Self: """Select columns by name returning DataFrame.""" want = set(names) + if len(self.columns) != len(self._column_map): + raise ValueError("Lost some columns with overlapping names") if not want.issubset(self.column_names_set): raise ValueError("Can't select missing names") return type(self)([self._column_map[name] for name in names]) From 9465011165649d2223db11f379e5fa58d4b0d840 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 1 Jul 2024 17:24:27 +0000 Subject: [PATCH 07/21] Cast count aggs to correct dtype in translation --- python/cudf_polars/cudf_polars/dsl/translate.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index a2fdb3c3d79..fc350048082 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -443,12 +443,15 @@ def _(node: pl_expr.Column, visitor: NodeTraverser, dtype: plc.DataType) -> expr @_translate_expr.register def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: - return expr.Agg( + value = expr.Agg( dtype, node.name, node.options, *(translate_expr(visitor, n=n) for n in node.arguments), ) + if value.name == "count" and value.dtype.id() != plc.TypeId.INT32: + return expr.Cast(value.dtype, value) + return value @_translate_expr.register @@ -475,7 +478,10 @@ def _( @_translate_expr.register def _(node: pl_expr.Len, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr: - return expr.Len(dtype) + value = expr.Len(dtype) + if dtype.id() != plc.TypeId.INT32: + return expr.Cast(dtype, value) + return value def translate_expr(visitor: NodeTraverser, *, n: int) -> expr.Expr: From 124dc5c5f6676dfe0f110ad267b66780bb11ce73 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 1 Jul 2024 17:24:57 +0000 Subject: [PATCH 08/21] Raise on unsupported nested types --- python/cudf_polars/cudf_polars/utils/dtypes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 507acb5d33a..da9bb99a6d1 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -167,6 +167,8 @@ def from_polars(dtype: pl.DataType) -> plc.DataType: return plc.DataType(plc.TypeId.EMPTY) elif isinstance(dtype, pl.List): # TODO: This doesn't consider the value type. + # Recurse to catch unsupported inner types + _ = from_polars(dtype.inner) return plc.DataType(plc.TypeId.LIST) else: raise NotImplementedError(f"{dtype=} conversion not supported") From 72c7883fdc5ef94db2f966ddbac26270bfccaea2 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 1 Jul 2024 17:25:14 +0000 Subject: [PATCH 09/21] Handle empty column name in conversion to polars --- python/cudf_polars/cudf_polars/containers/dataframe.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index cf7c2ab7fa0..7270b99e61f 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -51,7 +51,11 @@ def to_polars(self) -> pl.DataFrame: [plc.interop.ColumnMetadata(name=c.name) for c in self.columns], ) - return cast(pl.DataFrame, pl.from_arrow(table)) + result: pl.DataFrame = cast(pl.DataFrame, pl.from_arrow(table)) + if list(result.schema.keys()) != self.column_names: + # Empty column name gets translated in from_arrow + return result.rename(dict(zip(result.schema.keys(), self.column_names))) + return result @cached_property def column_names_set(self) -> frozenset[str]: From dc0cb3f82564bc51b1c653987e0f504f13188df1 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 1 Jul 2024 17:26:01 +0000 Subject: [PATCH 10/21] Raise for unsupported cast to/from strings --- python/cudf_polars/cudf_polars/dsl/expr.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index b8fb200b8df..fdb302e73b6 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -960,6 +960,13 @@ class Cast(Expr): def __init__(self, dtype: plc.DataType, value: Expr) -> None: super().__init__(dtype) self.children = (value,) + if ( + self.dtype.id() == plc.TypeId.STRING + or value.dtype.id() == plc.TypeId.STRING + ): + raise NotImplementedError( + "Need to implement cast to/from string separately." + ) def do_evaluate( self, From 16d7b86b281fbf9e296d7884c0b867a06e250369 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 1 Jul 2024 17:26:21 +0000 Subject: [PATCH 11/21] WIP: Fix bug in HConcat Shorter tables must be extended with nulls before concatenation. --- python/cudf_polars/cudf_polars/dsl/ir.py | 33 ++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 2fc7820b40b..1f531af1673 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -1017,6 +1017,14 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: ).slice(self.zlice) +def empty_column( + nrows: int, dtype: plc.DataType, mask_state: plc.MaskState +) -> plc.Column: + if dtype.id() in (plc.TypeId.LIST, plc.TypeId.STRING): + raise NotImplementedError("Empty list/string column") + return plc.column_factories.make_fixed_width_column(dtype, nrows, mask_state) + + @dataclasses.dataclass class HConcat(IR): """Concatenate dataframes horizontally.""" @@ -1027,6 +1035,31 @@ class HConcat(IR): def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" dfs = [df.evaluate(cache=cache) for df in self.dfs] + max_rows = max(df.num_rows for df in dfs) + # Horizontal concatenation extends shorter tables with nulls + dfs = [ + df + if df.num_rows == max_rows + else DataFrame.from_table( + plc.concatenate.concatenate( + [ + df.table, + plc.Table( + [ + empty_column( + max_rows - df.num_rows, + c.obj.type(), + plc.MaskState.ALL_NULL, + ) + for c in df.columns + ] + ), + ] + ), + df.column_names, + ) + for df in dfs + ] return DataFrame( list(itertools.chain.from_iterable(df.columns for df in dfs)), ) From cbc493e8ae29625b75b3afa8c1413035eaaf726e Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 1 Jul 2024 17:27:01 +0000 Subject: [PATCH 12/21] Translate BinOp.Add between strings to ConcatHorizontal This immediately raises, but previously we had a runtime error. --- python/cudf_polars/cudf_polars/dsl/translate.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index fc350048082..b1abaaa0f8c 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -468,12 +468,13 @@ def _(node: pl_expr.Ternary, visitor: NodeTraverser, dtype: plc.DataType) -> exp def _( node: pl_expr.BinaryExpr, visitor: NodeTraverser, dtype: plc.DataType ) -> expr.Expr: - return expr.BinOp( - dtype, - expr.BinOp._MAPPING[node.op], - translate_expr(visitor, n=node.left), - translate_expr(visitor, n=node.right), - ) + left = translate_expr(visitor, n=node.left) + right = translate_expr(visitor, n=node.right) + if dtype.id() == plc.TypeId.STRING and node.op == pl_expr.Operator.PLUS: + return expr.StringFunction( + dtype, pl_expr.StringFunction.ConcatHorizontal, (), left, right + ) + return expr.BinOp(dtype, expr.BinOp._MAPPING[node.op], left, right) @_translate_expr.register From ffb000839b98afcc8c3e86a619c646d4fb84a611 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 2 Jul 2024 11:29:56 +0000 Subject: [PATCH 13/21] Only produce row_index if the schema demands it in Scan --- python/cudf_polars/cudf_polars/dsl/ir.py | 32 +++++++++++++----------- python/cudf_polars/tests/test_scan.py | 11 ++++++++ 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 1f531af1673..60fe3252700 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -233,21 +233,23 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: ) # pragma: no cover; post init trips first if row_index is not None: name, offset = row_index - dtype = self.schema[name] - step = plc.interop.from_arrow( - pa.scalar(1, type=plc.interop.to_arrow(dtype)) - ) - init = plc.interop.from_arrow( - pa.scalar(offset, type=plc.interop.to_arrow(dtype)) - ) - index = NamedColumn( - plc.filling.sequence(df.num_rows, init, step), - name, - is_sorted=plc.types.Sorted.YES, - order=plc.types.Order.ASCENDING, - null_order=plc.types.NullOrder.AFTER, - ) - df = DataFrame([index, *df.columns]) + # Only make row index if the schema demands it + if name in self.schema: + dtype = self.schema[name] + step = plc.interop.from_arrow( + pa.scalar(1, type=plc.interop.to_arrow(dtype)) + ) + init = plc.interop.from_arrow( + pa.scalar(offset, type=plc.interop.to_arrow(dtype)) + ) + index = NamedColumn( + plc.filling.sequence(df.num_rows, init, step), + name, + is_sorted=plc.types.Sorted.YES, + order=plc.types.Order.ASCENDING, + null_order=plc.types.NullOrder.AFTER, + ) + df = DataFrame([index, *df.columns]) # TODO: should be true, but not the case until we get # cudf-classic out of the loop for IO since it converts date32 # to datetime. diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py index f129cc7ca32..af0baacd5a8 100644 --- a/python/cudf_polars/tests/test_scan.py +++ b/python/cudf_polars/tests/test_scan.py @@ -97,3 +97,14 @@ def test_scan_unsupported_raises(tmp_path): df.write_ndjson(tmp_path / "df.json") q = pl.scan_ndjson(tmp_path / "df.json") assert_ir_translation_raises(q, NotImplementedError) + + +@pytest.mark.xfail(reason="https://github.com/pola-rs/polars/pull/17363") +def test_scan_row_index_projected_out(tmp_path): + df = pl.DataFrame({"a": [1, 2, 3]}) + + df.write_parquet(tmp_path / "df.pq") + + q = pl.scan_parquet(tmp_path / "df.pq").with_row_index().select(pl.col("a")) + + assert_gpu_result_equal(q) From a4ee0803552b3154efe207170e3d535cb04d0d25 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 4 Jul 2024 13:38:03 +0000 Subject: [PATCH 14/21] Expose type traits in pylibcudf --- .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 1 + python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 4 + python/cudf/cudf/_lib/pylibcudf/__init__.py | 4 + .../pylibcudf/libcudf/utilities/traits.pxd | 27 +++ python/cudf/cudf/_lib/pylibcudf/traits.pxd | 25 +++ python/cudf/cudf/_lib/pylibcudf/traits.pyx | 187 ++++++++++++++++++ .../cudf/cudf/pylibcudf_tests/common/utils.py | 39 ---- .../cudf/cudf/pylibcudf_tests/test_copying.py | 47 +++-- .../cudf/cudf/pylibcudf_tests/test_traits.py | 110 +++++++++++ 9 files changed, 386 insertions(+), 58 deletions(-) create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/traits.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/traits.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_traits.py diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 0a198f431a7..c0146f3ff50 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -39,6 +39,7 @@ set(cython_sources sorting.pyx table.pyx types.pyx + traits.pyx unary.pyx utils.pyx ) diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index 5131df9a5cd..2376b0b0582 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -23,6 +23,7 @@ from . cimport ( sorting, stream_compaction, strings, + traits, types, unary, ) @@ -37,6 +38,7 @@ from .types cimport DataType, type_id __all__ = [ "Column", "DataType", + "MaskState", "Scalar", "Table", "aggregation", @@ -54,6 +56,7 @@ __all__ = [ "quantiles", "reduce", "replace", + "reshape", "rolling", "round", "search", @@ -61,5 +64,6 @@ __all__ = [ "strings", "sorting", "types", + "traits", "unary", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 43a9e2aca31..8610bf175d0 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -23,6 +23,7 @@ sorting, stream_compaction, strings, + traits, types, unary, ) @@ -35,6 +36,7 @@ __all__ = [ "Column", "DataType", + "MaskState", "Scalar", "Table", "TypeId", @@ -53,6 +55,7 @@ "merge", "quantiles", "reduce", + "reshape", "replace", "rolling", "round", @@ -60,6 +63,7 @@ "stream_compaction", "strings", "sorting", + "traits", "types", "unary", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd new file mode 100644 index 00000000000..0cc58af735b --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd @@ -0,0 +1,27 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.vector cimport vector + +from cudf._lib.pylibcudf.libcudf.types cimport data_type + + +cdef extern from "cudf/utilities/traits.hpp" namespace "cudf" nogil: + cdef bool is_relationally_comparable(data_type) + cdef bool is_equality_comparable(data_type) + cdef bool is_numeric(data_type) + cdef bool is_index_type(data_type) + cdef bool is_unsigned(data_type) + cdef bool is_integral(data_type) + cdef bool is_integral_not_bool(data_type) + cdef bool is_floating_point(data_type) + cdef bool is_boolean(data_type) + cdef bool is_timestamp(data_type) + cdef bool is_fixed_point(data_type) + cdef bool is_duration(data_type) + cdef bool is_chrono(data_type) + cdef bool is_dictionary(data_type) + cdef bool is_fixed_width(data_type) + cdef bool is_compound(data_type) + cdef bool is_nested(data_type) + cdef bool is_bit_castable(data_type, data_type) diff --git a/python/cudf/cudf/_lib/pylibcudf/traits.pxd b/python/cudf/cudf/_lib/pylibcudf/traits.pxd new file mode 100644 index 00000000000..668fa775202 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/traits.pxd @@ -0,0 +1,25 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool + +from .types cimport DataType + + +cpdef bool is_relationally_comparable(DataType typ) +cpdef bool is_equality_comparable(DataType typ) +cpdef bool is_numeric(DataType typ) +cpdef bool is_index_type(DataType typ) +cpdef bool is_unsigned(DataType typ) +cpdef bool is_integral(DataType typ) +cpdef bool is_integral_not_bool(DataType typ) +cpdef bool is_floating_point(DataType typ) +cpdef bool is_boolean(DataType typ) +cpdef bool is_timestamp(DataType typ) +cpdef bool is_fixed_point(DataType typ) +cpdef bool is_duration(DataType typ) +cpdef bool is_chrono(DataType typ) +cpdef bool is_dictionary(DataType typ) +cpdef bool is_fixed_width(DataType typ) +cpdef bool is_compound(DataType typ) +cpdef bool is_nested(DataType typ) +cpdef bool is_bit_castable(DataType source, DataType target) diff --git a/python/cudf/cudf/_lib/pylibcudf/traits.pyx b/python/cudf/cudf/_lib/pylibcudf/traits.pyx new file mode 100644 index 00000000000..fdd8f1e5057 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/traits.pyx @@ -0,0 +1,187 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool + +from cudf._lib.pylibcudf.libcudf.utilities cimport traits + +from .types cimport DataType + + +cpdef bool is_relationally_comparable(DataType typ): + """Checks if the given data type supports relational comparisons. + + See Also + -------- + :cpp:func:`is_relationally_comparable` + """ + return traits.is_relationally_comparable(typ.c_obj) + + +cpdef bool is_equality_comparable(DataType typ): + """Checks if the given data type supports equality comparisons. + + See Also + -------- + :cpp:func:`is_equality_comparable` + """ + return traits.is_equality_comparable(typ.c_obj) + + +cpdef bool is_numeric(DataType typ): + """Checks if the given data type is numeric. + + See Also + -------- + :cpp:func:`is_numeric` + """ + return traits.is_numeric(typ.c_obj) + + +cpdef bool is_index_type(DataType typ): + """Checks if the given data type is an index type. + + See Also + -------- + :cpp:func:`is_index_type` + """ + return traits.is_index_type(typ.c_obj) + + +cpdef bool is_unsigned(DataType typ): + """Checks if the given data type is an unsigned type. + + See Also + -------- + :cpp:func:`is_unsigned` + """ + return traits.is_unsigned(typ.c_obj) + + +cpdef bool is_integral(DataType typ): + """Checks if the given data type is an integral type. + + See Also + -------- + :cpp:func:`is_integral` + """ + return traits.is_integral(typ.c_obj) + + +cpdef bool is_integral_not_bool(DataType typ): + """Checks if the given data type is an integral type excluding booleans. + + See Also + -------- + :cpp:func:`is_integral_not_bool` + """ + return traits.is_integral_not_bool(typ.c_obj) + + +cpdef bool is_floating_point(DataType typ): + """Checks if the given data type is a floating point type. + + See Also + -------- + :cpp:func:`is_floating_point` + """ + return traits.is_floating_point(typ.c_obj) + + +cpdef bool is_boolean(DataType typ): + """Checks if the given data type is a boolean type. + + See Also + -------- + :cpp:func:`is_boolean` + """ + return traits.is_boolean(typ.c_obj) + + +cpdef bool is_timestamp(DataType typ): + """Checks if the given data type is a timestamp type. + + See Also + -------- + :cpp:func:`is_timestamp` + """ + return traits.is_timestamp(typ.c_obj) + + +cpdef bool is_fixed_point(DataType typ): + """Checks if the given data type is a fixed point type. + + See Also + -------- + :cpp:func:`is_fixed_point` + """ + return traits.is_fixed_point(typ.c_obj) + + +cpdef bool is_duration(DataType typ): + """Checks if the given data type is a duration type. + + See Also + -------- + :cpp:func:`is_duration` + """ + return traits.is_duration(typ.c_obj) + + +cpdef bool is_chrono(DataType typ): + """Checks if the given data type is a chrono type. + + See Also + -------- + :cpp:func:`is_chrono` + """ + return traits.is_chrono(typ.c_obj) + + +cpdef bool is_dictionary(DataType typ): + """Checks if the given data type is a dictionary type. + + See Also + -------- + :cpp:func:`is_dictionary` + """ + return traits.is_dictionary(typ.c_obj) + + +cpdef bool is_fixed_width(DataType typ): + """Checks if the given data type is a fixed width type. + + See Also + -------- + :cpp:func:`is_fixed_width` + """ + return traits.is_fixed_width(typ.c_obj) + + +cpdef bool is_compound(DataType typ): + """Checks if the given data type is a compound type. + + See Also + -------- + :cpp:func:`is_compound` + """ + return traits.is_compound(typ.c_obj) + + +cpdef bool is_nested(DataType typ): + """Checks if the given data type is a nested type. + + See Also + -------- + :cpp:func:`is_nested` + """ + return traits.is_nested(typ.c_obj) + + +cpdef bool is_bit_castable(DataType source, DataType target): + """Checks if the source type is bit-castable to the target type. + + See Also + -------- + :cpp:func:`is_bit_castable` + """ + return traits.is_bit_castable(source.c_obj, target.c_obj) diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index f8bfe340ae5..d41e6c720bf 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -102,49 +102,10 @@ def cudf_raises(expected_exception: BaseException, *args, **kwargs): return pytest.raises(expected_exception, *args, **kwargs) -# TODO: Consider moving these type utilities into pylibcudf.types itself. -def is_signed_integer(plc_dtype: plc.DataType): - return ( - plc.TypeId.INT8.value <= plc_dtype.id().value <= plc.TypeId.INT64.value - ) - - -def is_integer(plc_dtype: plc.DataType): - return plc_dtype.id() in ( - plc.TypeId.INT8, - plc.TypeId.INT16, - plc.TypeId.INT32, - plc.TypeId.INT64, - plc.TypeId.UINT8, - plc.TypeId.UINT16, - plc.TypeId.UINT32, - plc.TypeId.UINT64, - ) - - -def is_floating(plc_dtype: plc.DataType): - return plc_dtype.id() in ( - plc.TypeId.FLOAT32, - plc.TypeId.FLOAT64, - ) - - -def is_boolean(plc_dtype: plc.DataType): - return plc_dtype.id() == plc.TypeId.BOOL8 - - def is_string(plc_dtype: plc.DataType): return plc_dtype.id() == plc.TypeId.STRING -def is_fixed_width(plc_dtype: plc.DataType): - return ( - is_integer(plc_dtype) - or is_floating(plc_dtype) - or is_boolean(plc_dtype) - ) - - def nesting_level(typ) -> tuple[int, int]: """Return list and struct nesting of a pyarrow type.""" if isinstance(typ, pa.ListType): diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py index 0a6df198d46..f27fe4e942e 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_copying.py +++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py @@ -9,9 +9,6 @@ assert_column_eq, assert_table_eq, cudf_raises, - is_fixed_width, - is_floating, - is_integer, is_nested_list, is_nested_struct, is_string, @@ -359,9 +356,9 @@ def test_scatter_table_type_mismatch(source_table, index_column, target_table): _, plc_index_column = index_column _, plc_target_table = target_table with cudf_raises(TypeError): - if is_integer( + if plc.traits.is_integral_not_bool( dtype := plc_target_table.columns()[0].type() - ) or is_floating(dtype): + ) or plc.traits.is_floating_point(dtype): pa_array = pa.array([True] * plc_source_table.num_rows()) else: pa_array = pa.array([1] * plc_source_table.num_rows()) @@ -428,9 +425,9 @@ def test_scatter_scalars_type_mismatch(index_column, target_table): _, plc_index_column = index_column _, plc_target_table = target_table with cudf_raises(TypeError): - if is_integer( + if plc.traits.is_integral_not_bool( dtype := plc_target_table.columns()[0].type() - ) or is_floating(dtype): + ) or plc.traits.is_floating_point(dtype): plc_source_scalar = [plc.interop.from_arrow(pa.scalar(True))] else: plc_source_scalar = [plc.interop.from_arrow(pa.scalar(1))] @@ -458,7 +455,7 @@ def test_empty_like_table(source_table): @pytest.mark.parametrize("size", [None, 10]) def test_allocate_like(input_column, size): _, plc_input_column = input_column - if is_fixed_width(plc_input_column.type()): + if plc.traits.is_fixed_width(plc_input_column.type()): result = plc.copying.allocate_like( plc_input_column, plc.copying.MaskAllocationPolicy.RETAIN, @@ -484,7 +481,7 @@ def test_copy_range_in_place( pa_target_column, _ = target_column - if not is_fixed_width(mutable_target_column.type()): + if not plc.traits.is_fixed_width(mutable_target_column.type()): with pytest.raises(TypeError): plc.copying.copy_range_in_place( plc_input_column, @@ -516,7 +513,7 @@ def test_copy_range_in_place_out_of_bounds( ): _, plc_input_column = input_column - if is_fixed_width(mutable_target_column.type()): + if plc.traits.is_fixed_width(mutable_target_column.type()): with cudf_raises(IndexError): plc.copying.copy_range_in_place( plc_input_column, @@ -528,7 +525,9 @@ def test_copy_range_in_place_out_of_bounds( def test_copy_range_in_place_different_types(mutable_target_column): - if is_integer(dtype := mutable_target_column.type()) or is_floating(dtype): + if plc.traits.is_integral_not_bool( + dtype := mutable_target_column.type() + ) or plc.traits.is_floating_point(dtype): plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) else: plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) @@ -548,7 +547,7 @@ def test_copy_range_in_place_null_mismatch( ): pa_input_column, _ = input_column - if is_fixed_width(mutable_target_column.type()): + if plc.traits.is_fixed_width(mutable_target_column.type()): pa_input_column = pc.if_else( _pyarrow_index_to_mask([0], len(pa_input_column)), pa_input_column, @@ -568,7 +567,9 @@ def test_copy_range_in_place_null_mismatch( def test_copy_range(input_column, target_column): pa_input_column, plc_input_column = input_column pa_target_column, plc_target_column = target_column - if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype): + if plc.traits.is_fixed_width( + dtype := plc_target_column.type() + ) or is_string(dtype): result = plc.copying.copy_range( plc_input_column, plc_target_column, @@ -610,7 +611,9 @@ def test_copy_range_out_of_bounds(input_column, target_column): def test_copy_range_different_types(target_column): _, plc_target_column = target_column - if is_integer(dtype := plc_target_column.type()) or is_floating(dtype): + if plc.traits.is_integral_not_bool( + dtype := plc_target_column.type() + ) or plc.traits.is_floating_point(dtype): plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) else: plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) @@ -629,7 +632,9 @@ def test_shift(target_column, source_scalar): pa_source_scalar, plc_source_scalar = source_scalar pa_target_column, plc_target_column = target_column shift = 2 - if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype): + if plc.traits.is_fixed_width( + dtype := plc_target_column.type() + ) or is_string(dtype): result = plc.copying.shift(plc_target_column, shift, plc_source_scalar) expected = pa.concat_arrays( [pa.array([pa_source_scalar] * shift), pa_target_column[:-shift]] @@ -642,7 +647,9 @@ def test_shift(target_column, source_scalar): def test_shift_type_mismatch(target_column): _, plc_target_column = target_column - if is_integer(dtype := plc_target_column.type()) or is_floating(dtype): + if plc.traits.is_integral_not_bool( + dtype := plc_target_column.type() + ) or plc.traits.is_floating_point(dtype): fill_value = plc.interop.from_arrow(pa.scalar("a")) else: fill_value = plc.interop.from_arrow(pa.scalar(1)) @@ -747,7 +754,9 @@ def test_copy_if_else_column_column(target_column, mask, source_scalar): def test_copy_if_else_wrong_type(target_column, mask): _, plc_target_column = target_column _, plc_mask = mask - if is_integer(dtype := plc_target_column.type()) or is_floating(dtype): + if plc.traits.is_integral_not_bool( + dtype := plc_target_column.type() + ) or plc.traits.is_floating_point(dtype): plc_input_column = plc.interop.from_arrow( pa.array(["a"] * plc_target_column.size()) ) @@ -951,9 +960,9 @@ def test_boolean_mask_scatter_from_wrong_num_true(source_table, target_table): def test_boolean_mask_scatter_from_wrong_col_type(target_table, mask): _, plc_target_table = target_table _, plc_mask = mask - if is_integer( + if plc.traits.is_integral_not_bool( dtype := plc_target_table.columns()[0].type() - ) or is_floating(dtype): + ) or plc.traits.is_floating_point(dtype): input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"])) else: input_column = plc.interop.from_arrow(pa.array([1, 2, 3])) diff --git a/python/cudf/cudf/pylibcudf_tests/test_traits.py b/python/cudf/cudf/pylibcudf_tests/test_traits.py new file mode 100644 index 00000000000..6c22cb02f21 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_traits.py @@ -0,0 +1,110 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib import pylibcudf as plc + + +def test_is_relationally_comparable(): + assert plc.traits.is_relationally_comparable(plc.DataType(plc.TypeId.INT8)) + assert not plc.traits.is_relationally_comparable( + plc.DataType(plc.TypeId.LIST) + ) + + +def test_is_equality_comparable(): + assert plc.traits.is_equality_comparable(plc.DataType(plc.TypeId.INT8)) + assert not plc.traits.is_equality_comparable(plc.DataType(plc.TypeId.LIST)) + + +def test_is_numeric(): + assert plc.traits.is_numeric(plc.DataType(plc.TypeId.FLOAT64)) + assert not plc.traits.is_numeric(plc.DataType(plc.TypeId.LIST)) + + +def test_is_index_type(): + assert plc.traits.is_index_type(plc.DataType(plc.TypeId.INT8)) + assert not plc.traits.is_index_type(plc.DataType(plc.TypeId.BOOL8)) + + +def test_is_unsigned(): + assert plc.traits.is_unsigned(plc.DataType(plc.TypeId.UINT8)) + assert not plc.traits.is_unsigned(plc.DataType(plc.TypeId.INT8)) + + +def test_is_integral(): + assert plc.traits.is_integral(plc.DataType(plc.TypeId.BOOL8)) + assert not plc.traits.is_integral(plc.DataType(plc.TypeId.DECIMAL32)) + + +def test_is_integral_not_bool(): + assert plc.traits.is_integral_not_bool(plc.DataType(plc.TypeId.INT8)) + assert not plc.traits.is_integral_not_bool(plc.DataType(plc.TypeId.BOOL8)) + + +def test_is_floating_point(): + assert plc.traits.is_floating_point(plc.DataType(plc.TypeId.FLOAT64)) + assert not plc.traits.is_floating_point(plc.DataType(plc.TypeId.UINT8)) + + +def test_is_boolean(): + assert plc.traits.is_boolean(plc.DataType(plc.TypeId.BOOL8)) + assert not plc.traits.is_boolean(plc.DataType(plc.TypeId.UINT8)) + + +def test_is_timestamp(): + assert plc.traits.is_timestamp( + plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS) + ) + assert not plc.traits.is_timestamp( + plc.DataType(plc.TypeId.DURATION_MICROSECONDS) + ) + + +def test_is_fixed_point(): + assert plc.traits.is_fixed_point(plc.DataType(plc.TypeId.DECIMAL128)) + assert not plc.traits.is_fixed_point(plc.DataType(plc.TypeId.FLOAT32)) + + +def test_is_duration(): + assert plc.traits.is_duration( + plc.DataType(plc.TypeId.DURATION_MICROSECONDS) + ) + assert not plc.traits.is_duration( + plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS) + ) + + +def test_is_chrono(): + assert plc.traits.is_chrono(plc.DataType(plc.TypeId.DURATION_MICROSECONDS)) + assert plc.traits.is_chrono( + plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS) + ) + assert not plc.traits.is_chrono(plc.DataType(plc.TypeId.UINT8)) + + +def test_is_dictionary(): + assert plc.traits.is_dictionary(plc.DataType(plc.TypeId.DICTIONARY32)) + assert not plc.traits.is_dictionary(plc.DataType(plc.TypeId.UINT8)) + + +def test_is_fixed_width(): + assert plc.traits.is_fixed_width(plc.DataType(plc.TypeId.INT8)) + assert not plc.traits.is_fixed_width(plc.DataType(plc.TypeId.STRING)) + + +def test_is_compound(): + assert plc.traits.is_compound(plc.DataType(plc.TypeId.STRUCT)) + assert not plc.traits.is_compound(plc.DataType(plc.TypeId.UINT8)) + + +def test_is_nested(): + assert plc.traits.is_nested(plc.DataType(plc.TypeId.STRUCT)) + assert not plc.traits.is_nested(plc.DataType(plc.TypeId.STRING)) + + +def test_is_bit_castable(): + assert plc.traits.is_bit_castable( + plc.DataType(plc.TypeId.INT8), plc.DataType(plc.TypeId.UINT8) + ) + assert not plc.traits.is_bit_castable( + plc.DataType(plc.TypeId.UINT8), plc.DataType(plc.TypeId.UINT16) + ) From 4fb4ea9cd1d0d49bc457f903d320a44147f01d47 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 4 Jul 2024 13:43:13 +0000 Subject: [PATCH 15/21] Use new pylibcudf type traits in polars interpreter --- python/cudf_polars/cudf_polars/dsl/expr.py | 3 ++- python/cudf_polars/cudf_polars/utils/dtypes.py | 13 ------------- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index fe859c8d958..b8c3e312ac8 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -1182,7 +1182,8 @@ def __init__( self.children = (left, right) if ( op in (plc.binaryop.BinaryOperator.ADD, plc.binaryop.BinaryOperator.SUB) - and ({left.dtype.id(), right.dtype.id()}.issubset(dtypes.TIMELIKE_TYPES)) + and plc.traits.is_chrono(left.dtype) + and plc.traits.is_chrono(right.dtype) and not dtypes.have_compatible_resolution(left.dtype.id(), right.dtype.id()) ): raise NotImplementedError("Casting rules for timelike types") diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 507acb5d33a..918cd024fa2 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -17,19 +17,6 @@ __all__ = ["from_polars", "downcast_arrow_lists", "have_compatible_resolution"] -TIMELIKE_TYPES: frozenset[plc.TypeId] = frozenset( - [ - plc.TypeId.TIMESTAMP_MILLISECONDS, - plc.TypeId.TIMESTAMP_MICROSECONDS, - plc.TypeId.TIMESTAMP_NANOSECONDS, - plc.TypeId.TIMESTAMP_DAYS, - plc.TypeId.DURATION_MILLISECONDS, - plc.TypeId.DURATION_MICROSECONDS, - plc.TypeId.DURATION_NANOSECONDS, - ] -) - - def have_compatible_resolution(lid: plc.TypeId, rid: plc.TypeId): """ Do two datetime typeids have matching resolution for a binop. From 460439b808753b498b0870e8ab6bad2a200f9ac0 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 4 Jul 2024 14:32:21 +0000 Subject: [PATCH 16/21] Alphabetise --- python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 2 +- python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index c0146f3ff50..d22096081af 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -38,8 +38,8 @@ set(cython_sources stream_compaction.pyx sorting.pyx table.pyx - types.pyx traits.pyx + types.pyx unary.pyx utils.pyx ) diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index 2376b0b0582..61517aa737f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -63,7 +63,7 @@ __all__ = [ "stream_compaction", "strings", "sorting", - "types", "traits", + "types", "unary", ] From afd59416e8748bae48d3e7040f96ffec728d31e7 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 4 Jul 2024 14:33:25 +0000 Subject: [PATCH 17/21] MaskState not in all --- python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index 61517aa737f..d4d615cde34 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -38,7 +38,6 @@ from .types cimport DataType, type_id __all__ = [ "Column", "DataType", - "MaskState", "Scalar", "Table", "aggregation", From 23f93cb280389d267713858338959c5a7ef5c344 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 4 Jul 2024 14:34:32 +0000 Subject: [PATCH 18/21] So bad at the alphabet --- python/cudf/cudf/_lib/pylibcudf/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 8610bf175d0..91f8acaf682 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -55,8 +55,8 @@ "merge", "quantiles", "reduce", - "reshape", "replace", + "reshape", "rolling", "round", "search", From 53ca18ca2aef774a315a2ebde267d4ab1ff6bba7 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 5 Jul 2024 09:55:14 +0000 Subject: [PATCH 19/21] Link into docs and fix sphinx errors Impossible to use See Also sections with cross-linking to C++ function names. --- .../user_guide/api_docs/pylibcudf/index.rst | 7 +- .../user_guide/api_docs/pylibcudf/traits.rst | 6 ++ python/cudf/cudf/_lib/pylibcudf/traits.pyx | 72 +++++-------------- 3 files changed, 28 insertions(+), 57 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index e9dad705cbf..bd6f0f77357 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -18,22 +18,22 @@ This page provides API documentation for pylibcudf. filling gpumemoryview groupby - io/index.rst interop join lists merge quantiles reduce + replace reshape rolling round scalar search - stream_compaction sorting - replace + stream_compaction table + traits types unary @@ -41,4 +41,5 @@ This page provides API documentation for pylibcudf. :maxdepth: 2 :caption: Subpackages + io/index.rst strings/index.rst diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst new file mode 100644 index 00000000000..294ca8dc78c --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst @@ -0,0 +1,6 @@ +====== +traits +====== + +.. automodule:: cudf._lib.pylibcudf.traits + :members: diff --git a/python/cudf/cudf/_lib/pylibcudf/traits.pyx b/python/cudf/cudf/_lib/pylibcudf/traits.pyx index fdd8f1e5057..d2370f8d641 100644 --- a/python/cudf/cudf/_lib/pylibcudf/traits.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/traits.pyx @@ -10,9 +10,7 @@ from .types cimport DataType cpdef bool is_relationally_comparable(DataType typ): """Checks if the given data type supports relational comparisons. - See Also - -------- - :cpp:func:`is_relationally_comparable` + For details, see :cpp:func:`is_relationally_comparable`. """ return traits.is_relationally_comparable(typ.c_obj) @@ -20,9 +18,7 @@ cpdef bool is_relationally_comparable(DataType typ): cpdef bool is_equality_comparable(DataType typ): """Checks if the given data type supports equality comparisons. - See Also - -------- - :cpp:func:`is_equality_comparable` + For details, see :cpp:func:`is_equality_comparable`. """ return traits.is_equality_comparable(typ.c_obj) @@ -30,9 +26,7 @@ cpdef bool is_equality_comparable(DataType typ): cpdef bool is_numeric(DataType typ): """Checks if the given data type is numeric. - See Also - -------- - :cpp:func:`is_numeric` + For details, see :cpp:func:`is_numeric`. """ return traits.is_numeric(typ.c_obj) @@ -40,9 +34,7 @@ cpdef bool is_numeric(DataType typ): cpdef bool is_index_type(DataType typ): """Checks if the given data type is an index type. - See Also - -------- - :cpp:func:`is_index_type` + For details, see :cpp:func:`is_index_type`. """ return traits.is_index_type(typ.c_obj) @@ -50,9 +42,7 @@ cpdef bool is_index_type(DataType typ): cpdef bool is_unsigned(DataType typ): """Checks if the given data type is an unsigned type. - See Also - -------- - :cpp:func:`is_unsigned` + For details, see :cpp:func:`is_unsigned`. """ return traits.is_unsigned(typ.c_obj) @@ -60,9 +50,7 @@ cpdef bool is_unsigned(DataType typ): cpdef bool is_integral(DataType typ): """Checks if the given data type is an integral type. - See Also - -------- - :cpp:func:`is_integral` + For details, see :cpp:func:`is_integral`. """ return traits.is_integral(typ.c_obj) @@ -70,9 +58,7 @@ cpdef bool is_integral(DataType typ): cpdef bool is_integral_not_bool(DataType typ): """Checks if the given data type is an integral type excluding booleans. - See Also - -------- - :cpp:func:`is_integral_not_bool` + For details, see :cpp:func:`is_integral_not_bool`. """ return traits.is_integral_not_bool(typ.c_obj) @@ -80,9 +66,7 @@ cpdef bool is_integral_not_bool(DataType typ): cpdef bool is_floating_point(DataType typ): """Checks if the given data type is a floating point type. - See Also - -------- - :cpp:func:`is_floating_point` + For details, see :cpp:func:`is_floating_point`. """ return traits.is_floating_point(typ.c_obj) @@ -90,9 +74,7 @@ cpdef bool is_floating_point(DataType typ): cpdef bool is_boolean(DataType typ): """Checks if the given data type is a boolean type. - See Also - -------- - :cpp:func:`is_boolean` + For details, see :cpp:func:`is_boolean`. """ return traits.is_boolean(typ.c_obj) @@ -100,9 +82,7 @@ cpdef bool is_boolean(DataType typ): cpdef bool is_timestamp(DataType typ): """Checks if the given data type is a timestamp type. - See Also - -------- - :cpp:func:`is_timestamp` + For details, see :cpp:func:`is_timestamp`. """ return traits.is_timestamp(typ.c_obj) @@ -110,9 +90,7 @@ cpdef bool is_timestamp(DataType typ): cpdef bool is_fixed_point(DataType typ): """Checks if the given data type is a fixed point type. - See Also - -------- - :cpp:func:`is_fixed_point` + For details, see :cpp:func:`is_fixed_point`. """ return traits.is_fixed_point(typ.c_obj) @@ -120,9 +98,7 @@ cpdef bool is_fixed_point(DataType typ): cpdef bool is_duration(DataType typ): """Checks if the given data type is a duration type. - See Also - -------- - :cpp:func:`is_duration` + For details, see :cpp:func:`is_duration`. """ return traits.is_duration(typ.c_obj) @@ -130,9 +106,7 @@ cpdef bool is_duration(DataType typ): cpdef bool is_chrono(DataType typ): """Checks if the given data type is a chrono type. - See Also - -------- - :cpp:func:`is_chrono` + For details, see :cpp:func:`is_chrono`. """ return traits.is_chrono(typ.c_obj) @@ -140,9 +114,7 @@ cpdef bool is_chrono(DataType typ): cpdef bool is_dictionary(DataType typ): """Checks if the given data type is a dictionary type. - See Also - -------- - :cpp:func:`is_dictionary` + For details, see :cpp:func:`is_dictionary`. """ return traits.is_dictionary(typ.c_obj) @@ -150,9 +122,7 @@ cpdef bool is_dictionary(DataType typ): cpdef bool is_fixed_width(DataType typ): """Checks if the given data type is a fixed width type. - See Also - -------- - :cpp:func:`is_fixed_width` + For details, see :cpp:func:`is_fixed_width`. """ return traits.is_fixed_width(typ.c_obj) @@ -160,9 +130,7 @@ cpdef bool is_fixed_width(DataType typ): cpdef bool is_compound(DataType typ): """Checks if the given data type is a compound type. - See Also - -------- - :cpp:func:`is_compound` + For details, see :cpp:func:`is_compound`. """ return traits.is_compound(typ.c_obj) @@ -170,9 +138,7 @@ cpdef bool is_compound(DataType typ): cpdef bool is_nested(DataType typ): """Checks if the given data type is a nested type. - See Also - -------- - :cpp:func:`is_nested` + For details, see :cpp:func:`is_nested`. """ return traits.is_nested(typ.c_obj) @@ -180,8 +146,6 @@ cpdef bool is_nested(DataType typ): cpdef bool is_bit_castable(DataType source, DataType target): """Checks if the source type is bit-castable to the target type. - See Also - -------- - :cpp:func:`is_bit_castable` + For details, see :cpp:func:`is_bit_castable`. """ return traits.is_bit_castable(source.c_obj, target.c_obj) From 964ff7272a6e2e7899743906f706fb65d8131795 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 5 Jul 2024 17:26:51 +0000 Subject: [PATCH 20/21] Avoid out of bounds access in read_csv If one passes an empty list for the names, things should work. --- python/cudf/cudf/_lib/csv.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index c706351a683..9fecff5f5f6 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -450,7 +450,7 @@ def read_csv( col_name = df._data.names[index] df._data[col_name] = df._data[col_name].astype(col_dtype) - if names is not None and isinstance(names[0], (int)): + if names is not None and len(names) and isinstance(names[0], (int)): df.columns = [int(x) for x in df._data] # Set index if the index_col parameter is passed From 79a260502691d6733145957cadf4e5784ce75532 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 5 Jul 2024 17:33:05 +0000 Subject: [PATCH 21/21] WIP: Handling csv reader options Mostly there, just need to flesh out tests for coverage. --- python/cudf_polars/cudf_polars/dsl/ir.py | 99 ++++++++++++++++--- .../cudf_polars/cudf_polars/dsl/translate.py | 9 +- python/cudf_polars/tests/test_scan.py | 17 ++++ 3 files changed, 112 insertions(+), 13 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 60fe3252700..056f3c76f78 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -15,9 +15,9 @@ import dataclasses import itertools -import json import types from functools import cache +from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, ClassVar import pyarrow as pa @@ -185,8 +185,10 @@ class Scan(IR): typ: str """What type of file are we reading? Parquet, CSV, etc...""" - options: tuple[Any, ...] - """Type specific options, as json-encoded strings.""" + reader_options: dict[str, Any] + """Reader-specific options, as dictionary.""" + cloud_options: dict[str, Any] | None + """Cloud-related authentication options, currently ignored.""" paths: list[str] """List of paths to read from.""" file_options: Any @@ -206,9 +208,27 @@ def __post_init__(self) -> None: if self.file_options.n_rows is not None: raise NotImplementedError("row limit in scan") if self.typ not in ("csv", "parquet"): - raise NotImplementedError( - f"Unhandled scan type: {self.typ}" - ) # pragma: no cover; polars raises on the rust side for now + raise NotImplementedError(f"Unhandled scan type: {self.typ}") + if self.cloud_options is not None and any( + self.cloud_options[k] is not None for k in ("aws", "azure", "gcp") + ): + raise NotImplementedError("Read from cloud storage") + if self.typ == "csv": + if self.reader_options["skip_rows_after_header"] != 0: + raise NotImplementedError("Skipping rows after header in CSV reader") + parse_options = self.reader_options["parse_options"] + if ( + null_values := parse_options["null_values"] + ) is not None and "Named" in null_values: + raise NotImplementedError( + "Per column null value specification not supported for CSV reader" + ) + if ( + comment := parse_options["comment_prefix"] + ) is not None and "Multi" in comment: + raise NotImplementedError( + "Multi-character comment prefix not supported for CSV reader" + ) def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" @@ -216,14 +236,69 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: with_columns = options.with_columns row_index = options.row_index if self.typ == "csv": - opts, cloud_opts = map(json.loads, self.options) - df = DataFrame.from_cudf( - cudf.concat( - [cudf.read_csv(p, usecols=with_columns) for p in self.paths] + dtype_map = { + name: cudf._lib.types.PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[typ.id()] + for name, typ in self.schema.items() + } + parse_options = self.reader_options["parse_options"] + sep = chr(parse_options["separator"]) + quote = chr(parse_options["quote_char"]) + eol = chr(parse_options["eol_char"]) + if self.reader_options["schema"] is not None: + # Reader schema provides names + column_names = list(self.reader_options["schema"]["inner"].keys()) + else: + # file provides column names + column_names = None + usecols = with_columns + header = 0 if self.reader_options["has_header"] else None + + # polars defaults to no null recognition + null_values = [""] + if parse_options["null_values"] is not None: + ((typ, nulls),) = parse_options["null_values"].items() + if typ == "AllColumnsSingle": + # Single value + null_values.append(nulls) + else: + # List of values + null_values.extend(nulls) + if parse_options["comment_prefix"] is not None: + comment = parse_options["comment_prefix"]["Single"] + else: + comment = None + decimal = "," if parse_options["decimal_comma"] else "." + + # polars skips blank lines at the beginning of the file + pieces = [] + for p in self.paths: + skiprows = self.reader_options["skip_rows"] + # TODO: read_csv expands globs which we should not do, + # because polars will already have handled them. + path = Path(p) + with path.open() as f: + while f.readline() == "\n": + skiprows += 1 + pieces.append( + cudf.read_csv( + path, + sep=sep, + quotechar=quote, + lineterminator=eol, + names=column_names, + header=header, + usecols=usecols, + na_filter=True, + na_values=null_values, + keep_default_na=False, + skiprows=skiprows, + comment=comment, + decimal=decimal, + dtype=dtype_map, + ) ) - ) + df = DataFrame.from_cudf(cudf.concat(pieces)) elif self.typ == "parquet": - opts, cloud_opts = map(json.loads, self.options) cdf = cudf.read_parquet(self.paths, columns=with_columns) assert isinstance(cdf, cudf.DataFrame) df = DataFrame.from_cudf(cdf) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index b1abaaa0f8c..11092abae59 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -5,6 +5,7 @@ from __future__ import annotations +import json from contextlib import AbstractContextManager, nullcontext from functools import singledispatch from typing import Any @@ -88,10 +89,16 @@ def _( node: pl_ir.Scan, visitor: NodeTraverser, schema: dict[str, plc.DataType] ) -> ir.IR: typ, *options = node.scan_type + if typ == "ndjson": + (reader_options,) = map(json.loads, options) + cloud_options = None + else: + reader_options, cloud_options = map(json.loads, options) return ir.Scan( schema, typ, - tuple(options), + reader_options, + cloud_options, node.paths, node.file_options, translate_named_expr(visitor, n=node.predicate) diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py index af0baacd5a8..2bfd2162300 100644 --- a/python/cudf_polars/tests/test_scan.py +++ b/python/cudf_polars/tests/test_scan.py @@ -108,3 +108,20 @@ def test_scan_row_index_projected_out(tmp_path): q = pl.scan_parquet(tmp_path / "df.pq").with_row_index().select(pl.col("a")) assert_gpu_result_equal(q) + + +def test_scan_csv_column_renames_projection_schema(tmp_path): + with (tmp_path / "test.csv").open("w") as f: + f.write("""foo,bar,baz\n1,2,\n3,4,5""") + + q = pl.scan_csv( + tmp_path / "test.csv", + with_column_names=lambda names: [f"{n}_suffix" for n in names], + schema_overrides={ + "foo_suffix": pl.String(), + "bar_suffix": pl.Int8(), + "baz_suffix": pl.UInt16(), + }, + ) + + assert_gpu_result_equal(q)