From 84e7c4eedfd086ac15857de5055b1049dd08c6db Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 1 Jul 2024 17:15:51 +0000
Subject: [PATCH 01/21] Handle case of broadcasting empty list of columns

---
 python/cudf_polars/cudf_polars/dsl/ir.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 9b3096becd4..31a0be004ea 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -96,6 +96,8 @@ def broadcast(
     ``target_length`` is provided and not all columns are length-1
     (i.e. ``n != 1``), then ``target_length`` must be equal to ``n``.
     """
+    if len(columns) == 0:
+        return []
     lengths: set[int] = {column.obj.size() for column in columns}
     if lengths == {1}:
         if target_length is None:

From 27ac28abb7b9bca67f12bfebaec0bc74a891d3b2 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 1 Jul 2024 17:16:54 +0000
Subject: [PATCH 02/21] Create regex program during StringFunction init

This way, we raise early if we can't support the pattern.
---
 python/cudf_polars/cudf_polars/dsl/expr.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index fe859c8d958..b8fb200b8df 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -679,7 +679,7 @@ def do_evaluate(
 
 
 class StringFunction(Expr):
-    __slots__ = ("name", "options", "children")
+    __slots__ = ("name", "options", "children", "regex_program")
     _non_child = ("dtype", "name", "options")
     children: tuple[Expr, ...]
 
@@ -716,6 +716,13 @@ def _validate_input(self):
                     raise NotImplementedError(
                         "Regex contains only supports a scalar pattern"
                     )
+                try:
+                    self.regex_program = plc.strings.regex_program.RegexProgram.create(
+                        self.children[1].value.as_py(),
+                        flags=plc.strings.regex_flags.RegexFlags.DEFAULT,
+                    )
+                except Exception as e:
+                    raise NotImplementedError("Unsupported regex pattern.") from e
 
     def do_evaluate(
         self,
@@ -739,11 +746,9 @@ def do_evaluate(
                 )
                 return Column(plc.strings.find.contains(column.obj, pattern))
             assert isinstance(arg, Literal)
-            prog = plc.strings.regex_program.RegexProgram.create(
-                arg.value.as_py(),
-                flags=plc.strings.regex_flags.RegexFlags.DEFAULT,
+            return Column(
+                plc.strings.contains.contains_re(column.obj, self.regex_program)
             )
-            return Column(plc.strings.contains.contains_re(column.obj, prog))
         columns = [
             child.evaluate(df, context=context, mapping=mapping)
             for child in self.children

From a4dbd0d158c4db5edd4aa989847c4e325f72d18b Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 1 Jul 2024 17:18:32 +0000
Subject: [PATCH 03/21] Match ordering requirements of polars for left join

---
 python/cudf_polars/cudf_polars/dsl/ir.py | 61 ++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 31a0be004ea..2fc7820b40b 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -575,6 +575,62 @@ def _joiners(
         else:
             assert_never(how)
 
+    def _reorder_maps(
+        self,
+        left_rows: int,
+        lg: plc.Column,
+        left_policy: plc.copying.OutOfBoundsPolicy,
+        right_rows: int,
+        rg: plc.Column,
+        right_policy: plc.copying.OutOfBoundsPolicy,
+    ) -> tuple[plc.Column, plc.Column]:
+        """
+        Reorder gather maps to satisfy polars join order restrictions.
+
+        Parameters
+        ----------
+        left_rows
+            Number of rows in left table
+        lg
+            Left gather map
+        left_policy
+            Nullify policy for left map
+        right_rows
+            Number of rows in right table
+        rg
+            Right gathert map
+        right_policy
+            Nullify policy for right map
+
+        Returns
+        -------
+        tuple of reordered left and right gather maps.
+
+        Notes
+        -----
+        For a left join, the polars result preserves the order of the
+        left keys, and is stable wrt the right keys. For all other
+        joins, there is no order obligation.
+        """
+        # TODO: size_type
+        dt = plc.interop.to_arrow(plc.DataType(plc.TypeId.INT32))
+        init = plc.interop.from_arrow(pa.scalar(0, type=dt))
+        step = plc.interop.from_arrow(pa.scalar(1, type=dt))
+        left_order = plc.copying.gather(
+            plc.Table([plc.filling.sequence(left_rows, init, step)]), lg, left_policy
+        )
+        right_order = plc.copying.gather(
+            plc.Table([plc.filling.sequence(right_rows, init, step)]), rg, right_policy
+        )
+        return tuple(
+            plc.sorting.stable_sort_by_key(
+                plc.Table([lg, rg]),
+                plc.Table([*left_order.columns(), *right_order.columns()]),
+                [plc.types.Order.ASCENDING, plc.types.Order.ASCENDING],
+                [plc.types.NullOrder.AFTER, plc.types.NullOrder.AFTER],
+            ).columns()
+        )
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         left = self.left.evaluate(cache=cache)
@@ -617,6 +673,11 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             lg, rg = join_fn(left_on.table, right_on.table, null_equality)
             if coalesce and how == "inner":
                 right = right.discard_columns(right_on.column_names_set)
+            if how == "left":
+                # Order of left table is preserved
+                lg, rg = self._reorder_maps(
+                    left.num_rows, lg, left_policy, right.num_rows, rg, right_policy
+                )
             left = DataFrame.from_table(
                 plc.copying.gather(left.table, lg, left_policy), left.column_names
             )

From 2a628dac9c8e85036e6ec291e63eedd5ee610a8b Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 1 Jul 2024 17:19:09 +0000
Subject: [PATCH 04/21] Allow specifying exceptions to catch in
 execute_with_cudf

Default to Exception so we can catch errors from arrow and other
third-party libraries.
---
 python/cudf_polars/cudf_polars/callback.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index 979087d5273..764cdd3b3ca 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -34,7 +34,12 @@ def _callback(
         return ir.evaluate(cache={}).to_polars()
 
 
-def execute_with_cudf(nt: NodeTraverser, *, raise_on_fail: bool = False) -> None:
+def execute_with_cudf(
+    nt: NodeTraverser,
+    *,
+    raise_on_fail: bool = False,
+    exception: type[Exception] | tuple[type[Exception], ...] = Exception,
+) -> None:
     """
     A post optimization callback that attempts to execute the plan with cudf.
 
@@ -47,11 +52,15 @@ def execute_with_cudf(nt: NodeTraverser, *, raise_on_fail: bool = False) -> None
         Should conversion raise an exception rather than continuing
         without setting a callback.
 
+    exception
+        Optional exception, or tuple of exceptions, to catch during
+        translation. Defaults to ``Exception``.
+
     The NodeTraverser is mutated if the libcudf executor can handle the plan.
     """
     try:
         with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
             nt.set_udf(partial(_callback, translate_ir(nt)))
-    except NotImplementedError:
+    except exception:
         if raise_on_fail:
             raise

From 917b5a75409c9bc466905c320696e08688498a0c Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 1 Jul 2024 17:20:41 +0000
Subject: [PATCH 05/21] Fix bug in documented behaviour of with_columns

We were not previously discarding overlapping column names.
---
 python/cudf_polars/cudf_polars/containers/dataframe.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index ec8d00c3123..d86656578d7 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -5,6 +5,7 @@
 
 from __future__ import annotations
 
+import itertools
 from functools import cached_property
 from typing import TYPE_CHECKING, cast
 
@@ -160,7 +161,10 @@ def with_columns(self, columns: Sequence[NamedColumn]) -> Self:
         -----
         If column names overlap, newer names replace older ones.
         """
-        return type(self)([*self.columns, *columns])
+        columns = list(
+            {c.name: c for c in itertools.chain(self.columns, columns)}.values()
+        )
+        return type(self)(columns)
 
     def discard_columns(self, names: Set[str]) -> Self:
         """Drop columns by name."""

From 672b356d6fa440e17a848eccee4bf54824ded30b Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 1 Jul 2024 17:21:19 +0000
Subject: [PATCH 06/21] Allow dataframe to have overlapping names

Only complain when trying to select columns. This is a bit of a
band-aid, should fix closer to the source: intermediate results don't
necessarily need names (e.g. join keys).
---
 python/cudf_polars/cudf_polars/containers/dataframe.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index d86656578d7..cf7c2ab7fa0 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -173,6 +173,8 @@ def discard_columns(self, names: Set[str]) -> Self:
     def select(self, names: Sequence[str]) -> Self:
         """Select columns by name returning DataFrame."""
         want = set(names)
+        if len(self.columns) != len(self._column_map):
+            raise ValueError("Lost some columns with overlapping names")
         if not want.issubset(self.column_names_set):
             raise ValueError("Can't select missing names")
         return type(self)([self._column_map[name] for name in names])

From 9465011165649d2223db11f379e5fa58d4b0d840 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 1 Jul 2024 17:24:27 +0000
Subject: [PATCH 07/21] Cast count aggs to correct dtype in translation

---
 python/cudf_polars/cudf_polars/dsl/translate.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index a2fdb3c3d79..fc350048082 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -443,12 +443,15 @@ def _(node: pl_expr.Column, visitor: NodeTraverser, dtype: plc.DataType) -> expr
 
 @_translate_expr.register
 def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
-    return expr.Agg(
+    value = expr.Agg(
         dtype,
         node.name,
         node.options,
         *(translate_expr(visitor, n=n) for n in node.arguments),
     )
+    if value.name == "count" and value.dtype.id() != plc.TypeId.INT32:
+        return expr.Cast(value.dtype, value)
+    return value
 
 
 @_translate_expr.register
@@ -475,7 +478,10 @@ def _(
 
 @_translate_expr.register
 def _(node: pl_expr.Len, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
-    return expr.Len(dtype)
+    value = expr.Len(dtype)
+    if dtype.id() != plc.TypeId.INT32:
+        return expr.Cast(dtype, value)
+    return value
 
 
 def translate_expr(visitor: NodeTraverser, *, n: int) -> expr.Expr:

From 124dc5c5f6676dfe0f110ad267b66780bb11ce73 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 1 Jul 2024 17:24:57 +0000
Subject: [PATCH 08/21] Raise on unsupported nested types

---
 python/cudf_polars/cudf_polars/utils/dtypes.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index 507acb5d33a..da9bb99a6d1 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -167,6 +167,8 @@ def from_polars(dtype: pl.DataType) -> plc.DataType:
         return plc.DataType(plc.TypeId.EMPTY)
     elif isinstance(dtype, pl.List):
         # TODO: This doesn't consider the value type.
+        # Recurse to catch unsupported inner types
+        _ = from_polars(dtype.inner)
         return plc.DataType(plc.TypeId.LIST)
     else:
         raise NotImplementedError(f"{dtype=} conversion not supported")

From 72c7883fdc5ef94db2f966ddbac26270bfccaea2 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 1 Jul 2024 17:25:14 +0000
Subject: [PATCH 09/21] Handle empty column name in conversion to polars

---
 python/cudf_polars/cudf_polars/containers/dataframe.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index cf7c2ab7fa0..7270b99e61f 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -51,7 +51,11 @@ def to_polars(self) -> pl.DataFrame:
             [plc.interop.ColumnMetadata(name=c.name) for c in self.columns],
         )
 
-        return cast(pl.DataFrame, pl.from_arrow(table))
+        result: pl.DataFrame = cast(pl.DataFrame, pl.from_arrow(table))
+        if list(result.schema.keys()) != self.column_names:
+            # Empty column name gets translated in from_arrow
+            return result.rename(dict(zip(result.schema.keys(), self.column_names)))
+        return result
 
     @cached_property
     def column_names_set(self) -> frozenset[str]:

From dc0cb3f82564bc51b1c653987e0f504f13188df1 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 1 Jul 2024 17:26:01 +0000
Subject: [PATCH 10/21] Raise for unsupported cast to/from strings

---
 python/cudf_polars/cudf_polars/dsl/expr.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index b8fb200b8df..fdb302e73b6 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -960,6 +960,13 @@ class Cast(Expr):
     def __init__(self, dtype: plc.DataType, value: Expr) -> None:
         super().__init__(dtype)
         self.children = (value,)
+        if (
+            self.dtype.id() == plc.TypeId.STRING
+            or value.dtype.id() == plc.TypeId.STRING
+        ):
+            raise NotImplementedError(
+                "Need to implement cast to/from string separately."
+            )
 
     def do_evaluate(
         self,

From 16d7b86b281fbf9e296d7884c0b867a06e250369 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 1 Jul 2024 17:26:21 +0000
Subject: [PATCH 11/21] WIP: Fix bug in HConcat

Shorter tables must be extended with nulls before concatenation.
---
 python/cudf_polars/cudf_polars/dsl/ir.py | 33 ++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 2fc7820b40b..1f531af1673 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -1017,6 +1017,14 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         ).slice(self.zlice)
 
 
+def empty_column(
+    nrows: int, dtype: plc.DataType, mask_state: plc.MaskState
+) -> plc.Column:
+    if dtype.id() in (plc.TypeId.LIST, plc.TypeId.STRING):
+        raise NotImplementedError("Empty list/string column")
+    return plc.column_factories.make_fixed_width_column(dtype, nrows, mask_state)
+
+
 @dataclasses.dataclass
 class HConcat(IR):
     """Concatenate dataframes horizontally."""
@@ -1027,6 +1035,31 @@ class HConcat(IR):
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         dfs = [df.evaluate(cache=cache) for df in self.dfs]
+        max_rows = max(df.num_rows for df in dfs)
+        # Horizontal concatenation extends shorter tables with nulls
+        dfs = [
+            df
+            if df.num_rows == max_rows
+            else DataFrame.from_table(
+                plc.concatenate.concatenate(
+                    [
+                        df.table,
+                        plc.Table(
+                            [
+                                empty_column(
+                                    max_rows - df.num_rows,
+                                    c.obj.type(),
+                                    plc.MaskState.ALL_NULL,
+                                )
+                                for c in df.columns
+                            ]
+                        ),
+                    ]
+                ),
+                df.column_names,
+            )
+            for df in dfs
+        ]
         return DataFrame(
             list(itertools.chain.from_iterable(df.columns for df in dfs)),
         )

From cbc493e8ae29625b75b3afa8c1413035eaaf726e Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 1 Jul 2024 17:27:01 +0000
Subject: [PATCH 12/21] Translate BinOp.Add between strings to ConcatHorizontal

This immediately raises, but previously we had a runtime error.
---
 python/cudf_polars/cudf_polars/dsl/translate.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index fc350048082..b1abaaa0f8c 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -468,12 +468,13 @@ def _(node: pl_expr.Ternary, visitor: NodeTraverser, dtype: plc.DataType) -> exp
 def _(
     node: pl_expr.BinaryExpr, visitor: NodeTraverser, dtype: plc.DataType
 ) -> expr.Expr:
-    return expr.BinOp(
-        dtype,
-        expr.BinOp._MAPPING[node.op],
-        translate_expr(visitor, n=node.left),
-        translate_expr(visitor, n=node.right),
-    )
+    left = translate_expr(visitor, n=node.left)
+    right = translate_expr(visitor, n=node.right)
+    if dtype.id() == plc.TypeId.STRING and node.op == pl_expr.Operator.PLUS:
+        return expr.StringFunction(
+            dtype, pl_expr.StringFunction.ConcatHorizontal, (), left, right
+        )
+    return expr.BinOp(dtype, expr.BinOp._MAPPING[node.op], left, right)
 
 
 @_translate_expr.register

From ffb000839b98afcc8c3e86a619c646d4fb84a611 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 2 Jul 2024 11:29:56 +0000
Subject: [PATCH 13/21] Only produce row_index if the schema demands it in Scan

---
 python/cudf_polars/cudf_polars/dsl/ir.py | 32 +++++++++++++-----------
 python/cudf_polars/tests/test_scan.py    | 11 ++++++++
 2 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 1f531af1673..60fe3252700 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -233,21 +233,23 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             )  # pragma: no cover; post init trips first
         if row_index is not None:
             name, offset = row_index
-            dtype = self.schema[name]
-            step = plc.interop.from_arrow(
-                pa.scalar(1, type=plc.interop.to_arrow(dtype))
-            )
-            init = plc.interop.from_arrow(
-                pa.scalar(offset, type=plc.interop.to_arrow(dtype))
-            )
-            index = NamedColumn(
-                plc.filling.sequence(df.num_rows, init, step),
-                name,
-                is_sorted=plc.types.Sorted.YES,
-                order=plc.types.Order.ASCENDING,
-                null_order=plc.types.NullOrder.AFTER,
-            )
-            df = DataFrame([index, *df.columns])
+            # Only make row index if the schema demands it
+            if name in self.schema:
+                dtype = self.schema[name]
+                step = plc.interop.from_arrow(
+                    pa.scalar(1, type=plc.interop.to_arrow(dtype))
+                )
+                init = plc.interop.from_arrow(
+                    pa.scalar(offset, type=plc.interop.to_arrow(dtype))
+                )
+                index = NamedColumn(
+                    plc.filling.sequence(df.num_rows, init, step),
+                    name,
+                    is_sorted=plc.types.Sorted.YES,
+                    order=plc.types.Order.ASCENDING,
+                    null_order=plc.types.NullOrder.AFTER,
+                )
+                df = DataFrame([index, *df.columns])
         # TODO: should be true, but not the case until we get
         # cudf-classic out of the loop for IO since it converts date32
         # to datetime.
diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py
index f129cc7ca32..af0baacd5a8 100644
--- a/python/cudf_polars/tests/test_scan.py
+++ b/python/cudf_polars/tests/test_scan.py
@@ -97,3 +97,14 @@ def test_scan_unsupported_raises(tmp_path):
     df.write_ndjson(tmp_path / "df.json")
     q = pl.scan_ndjson(tmp_path / "df.json")
     assert_ir_translation_raises(q, NotImplementedError)
+
+
+@pytest.mark.xfail(reason="https://github.com/pola-rs/polars/pull/17363")
+def test_scan_row_index_projected_out(tmp_path):
+    df = pl.DataFrame({"a": [1, 2, 3]})
+
+    df.write_parquet(tmp_path / "df.pq")
+
+    q = pl.scan_parquet(tmp_path / "df.pq").with_row_index().select(pl.col("a"))
+
+    assert_gpu_result_equal(q)

From a4ee0803552b3154efe207170e3d535cb04d0d25 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 4 Jul 2024 13:38:03 +0000
Subject: [PATCH 14/21] Expose type traits in pylibcudf

---
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |   1 +
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd  |   4 +
 python/cudf/cudf/_lib/pylibcudf/__init__.py   |   4 +
 .../pylibcudf/libcudf/utilities/traits.pxd    |  27 +++
 python/cudf/cudf/_lib/pylibcudf/traits.pxd    |  25 +++
 python/cudf/cudf/_lib/pylibcudf/traits.pyx    | 187 ++++++++++++++++++
 .../cudf/cudf/pylibcudf_tests/common/utils.py |  39 ----
 .../cudf/cudf/pylibcudf_tests/test_copying.py |  47 +++--
 .../cudf/cudf/pylibcudf_tests/test_traits.py  | 110 +++++++++++
 9 files changed, 386 insertions(+), 58 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/traits.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/traits.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_traits.py

diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index 0a198f431a7..c0146f3ff50 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -39,6 +39,7 @@ set(cython_sources
     sorting.pyx
     table.pyx
     types.pyx
+    traits.pyx
     unary.pyx
     utils.pyx
 )
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index 5131df9a5cd..2376b0b0582 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -23,6 +23,7 @@ from . cimport (
     sorting,
     stream_compaction,
     strings,
+    traits,
     types,
     unary,
 )
@@ -37,6 +38,7 @@ from .types cimport DataType, type_id
 __all__ = [
     "Column",
     "DataType",
+    "MaskState",
     "Scalar",
     "Table",
     "aggregation",
@@ -54,6 +56,7 @@ __all__ = [
     "quantiles",
     "reduce",
     "replace",
+    "reshape",
     "rolling",
     "round",
     "search",
@@ -61,5 +64,6 @@ __all__ = [
     "strings",
     "sorting",
     "types",
+    "traits",
     "unary",
 ]
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index 43a9e2aca31..8610bf175d0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -23,6 +23,7 @@
     sorting,
     stream_compaction,
     strings,
+    traits,
     types,
     unary,
 )
@@ -35,6 +36,7 @@
 __all__ = [
     "Column",
     "DataType",
+    "MaskState",
     "Scalar",
     "Table",
     "TypeId",
@@ -53,6 +55,7 @@
     "merge",
     "quantiles",
     "reduce",
+    "reshape",
     "replace",
     "rolling",
     "round",
@@ -60,6 +63,7 @@
     "stream_compaction",
     "strings",
     "sorting",
+    "traits",
     "types",
     "unary",
 ]
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd
new file mode 100644
index 00000000000..0cc58af735b
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd
@@ -0,0 +1,27 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.vector cimport vector
+
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
+
+
+cdef extern from "cudf/utilities/traits.hpp" namespace "cudf" nogil:
+    cdef bool is_relationally_comparable(data_type)
+    cdef bool is_equality_comparable(data_type)
+    cdef bool is_numeric(data_type)
+    cdef bool is_index_type(data_type)
+    cdef bool is_unsigned(data_type)
+    cdef bool is_integral(data_type)
+    cdef bool is_integral_not_bool(data_type)
+    cdef bool is_floating_point(data_type)
+    cdef bool is_boolean(data_type)
+    cdef bool is_timestamp(data_type)
+    cdef bool is_fixed_point(data_type)
+    cdef bool is_duration(data_type)
+    cdef bool is_chrono(data_type)
+    cdef bool is_dictionary(data_type)
+    cdef bool is_fixed_width(data_type)
+    cdef bool is_compound(data_type)
+    cdef bool is_nested(data_type)
+    cdef bool is_bit_castable(data_type, data_type)
diff --git a/python/cudf/cudf/_lib/pylibcudf/traits.pxd b/python/cudf/cudf/_lib/pylibcudf/traits.pxd
new file mode 100644
index 00000000000..668fa775202
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/traits.pxd
@@ -0,0 +1,25 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+
+from .types cimport DataType
+
+
+cpdef bool is_relationally_comparable(DataType typ)
+cpdef bool is_equality_comparable(DataType typ)
+cpdef bool is_numeric(DataType typ)
+cpdef bool is_index_type(DataType typ)
+cpdef bool is_unsigned(DataType typ)
+cpdef bool is_integral(DataType typ)
+cpdef bool is_integral_not_bool(DataType typ)
+cpdef bool is_floating_point(DataType typ)
+cpdef bool is_boolean(DataType typ)
+cpdef bool is_timestamp(DataType typ)
+cpdef bool is_fixed_point(DataType typ)
+cpdef bool is_duration(DataType typ)
+cpdef bool is_chrono(DataType typ)
+cpdef bool is_dictionary(DataType typ)
+cpdef bool is_fixed_width(DataType typ)
+cpdef bool is_compound(DataType typ)
+cpdef bool is_nested(DataType typ)
+cpdef bool is_bit_castable(DataType source, DataType target)
diff --git a/python/cudf/cudf/_lib/pylibcudf/traits.pyx b/python/cudf/cudf/_lib/pylibcudf/traits.pyx
new file mode 100644
index 00000000000..fdd8f1e5057
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/traits.pyx
@@ -0,0 +1,187 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+
+from cudf._lib.pylibcudf.libcudf.utilities cimport traits
+
+from .types cimport DataType
+
+
+cpdef bool is_relationally_comparable(DataType typ):
+    """Checks if the given data type supports relational comparisons.
+
+    See Also
+    --------
+    :cpp:func:`is_relationally_comparable`
+    """
+    return traits.is_relationally_comparable(typ.c_obj)
+
+
+cpdef bool is_equality_comparable(DataType typ):
+    """Checks if the given data type supports equality comparisons.
+
+    See Also
+    --------
+    :cpp:func:`is_equality_comparable`
+    """
+    return traits.is_equality_comparable(typ.c_obj)
+
+
+cpdef bool is_numeric(DataType typ):
+    """Checks if the given data type is numeric.
+
+    See Also
+    --------
+    :cpp:func:`is_numeric`
+    """
+    return traits.is_numeric(typ.c_obj)
+
+
+cpdef bool is_index_type(DataType typ):
+    """Checks if the given data type is an index type.
+
+    See Also
+    --------
+    :cpp:func:`is_index_type`
+    """
+    return traits.is_index_type(typ.c_obj)
+
+
+cpdef bool is_unsigned(DataType typ):
+    """Checks if the given data type is an unsigned type.
+
+    See Also
+    --------
+    :cpp:func:`is_unsigned`
+    """
+    return traits.is_unsigned(typ.c_obj)
+
+
+cpdef bool is_integral(DataType typ):
+    """Checks if the given data type is an integral type.
+
+    See Also
+    --------
+    :cpp:func:`is_integral`
+    """
+    return traits.is_integral(typ.c_obj)
+
+
+cpdef bool is_integral_not_bool(DataType typ):
+    """Checks if the given data type is an integral type excluding booleans.
+
+    See Also
+    --------
+    :cpp:func:`is_integral_not_bool`
+    """
+    return traits.is_integral_not_bool(typ.c_obj)
+
+
+cpdef bool is_floating_point(DataType typ):
+    """Checks if the given data type is a floating point type.
+
+    See Also
+    --------
+    :cpp:func:`is_floating_point`
+    """
+    return traits.is_floating_point(typ.c_obj)
+
+
+cpdef bool is_boolean(DataType typ):
+    """Checks if the given data type is a boolean type.
+
+    See Also
+    --------
+    :cpp:func:`is_boolean`
+    """
+    return traits.is_boolean(typ.c_obj)
+
+
+cpdef bool is_timestamp(DataType typ):
+    """Checks if the given data type is a timestamp type.
+
+    See Also
+    --------
+    :cpp:func:`is_timestamp`
+    """
+    return traits.is_timestamp(typ.c_obj)
+
+
+cpdef bool is_fixed_point(DataType typ):
+    """Checks if the given data type is a fixed point type.
+
+    See Also
+    --------
+    :cpp:func:`is_fixed_point`
+    """
+    return traits.is_fixed_point(typ.c_obj)
+
+
+cpdef bool is_duration(DataType typ):
+    """Checks if the given data type is a duration type.
+
+    See Also
+    --------
+    :cpp:func:`is_duration`
+    """
+    return traits.is_duration(typ.c_obj)
+
+
+cpdef bool is_chrono(DataType typ):
+    """Checks if the given data type is a chrono type.
+
+    See Also
+    --------
+    :cpp:func:`is_chrono`
+    """
+    return traits.is_chrono(typ.c_obj)
+
+
+cpdef bool is_dictionary(DataType typ):
+    """Checks if the given data type is a dictionary type.
+
+    See Also
+    --------
+    :cpp:func:`is_dictionary`
+    """
+    return traits.is_dictionary(typ.c_obj)
+
+
+cpdef bool is_fixed_width(DataType typ):
+    """Checks if the given data type is a fixed width type.
+
+    See Also
+    --------
+    :cpp:func:`is_fixed_width`
+    """
+    return traits.is_fixed_width(typ.c_obj)
+
+
+cpdef bool is_compound(DataType typ):
+    """Checks if the given data type is a compound type.
+
+    See Also
+    --------
+    :cpp:func:`is_compound`
+    """
+    return traits.is_compound(typ.c_obj)
+
+
+cpdef bool is_nested(DataType typ):
+    """Checks if the given data type is a nested type.
+
+    See Also
+    --------
+    :cpp:func:`is_nested`
+    """
+    return traits.is_nested(typ.c_obj)
+
+
+cpdef bool is_bit_castable(DataType source, DataType target):
+    """Checks if the source type is bit-castable to the target type.
+
+    See Also
+    --------
+    :cpp:func:`is_bit_castable`
+    """
+    return traits.is_bit_castable(source.c_obj, target.c_obj)
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index f8bfe340ae5..d41e6c720bf 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -102,49 +102,10 @@ def cudf_raises(expected_exception: BaseException, *args, **kwargs):
     return pytest.raises(expected_exception, *args, **kwargs)
 
 
-# TODO: Consider moving these type utilities into pylibcudf.types itself.
-def is_signed_integer(plc_dtype: plc.DataType):
-    return (
-        plc.TypeId.INT8.value <= plc_dtype.id().value <= plc.TypeId.INT64.value
-    )
-
-
-def is_integer(plc_dtype: plc.DataType):
-    return plc_dtype.id() in (
-        plc.TypeId.INT8,
-        plc.TypeId.INT16,
-        plc.TypeId.INT32,
-        plc.TypeId.INT64,
-        plc.TypeId.UINT8,
-        plc.TypeId.UINT16,
-        plc.TypeId.UINT32,
-        plc.TypeId.UINT64,
-    )
-
-
-def is_floating(plc_dtype: plc.DataType):
-    return plc_dtype.id() in (
-        plc.TypeId.FLOAT32,
-        plc.TypeId.FLOAT64,
-    )
-
-
-def is_boolean(plc_dtype: plc.DataType):
-    return plc_dtype.id() == plc.TypeId.BOOL8
-
-
 def is_string(plc_dtype: plc.DataType):
     return plc_dtype.id() == plc.TypeId.STRING
 
 
-def is_fixed_width(plc_dtype: plc.DataType):
-    return (
-        is_integer(plc_dtype)
-        or is_floating(plc_dtype)
-        or is_boolean(plc_dtype)
-    )
-
-
 def nesting_level(typ) -> tuple[int, int]:
     """Return list and struct nesting of a pyarrow type."""
     if isinstance(typ, pa.ListType):
diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py
index 0a6df198d46..f27fe4e942e 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_copying.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py
@@ -9,9 +9,6 @@
     assert_column_eq,
     assert_table_eq,
     cudf_raises,
-    is_fixed_width,
-    is_floating,
-    is_integer,
     is_nested_list,
     is_nested_struct,
     is_string,
@@ -359,9 +356,9 @@ def test_scatter_table_type_mismatch(source_table, index_column, target_table):
     _, plc_index_column = index_column
     _, plc_target_table = target_table
     with cudf_raises(TypeError):
-        if is_integer(
+        if plc.traits.is_integral_not_bool(
             dtype := plc_target_table.columns()[0].type()
-        ) or is_floating(dtype):
+        ) or plc.traits.is_floating_point(dtype):
             pa_array = pa.array([True] * plc_source_table.num_rows())
         else:
             pa_array = pa.array([1] * plc_source_table.num_rows())
@@ -428,9 +425,9 @@ def test_scatter_scalars_type_mismatch(index_column, target_table):
     _, plc_index_column = index_column
     _, plc_target_table = target_table
     with cudf_raises(TypeError):
-        if is_integer(
+        if plc.traits.is_integral_not_bool(
             dtype := plc_target_table.columns()[0].type()
-        ) or is_floating(dtype):
+        ) or plc.traits.is_floating_point(dtype):
             plc_source_scalar = [plc.interop.from_arrow(pa.scalar(True))]
         else:
             plc_source_scalar = [plc.interop.from_arrow(pa.scalar(1))]
@@ -458,7 +455,7 @@ def test_empty_like_table(source_table):
 @pytest.mark.parametrize("size", [None, 10])
 def test_allocate_like(input_column, size):
     _, plc_input_column = input_column
-    if is_fixed_width(plc_input_column.type()):
+    if plc.traits.is_fixed_width(plc_input_column.type()):
         result = plc.copying.allocate_like(
             plc_input_column,
             plc.copying.MaskAllocationPolicy.RETAIN,
@@ -484,7 +481,7 @@ def test_copy_range_in_place(
 
     pa_target_column, _ = target_column
 
-    if not is_fixed_width(mutable_target_column.type()):
+    if not plc.traits.is_fixed_width(mutable_target_column.type()):
         with pytest.raises(TypeError):
             plc.copying.copy_range_in_place(
                 plc_input_column,
@@ -516,7 +513,7 @@ def test_copy_range_in_place_out_of_bounds(
 ):
     _, plc_input_column = input_column
 
-    if is_fixed_width(mutable_target_column.type()):
+    if plc.traits.is_fixed_width(mutable_target_column.type()):
         with cudf_raises(IndexError):
             plc.copying.copy_range_in_place(
                 plc_input_column,
@@ -528,7 +525,9 @@ def test_copy_range_in_place_out_of_bounds(
 
 
 def test_copy_range_in_place_different_types(mutable_target_column):
-    if is_integer(dtype := mutable_target_column.type()) or is_floating(dtype):
+    if plc.traits.is_integral_not_bool(
+        dtype := mutable_target_column.type()
+    ) or plc.traits.is_floating_point(dtype):
         plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
     else:
         plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
@@ -548,7 +547,7 @@ def test_copy_range_in_place_null_mismatch(
 ):
     pa_input_column, _ = input_column
 
-    if is_fixed_width(mutable_target_column.type()):
+    if plc.traits.is_fixed_width(mutable_target_column.type()):
         pa_input_column = pc.if_else(
             _pyarrow_index_to_mask([0], len(pa_input_column)),
             pa_input_column,
@@ -568,7 +567,9 @@ def test_copy_range_in_place_null_mismatch(
 def test_copy_range(input_column, target_column):
     pa_input_column, plc_input_column = input_column
     pa_target_column, plc_target_column = target_column
-    if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype):
+    if plc.traits.is_fixed_width(
+        dtype := plc_target_column.type()
+    ) or is_string(dtype):
         result = plc.copying.copy_range(
             plc_input_column,
             plc_target_column,
@@ -610,7 +611,9 @@ def test_copy_range_out_of_bounds(input_column, target_column):
 
 def test_copy_range_different_types(target_column):
     _, plc_target_column = target_column
-    if is_integer(dtype := plc_target_column.type()) or is_floating(dtype):
+    if plc.traits.is_integral_not_bool(
+        dtype := plc_target_column.type()
+    ) or plc.traits.is_floating_point(dtype):
         plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
     else:
         plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
@@ -629,7 +632,9 @@ def test_shift(target_column, source_scalar):
     pa_source_scalar, plc_source_scalar = source_scalar
     pa_target_column, plc_target_column = target_column
     shift = 2
-    if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype):
+    if plc.traits.is_fixed_width(
+        dtype := plc_target_column.type()
+    ) or is_string(dtype):
         result = plc.copying.shift(plc_target_column, shift, plc_source_scalar)
         expected = pa.concat_arrays(
             [pa.array([pa_source_scalar] * shift), pa_target_column[:-shift]]
@@ -642,7 +647,9 @@ def test_shift(target_column, source_scalar):
 
 def test_shift_type_mismatch(target_column):
     _, plc_target_column = target_column
-    if is_integer(dtype := plc_target_column.type()) or is_floating(dtype):
+    if plc.traits.is_integral_not_bool(
+        dtype := plc_target_column.type()
+    ) or plc.traits.is_floating_point(dtype):
         fill_value = plc.interop.from_arrow(pa.scalar("a"))
     else:
         fill_value = plc.interop.from_arrow(pa.scalar(1))
@@ -747,7 +754,9 @@ def test_copy_if_else_column_column(target_column, mask, source_scalar):
 def test_copy_if_else_wrong_type(target_column, mask):
     _, plc_target_column = target_column
     _, plc_mask = mask
-    if is_integer(dtype := plc_target_column.type()) or is_floating(dtype):
+    if plc.traits.is_integral_not_bool(
+        dtype := plc_target_column.type()
+    ) or plc.traits.is_floating_point(dtype):
         plc_input_column = plc.interop.from_arrow(
             pa.array(["a"] * plc_target_column.size())
         )
@@ -951,9 +960,9 @@ def test_boolean_mask_scatter_from_wrong_num_true(source_table, target_table):
 def test_boolean_mask_scatter_from_wrong_col_type(target_table, mask):
     _, plc_target_table = target_table
     _, plc_mask = mask
-    if is_integer(
+    if plc.traits.is_integral_not_bool(
         dtype := plc_target_table.columns()[0].type()
-    ) or is_floating(dtype):
+    ) or plc.traits.is_floating_point(dtype):
         input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
     else:
         input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
diff --git a/python/cudf/cudf/pylibcudf_tests/test_traits.py b/python/cudf/cudf/pylibcudf_tests/test_traits.py
new file mode 100644
index 00000000000..6c22cb02f21
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_traits.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib import pylibcudf as plc
+
+
+def test_is_relationally_comparable():
+    assert plc.traits.is_relationally_comparable(plc.DataType(plc.TypeId.INT8))
+    assert not plc.traits.is_relationally_comparable(
+        plc.DataType(plc.TypeId.LIST)
+    )
+
+
+def test_is_equality_comparable():
+    assert plc.traits.is_equality_comparable(plc.DataType(plc.TypeId.INT8))
+    assert not plc.traits.is_equality_comparable(plc.DataType(plc.TypeId.LIST))
+
+
+def test_is_numeric():
+    assert plc.traits.is_numeric(plc.DataType(plc.TypeId.FLOAT64))
+    assert not plc.traits.is_numeric(plc.DataType(plc.TypeId.LIST))
+
+
+def test_is_index_type():
+    assert plc.traits.is_index_type(plc.DataType(plc.TypeId.INT8))
+    assert not plc.traits.is_index_type(plc.DataType(plc.TypeId.BOOL8))
+
+
+def test_is_unsigned():
+    assert plc.traits.is_unsigned(plc.DataType(plc.TypeId.UINT8))
+    assert not plc.traits.is_unsigned(plc.DataType(plc.TypeId.INT8))
+
+
+def test_is_integral():
+    assert plc.traits.is_integral(plc.DataType(plc.TypeId.BOOL8))
+    assert not plc.traits.is_integral(plc.DataType(plc.TypeId.DECIMAL32))
+
+
+def test_is_integral_not_bool():
+    assert plc.traits.is_integral_not_bool(plc.DataType(plc.TypeId.INT8))
+    assert not plc.traits.is_integral_not_bool(plc.DataType(plc.TypeId.BOOL8))
+
+
+def test_is_floating_point():
+    assert plc.traits.is_floating_point(plc.DataType(plc.TypeId.FLOAT64))
+    assert not plc.traits.is_floating_point(plc.DataType(plc.TypeId.UINT8))
+
+
+def test_is_boolean():
+    assert plc.traits.is_boolean(plc.DataType(plc.TypeId.BOOL8))
+    assert not plc.traits.is_boolean(plc.DataType(plc.TypeId.UINT8))
+
+
+def test_is_timestamp():
+    assert plc.traits.is_timestamp(
+        plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS)
+    )
+    assert not plc.traits.is_timestamp(
+        plc.DataType(plc.TypeId.DURATION_MICROSECONDS)
+    )
+
+
+def test_is_fixed_point():
+    assert plc.traits.is_fixed_point(plc.DataType(plc.TypeId.DECIMAL128))
+    assert not plc.traits.is_fixed_point(plc.DataType(plc.TypeId.FLOAT32))
+
+
+def test_is_duration():
+    assert plc.traits.is_duration(
+        plc.DataType(plc.TypeId.DURATION_MICROSECONDS)
+    )
+    assert not plc.traits.is_duration(
+        plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS)
+    )
+
+
+def test_is_chrono():
+    assert plc.traits.is_chrono(plc.DataType(plc.TypeId.DURATION_MICROSECONDS))
+    assert plc.traits.is_chrono(
+        plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS)
+    )
+    assert not plc.traits.is_chrono(plc.DataType(plc.TypeId.UINT8))
+
+
+def test_is_dictionary():
+    assert plc.traits.is_dictionary(plc.DataType(plc.TypeId.DICTIONARY32))
+    assert not plc.traits.is_dictionary(plc.DataType(plc.TypeId.UINT8))
+
+
+def test_is_fixed_width():
+    assert plc.traits.is_fixed_width(plc.DataType(plc.TypeId.INT8))
+    assert not plc.traits.is_fixed_width(plc.DataType(plc.TypeId.STRING))
+
+
+def test_is_compound():
+    assert plc.traits.is_compound(plc.DataType(plc.TypeId.STRUCT))
+    assert not plc.traits.is_compound(plc.DataType(plc.TypeId.UINT8))
+
+
+def test_is_nested():
+    assert plc.traits.is_nested(plc.DataType(plc.TypeId.STRUCT))
+    assert not plc.traits.is_nested(plc.DataType(plc.TypeId.STRING))
+
+
+def test_is_bit_castable():
+    assert plc.traits.is_bit_castable(
+        plc.DataType(plc.TypeId.INT8), plc.DataType(plc.TypeId.UINT8)
+    )
+    assert not plc.traits.is_bit_castable(
+        plc.DataType(plc.TypeId.UINT8), plc.DataType(plc.TypeId.UINT16)
+    )

From 4fb4ea9cd1d0d49bc457f903d320a44147f01d47 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 4 Jul 2024 13:43:13 +0000
Subject: [PATCH 15/21] Use new pylibcudf type traits in polars interpreter

---
 python/cudf_polars/cudf_polars/dsl/expr.py     |  3 ++-
 python/cudf_polars/cudf_polars/utils/dtypes.py | 13 -------------
 2 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index fe859c8d958..b8c3e312ac8 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -1182,7 +1182,8 @@ def __init__(
         self.children = (left, right)
         if (
             op in (plc.binaryop.BinaryOperator.ADD, plc.binaryop.BinaryOperator.SUB)
-            and ({left.dtype.id(), right.dtype.id()}.issubset(dtypes.TIMELIKE_TYPES))
+            and plc.traits.is_chrono(left.dtype)
+            and plc.traits.is_chrono(right.dtype)
             and not dtypes.have_compatible_resolution(left.dtype.id(), right.dtype.id())
         ):
             raise NotImplementedError("Casting rules for timelike types")
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index 507acb5d33a..918cd024fa2 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -17,19 +17,6 @@
 __all__ = ["from_polars", "downcast_arrow_lists", "have_compatible_resolution"]
 
 
-TIMELIKE_TYPES: frozenset[plc.TypeId] = frozenset(
-    [
-        plc.TypeId.TIMESTAMP_MILLISECONDS,
-        plc.TypeId.TIMESTAMP_MICROSECONDS,
-        plc.TypeId.TIMESTAMP_NANOSECONDS,
-        plc.TypeId.TIMESTAMP_DAYS,
-        plc.TypeId.DURATION_MILLISECONDS,
-        plc.TypeId.DURATION_MICROSECONDS,
-        plc.TypeId.DURATION_NANOSECONDS,
-    ]
-)
-
-
 def have_compatible_resolution(lid: plc.TypeId, rid: plc.TypeId):
     """
     Do two datetime typeids have matching resolution for a binop.

From 460439b808753b498b0870e8ab6bad2a200f9ac0 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 4 Jul 2024 14:32:21 +0000
Subject: [PATCH 16/21] Alphabetise

---
 python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 2 +-
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index c0146f3ff50..d22096081af 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -38,8 +38,8 @@ set(cython_sources
     stream_compaction.pyx
     sorting.pyx
     table.pyx
-    types.pyx
     traits.pyx
+    types.pyx
     unary.pyx
     utils.pyx
 )
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index 2376b0b0582..61517aa737f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -63,7 +63,7 @@ __all__ = [
     "stream_compaction",
     "strings",
     "sorting",
-    "types",
     "traits",
+    "types",
     "unary",
 ]

From afd59416e8748bae48d3e7040f96ffec728d31e7 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 4 Jul 2024 14:33:25 +0000
Subject: [PATCH 17/21] MaskState not in all

---
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index 61517aa737f..d4d615cde34 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -38,7 +38,6 @@ from .types cimport DataType, type_id
 __all__ = [
     "Column",
     "DataType",
-    "MaskState",
     "Scalar",
     "Table",
     "aggregation",

From 23f93cb280389d267713858338959c5a7ef5c344 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 4 Jul 2024 14:34:32 +0000
Subject: [PATCH 18/21] So bad at the alphabet

---
 python/cudf/cudf/_lib/pylibcudf/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index 8610bf175d0..91f8acaf682 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -55,8 +55,8 @@
     "merge",
     "quantiles",
     "reduce",
-    "reshape",
     "replace",
+    "reshape",
     "rolling",
     "round",
     "search",

From 53ca18ca2aef774a315a2ebde267d4ab1ff6bba7 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 5 Jul 2024 09:55:14 +0000
Subject: [PATCH 19/21] Link into docs and fix sphinx errors

Impossible to use See Also sections with cross-linking to C++ function
names.
---
 .../user_guide/api_docs/pylibcudf/index.rst   |  7 +-
 .../user_guide/api_docs/pylibcudf/traits.rst  |  6 ++
 python/cudf/cudf/_lib/pylibcudf/traits.pyx    | 72 +++++--------------
 3 files changed, 28 insertions(+), 57 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index e9dad705cbf..bd6f0f77357 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -18,22 +18,22 @@ This page provides API documentation for pylibcudf.
     filling
     gpumemoryview
     groupby
-    io/index.rst
     interop
     join
     lists
     merge
     quantiles
     reduce
+    replace
     reshape
     rolling
     round
     scalar
     search
-    stream_compaction
     sorting
-    replace
+    stream_compaction
     table
+    traits
     types
     unary
 
@@ -41,4 +41,5 @@ This page provides API documentation for pylibcudf.
     :maxdepth: 2
     :caption: Subpackages
 
+    io/index.rst
     strings/index.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst
new file mode 100644
index 00000000000..294ca8dc78c
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst
@@ -0,0 +1,6 @@
+======
+traits
+======
+
+.. automodule:: cudf._lib.pylibcudf.traits
+   :members:
diff --git a/python/cudf/cudf/_lib/pylibcudf/traits.pyx b/python/cudf/cudf/_lib/pylibcudf/traits.pyx
index fdd8f1e5057..d2370f8d641 100644
--- a/python/cudf/cudf/_lib/pylibcudf/traits.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/traits.pyx
@@ -10,9 +10,7 @@ from .types cimport DataType
 cpdef bool is_relationally_comparable(DataType typ):
     """Checks if the given data type supports relational comparisons.
 
-    See Also
-    --------
-    :cpp:func:`is_relationally_comparable`
+    For details, see :cpp:func:`is_relationally_comparable`.
     """
     return traits.is_relationally_comparable(typ.c_obj)
 
@@ -20,9 +18,7 @@ cpdef bool is_relationally_comparable(DataType typ):
 cpdef bool is_equality_comparable(DataType typ):
     """Checks if the given data type supports equality comparisons.
 
-    See Also
-    --------
-    :cpp:func:`is_equality_comparable`
+    For details, see :cpp:func:`is_equality_comparable`.
     """
     return traits.is_equality_comparable(typ.c_obj)
 
@@ -30,9 +26,7 @@ cpdef bool is_equality_comparable(DataType typ):
 cpdef bool is_numeric(DataType typ):
     """Checks if the given data type is numeric.
 
-    See Also
-    --------
-    :cpp:func:`is_numeric`
+    For details, see :cpp:func:`is_numeric`.
     """
     return traits.is_numeric(typ.c_obj)
 
@@ -40,9 +34,7 @@ cpdef bool is_numeric(DataType typ):
 cpdef bool is_index_type(DataType typ):
     """Checks if the given data type is an index type.
 
-    See Also
-    --------
-    :cpp:func:`is_index_type`
+    For details, see :cpp:func:`is_index_type`.
     """
     return traits.is_index_type(typ.c_obj)
 
@@ -50,9 +42,7 @@ cpdef bool is_index_type(DataType typ):
 cpdef bool is_unsigned(DataType typ):
     """Checks if the given data type is an unsigned type.
 
-    See Also
-    --------
-    :cpp:func:`is_unsigned`
+    For details, see :cpp:func:`is_unsigned`.
     """
     return traits.is_unsigned(typ.c_obj)
 
@@ -60,9 +50,7 @@ cpdef bool is_unsigned(DataType typ):
 cpdef bool is_integral(DataType typ):
     """Checks if the given data type is an integral type.
 
-    See Also
-    --------
-    :cpp:func:`is_integral`
+    For details, see :cpp:func:`is_integral`.
     """
     return traits.is_integral(typ.c_obj)
 
@@ -70,9 +58,7 @@ cpdef bool is_integral(DataType typ):
 cpdef bool is_integral_not_bool(DataType typ):
     """Checks if the given data type is an integral type excluding booleans.
 
-    See Also
-    --------
-    :cpp:func:`is_integral_not_bool`
+    For details, see :cpp:func:`is_integral_not_bool`.
     """
     return traits.is_integral_not_bool(typ.c_obj)
 
@@ -80,9 +66,7 @@ cpdef bool is_integral_not_bool(DataType typ):
 cpdef bool is_floating_point(DataType typ):
     """Checks if the given data type is a floating point type.
 
-    See Also
-    --------
-    :cpp:func:`is_floating_point`
+    For details, see :cpp:func:`is_floating_point`.
     """
     return traits.is_floating_point(typ.c_obj)
 
@@ -90,9 +74,7 @@ cpdef bool is_floating_point(DataType typ):
 cpdef bool is_boolean(DataType typ):
     """Checks if the given data type is a boolean type.
 
-    See Also
-    --------
-    :cpp:func:`is_boolean`
+    For details, see :cpp:func:`is_boolean`.
     """
     return traits.is_boolean(typ.c_obj)
 
@@ -100,9 +82,7 @@ cpdef bool is_boolean(DataType typ):
 cpdef bool is_timestamp(DataType typ):
     """Checks if the given data type is a timestamp type.
 
-    See Also
-    --------
-    :cpp:func:`is_timestamp`
+    For details, see :cpp:func:`is_timestamp`.
     """
     return traits.is_timestamp(typ.c_obj)
 
@@ -110,9 +90,7 @@ cpdef bool is_timestamp(DataType typ):
 cpdef bool is_fixed_point(DataType typ):
     """Checks if the given data type is a fixed point type.
 
-    See Also
-    --------
-    :cpp:func:`is_fixed_point`
+    For details, see :cpp:func:`is_fixed_point`.
     """
     return traits.is_fixed_point(typ.c_obj)
 
@@ -120,9 +98,7 @@ cpdef bool is_fixed_point(DataType typ):
 cpdef bool is_duration(DataType typ):
     """Checks if the given data type is a duration type.
 
-    See Also
-    --------
-    :cpp:func:`is_duration`
+    For details, see :cpp:func:`is_duration`.
     """
     return traits.is_duration(typ.c_obj)
 
@@ -130,9 +106,7 @@ cpdef bool is_duration(DataType typ):
 cpdef bool is_chrono(DataType typ):
     """Checks if the given data type is a chrono type.
 
-    See Also
-    --------
-    :cpp:func:`is_chrono`
+    For details, see :cpp:func:`is_chrono`.
     """
     return traits.is_chrono(typ.c_obj)
 
@@ -140,9 +114,7 @@ cpdef bool is_chrono(DataType typ):
 cpdef bool is_dictionary(DataType typ):
     """Checks if the given data type is a dictionary type.
 
-    See Also
-    --------
-    :cpp:func:`is_dictionary`
+    For details, see :cpp:func:`is_dictionary`.
     """
     return traits.is_dictionary(typ.c_obj)
 
@@ -150,9 +122,7 @@ cpdef bool is_dictionary(DataType typ):
 cpdef bool is_fixed_width(DataType typ):
     """Checks if the given data type is a fixed width type.
 
-    See Also
-    --------
-    :cpp:func:`is_fixed_width`
+    For details, see :cpp:func:`is_fixed_width`.
     """
     return traits.is_fixed_width(typ.c_obj)
 
@@ -160,9 +130,7 @@ cpdef bool is_fixed_width(DataType typ):
 cpdef bool is_compound(DataType typ):
     """Checks if the given data type is a compound type.
 
-    See Also
-    --------
-    :cpp:func:`is_compound`
+    For details, see :cpp:func:`is_compound`.
     """
     return traits.is_compound(typ.c_obj)
 
@@ -170,9 +138,7 @@ cpdef bool is_compound(DataType typ):
 cpdef bool is_nested(DataType typ):
     """Checks if the given data type is a nested type.
 
-    See Also
-    --------
-    :cpp:func:`is_nested`
+    For details, see :cpp:func:`is_nested`.
     """
     return traits.is_nested(typ.c_obj)
 
@@ -180,8 +146,6 @@ cpdef bool is_nested(DataType typ):
 cpdef bool is_bit_castable(DataType source, DataType target):
     """Checks if the source type is bit-castable to the target type.
 
-    See Also
-    --------
-    :cpp:func:`is_bit_castable`
+    For details, see :cpp:func:`is_bit_castable`.
     """
     return traits.is_bit_castable(source.c_obj, target.c_obj)

From 964ff7272a6e2e7899743906f706fb65d8131795 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 5 Jul 2024 17:26:51 +0000
Subject: [PATCH 20/21] Avoid out of bounds access in read_csv

If one passes an empty list for the names, things should work.
---
 python/cudf/cudf/_lib/csv.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index c706351a683..9fecff5f5f6 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -450,7 +450,7 @@ def read_csv(
                     col_name = df._data.names[index]
                     df._data[col_name] = df._data[col_name].astype(col_dtype)
 
-    if names is not None and isinstance(names[0], (int)):
+    if names is not None and len(names) and isinstance(names[0], (int)):
         df.columns = [int(x) for x in df._data]
 
     # Set index if the index_col parameter is passed

From 79a260502691d6733145957cadf4e5784ce75532 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 5 Jul 2024 17:33:05 +0000
Subject: [PATCH 21/21] WIP: Handling csv reader options

Mostly there, just need to flesh out tests for coverage.
---
 python/cudf_polars/cudf_polars/dsl/ir.py      | 99 ++++++++++++++++---
 .../cudf_polars/cudf_polars/dsl/translate.py  |  9 +-
 python/cudf_polars/tests/test_scan.py         | 17 ++++
 3 files changed, 112 insertions(+), 13 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 60fe3252700..056f3c76f78 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -15,9 +15,9 @@
 
 import dataclasses
 import itertools
-import json
 import types
 from functools import cache
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, ClassVar
 
 import pyarrow as pa
@@ -185,8 +185,10 @@ class Scan(IR):
 
     typ: str
     """What type of file are we reading? Parquet, CSV, etc..."""
-    options: tuple[Any, ...]
-    """Type specific options, as json-encoded strings."""
+    reader_options: dict[str, Any]
+    """Reader-specific options, as dictionary."""
+    cloud_options: dict[str, Any] | None
+    """Cloud-related authentication options, currently ignored."""
     paths: list[str]
     """List of paths to read from."""
     file_options: Any
@@ -206,9 +208,27 @@ def __post_init__(self) -> None:
         if self.file_options.n_rows is not None:
             raise NotImplementedError("row limit in scan")
         if self.typ not in ("csv", "parquet"):
-            raise NotImplementedError(
-                f"Unhandled scan type: {self.typ}"
-            )  # pragma: no cover; polars raises on the rust side for now
+            raise NotImplementedError(f"Unhandled scan type: {self.typ}")
+        if self.cloud_options is not None and any(
+            self.cloud_options[k] is not None for k in ("aws", "azure", "gcp")
+        ):
+            raise NotImplementedError("Read from cloud storage")
+        if self.typ == "csv":
+            if self.reader_options["skip_rows_after_header"] != 0:
+                raise NotImplementedError("Skipping rows after header in CSV reader")
+            parse_options = self.reader_options["parse_options"]
+            if (
+                null_values := parse_options["null_values"]
+            ) is not None and "Named" in null_values:
+                raise NotImplementedError(
+                    "Per column null value specification not supported for CSV reader"
+                )
+            if (
+                comment := parse_options["comment_prefix"]
+            ) is not None and "Multi" in comment:
+                raise NotImplementedError(
+                    "Multi-character comment prefix not supported for CSV reader"
+                )
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
@@ -216,14 +236,69 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         with_columns = options.with_columns
         row_index = options.row_index
         if self.typ == "csv":
-            opts, cloud_opts = map(json.loads, self.options)
-            df = DataFrame.from_cudf(
-                cudf.concat(
-                    [cudf.read_csv(p, usecols=with_columns) for p in self.paths]
+            dtype_map = {
+                name: cudf._lib.types.PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[typ.id()]
+                for name, typ in self.schema.items()
+            }
+            parse_options = self.reader_options["parse_options"]
+            sep = chr(parse_options["separator"])
+            quote = chr(parse_options["quote_char"])
+            eol = chr(parse_options["eol_char"])
+            if self.reader_options["schema"] is not None:
+                # Reader schema provides names
+                column_names = list(self.reader_options["schema"]["inner"].keys())
+            else:
+                # file provides column names
+                column_names = None
+            usecols = with_columns
+            header = 0 if self.reader_options["has_header"] else None
+
+            # polars defaults to no null recognition
+            null_values = [""]
+            if parse_options["null_values"] is not None:
+                ((typ, nulls),) = parse_options["null_values"].items()
+                if typ == "AllColumnsSingle":
+                    # Single value
+                    null_values.append(nulls)
+                else:
+                    # List of values
+                    null_values.extend(nulls)
+            if parse_options["comment_prefix"] is not None:
+                comment = parse_options["comment_prefix"]["Single"]
+            else:
+                comment = None
+            decimal = "," if parse_options["decimal_comma"] else "."
+
+            # polars skips blank lines at the beginning of the file
+            pieces = []
+            for p in self.paths:
+                skiprows = self.reader_options["skip_rows"]
+                # TODO: read_csv expands globs which we should not do,
+                # because polars will already have handled them.
+                path = Path(p)
+                with path.open() as f:
+                    while f.readline() == "\n":
+                        skiprows += 1
+                pieces.append(
+                    cudf.read_csv(
+                        path,
+                        sep=sep,
+                        quotechar=quote,
+                        lineterminator=eol,
+                        names=column_names,
+                        header=header,
+                        usecols=usecols,
+                        na_filter=True,
+                        na_values=null_values,
+                        keep_default_na=False,
+                        skiprows=skiprows,
+                        comment=comment,
+                        decimal=decimal,
+                        dtype=dtype_map,
+                    )
                 )
-            )
+            df = DataFrame.from_cudf(cudf.concat(pieces))
         elif self.typ == "parquet":
-            opts, cloud_opts = map(json.loads, self.options)
             cdf = cudf.read_parquet(self.paths, columns=with_columns)
             assert isinstance(cdf, cudf.DataFrame)
             df = DataFrame.from_cudf(cdf)
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index b1abaaa0f8c..11092abae59 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -5,6 +5,7 @@
 
 from __future__ import annotations
 
+import json
 from contextlib import AbstractContextManager, nullcontext
 from functools import singledispatch
 from typing import Any
@@ -88,10 +89,16 @@ def _(
     node: pl_ir.Scan, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
     typ, *options = node.scan_type
+    if typ == "ndjson":
+        (reader_options,) = map(json.loads, options)
+        cloud_options = None
+    else:
+        reader_options, cloud_options = map(json.loads, options)
     return ir.Scan(
         schema,
         typ,
-        tuple(options),
+        reader_options,
+        cloud_options,
         node.paths,
         node.file_options,
         translate_named_expr(visitor, n=node.predicate)
diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py
index af0baacd5a8..2bfd2162300 100644
--- a/python/cudf_polars/tests/test_scan.py
+++ b/python/cudf_polars/tests/test_scan.py
@@ -108,3 +108,20 @@ def test_scan_row_index_projected_out(tmp_path):
     q = pl.scan_parquet(tmp_path / "df.pq").with_row_index().select(pl.col("a"))
 
     assert_gpu_result_equal(q)
+
+
+def test_scan_csv_column_renames_projection_schema(tmp_path):
+    with (tmp_path / "test.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2,\n3,4,5""")
+
+    q = pl.scan_csv(
+        tmp_path / "test.csv",
+        with_column_names=lambda names: [f"{n}_suffix" for n in names],
+        schema_overrides={
+            "foo_suffix": pl.String(),
+            "bar_suffix": pl.Int8(),
+            "baz_suffix": pl.UInt16(),
+        },
+    )
+
+    assert_gpu_result_equal(q)