From 6a8bdfbcc9ede73815a8a19a1ce713515d1c1b4e Mon Sep 17 00:00:00 2001
From: rjzamora <rzamora217@gmail.com>
Date: Mon, 9 Dec 2024 13:29:55 -0800
Subject: [PATCH 1/2] relax logic in set_object_dtypes_from_pa_schema

---
 .../dask_cudf/dask_cudf/_legacy/io/parquet.py | 15 +++++------
 .../dask_cudf/io/tests/test_parquet.py        | 27 +++++++++++++++++++
 2 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py
index c0638e4a1c3..129e83caa02 100644
--- a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py
@@ -42,7 +42,7 @@ def _create_dd_meta(cls, dataset_info, **kwargs):
         kwargs = dataset_info.get("kwargs", {})
         set_object_dtypes_from_pa_schema(
             meta_cudf,
-            kwargs.get("schema", None),
+            kwargs.get("dataset", {}).get("schema", None),
         )
 
         return meta_cudf
@@ -439,13 +439,12 @@ def set_object_dtypes_from_pa_schema(df, schema):
                 # However, this should be a simple range index that
                 # we can ignore anyway
                 continue
-            typ = cudf_dtype_from_pa_type(schema.field(col_name).type)
-            if (
-                col_name in schema.names
-                and not isinstance(typ, (cudf.ListDtype, cudf.StructDtype))
-                and isinstance(col, cudf.core.column.StringColumn)
-            ):
-                df._data[col_name] = col.astype(typ)
+            elif col_name in schema.names:
+                typ = cudf_dtype_from_pa_type(schema.field(col_name).type)
+                if not isinstance(
+                    typ, (cudf.ListDtype, cudf.StructDtype)
+                ) and isinstance(col, cudf.core.column.StringColumn):
+                    df._data[col_name] = col.astype(typ)
 
 
 def read_parquet(path, columns=None, **kwargs):
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 6efe6c4f388..8e0e9e27cc4 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -492,6 +492,33 @@ def test_create_metadata_file_inconsistent_schema(tmpdir):
     dd.assert_eq(ddf1.compute(), ddf2.compute())
 
 
+def test_read_inconsistent_schema(tmpdir):
+    records = [
+        {"id": 123, "text": "foo"},
+        {
+            "text": "bar",
+            "meta1": [{"field1": "cat"}],
+            "id": 456,
+        },
+    ]
+    columns = ["text", "id"]
+    pd.DataFrame(records[:1]).to_parquet(tmpdir / "part.0.parquet")
+    pd.DataFrame(records[1:]).to_parquet(tmpdir / "part.1.parquet")
+    # Check that cuDF and Dask cuDF match
+    dd.assert_eq(
+        cudf.read_parquet(
+            tmpdir, columns=columns, allow_mismatched_pq_schemas=True
+        ),
+        dask_cudf.read_parquet(tmpdir, columns=columns),
+        check_index=False,
+    )
+    # Check that "pandas" and "cudf" backends match
+    dd.assert_eq(
+        dd.read_parquet(tmpdir, columns=columns),
+        dask_cudf.read_parquet(tmpdir, columns=columns),
+    )
+
+
 @pytest.mark.parametrize(
     "data",
     [

From e2db4d0a2d0b7a1bb51e2ce7d3429ec7790bb64f Mon Sep 17 00:00:00 2001
From: rjzamora <rzamora217@gmail.com>
Date: Thu, 12 Dec 2024 08:25:12 -0800
Subject: [PATCH 2/2] remove None block

---
 python/dask_cudf/dask_cudf/_legacy/io/parquet.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py
index 129e83caa02..72fbd0cdc9e 100644
--- a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py
@@ -434,12 +434,7 @@ def set_object_dtypes_from_pa_schema(df, schema):
     # pyarrow schema.
     if schema:
         for col_name, col in df._data.items():
-            if col_name is None:
-                # Pyarrow cannot handle `None` as a field name.
-                # However, this should be a simple range index that
-                # we can ignore anyway
-                continue
-            elif col_name in schema.names:
+            if col_name in schema.names:
                 typ = cudf_dtype_from_pa_type(schema.field(col_name).type)
                 if not isinstance(
                     typ, (cudf.ListDtype, cudf.StructDtype)