rapidsai · rjzamora · Dec 9, 2024 · Dec 9, 2024 · Dec 10, 2024 · Dec 11, 2024
@@ -42,7 +42,7 @@ def _create_dd_meta(cls, dataset_info, **kwargs):
         kwargs = dataset_info.get("kwargs", {})
         set_object_dtypes_from_pa_schema(
             meta_cudf,
-            kwargs.get("schema", None),
+            kwargs.get("dataset", {}).get("schema", None),
         )
 
         return meta_cudf
@@ -434,18 +434,12 @@ def set_object_dtypes_from_pa_schema(df, schema):
     # pyarrow schema.
     if schema:
         for col_name, col in df._data.items():
-            if col_name is None:
-                # Pyarrow cannot handle `None` as a field name.
-                # However, this should be a simple range index that
-                # we can ignore anyway
-                continue
-            typ = cudf_dtype_from_pa_type(schema.field(col_name).type)
-            if (
-                col_name in schema.names
-                and not isinstance(typ, (cudf.ListDtype, cudf.StructDtype))
-                and isinstance(col, cudf.core.column.StringColumn)
-            ):
-                df._data[col_name] = col.astype(typ)
+            if col_name in schema.names:
+                typ = cudf_dtype_from_pa_type(schema.field(col_name).type)
+                if not isinstance(
+                    typ, (cudf.ListDtype, cudf.StructDtype)
+                ) and isinstance(col, cudf.core.column.StringColumn):
+                    df._data[col_name] = col.astype(typ)
 
 
 def read_parquet(path, columns=None, **kwargs):

@@ -492,6 +492,33 @@ def test_create_metadata_file_inconsistent_schema(tmpdir):
     dd.assert_eq(ddf1.compute(), ddf2.compute())
 
 
+def test_read_inconsistent_schema(tmpdir):
+    records = [
+        {"id": 123, "text": "foo"},
+        {
+            "text": "bar",
+            "meta1": [{"field1": "cat"}],
+            "id": 456,
+        },
+    ]
+    columns = ["text", "id"]
+    pd.DataFrame(records[:1]).to_parquet(tmpdir / "part.0.parquet")
+    pd.DataFrame(records[1:]).to_parquet(tmpdir / "part.1.parquet")
+    # Check that cuDF and Dask cuDF match
+    dd.assert_eq(
+        cudf.read_parquet(
+            tmpdir, columns=columns, allow_mismatched_pq_schemas=True
+        ),
+        dask_cudf.read_parquet(tmpdir, columns=columns),
+        check_index=False,
+    )
+    # Check that "pandas" and "cudf" backends match
+    dd.assert_eq(
+        dd.read_parquet(tmpdir, columns=columns),
+        dask_cudf.read_parquet(tmpdir, columns=columns),
+    )
+
+
 @pytest.mark.parametrize(
     "data",
     [