From 6a8bdfbcc9ede73815a8a19a1ce713515d1c1b4e Mon Sep 17 00:00:00 2001 From: rjzamora Date: Mon, 9 Dec 2024 13:29:55 -0800 Subject: [PATCH 1/2] relax logic in set_object_dtypes_from_pa_schema --- .../dask_cudf/dask_cudf/_legacy/io/parquet.py | 15 +++++------ .../dask_cudf/io/tests/test_parquet.py | 27 +++++++++++++++++++ 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py index c0638e4a1c3..129e83caa02 100644 --- a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py +++ b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py @@ -42,7 +42,7 @@ def _create_dd_meta(cls, dataset_info, **kwargs): kwargs = dataset_info.get("kwargs", {}) set_object_dtypes_from_pa_schema( meta_cudf, - kwargs.get("schema", None), + kwargs.get("dataset", {}).get("schema", None), ) return meta_cudf @@ -439,13 +439,12 @@ def set_object_dtypes_from_pa_schema(df, schema): # However, this should be a simple range index that # we can ignore anyway continue - typ = cudf_dtype_from_pa_type(schema.field(col_name).type) - if ( - col_name in schema.names - and not isinstance(typ, (cudf.ListDtype, cudf.StructDtype)) - and isinstance(col, cudf.core.column.StringColumn) - ): - df._data[col_name] = col.astype(typ) + elif col_name in schema.names: + typ = cudf_dtype_from_pa_type(schema.field(col_name).type) + if not isinstance( + typ, (cudf.ListDtype, cudf.StructDtype) + ) and isinstance(col, cudf.core.column.StringColumn): + df._data[col_name] = col.astype(typ) def read_parquet(path, columns=None, **kwargs): diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 6efe6c4f388..8e0e9e27cc4 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -492,6 +492,33 @@ def test_create_metadata_file_inconsistent_schema(tmpdir): dd.assert_eq(ddf1.compute(), ddf2.compute()) +def test_read_inconsistent_schema(tmpdir): + records = [ + {"id": 123, "text": "foo"}, + { + "text": "bar", + "meta1": [{"field1": "cat"}], + "id": 456, + }, + ] + columns = ["text", "id"] + pd.DataFrame(records[:1]).to_parquet(tmpdir / "part.0.parquet") + pd.DataFrame(records[1:]).to_parquet(tmpdir / "part.1.parquet") + # Check that cuDF and Dask cuDF match + dd.assert_eq( + cudf.read_parquet( + tmpdir, columns=columns, allow_mismatched_pq_schemas=True + ), + dask_cudf.read_parquet(tmpdir, columns=columns), + check_index=False, + ) + # Check that "pandas" and "cudf" backends match + dd.assert_eq( + dd.read_parquet(tmpdir, columns=columns), + dask_cudf.read_parquet(tmpdir, columns=columns), + ) + + @pytest.mark.parametrize( "data", [ From e2db4d0a2d0b7a1bb51e2ce7d3429ec7790bb64f Mon Sep 17 00:00:00 2001 From: rjzamora Date: Thu, 12 Dec 2024 08:25:12 -0800 Subject: [PATCH 2/2] remove None block --- python/dask_cudf/dask_cudf/_legacy/io/parquet.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py index 129e83caa02..72fbd0cdc9e 100644 --- a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py +++ b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py @@ -434,12 +434,7 @@ def set_object_dtypes_from_pa_schema(df, schema): # pyarrow schema. if schema: for col_name, col in df._data.items(): - if col_name is None: - # Pyarrow cannot handle `None` as a field name. - # However, this should be a simple range index that - # we can ignore anyway - continue - elif col_name in schema.names: + if col_name in schema.names: typ = cudf_dtype_from_pa_type(schema.field(col_name).type) if not isinstance( typ, (cudf.ListDtype, cudf.StructDtype)