diff --git a/docs/cudf/source/user_guide/pandas-comparison.md b/docs/cudf/source/user_guide/pandas-comparison.md index ba04a231f41..441bc72205a 100644 --- a/docs/cudf/source/user_guide/pandas-comparison.md +++ b/docs/cudf/source/user_guide/pandas-comparison.md @@ -139,6 +139,27 @@ module, which allow you to compare values up to a desired precision. Unlike Pandas, cuDF does not support duplicate column names. It is best to use unique strings for column names. +## Writing a DataFrame to Parquet with non-string column names + +When there is a DataFrame with non-string column names, pandas casts each +column name to `str` before writing to a Parquet file. `cudf` raises an +error by default if this is attempted. However, to achieve similar behavior +as pandas you can enable the `mode.pandas_compatible` option, which will +enable `cudf` to cast the column names to `str` just like pandas. + +```python +>>> import cudf +>>> df = cudf.DataFrame({1: [1, 2, 3], "1": ["a", "b", "c"]}) +>>> df.to_parquet("df.parquet") + +Traceback (most recent call last): +ValueError: Writing a Parquet file requires string column names +>>> cudf.set_option("mode.pandas_compatible", True) +>>> df.to_parquet("df.parquet") + +UserWarning: The DataFrame has column names of non-string type. They will be converted to strings on write. +``` + ## No true `"object"` data type In Pandas and NumPy, the `"object"` data type is used for diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 923f5c4089f..5519bbd4cd5 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -361,9 +361,12 @@ def write_parquet( for i, name in enumerate(table._column_names, num_index_cols_meta): if not isinstance(name, str): - raise ValueError("parquet must have string column names") - - tbl_meta.get().column_metadata[i].set_name(name.encode()) + if cudf.get_option("mode.pandas_compatible"): + tbl_meta.get().column_metadata[i].set_name(str(name).encode()) + else: + raise ValueError("Writing a Parquet file requires string column names") + else: + tbl_meta.get().column_metadata[i].set_name(name.encode()) _set_col_metadata( table[name]._column, tbl_meta.get().column_metadata[i], diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 56918799cca..f5a5571a72f 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -174,7 +174,7 @@ cpdef generate_pandas_metadata(table, index): for col in table._columns ], df=table, - column_names=col_names, + column_names=map(str, col_names), index_levels=index_levels, index_descriptors=index_descriptors, preserve_index=index, diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index cd7075e1851..74ed6baead6 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -30,7 +30,6 @@ from cudf.testing._utils import ( TIMEDELTA_TYPES, assert_eq, - assert_exceptions_equal, expect_warning_if, set_random_null_mask_inplace, ) @@ -2528,15 +2527,29 @@ def test_parquet_writer_decimal(decimal_type, data): def test_parquet_writer_column_validation(): - df = cudf.DataFrame({1: [1, 2, 3], "1": ["a", "b", "c"]}) + df = cudf.DataFrame({1: [1, 2, 3], "a": ["a", "b", "c"]}) pdf = df.to_pandas() - assert_exceptions_equal( - lfunc=df.to_parquet, - rfunc=pdf.to_parquet, - lfunc_args_and_kwargs=(["cudf.parquet"],), - rfunc_args_and_kwargs=(["pandas.parquet"],), - ) + with cudf.option_context("mode.pandas_compatible", True): + with pytest.warns(UserWarning): + df.to_parquet("cudf.parquet") + + if PANDAS_GE_200: + with pytest.warns(UserWarning): + pdf.to_parquet("pandas.parquet") + + assert_eq( + pd.read_parquet("cudf.parquet"), + cudf.read_parquet("pandas.parquet"), + ) + assert_eq( + cudf.read_parquet("cudf.parquet"), + pd.read_parquet("pandas.parquet"), + ) + + with cudf.option_context("mode.pandas_compatible", False): + with pytest.raises(ValueError): + df.to_parquet("cudf.parquet") def test_parquet_writer_nulls_pandas_read(tmpdir, pdf):