Updated documentation

Fix"Should raise error on using partition_cols and partition_on together"
pandas-dev · Nov 5, 2018 · d5ee5ec · d5ee5ec
1 parent a5164b8
commit d5ee5ec
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 8 deletions.
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -4574,8 +4574,6 @@ Several caveats.
 * Categorical dtypes can be serialized to parquet, but will de-serialize as ``object`` dtype.
 * Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message
   on an attempt at serialization.
-* ``partition_cols`` will be used for partitioning the dataset, where the dataset will be written to multiple
-  files in the path specified. Therefore, the path specified, must be a directory path.
 
 You can specify an ``engine`` to direct the serialization. This can be one of ``pyarrow``, or ``fastparquet``, or ``auto``.
 If the engine is NOT specified, then the ``pd.options.io.parquet.engine`` option is checked; if this is also ``auto``,
@@ -4670,6 +4668,33 @@ Passing ``index=True`` will *always* write the index, even if that's not the
 underlying engine's default behavior.
 
 
+Partitioning Parquet files
+''''''''''''''''''''''''''
+
+Parquet supports partitioning of data based on the values of one or more columns.
+
+.. ipython:: python
+
+    df = pd.DataFrame({'a': [0, 0, 1, 1], 'b': [0, 1, 0, 1]})
+    df.to_parquet(fname='test', engine='pyarrow', partition_cols=['a'], compression=None)
+
+The `fname` specifies the parent directory to which data will be saved.
+The `partition_cols` are the column names by which the dataset will be partitioned.
+Columns are partitioned in the order they are given. The partition splits are
+determined by the unique values in the partition columns.
+The above example creates a partitioned dataset that may look like:
+
+::
+
+    test/
+        a=0/
+           0bac803e32dc42ae83fddfd029cbdebc.parquet
+           ...
+        a=1/
+           e6ab24a4f45147b49b54a662f0c412a3.parquet
+           ...
+
+
 .. _io.sql:
 
 SQL Queries

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -235,7 +235,7 @@ Other Enhancements
 - New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`).
 - Compatibility with Matplotlib 3.0 (:issue:`22790`).
 - Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`)
-- :func:`~DataFrame.to_parquet` now supports writing a DataFrame as a directory of parquet files partitioned by a subset of the columns. (:issue:`23283`).
+- With the pyarrow engine, :func:`~DataFrame.to_parquet` now supports writing a DataFrame as a directory of parquet files partitioned by a subset of the columns. (:issue:`23283`).
 - :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have gained the ``nonexistent`` argument for alternative handling of nonexistent times. See :ref:`timeseries.timezone_nonexsistent` (:issue:`8917`)
 
 .. _whatsnew_0240.api_breaking:

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -2002,8 +2002,8 @@ def to_parquet(self, fname, engine='auto', compression='snappy',
         partition_cols : list, optional, default None
             Column names by which to partition the dataset
             Columns are partitioned in the order they are given
-            The behaviour applies only to pyarrow >= 0.7.0 and fastparquet
-            For other versions, this argument will be ignored.
+            The behaviour applies only to pyarrow >= 0.7.0 and fastparquet.
+            Raises a ValueError for other versions.
 
             .. versionadded:: 0.24.0
 

diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -227,7 +227,12 @@ def write(self, df, path, compression='snappy', index=None,
         # Use tobytes() instead.
 
         if 'partition_on' in kwargs:
-            partition_cols = kwargs.pop('partition_on')
+            if partition_cols is None:
+                partition_cols = kwargs.pop('partition_on')
+            else:
+                raise ValueError("Cannot use both partition_on and "
+                                 "partition_cols. Use partition_cols for "
+                                 "partitioning data")
 
         if partition_cols is not None:
             kwargs['file_scheme'] = 'hive'
@@ -290,8 +295,8 @@ def to_parquet(df, path, engine='auto', compression='snappy', index=None,
     partition_cols : list, optional
             Column names by which to partition the dataset
             Columns are partitioned in the order they are given
-            The behaviour applies only to pyarrow >= 0.7.0 and fastparquet
-            For other versions, this argument will be ignored.
+            The behaviour applies only to pyarrow >= 0.7.0 and fastparquet.
+            Raises a ValueError for other versions.
             .. versionadded:: 0.24.0
     kwargs
         Additional keyword arguments passed to the engine

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -589,3 +589,14 @@ def test_partition_on_supported(self, fp, df_full):
             import fastparquet
             actual_partition_cols = fastparquet.ParquetFile(path, False).cats
             assert len(actual_partition_cols) == 2
+
+    def test_error_on_using_partition_cols_and_partition_on(self, fp, df_full):
+        # GH #23283
+        partition_cols = ['bool', 'int']
+        df = df_full
+        with pytest.raises(ValueError):
+            with tm.ensure_clean_dir() as path:
+                df.to_parquet(path, engine="fastparquet", compression=None,
+                              partition_on=partition_cols,
+                              partition_cols=partition_cols)
+