Skip to content

Commit

Permalink
closes #23283
Browse files Browse the repository at this point in the history
  • Loading branch information
anjsudh committed Oct 27, 2018
1 parent caea25a commit 41c2828
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 4 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ Other Enhancements
- New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`).
- Compatibility with Matplotlib 3.0 (:issue:`22790`).
- Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`)
- :func:`~DataFrame.to_parquet` now supports writing a DataFrame as a directory of parquet files partitioned by a subset of the columns. (:issue:`23283`).
- :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have gained the ``nonexistent`` argument for alternative handling of nonexistent times. See :ref:`timeseries.timezone_nonexsistent` (:issue:`8917`)

.. _whatsnew_0240.api_breaking:
Expand Down
14 changes: 10 additions & 4 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,14 @@ def write(self, df, path, compression='snappy',

else:
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
self.api.parquet.write_table(
table, path, compression=compression,
coerce_timestamps=coerce_timestamps, **kwargs)
if 'partition_cols' in kwargs:
self.api.parquet.write_to_dataset(
table, path, compression=compression,
coerce_timestamps=coerce_timestamps, **kwargs)
else:
self.api.parquet.write_table(
table, path, compression=compression,
coerce_timestamps=coerce_timestamps, **kwargs)

def read(self, path, columns=None, **kwargs):
path, _, _, should_close = get_filepath_or_buffer(path)
Expand Down Expand Up @@ -252,7 +257,8 @@ def to_parquet(df, path, engine='auto', compression='snappy', index=None,
----------
df : DataFrame
path : string
File path
File path ( Will be used as `root_path` if
`partition_cols` is provided as parameter for 'pyarrow' engine).
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
Parquet library to use. If 'auto', then the option
``io.parquet.engine`` is used. The default ``io.parquet.engine``
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
""" test parquet compat """

import pytest
import tempfile
import shutil
import datetime
from distutils.version import LooseVersion
from warnings import catch_warnings
Expand Down Expand Up @@ -478,6 +480,24 @@ def test_s3_roundtrip(self, df_compat, s3_resource, pa):
check_round_trip(df_compat, pa,
path='s3://pandas-test/pyarrow.parquet')

def test_partition_cols_supported(self, pa_ge_070, df_full):
partition_cols = ['bool', 'int']
df = df_full
path = tempfile.mkdtemp()
df.to_parquet(path, partition_cols=partition_cols,
compression=None)
import pyarrow.parquet as pq
dataset = pq.ParquetDataset(path, validate_schema=False)
assert len(dataset.pieces) == 2
assert len(dataset.partitions.partition_names) == 2
assert dataset.partitions.partition_names == set(partition_cols)
shutil.rmtree(path)

def test_ignore_partition_cols_lt_070(self, pa_lt_070, df_full):
partition_cols = ['bool', 'int']
pa = pa_lt_070
df = df_full
check_round_trip(df, pa, write_kwargs={'partition_cols': partition_cols})

class TestParquetFastParquet(Base):

Expand Down

0 comments on commit 41c2828

Please sign in to comment.