From d3ec8b55efa65c6295c1bea6e6c70b2a2a1a5a21 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 27 Jul 2017 06:30:34 -0400 Subject: [PATCH] doc updates --- ci/requirements-3.6_DOC.sh | 2 +- doc/source/io.rst | 36 ++++++++++++++++----------------- pandas/core/frame.py | 9 +++++---- pandas/io/feather_format.py | 4 ++-- pandas/io/parquet.py | 14 +++++++------ pandas/tests/io/test_parquet.py | 23 +++++++++++++++++---- 6 files changed, 53 insertions(+), 35 deletions(-) diff --git a/ci/requirements-3.6_DOC.sh b/ci/requirements-3.6_DOC.sh index 8c10a794a13b9f..aec0f62148622d 100644 --- a/ci/requirements-3.6_DOC.sh +++ b/ci/requirements-3.6_DOC.sh @@ -6,6 +6,6 @@ echo "[install DOC_BUILD deps]" pip install pandas-gbq -conda install -n pandas -c conda-forge feather-format pyarrow nbsphinx pandoc +conda install -n pandas -c conda-forge feather-format pyarrow nbsphinx pandoc fastparquet conda install -n pandas -c r r rpy2 --yes diff --git a/doc/source/io.rst b/doc/source/io.rst index 72d89b1923d006..2ab5a4d15e6b81 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -213,7 +213,7 @@ buffer_lines : int, default None .. deprecated:: 0.19.0 Argument removed because its value is not respected by the parser - + compact_ints : boolean, default False .. deprecated:: 0.19.0 @@ -4093,7 +4093,7 @@ control compression: ``complevel`` and ``complib``. ``complevel`` specifies if and how hard data is to be compressed. ``complevel=0`` and ``complevel=None`` disables compression and ``0`_: Fast compression and decompression. .. versionadded:: 0.20.2 - + Support for alternative blosc compressors: - + - `blosc:blosclz `_ This is the default compressor for ``blosc`` - `blosc:lz4 @@ -4559,28 +4559,30 @@ Parquet .. versionadded:: 0.21.0 -Parquet provides a sharded binary columnar serialization for data frames. It is designed to make reading and writing data -frames efficient, and to make sharing data across data analysis languages easy. Parquet can use a -variety of compression techniques to shrink the file size as much as possible while still maintaining good read performance. +`Parquet `__ .. note:: These engines are very similar and should read/write nearly identical parquet format files. These libraries differ by having different underlying dependencies (``fastparquet`` by using ``numba``, while ``pyarrow`` uses a c-library). - TODO: differing options to write non-standard columns & null treatment .. ipython:: python @@ -4589,10 +4591,9 @@ See the documentation for `pyarrow = LooseVersion('0.3.1') @@ -29,7 +29,7 @@ def _try_import(): "you can install via conda\n" "conda install feather-format -c conda-forge" "or via pip\n" - "pip install feather-format\n") + "pip install -U feather-format\n") return feather diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index ac22fd7622ce74..ce903812ca2a2d 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -36,7 +36,7 @@ def __init__(self): "you can install via conda\n" "conda install pyarrow -c conda-forge\n" "\nor via pip\n" - "pip install pyarrow\n") + "pip install -U pyarrow\n") if LooseVersion(pyarrow.__version__) < '0.4.1': raise ImportError("pyarrow >= 0.4.1 is required for parquet" @@ -44,7 +44,7 @@ def __init__(self): "you can install via conda\n" "conda install pyarrow -c conda-forge\n" "\nor via pip\n" - "pip install pyarrow\n") + "pip install -U pyarrow\n") self.api = pyarrow @@ -72,7 +72,7 @@ def __init__(self): "you can install via conda\n" "conda install fastparquet -c conda-forge\n" "\nor via pip\n" - "pip install fastparquet") + "pip install -U fastparquet") if LooseVersion(fastparquet.__version__) < '0.1.0': raise ImportError("fastparquet >= 0.1.0 is required for parquet " @@ -80,7 +80,7 @@ def __init__(self): "you can install via conda\n" "conda install fastparquet -c conda-forge\n" "\nor via pip\n" - "pip install fastparquet") + "pip install -U fastparquet") self.api = fastparquet @@ -109,10 +109,12 @@ def to_parquet(df, path, engine=None, compression='snappy', **kwargs): File path engine : str, optional The parquet engine, one of {'pyarrow', 'fastparquet'} - if None, will use the option: `io.parquet.engine` + If None, will use the option: `io.parquet.engine`, which + defaults to 'pyarrow' compression : str, optional, default 'snappy' compression method, includes {'gzip', 'snappy', 'brotli'} - kwargs are passed to the engine + kwargs + Additional keyword arguments passed to the engine """ impl = get_engine(engine) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 032921e8d6704d..fe27d1068e7c71 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -54,6 +54,20 @@ def df_compat(): return pd.DataFrame({'A': [1, 2, 3], 'B': 'foo'}) +@pytest.fixture +def df_cross_compat(): + df = pd.DataFrame({'a': list('abc'), + 'b': list(range(1, 4)), + 'c': np.arange(3, 6).astype('u1'), + 'd': np.arange(4.0, 7.0, dtype='float64'), + 'e': [True, False, True], + 'f': pd.date_range('20130101', periods=3), + 'g': pd.date_range('20130101', periods=3, + tz='US/Eastern'), + 'h': pd.date_range('20130101', periods=3, freq='ns')}) + return df + + def test_invalid_engine(df_compat): with pytest.raises(ValueError): @@ -87,10 +101,10 @@ def test_options_fp(df_compat, fp): @pytest.mark.xfail(reason="fp does not ignore pa index __index_level_0__") -def test_cross_engine_pa_fp(df_compat, pa, fp): +def test_cross_engine_pa_fp(df_cross_compat, pa, fp): # cross-compat with differing reading/writing engines - df = df_compat + df = df_cross_compat with tm.ensure_clean() as path: df.to_parquet(path, engine=pa, compression=None) @@ -98,10 +112,11 @@ def test_cross_engine_pa_fp(df_compat, pa, fp): tm.assert_frame_equal(result, df) -def test_cross_engine_fp_pa(df_compat, pa, fp): +@pytest.mark.xfail(reason="pyarrow reading fp in some cases") +def test_cross_engine_fp_pa(df_cross_compat, pa, fp): # cross-compat with differing reading/writing engines - df = df_compat + df = df_cross_compat with tm.ensure_clean() as path: df.to_parquet(path, engine=fp, compression=None)