add compression support for pickle

pandas-dev · Oct 8, 2016 · f187514 · f187514
1 parent daba8e5
commit f187514
Show file tree

Hide file tree

Showing 5 changed files with 89 additions and 27 deletions.
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1200,17 +1200,18 @@ def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail',
                    if_exists=if_exists, index=index, index_label=index_label,
                    chunksize=chunksize, dtype=dtype)
 
-    def to_pickle(self, path):
+    def to_pickle(self, path, compression='infer'):
         """
         Pickle (serialize) object to input file path.
 
         Parameters
         ----------
         path : string
             File path
+        compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
         """
         from pandas.io.pickle import to_pickle
-        return to_pickle(self, path)
+        return to_pickle(self, path, compression)
 
     def to_clipboard(self, excel=None, sep=None, **kwargs):
         """

diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -285,8 +285,45 @@ def ZipFile(*args, **kwargs):
     ZipFile = zipfile.ZipFile
 
 
-def _get_handle(path, mode, encoding=None, compression=None, memory_map=False):
+def _get_inferred_compression(filepath_or_buffer, compression):
+    if compression == 'infer':
+        if isinstance(filepath_or_buffer, compat.string_types):
+            if filepath_or_buffer.endswith('.gz'):
+                inferred_compression = 'gzip'
+            elif filepath_or_buffer.endswith('.bz2'):
+                inferred_compression = 'bz2'
+            elif filepath_or_buffer.endswith('.zip'):
+                inferred_compression = 'zip'
+            elif filepath_or_buffer.endswith('.xz'):
+                inferred_compression = 'xz'
+            else:
+                inferred_compression = None
+        else:
+            inferred_compression = None
+    else:
+        inferred_compression = compression
+    return inferred_compression
+
+
+def _get_handle(path, mode, encoding=None, compression=None, memory_map=False, is_txt=True):
     """Gets file handle for given path and mode.
+    
+    Parameters
+    ----------
+    path : string
+        file path
+    mode : string
+        mode to open file, like 'wb', 'rb', etc
+    encoding : string, default None
+        encoding for text file
+    compression : string, default None
+        { None, 'gzip', 'bz2', 'zip', 'xz' }
+    is_txt : bool, default True
+        True for text files (csv, json), False for binary files (pickle)
+
+    Returns
+    -------
+    opened file handle for I/O
     """
     if compression is not None:
         if encoding is not None and not compat.PY3:
@@ -320,7 +357,7 @@ def _get_handle(path, mode, encoding=None, compression=None, memory_map=False):
         else:
             raise ValueError('Unrecognized compression type: %s' %
                              compression)
-        if compat.PY3:
+        if compat.PY3 and is_txt:
             from io import TextIOWrapper
             f = TextIOWrapper(f, encoding=encoding)
         return f

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -27,7 +27,7 @@
 from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg,
                               _get_handle, UnicodeReader, UTF8Recoder,
                               BaseIterator, CParserError, EmptyDataError,
-                              ParserWarning, _NA_VALUES)
+                              ParserWarning, _NA_VALUES, _get_inferred_compression)
 from pandas.tseries import tools
 
 from pandas.util.decorators import Appender
@@ -353,21 +353,7 @@ def _read(filepath_or_buffer, kwds):
     # extension.  If we're reading from a URL, the `get_filepath_or_buffer`
     # will use header info to determine compression, so use what it finds in
     # that case.
-    inferred_compression = kwds.get('compression')
-    if inferred_compression == 'infer':
-        if isinstance(filepath_or_buffer, compat.string_types):
-            if filepath_or_buffer.endswith('.gz'):
-                inferred_compression = 'gzip'
-            elif filepath_or_buffer.endswith('.bz2'):
-                inferred_compression = 'bz2'
-            elif filepath_or_buffer.endswith('.zip'):
-                inferred_compression = 'zip'
-            elif filepath_or_buffer.endswith('.xz'):
-                inferred_compression = 'xz'
-            else:
-                inferred_compression = None
-        else:
-            inferred_compression = None
+    inferred_compression = _get_inferred_compression(filepath_or_buffer, kwds.get('compression'))
 
     filepath_or_buffer, _, compression = get_filepath_or_buffer(
         filepath_or_buffer, encoding,

diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py
@@ -4,9 +4,10 @@
 from numpy.lib.format import read_array, write_array
 from pandas.compat import BytesIO, cPickle as pkl, pickle_compat as pc, PY3
 from pandas.types.common import is_datetime64_dtype, _NS_DTYPE
+from pandas.io.common import _get_handle, _get_inferred_compression
 
 
-def to_pickle(obj, path):
+def to_pickle(obj, path, compression='infer'):
     """
     Pickle (serialize) object to input file path
 
@@ -15,12 +16,18 @@ def to_pickle(obj, path):
     obj : any object
     path : string
         File path
+    compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
     """
-    with open(path, 'wb') as f:
+    inferred_compression = _get_inferred_compression(path, compression)
+    if inferred_compression:
+        f = _get_handle(path, 'wb', compression=inferred_compression, is_txt=False)
+    else:
+        f = open(path, 'wb')
+    with f:
         pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL)
 
 
-def read_pickle(path):
+def read_pickle(path, compression='infer'):
     """
     Load pickled pandas object (or any other pickled object) from the specified
     file path
@@ -32,12 +39,21 @@ def read_pickle(path):
     ----------
     path : string
         File path
+    compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
 
     Returns
     -------
     unpickled : type of object stored in file
     """
 
+    inferred_compression = _get_inferred_compression(path, compression)
+
+    def openfile():
+        if inferred_compression:
+            return _get_handle(path, 'rb', compression=inferred_compression, is_txt=False)
+        else:
+            return open(path, 'rb')
+
     def try_read(path, encoding=None):
         # try with cPickle
         # try with current pickle, if we have a Type Error then
@@ -48,17 +64,17 @@ def try_read(path, encoding=None):
         # cpickle
         # GH 6899
         try:
-            with open(path, 'rb') as fh:
-                return pkl.load(fh)
+            with openfile() as f:
+                return pkl.load(f)
         except Exception:
             # reg/patched pickle
             try:
-                with open(path, 'rb') as fh:
+                with openfile() as fh:
                     return pc.load(fh, encoding=encoding, compat=False)
 
             # compat pickle
             except:
-                with open(path, 'rb') as fh:
+                with openfile() as fh:
                     return pc.load(fh, encoding=encoding, compat=True)
 
     try:

diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py
@@ -284,6 +284,28 @@ def test_pickle_v0_15_2(self):
         #
         tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path))
 
+    def compression_explicit(self, compression):
+        with tm.ensure_clean(self.path) as path:
+            df = tm.makeDataFrame()
+            df.to_pickle(path, compression)
+            tm.assert_frame_equal(df, pandas.read_pickle(path, compression))
+
+    def test_compression_explicit(self):
+        compressions = [None, 'gzip', 'bz2', 'xz']
+        for c in compressions:
+            yield self.compression_explicit, c
+
+    def compression_infer(self, ext):
+        with tm.ensure_clean(self.path + ext) as p:
+            df = tm.makeDataFrame()
+            df.to_pickle(p)
+            tm.assert_frame_equal(df, pandas.read_pickle(p))
+
+    def test_compression_infer(self):
+        extensions = ['', '.gz', '.bz2', '.xz']
+        for ext in extensions:
+            yield self.compression_infer, ext
+
 
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],