Skip to content

Commit

Permalink
add compression support for pickle
Browse files Browse the repository at this point in the history
  • Loading branch information
goldenbull committed Oct 8, 2016
1 parent daba8e5 commit f187514
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 27 deletions.
5 changes: 3 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1200,17 +1200,18 @@ def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail',
if_exists=if_exists, index=index, index_label=index_label,
chunksize=chunksize, dtype=dtype)

def to_pickle(self, path):
def to_pickle(self, path, compression='infer'):
"""
Pickle (serialize) object to input file path.
Parameters
----------
path : string
File path
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
"""
from pandas.io.pickle import to_pickle
return to_pickle(self, path)
return to_pickle(self, path, compression)

def to_clipboard(self, excel=None, sep=None, **kwargs):
"""
Expand Down
41 changes: 39 additions & 2 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,8 +285,45 @@ def ZipFile(*args, **kwargs):
ZipFile = zipfile.ZipFile


def _get_handle(path, mode, encoding=None, compression=None, memory_map=False):
def _get_inferred_compression(filepath_or_buffer, compression):
if compression == 'infer':
if isinstance(filepath_or_buffer, compat.string_types):
if filepath_or_buffer.endswith('.gz'):
inferred_compression = 'gzip'
elif filepath_or_buffer.endswith('.bz2'):
inferred_compression = 'bz2'
elif filepath_or_buffer.endswith('.zip'):
inferred_compression = 'zip'
elif filepath_or_buffer.endswith('.xz'):
inferred_compression = 'xz'
else:
inferred_compression = None
else:
inferred_compression = None
else:
inferred_compression = compression
return inferred_compression


def _get_handle(path, mode, encoding=None, compression=None, memory_map=False, is_txt=True):
"""Gets file handle for given path and mode.
Parameters
----------
path : string
file path
mode : string
mode to open file, like 'wb', 'rb', etc
encoding : string, default None
encoding for text file
compression : string, default None
{ None, 'gzip', 'bz2', 'zip', 'xz' }
is_txt : bool, default True
True for text files (csv, json), False for binary files (pickle)
Returns
-------
opened file handle for I/O
"""
if compression is not None:
if encoding is not None and not compat.PY3:
Expand Down Expand Up @@ -320,7 +357,7 @@ def _get_handle(path, mode, encoding=None, compression=None, memory_map=False):
else:
raise ValueError('Unrecognized compression type: %s' %
compression)
if compat.PY3:
if compat.PY3 and is_txt:
from io import TextIOWrapper
f = TextIOWrapper(f, encoding=encoding)
return f
Expand Down
18 changes: 2 additions & 16 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg,
_get_handle, UnicodeReader, UTF8Recoder,
BaseIterator, CParserError, EmptyDataError,
ParserWarning, _NA_VALUES)
ParserWarning, _NA_VALUES, _get_inferred_compression)
from pandas.tseries import tools

from pandas.util.decorators import Appender
Expand Down Expand Up @@ -353,21 +353,7 @@ def _read(filepath_or_buffer, kwds):
# extension. If we're reading from a URL, the `get_filepath_or_buffer`
# will use header info to determine compression, so use what it finds in
# that case.
inferred_compression = kwds.get('compression')
if inferred_compression == 'infer':
if isinstance(filepath_or_buffer, compat.string_types):
if filepath_or_buffer.endswith('.gz'):
inferred_compression = 'gzip'
elif filepath_or_buffer.endswith('.bz2'):
inferred_compression = 'bz2'
elif filepath_or_buffer.endswith('.zip'):
inferred_compression = 'zip'
elif filepath_or_buffer.endswith('.xz'):
inferred_compression = 'xz'
else:
inferred_compression = None
else:
inferred_compression = None
inferred_compression = _get_inferred_compression(filepath_or_buffer, kwds.get('compression'))

filepath_or_buffer, _, compression = get_filepath_or_buffer(
filepath_or_buffer, encoding,
Expand Down
30 changes: 23 additions & 7 deletions pandas/io/pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
from numpy.lib.format import read_array, write_array
from pandas.compat import BytesIO, cPickle as pkl, pickle_compat as pc, PY3
from pandas.types.common import is_datetime64_dtype, _NS_DTYPE
from pandas.io.common import _get_handle, _get_inferred_compression


def to_pickle(obj, path):
def to_pickle(obj, path, compression='infer'):
"""
Pickle (serialize) object to input file path
Expand All @@ -15,12 +16,18 @@ def to_pickle(obj, path):
obj : any object
path : string
File path
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
"""
with open(path, 'wb') as f:
inferred_compression = _get_inferred_compression(path, compression)
if inferred_compression:
f = _get_handle(path, 'wb', compression=inferred_compression, is_txt=False)
else:
f = open(path, 'wb')
with f:
pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL)


def read_pickle(path):
def read_pickle(path, compression='infer'):
"""
Load pickled pandas object (or any other pickled object) from the specified
file path
Expand All @@ -32,12 +39,21 @@ def read_pickle(path):
----------
path : string
File path
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
Returns
-------
unpickled : type of object stored in file
"""

inferred_compression = _get_inferred_compression(path, compression)

def openfile():
if inferred_compression:
return _get_handle(path, 'rb', compression=inferred_compression, is_txt=False)
else:
return open(path, 'rb')

def try_read(path, encoding=None):
# try with cPickle
# try with current pickle, if we have a Type Error then
Expand All @@ -48,17 +64,17 @@ def try_read(path, encoding=None):
# cpickle
# GH 6899
try:
with open(path, 'rb') as fh:
return pkl.load(fh)
with openfile() as f:
return pkl.load(f)
except Exception:
# reg/patched pickle
try:
with open(path, 'rb') as fh:
with openfile() as fh:
return pc.load(fh, encoding=encoding, compat=False)

# compat pickle
except:
with open(path, 'rb') as fh:
with openfile() as fh:
return pc.load(fh, encoding=encoding, compat=True)

try:
Expand Down
22 changes: 22 additions & 0 deletions pandas/io/tests/test_pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,28 @@ def test_pickle_v0_15_2(self):
#
tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path))

def compression_explicit(self, compression):
with tm.ensure_clean(self.path) as path:
df = tm.makeDataFrame()
df.to_pickle(path, compression)
tm.assert_frame_equal(df, pandas.read_pickle(path, compression))

def test_compression_explicit(self):
compressions = [None, 'gzip', 'bz2', 'xz']
for c in compressions:
yield self.compression_explicit, c

def compression_infer(self, ext):
with tm.ensure_clean(self.path + ext) as p:
df = tm.makeDataFrame()
df.to_pickle(p)
tm.assert_frame_equal(df, pandas.read_pickle(p))

def test_compression_infer(self):
extensions = ['', '.gz', '.bz2', '.xz']
for ext in extensions:
yield self.compression_infer, ext


if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
Expand Down

0 comments on commit f187514

Please sign in to comment.