diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 697438df87d4f3..415d36787a5971 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1200,7 +1200,7 @@ def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail', if_exists=if_exists, index=index, index_label=index_label, chunksize=chunksize, dtype=dtype) - def to_pickle(self, path): + def to_pickle(self, path, compression='infer'): """ Pickle (serialize) object to input file path. @@ -1208,9 +1208,10 @@ def to_pickle(self, path): ---------- path : string File path + compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' """ from pandas.io.pickle import to_pickle - return to_pickle(self, path) + return to_pickle(self, path, compression) def to_clipboard(self, excel=None, sep=None, **kwargs): """ diff --git a/pandas/io/common.py b/pandas/io/common.py index 127ebc4839fd34..7c0258666c7a24 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -285,8 +285,45 @@ def ZipFile(*args, **kwargs): ZipFile = zipfile.ZipFile -def _get_handle(path, mode, encoding=None, compression=None, memory_map=False): +def _get_inferred_compression(filepath_or_buffer, compression): + if compression == 'infer': + if isinstance(filepath_or_buffer, compat.string_types): + if filepath_or_buffer.endswith('.gz'): + inferred_compression = 'gzip' + elif filepath_or_buffer.endswith('.bz2'): + inferred_compression = 'bz2' + elif filepath_or_buffer.endswith('.zip'): + inferred_compression = 'zip' + elif filepath_or_buffer.endswith('.xz'): + inferred_compression = 'xz' + else: + inferred_compression = None + else: + inferred_compression = None + else: + inferred_compression = compression + return inferred_compression + + +def _get_handle(path, mode, encoding=None, compression=None, memory_map=False, is_txt=True): """Gets file handle for given path and mode. + + Parameters + ---------- + path : string + file path + mode : string + mode to open file, like 'wb', 'rb', etc + encoding : string, default None + encoding for text file + compression : string, default None + { None, 'gzip', 'bz2', 'zip', 'xz' } + is_txt : bool, default True + True for text files (csv, json), False for binary files (pickle) + + Returns + ------- + opened file handle for I/O """ if compression is not None: if encoding is not None and not compat.PY3: @@ -320,7 +357,7 @@ def _get_handle(path, mode, encoding=None, compression=None, memory_map=False): else: raise ValueError('Unrecognized compression type: %s' % compression) - if compat.PY3: + if compat.PY3 and is_txt: from io import TextIOWrapper f = TextIOWrapper(f, encoding=encoding) return f diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f8cf04e08ab036..a1debc6308f0a3 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -27,7 +27,7 @@ from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg, _get_handle, UnicodeReader, UTF8Recoder, BaseIterator, CParserError, EmptyDataError, - ParserWarning, _NA_VALUES) + ParserWarning, _NA_VALUES, _get_inferred_compression) from pandas.tseries import tools from pandas.util.decorators import Appender @@ -353,21 +353,7 @@ def _read(filepath_or_buffer, kwds): # extension. If we're reading from a URL, the `get_filepath_or_buffer` # will use header info to determine compression, so use what it finds in # that case. - inferred_compression = kwds.get('compression') - if inferred_compression == 'infer': - if isinstance(filepath_or_buffer, compat.string_types): - if filepath_or_buffer.endswith('.gz'): - inferred_compression = 'gzip' - elif filepath_or_buffer.endswith('.bz2'): - inferred_compression = 'bz2' - elif filepath_or_buffer.endswith('.zip'): - inferred_compression = 'zip' - elif filepath_or_buffer.endswith('.xz'): - inferred_compression = 'xz' - else: - inferred_compression = None - else: - inferred_compression = None + inferred_compression = _get_inferred_compression(filepath_or_buffer, kwds.get('compression')) filepath_or_buffer, _, compression = get_filepath_or_buffer( filepath_or_buffer, encoding, diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 2358c296f782eb..9a0e3e0f4f6461 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -4,9 +4,10 @@ from numpy.lib.format import read_array, write_array from pandas.compat import BytesIO, cPickle as pkl, pickle_compat as pc, PY3 from pandas.types.common import is_datetime64_dtype, _NS_DTYPE +from pandas.io.common import _get_handle, _get_inferred_compression -def to_pickle(obj, path): +def to_pickle(obj, path, compression='infer'): """ Pickle (serialize) object to input file path @@ -15,12 +16,18 @@ def to_pickle(obj, path): obj : any object path : string File path + compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' """ - with open(path, 'wb') as f: + inferred_compression = _get_inferred_compression(path, compression) + if inferred_compression: + f = _get_handle(path, 'wb', compression=inferred_compression, is_txt=False) + else: + f = open(path, 'wb') + with f: pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL) -def read_pickle(path): +def read_pickle(path, compression='infer'): """ Load pickled pandas object (or any other pickled object) from the specified file path @@ -32,12 +39,21 @@ def read_pickle(path): ---------- path : string File path + compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' Returns ------- unpickled : type of object stored in file """ + inferred_compression = _get_inferred_compression(path, compression) + + def openfile(): + if inferred_compression: + return _get_handle(path, 'rb', compression=inferred_compression, is_txt=False) + else: + return open(path, 'rb') + def try_read(path, encoding=None): # try with cPickle # try with current pickle, if we have a Type Error then @@ -48,17 +64,17 @@ def try_read(path, encoding=None): # cpickle # GH 6899 try: - with open(path, 'rb') as fh: - return pkl.load(fh) + with openfile() as f: + return pkl.load(f) except Exception: # reg/patched pickle try: - with open(path, 'rb') as fh: + with openfile() as fh: return pc.load(fh, encoding=encoding, compat=False) # compat pickle except: - with open(path, 'rb') as fh: + with openfile() as fh: return pc.load(fh, encoding=encoding, compat=True) try: diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index a49f50b1bcb9f9..3cedad44d844be 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -284,6 +284,28 @@ def test_pickle_v0_15_2(self): # tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) + def compression_explicit(self, compression): + with tm.ensure_clean(self.path) as path: + df = tm.makeDataFrame() + df.to_pickle(path, compression) + tm.assert_frame_equal(df, pandas.read_pickle(path, compression)) + + def test_compression_explicit(self): + compressions = [None, 'gzip', 'bz2', 'xz'] + for c in compressions: + yield self.compression_explicit, c + + def compression_infer(self, ext): + with tm.ensure_clean(self.path + ext) as p: + df = tm.makeDataFrame() + df.to_pickle(p) + tm.assert_frame_equal(df, pandas.read_pickle(p)) + + def test_compression_infer(self): + extensions = ['', '.gz', '.bz2', '.xz'] + for ext in extensions: + yield self.compression_infer, ext + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],