diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 7fbf2533428dc..8d6d7947b6892 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -195,7 +195,7 @@ Other Enhancements - :func:`read_json` now accepts a ``chunksize`` parameter that can be used when ``lines=True``. If ``chunksize`` is passed, read_json now returns an iterator which reads in ``chunksize`` lines with each iteration. (:issue:`17048`) - :meth:`DataFrame.assign` will preserve the original order of ``**kwargs`` for Python 3.6+ users instead of sorting the column names - Improved the import time of pandas by about 2.25x (:issue:`16764`) - +- :func:`read_json` and :func:`to_json` now accept a ``compression`` argument which allows them to transparently handle compressed files. (:issue:`17798`) .. _whatsnew_0210.api_breaking: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 942a9ff279092..c7ae9bbee9013 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1258,7 +1258,7 @@ def _repr_latex_(self): def to_json(self, path_or_buf=None, orient=None, date_format=None, double_precision=10, force_ascii=True, date_unit='ms', - default_handler=None, lines=False): + default_handler=None, lines=False, compression=None): """ Convert the object to a JSON string. @@ -1320,6 +1320,12 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None, .. versionadded:: 0.19.0 + compression : {None, 'gzip', 'bz2', 'xz'} + A string representing the compression to use in the output file, + only used when the first argument is a filename + + .. versionadded:: 0.21.0 + Returns ------- same type as input object with filtered info axis @@ -1372,7 +1378,7 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None, double_precision=double_precision, force_ascii=force_ascii, date_unit=date_unit, default_handler=default_handler, - lines=lines) + lines=lines, compression=compression) def to_hdf(self, path_or_buf, key, **kwargs): """Write the contained data to an HDF5 file using HDFStore. diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index ab74b265b6a06..be39f4baba0fb 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -9,7 +9,8 @@ from pandas import compat, isna from pandas import Series, DataFrame, to_datetime, MultiIndex from pandas.io.common import (get_filepath_or_buffer, _get_handle, - _stringify_path, BaseIterator) + _infer_compression, _stringify_path, + BaseIterator) from pandas.io.parsers import _validate_integer from pandas.core.common import AbstractMethodError from pandas.core.reshape.concat import concat @@ -27,7 +28,7 @@ # interface to/from def to_json(path_or_buf, obj, orient=None, date_format='epoch', double_precision=10, force_ascii=True, date_unit='ms', - default_handler=None, lines=False): + default_handler=None, lines=False, compression=None): path_or_buf = _stringify_path(path_or_buf) if lines and orient != 'records': @@ -54,8 +55,11 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', s = _convert_to_line_delimits(s) if isinstance(path_or_buf, compat.string_types): - with open(path_or_buf, 'w') as fh: + fh, handles = _get_handle(path_or_buf, 'w', compression=compression) + try: fh.write(s) + finally: + fh.close() elif path_or_buf is None: return s else: @@ -178,7 +182,7 @@ def write(self): def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, - lines=False, chunksize=None): + lines=False, chunksize=None, compression='infer'): """ Convert a JSON string to pandas object @@ -277,6 +281,15 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, .. versionadded:: 0.21.0 + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer', then use + gzip, bz2, zip or xz if path_or_buf is a string ending in + '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression + otherwise. If using 'zip', the ZIP file must contain only one data + file to be read in. Set to None for no decompression. + + .. versionadded:: 0.21.0 + Returns ------- result : Series or DataFrame, depending on the value of `typ`. @@ -334,15 +347,17 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, {"index": "row 2", "col 1": "c", "col 2": "d"}]}' """ - filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, - encoding=encoding) + compression = _infer_compression(path_or_buf, compression) + filepath_or_buffer, _, compression = get_filepath_or_buffer( + path_or_buf, encoding=encoding, compression=compression, + ) json_reader = JsonReader( filepath_or_buffer, orient=orient, typ=typ, dtype=dtype, convert_axes=convert_axes, convert_dates=convert_dates, keep_default_dates=keep_default_dates, numpy=numpy, precise_float=precise_float, date_unit=date_unit, encoding=encoding, - lines=lines, chunksize=chunksize + lines=lines, chunksize=chunksize, compression=compression, ) if chunksize: @@ -361,7 +376,7 @@ class JsonReader(BaseIterator): """ def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, - date_unit, encoding, lines, chunksize): + date_unit, encoding, lines, chunksize, compression): self.path_or_buf = filepath_or_buffer self.orient = orient @@ -374,6 +389,7 @@ def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes, self.precise_float = precise_float self.date_unit = date_unit self.encoding = encoding + self.compression = compression self.lines = lines self.chunksize = chunksize self.nrows_seen = 0 @@ -415,20 +431,20 @@ def _get_data_from_filepath(self, filepath_or_buffer): data = filepath_or_buffer + exists = False if isinstance(data, compat.string_types): try: exists = os.path.exists(filepath_or_buffer) - # gh-5874: if the filepath is too long will raise here except (TypeError, ValueError): pass - else: - if exists: - data, _ = _get_handle(filepath_or_buffer, 'r', - encoding=self.encoding) - self.should_close = True - self.open_stream = data + if exists or self.compression is not None: + data, _ = _get_handle(filepath_or_buffer, 'r', + encoding=self.encoding, + compression=self.compression) + self.should_close = True + self.open_stream = data return data diff --git a/pandas/tests/io/json/data/tsframe_v012.json.zip b/pandas/tests/io/json/data/tsframe_v012.json.zip new file mode 100644 index 0000000000000..100ba0c87b2ba Binary files /dev/null and b/pandas/tests/io/json/data/tsframe_v012.json.zip differ diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py new file mode 100644 index 0000000000000..e9976da6f6774 --- /dev/null +++ b/pandas/tests/io/json/test_compression.py @@ -0,0 +1,133 @@ +import pytest +import moto + +import pandas as pd +from pandas import compat +import pandas.util.testing as tm +from pandas.util.testing import assert_frame_equal, assert_raises_regex + + +COMPRESSION_TYPES = [None, 'bz2', 'gzip', 'xz'] + + +def decompress_file(path, compression): + if compression is None: + f = open(path, 'rb') + elif compression == 'gzip': + import gzip + f = gzip.GzipFile(path, 'rb') + elif compression == 'bz2': + import bz2 + f = bz2.BZ2File(path, 'rb') + elif compression == 'xz': + lzma = compat.import_lzma() + f = lzma.open(path, 'rb') + else: + msg = 'Unrecognized compression type: {}'.format(compression) + raise ValueError(msg) + + result = f.read().decode('utf8') + f.close() + return result + + +@pytest.mark.parametrize('compression', COMPRESSION_TYPES) +def test_compression_roundtrip(compression): + if compression == 'xz': + tm._skip_if_no_lzma() + + df = pd.DataFrame([[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + index=['A', 'B'], columns=['X', 'Y', 'Z']) + + with tm.ensure_clean() as path: + df.to_json(path, compression=compression) + assert_frame_equal(df, pd.read_json(path, compression=compression)) + + # explicitly ensure file was compressed. + uncompressed_content = decompress_file(path, compression) + assert_frame_equal(df, pd.read_json(uncompressed_content)) + + +def test_compress_zip_value_error(): + df = pd.DataFrame([[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + index=['A', 'B'], columns=['X', 'Y', 'Z']) + + with tm.ensure_clean() as path: + import zipfile + pytest.raises(zipfile.BadZipfile, df.to_json, path, compression="zip") + + +def test_read_zipped_json(): + uncompressed_path = tm.get_data_path("tsframe_v012.json") + uncompressed_df = pd.read_json(uncompressed_path) + + compressed_path = tm.get_data_path("tsframe_v012.json.zip") + compressed_df = pd.read_json(compressed_path, compression='zip') + + assert_frame_equal(uncompressed_df, compressed_df) + + +@pytest.mark.parametrize('compression', COMPRESSION_TYPES) +def test_with_s3_url(compression): + boto3 = pytest.importorskip('boto3') + pytest.importorskip('s3fs') + if compression == 'xz': + tm._skip_if_no_lzma() + + df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}') + with moto.mock_s3(): + conn = boto3.resource("s3", region_name="us-east-1") + bucket = conn.create_bucket(Bucket="pandas-test") + + with tm.ensure_clean() as path: + df.to_json(path, compression=compression) + with open(path, 'rb') as f: + bucket.put_object(Key='test-1', Body=f) + + roundtripped_df = pd.read_json('s3://pandas-test/test-1', + compression=compression) + assert_frame_equal(df, roundtripped_df) + + +@pytest.mark.parametrize('compression', COMPRESSION_TYPES) +def test_lines_with_compression(compression): + if compression == 'xz': + tm._skip_if_no_lzma() + + with tm.ensure_clean() as path: + df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}') + df.to_json(path, orient='records', lines=True, compression=compression) + roundtripped_df = pd.read_json(path, lines=True, + compression=compression) + assert_frame_equal(df, roundtripped_df) + + +@pytest.mark.parametrize('compression', COMPRESSION_TYPES) +def test_chunksize_with_compression(compression): + if compression == 'xz': + tm._skip_if_no_lzma() + + with tm.ensure_clean() as path: + df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}') + df.to_json(path, orient='records', lines=True, compression=compression) + + roundtripped_df = pd.concat(pd.read_json(path, lines=True, chunksize=1, + compression=compression)) + assert_frame_equal(df, roundtripped_df) + + +def test_write_unsupported_compression_type(): + df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}') + with tm.ensure_clean() as path: + msg = "Unrecognized compression type: unsupported" + assert_raises_regex(ValueError, msg, df.to_json, + path, compression="unsupported") + + +def test_read_unsupported_compression_type(): + with tm.ensure_clean() as path: + msg = "Unrecognized compression type: unsupported" + assert_raises_regex(ValueError, msg, pd.read_json, + path, compression="unsupported") diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index d14355b07cf20..95f23e82fced0 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -128,7 +128,7 @@ def test_readjson_chunks_closes(chunksize): path, orient=None, typ="frame", dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, - lines=True, chunksize=chunksize) + lines=True, chunksize=chunksize, compression=None) reader.read() assert reader.open_stream.closed, "didn't close stream with \ chunksize = %s" % chunksize