Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add tranparent compression to json reading/writing #17798

Merged
merged 6 commits into from
Oct 6, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ Other Enhancements
- :func:`read_json` now accepts a ``chunksize`` parameter that can be used when ``lines=True``. If ``chunksize`` is passed, read_json now returns an iterator which reads in ``chunksize`` lines with each iteration. (:issue:`17048`)
- :meth:`DataFrame.assign` will preserve the original order of ``**kwargs`` for Python 3.6+ users instead of sorting the column names
- Improved the import time of pandas by about 2.25x (:issue:`16764`)

- :func:`read_json` and :func:`to_json` now accept a ``compression`` argument which allows them to transparently handle compressed files. (:issue:`17798`)

.. _whatsnew_0210.api_breaking:

Expand Down
10 changes: 8 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1258,7 +1258,7 @@ def _repr_latex_(self):

def to_json(self, path_or_buf=None, orient=None, date_format=None,
double_precision=10, force_ascii=True, date_unit='ms',
default_handler=None, lines=False):
default_handler=None, lines=False, compression=None):
"""
Convert the object to a JSON string.

Expand Down Expand Up @@ -1320,6 +1320,12 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None,

.. versionadded:: 0.19.0

compression : {None, 'gzip', 'bz2', 'xz'}
A string representing the compression to use in the output file,
only used when the first argument is a filename

.. versionadded:: 0.21.0

Returns
-------
same type as input object with filtered info axis
Expand Down Expand Up @@ -1372,7 +1378,7 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None,
double_precision=double_precision,
force_ascii=force_ascii, date_unit=date_unit,
default_handler=default_handler,
lines=lines)
lines=lines, compression=compression)

def to_hdf(self, path_or_buf, key, **kwargs):
"""Write the contained data to an HDF5 file using HDFStore.
Expand Down
46 changes: 31 additions & 15 deletions pandas/io/json/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
from pandas import compat, isna
from pandas import Series, DataFrame, to_datetime, MultiIndex
from pandas.io.common import (get_filepath_or_buffer, _get_handle,
_stringify_path, BaseIterator)
_infer_compression, _stringify_path,
BaseIterator)
from pandas.io.parsers import _validate_integer
from pandas.core.common import AbstractMethodError
from pandas.core.reshape.concat import concat
Expand All @@ -27,7 +28,7 @@
# interface to/from
def to_json(path_or_buf, obj, orient=None, date_format='epoch',
double_precision=10, force_ascii=True, date_unit='ms',
default_handler=None, lines=False):
default_handler=None, lines=False, compression=None):

path_or_buf = _stringify_path(path_or_buf)
if lines and orient != 'records':
Expand All @@ -54,8 +55,11 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch',
s = _convert_to_line_delimits(s)

if isinstance(path_or_buf, compat.string_types):
with open(path_or_buf, 'w') as fh:
fh, handles = _get_handle(path_or_buf, 'w', compression=compression)
try:
fh.write(s)
finally:
fh.close()
elif path_or_buf is None:
return s
else:
Expand Down Expand Up @@ -178,7 +182,7 @@ def write(self):
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
convert_axes=True, convert_dates=True, keep_default_dates=True,
numpy=False, precise_float=False, date_unit=None, encoding=None,
lines=False, chunksize=None):
lines=False, chunksize=None, compression='infer'):
"""
Convert a JSON string to pandas object

Expand Down Expand Up @@ -277,6 +281,15 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,

.. versionadded:: 0.21.0

compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
For on-the-fly decompression of on-disk data. If 'infer', then use
gzip, bz2, zip or xz if path_or_buf is a string ending in
'.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
otherwise. If using 'zip', the ZIP file must contain only one data
file to be read in. Set to None for no decompression.

.. versionadded:: 0.21.0

Returns
-------
result : Series or DataFrame, depending on the value of `typ`.
Expand Down Expand Up @@ -334,15 +347,17 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
{"index": "row 2", "col 1": "c", "col 2": "d"}]}'
"""

filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf,
encoding=encoding)
compression = _infer_compression(path_or_buf, compression)
filepath_or_buffer, _, compression = get_filepath_or_buffer(
path_or_buf, encoding=encoding, compression=compression,
)

json_reader = JsonReader(
filepath_or_buffer, orient=orient, typ=typ, dtype=dtype,
convert_axes=convert_axes, convert_dates=convert_dates,
keep_default_dates=keep_default_dates, numpy=numpy,
precise_float=precise_float, date_unit=date_unit, encoding=encoding,
lines=lines, chunksize=chunksize
lines=lines, chunksize=chunksize, compression=compression,
)

if chunksize:
Expand All @@ -361,7 +376,7 @@ class JsonReader(BaseIterator):
"""
def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes,
convert_dates, keep_default_dates, numpy, precise_float,
date_unit, encoding, lines, chunksize):
date_unit, encoding, lines, chunksize, compression):

self.path_or_buf = filepath_or_buffer
self.orient = orient
Expand All @@ -374,6 +389,7 @@ def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes,
self.precise_float = precise_float
self.date_unit = date_unit
self.encoding = encoding
self.compression = compression
self.lines = lines
self.chunksize = chunksize
self.nrows_seen = 0
Expand Down Expand Up @@ -415,20 +431,20 @@ def _get_data_from_filepath(self, filepath_or_buffer):

data = filepath_or_buffer

exists = False
if isinstance(data, compat.string_types):
try:
exists = os.path.exists(filepath_or_buffer)

# gh-5874: if the filepath is too long will raise here
except (TypeError, ValueError):
pass

else:
if exists:
data, _ = _get_handle(filepath_or_buffer, 'r',
encoding=self.encoding)
self.should_close = True
self.open_stream = data
if exists or self.compression is not None:
data, _ = _get_handle(filepath_or_buffer, 'r',
encoding=self.encoding,
compression=self.compression)
self.should_close = True
self.open_stream = data

return data

Expand Down
Binary file added pandas/tests/io/json/data/tsframe_v012.json.zip
Binary file not shown.
133 changes: 133 additions & 0 deletions pandas/tests/io/json/test_compression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import pytest
import moto

import pandas as pd
from pandas import compat
import pandas.util.testing as tm
from pandas.util.testing import assert_frame_equal, assert_raises_regex


COMPRESSION_TYPES = [None, 'bz2', 'gzip', 'xz']


def decompress_file(path, compression):
if compression is None:
f = open(path, 'rb')
elif compression == 'gzip':
import gzip
f = gzip.GzipFile(path, 'rb')
elif compression == 'bz2':
import bz2
f = bz2.BZ2File(path, 'rb')
elif compression == 'xz':
lzma = compat.import_lzma()
f = lzma.open(path, 'rb')
else:
msg = 'Unrecognized compression type: {}'.format(compression)
raise ValueError(msg)

result = f.read().decode('utf8')
f.close()
return result


@pytest.mark.parametrize('compression', COMPRESSION_TYPES)
def test_compression_roundtrip(compression):
if compression == 'xz':
tm._skip_if_no_lzma()

df = pd.DataFrame([[0.123456, 0.234567, 0.567567],
[12.32112, 123123.2, 321321.2]],
index=['A', 'B'], columns=['X', 'Y', 'Z'])

with tm.ensure_clean() as path:
df.to_json(path, compression=compression)
assert_frame_equal(df, pd.read_json(path, compression=compression))

# explicitly ensure file was compressed.
uncompressed_content = decompress_file(path, compression)
assert_frame_equal(df, pd.read_json(uncompressed_content))


def test_compress_zip_value_error():
df = pd.DataFrame([[0.123456, 0.234567, 0.567567],
[12.32112, 123123.2, 321321.2]],
index=['A', 'B'], columns=['X', 'Y', 'Z'])

with tm.ensure_clean() as path:
import zipfile
pytest.raises(zipfile.BadZipfile, df.to_json, path, compression="zip")


def test_read_zipped_json():
uncompressed_path = tm.get_data_path("tsframe_v012.json")
uncompressed_df = pd.read_json(uncompressed_path)

compressed_path = tm.get_data_path("tsframe_v012.json.zip")
compressed_df = pd.read_json(compressed_path, compression='zip')

assert_frame_equal(uncompressed_df, compressed_df)


@pytest.mark.parametrize('compression', COMPRESSION_TYPES)
def test_with_s3_url(compression):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This shares some code with the (to be merged) #17201

I think it's fine for now, but we'll want to clean it up whenever the later is merged. Since this is clean at the moment, I think we'll merge it, and then refactor this test in #17201.

boto3 = pytest.importorskip('boto3')
pytest.importorskip('s3fs')
if compression == 'xz':
tm._skip_if_no_lzma()

df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
with moto.mock_s3():
conn = boto3.resource("s3", region_name="us-east-1")
bucket = conn.create_bucket(Bucket="pandas-test")

with tm.ensure_clean() as path:
df.to_json(path, compression=compression)
with open(path, 'rb') as f:
bucket.put_object(Key='test-1', Body=f)

roundtripped_df = pd.read_json('s3://pandas-test/test-1',
compression=compression)
assert_frame_equal(df, roundtripped_df)


@pytest.mark.parametrize('compression', COMPRESSION_TYPES)
def test_lines_with_compression(compression):
if compression == 'xz':
tm._skip_if_no_lzma()

with tm.ensure_clean() as path:
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
df.to_json(path, orient='records', lines=True, compression=compression)
roundtripped_df = pd.read_json(path, lines=True,
compression=compression)
assert_frame_equal(df, roundtripped_df)


@pytest.mark.parametrize('compression', COMPRESSION_TYPES)
def test_chunksize_with_compression(compression):
if compression == 'xz':
tm._skip_if_no_lzma()

with tm.ensure_clean() as path:
df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}')
df.to_json(path, orient='records', lines=True, compression=compression)

roundtripped_df = pd.concat(pd.read_json(path, lines=True, chunksize=1,
compression=compression))
assert_frame_equal(df, roundtripped_df)


def test_write_unsupported_compression_type():
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
with tm.ensure_clean() as path:
msg = "Unrecognized compression type: unsupported"
assert_raises_regex(ValueError, msg, df.to_json,
path, compression="unsupported")


def test_read_unsupported_compression_type():
with tm.ensure_clean() as path:
msg = "Unrecognized compression type: unsupported"
assert_raises_regex(ValueError, msg, pd.read_json,
path, compression="unsupported")
2 changes: 1 addition & 1 deletion pandas/tests/io/json/test_readlines.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def test_readjson_chunks_closes(chunksize):
path, orient=None, typ="frame", dtype=True, convert_axes=True,
convert_dates=True, keep_default_dates=True, numpy=False,
precise_float=False, date_unit=None, encoding=None,
lines=True, chunksize=chunksize)
lines=True, chunksize=chunksize, compression=None)
reader.read()
assert reader.open_stream.closed, "didn't close stream with \
chunksize = %s" % chunksize
Expand Down