diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 8aae870d50716..89b4240afe694 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -567,6 +567,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`Index.value_counts`) - Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements (:issue:`36611`) - Performance improvement for :meth:`DataFrame.__setitem__` with list-like indexers (:issue:`37954`) +- :meth:`read_json` now avoids reading entire file into memory when chunksize is specified (:issue:`34548`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index da085d0d0eb2f..e1ac7b1b02f21 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -630,7 +630,7 @@ def _preprocess_data(self, data): If self.chunksize, we prepare the data for the `__next__` method. Otherwise, we read it into memory for the `read` method. """ - if hasattr(data, "read") and (not self.chunksize or not self.nrows): + if hasattr(data, "read") and not (self.chunksize or self.nrows): data = data.read() self.close() if not hasattr(data, "read") and (self.chunksize or self.nrows): diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 4bbd81ada995b..099d99507e136 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -252,3 +252,31 @@ def test_readjson_lines_chunks_fileurl(datapath): with pd.read_json(file_url, lines=True, chunksize=1) as url_reader: for index, chuck in enumerate(url_reader): tm.assert_frame_equal(chuck, df_list_expected[index]) + + +def test_chunksize_is_incremental(): + # See https://github.com/pandas-dev/pandas/issues/34548 + jsonl = ( + """{"a": 1, "b": 2} + {"a": 3, "b": 4} + {"a": 5, "b": 6} + {"a": 7, "b": 8}\n""" + * 1000 + ) + + class MyReader: + def __init__(self, contents): + self.read_count = 0 + self.stringio = StringIO(contents) + + def read(self, *args): + self.read_count += 1 + return self.stringio.read(*args) + + def __iter__(self): + self.read_count += 1 + return iter(self.stringio) + + reader = MyReader(jsonl) + assert len(list(pd.read_json(reader, lines=True, chunksize=100))) > 1 + assert reader.read_count > 10