Skip to content

Commit

Permalink
Backport PR #38293: BUG: read_json does not respect chunksize (#38658)
Browse files Browse the repository at this point in the history
Co-authored-by: Robert Bradshaw <robertwb@gmail.com>
  • Loading branch information
meeseeksmachine and robertwb authored Dec 23, 2020
1 parent fd9670e commit 2a4c3c6
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 1 deletion.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,7 @@ Performance improvements
- Performance improvement in :meth:`DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`Index.value_counts`)
- Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements (:issue:`36611`)
- Performance improvement for :meth:`DataFrame.__setitem__` with list-like indexers (:issue:`37954`)
- :meth:`read_json` now avoids reading entire file into memory when chunksize is specified (:issue:`34548`)

.. ---------------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -630,7 +630,7 @@ def _preprocess_data(self, data):
If self.chunksize, we prepare the data for the `__next__` method.
Otherwise, we read it into memory for the `read` method.
"""
if hasattr(data, "read") and (not self.chunksize or not self.nrows):
if hasattr(data, "read") and not (self.chunksize or self.nrows):
data = data.read()
self.close()
if not hasattr(data, "read") and (self.chunksize or self.nrows):
Expand Down
28 changes: 28 additions & 0 deletions pandas/tests/io/json/test_readlines.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,3 +252,31 @@ def test_readjson_lines_chunks_fileurl(datapath):
with pd.read_json(file_url, lines=True, chunksize=1) as url_reader:
for index, chuck in enumerate(url_reader):
tm.assert_frame_equal(chuck, df_list_expected[index])


def test_chunksize_is_incremental():
# See https://github.com/pandas-dev/pandas/issues/34548
jsonl = (
"""{"a": 1, "b": 2}
{"a": 3, "b": 4}
{"a": 5, "b": 6}
{"a": 7, "b": 8}\n"""
* 1000
)

class MyReader:
def __init__(self, contents):
self.read_count = 0
self.stringio = StringIO(contents)

def read(self, *args):
self.read_count += 1
return self.stringio.read(*args)

def __iter__(self):
self.read_count += 1
return iter(self.stringio)

reader = MyReader(jsonl)
assert len(list(pd.read_json(reader, lines=True, chunksize=100))) > 1
assert reader.read_count > 10

0 comments on commit 2a4c3c6

Please sign in to comment.