diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 4389dbcff280d..d1c3a4ba32603 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -88,7 +88,7 @@ I/O - :func:`read_parquet` now allows to specify kwargs which are passed to the respective engine (:issue:`18216`) - Bug in parsing integer datetime-like columns with specified format in ``read_sql`` (:issue:`17855`). - Bug in :meth:`DataFrame.to_msgpack` when serializing data of the numpy.bool_ datatype (:issue:`18390`) - +- Bug in :func:`read_json` not decoding when reading line deliminted JSON from S3 (:issue:`17200`) Plotting ^^^^^^^^ diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 11bf3a9363953..21736673350d8 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -5,7 +5,7 @@ import pandas._libs.json as json from pandas._libs.tslib import iNaT -from pandas.compat import StringIO, long, u +from pandas.compat import StringIO, long, u, to_str from pandas import compat, isna from pandas import Series, DataFrame, to_datetime, MultiIndex from pandas.io.common import (get_filepath_or_buffer, _get_handle, @@ -458,8 +458,10 @@ def read(self): if self.lines and self.chunksize: obj = concat(self) elif self.lines: + + data = to_str(self.data) obj = self._get_object_parser( - self._combine_lines(self.data.split('\n')) + self._combine_lines(data.split('\n')) ) else: obj = self._get_object_parser(self.data) @@ -612,7 +614,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, try: dtype = np.dtype(dtype) return data.astype(dtype), True - except: + except (TypeError, ValueError): return data, False if convert_dates: @@ -628,7 +630,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, try: data = data.astype('float64') result = True - except: + except (TypeError, ValueError): pass if data.dtype.kind == 'f': @@ -639,7 +641,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, try: data = data.astype('float64') result = True - except: + except (TypeError, ValueError): pass # do't coerce 0-len data @@ -651,7 +653,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, if (new_data == data).all(): data = new_data result = True - except: + except (TypeError, ValueError): pass # coerce ints to 64 @@ -661,7 +663,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, try: data = data.astype('int64') result = True - except: + except (TypeError, ValueError): pass return data, result @@ -680,7 +682,7 @@ def _try_convert_to_date(self, data): if new_data.dtype == 'object': try: new_data = data.astype('int64') - except: + except (TypeError, ValueError): pass # ignore numbers that are out of range @@ -697,7 +699,7 @@ def _try_convert_to_date(self, data): unit=date_unit) except ValueError: continue - except: + except Exception: break return new_data, True return data, False diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py new file mode 100644 index 0000000000000..828d5d0ccd3c6 --- /dev/null +++ b/pandas/tests/io/conftest.py @@ -0,0 +1,74 @@ +import os + +import moto +import pytest +from pandas.io.parsers import read_table + +HERE = os.path.dirname(__file__) + + +@pytest.fixture(scope='module') +def tips_file(): + """Path to the tips dataset""" + return os.path.join(HERE, 'parser', 'data', 'tips.csv') + + +@pytest.fixture(scope='module') +def jsonl_file(): + """Path a JSONL dataset""" + return os.path.join(HERE, 'parser', 'data', 'items.jsonl') + + +@pytest.fixture(scope='module') +def salaries_table(): + """DataFrame with the salaries dataset""" + path = os.path.join(HERE, 'parser', 'data', 'salaries.csv') + return read_table(path) + + +@pytest.fixture(scope='module') +def s3_resource(tips_file, jsonl_file): + """Fixture for mocking S3 interaction. + + The primary bucket name is "pandas-test". The following datasets + are loaded. + + - tips.csv + - tips.csv.gz + - tips.csv.bz2 + - items.jsonl + + A private bucket "cant_get_it" is also created. The boto3 s3 resource + is yielded by the fixture. + """ + pytest.importorskip('s3fs') + moto.mock_s3().start() + + test_s3_files = [ + ('tips.csv', tips_file), + ('tips.csv.gz', tips_file + '.gz'), + ('tips.csv.bz2', tips_file + '.bz2'), + ('items.jsonl', jsonl_file), + ] + + def add_tips_files(bucket_name): + for s3_key, file_name in test_s3_files: + with open(file_name, 'rb') as f: + conn.Bucket(bucket_name).put_object( + Key=s3_key, + Body=f) + + boto3 = pytest.importorskip('boto3') + # see gh-16135 + bucket = 'pandas-test' + + conn = boto3.resource("s3", region_name="us-east-1") + conn.create_bucket(Bucket=bucket) + add_tips_files(bucket) + + conn.create_bucket(Bucket='cant_get_it', ACL='private') + add_tips_files('cant_get_it') + + yield conn + + moto.mock_s3().stop() diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 1c895f7e9e89a..fe447534efdc7 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -4,7 +4,6 @@ from pandas.compat import (range, lrange, StringIO, OrderedDict, is_platform_32bit) import os - import numpy as np from pandas import (Series, DataFrame, DatetimeIndex, Timestamp, read_json, compat) @@ -1032,6 +1031,70 @@ def test_tz_range_is_utc(self): df = DataFrame({'DT': dti}) assert dumps(df, iso_dates=True) == dfexp + def test_read_inline_jsonl(self): + # GH9180 + result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) + expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + assert_frame_equal(result, expected) + + def test_read_s3_jsonl(self, s3_resource): + pytest.importorskip('s3fs') + # GH17200 + + result = read_json('s3n://pandas-test/items.jsonl', lines=True) + expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + assert_frame_equal(result, expected) + + def test_read_local_jsonl(self): + # GH17200 + with ensure_clean('tmp_items.json') as path: + with open(path, 'w') as infile: + infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n') + result = read_json(path, lines=True) + expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + assert_frame_equal(result, expected) + + def test_read_jsonl_unicode_chars(self): + # GH15132: non-ascii unicode characters + # \u201d == RIGHT DOUBLE QUOTATION MARK + + # simulate file handle + json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' + json = StringIO(json) + result = read_json(json, lines=True) + expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]], + columns=['a', 'b']) + assert_frame_equal(result, expected) + + # simulate string + json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' + result = read_json(json, lines=True) + expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]], + columns=['a', 'b']) + assert_frame_equal(result, expected) + + def test_to_jsonl(self): + # GH9180 + df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + result = df.to_json(orient="records", lines=True) + expected = '{"a":1,"b":2}\n{"a":1,"b":2}' + assert result == expected + + df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b']) + result = df.to_json(orient="records", lines=True) + expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}' + assert result == expected + assert_frame_equal(pd.read_json(result, lines=True), df) + + # GH15096: escaped characters in columns and data + df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], + columns=["a\\", 'b']) + result = df.to_json(orient="records", lines=True) + expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n' + '{"a\\\\":"foo\\"","b":"bar"}') + assert result == expected + assert_frame_equal(pd.read_json(result, lines=True), df) + def test_latin_encoding(self): if compat.PY2: tm.assert_raises_regex( diff --git a/pandas/tests/io/parser/data/items.jsonl b/pandas/tests/io/parser/data/items.jsonl new file mode 100644 index 0000000000000..f784d37befa82 --- /dev/null +++ b/pandas/tests/io/parser/data/items.jsonl @@ -0,0 +1,2 @@ +{"a": 1, "b": 2} +{"b":2, "a" :1} diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 27cc708889fa2..d00d3f31ce189 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -4,10 +4,7 @@ Tests parsers ability to read and parse non-local files and hence require a network connection to be read. """ -import os - import pytest -import moto import pandas.util.testing as tm from pandas import DataFrame @@ -15,51 +12,6 @@ from pandas.compat import BytesIO -@pytest.fixture(scope='module') -def tips_file(): - return os.path.join(tm.get_data_path(), 'tips.csv') - - -@pytest.fixture(scope='module') -def salaries_table(): - path = os.path.join(tm.get_data_path(), 'salaries.csv') - return read_table(path) - - -@pytest.fixture(scope='module') -def s3_resource(tips_file): - pytest.importorskip('s3fs') - moto.mock_s3().start() - - test_s3_files = [ - ('tips.csv', tips_file), - ('tips.csv.gz', tips_file + '.gz'), - ('tips.csv.bz2', tips_file + '.bz2'), - ] - - def add_tips_files(bucket_name): - for s3_key, file_name in test_s3_files: - with open(file_name, 'rb') as f: - conn.Bucket(bucket_name).put_object( - Key=s3_key, - Body=f) - - boto3 = pytest.importorskip('boto3') - # see gh-16135 - bucket = 'pandas-test' - - conn = boto3.resource("s3", region_name="us-east-1") - conn.create_bucket(Bucket=bucket) - add_tips_files(bucket) - - conn.create_bucket(Bucket='cant_get_it', ACL='private') - add_tips_files('cant_get_it') - - yield conn - - moto.mock_s3().stop() - - @pytest.mark.network @pytest.mark.parametrize( "compression,extension",