Skip to content

Commit

Permalink
COMPAT: reading json with lines=True from s3, xref #17200 (#17201)
Browse files Browse the repository at this point in the history
  • Loading branch information
Kevin Kuhl authored and jreback committed Nov 27, 2017
1 parent f7c79be commit 4fd104a
Show file tree
Hide file tree
Showing 6 changed files with 152 additions and 59 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.21.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ I/O
- :func:`read_parquet` now allows to specify kwargs which are passed to the respective engine (:issue:`18216`)
- Bug in parsing integer datetime-like columns with specified format in ``read_sql`` (:issue:`17855`).
- Bug in :meth:`DataFrame.to_msgpack` when serializing data of the numpy.bool_ datatype (:issue:`18390`)

- Bug in :func:`read_json` not decoding when reading line deliminted JSON from S3 (:issue:`17200`)

Plotting
^^^^^^^^
Expand Down
20 changes: 11 additions & 9 deletions pandas/io/json/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import pandas._libs.json as json
from pandas._libs.tslib import iNaT
from pandas.compat import StringIO, long, u
from pandas.compat import StringIO, long, u, to_str
from pandas import compat, isna
from pandas import Series, DataFrame, to_datetime, MultiIndex
from pandas.io.common import (get_filepath_or_buffer, _get_handle,
Expand Down Expand Up @@ -458,8 +458,10 @@ def read(self):
if self.lines and self.chunksize:
obj = concat(self)
elif self.lines:

data = to_str(self.data)
obj = self._get_object_parser(
self._combine_lines(self.data.split('\n'))
self._combine_lines(data.split('\n'))
)
else:
obj = self._get_object_parser(self.data)
Expand Down Expand Up @@ -612,7 +614,7 @@ def _try_convert_data(self, name, data, use_dtypes=True,
try:
dtype = np.dtype(dtype)
return data.astype(dtype), True
except:
except (TypeError, ValueError):
return data, False

if convert_dates:
Expand All @@ -628,7 +630,7 @@ def _try_convert_data(self, name, data, use_dtypes=True,
try:
data = data.astype('float64')
result = True
except:
except (TypeError, ValueError):
pass

if data.dtype.kind == 'f':
Expand All @@ -639,7 +641,7 @@ def _try_convert_data(self, name, data, use_dtypes=True,
try:
data = data.astype('float64')
result = True
except:
except (TypeError, ValueError):
pass

# do't coerce 0-len data
Expand All @@ -651,7 +653,7 @@ def _try_convert_data(self, name, data, use_dtypes=True,
if (new_data == data).all():
data = new_data
result = True
except:
except (TypeError, ValueError):
pass

# coerce ints to 64
Expand All @@ -661,7 +663,7 @@ def _try_convert_data(self, name, data, use_dtypes=True,
try:
data = data.astype('int64')
result = True
except:
except (TypeError, ValueError):
pass

return data, result
Expand All @@ -680,7 +682,7 @@ def _try_convert_to_date(self, data):
if new_data.dtype == 'object':
try:
new_data = data.astype('int64')
except:
except (TypeError, ValueError):
pass

# ignore numbers that are out of range
Expand All @@ -697,7 +699,7 @@ def _try_convert_to_date(self, data):
unit=date_unit)
except ValueError:
continue
except:
except Exception:
break
return new_data, True
return data, False
Expand Down
74 changes: 74 additions & 0 deletions pandas/tests/io/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import os

import moto
import pytest
from pandas.io.parsers import read_table

HERE = os.path.dirname(__file__)


@pytest.fixture(scope='module')
def tips_file():
"""Path to the tips dataset"""
return os.path.join(HERE, 'parser', 'data', 'tips.csv')


@pytest.fixture(scope='module')
def jsonl_file():
"""Path a JSONL dataset"""
return os.path.join(HERE, 'parser', 'data', 'items.jsonl')


@pytest.fixture(scope='module')
def salaries_table():
"""DataFrame with the salaries dataset"""
path = os.path.join(HERE, 'parser', 'data', 'salaries.csv')
return read_table(path)


@pytest.fixture(scope='module')
def s3_resource(tips_file, jsonl_file):
"""Fixture for mocking S3 interaction.
The primary bucket name is "pandas-test". The following datasets
are loaded.
- tips.csv
- tips.csv.gz
- tips.csv.bz2
- items.jsonl
A private bucket "cant_get_it" is also created. The boto3 s3 resource
is yielded by the fixture.
"""
pytest.importorskip('s3fs')
moto.mock_s3().start()

test_s3_files = [
('tips.csv', tips_file),
('tips.csv.gz', tips_file + '.gz'),
('tips.csv.bz2', tips_file + '.bz2'),
('items.jsonl', jsonl_file),
]

def add_tips_files(bucket_name):
for s3_key, file_name in test_s3_files:
with open(file_name, 'rb') as f:
conn.Bucket(bucket_name).put_object(
Key=s3_key,
Body=f)

boto3 = pytest.importorskip('boto3')
# see gh-16135
bucket = 'pandas-test'

conn = boto3.resource("s3", region_name="us-east-1")
conn.create_bucket(Bucket=bucket)
add_tips_files(bucket)

conn.create_bucket(Bucket='cant_get_it', ACL='private')
add_tips_files('cant_get_it')

yield conn

moto.mock_s3().stop()
65 changes: 64 additions & 1 deletion pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from pandas.compat import (range, lrange, StringIO,
OrderedDict, is_platform_32bit)
import os

import numpy as np
from pandas import (Series, DataFrame, DatetimeIndex, Timestamp,
read_json, compat)
Expand Down Expand Up @@ -1032,6 +1031,70 @@ def test_tz_range_is_utc(self):
df = DataFrame({'DT': dti})
assert dumps(df, iso_dates=True) == dfexp

def test_read_inline_jsonl(self):
# GH9180
result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
assert_frame_equal(result, expected)

def test_read_s3_jsonl(self, s3_resource):
pytest.importorskip('s3fs')
# GH17200

result = read_json('s3n://pandas-test/items.jsonl', lines=True)
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
assert_frame_equal(result, expected)

def test_read_local_jsonl(self):
# GH17200
with ensure_clean('tmp_items.json') as path:
with open(path, 'w') as infile:
infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n')
result = read_json(path, lines=True)
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
assert_frame_equal(result, expected)

def test_read_jsonl_unicode_chars(self):
# GH15132: non-ascii unicode characters
# \u201d == RIGHT DOUBLE QUOTATION MARK

# simulate file handle
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
json = StringIO(json)
result = read_json(json, lines=True)
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
columns=['a', 'b'])
assert_frame_equal(result, expected)

# simulate string
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
result = read_json(json, lines=True)
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
columns=['a', 'b'])
assert_frame_equal(result, expected)

def test_to_jsonl(self):
# GH9180
df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
result = df.to_json(orient="records", lines=True)
expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
assert result == expected

df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b'])
result = df.to_json(orient="records", lines=True)
expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
assert result == expected
assert_frame_equal(pd.read_json(result, lines=True), df)

# GH15096: escaped characters in columns and data
df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
columns=["a\\", 'b'])
result = df.to_json(orient="records", lines=True)
expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
'{"a\\\\":"foo\\"","b":"bar"}')
assert result == expected
assert_frame_equal(pd.read_json(result, lines=True), df)

def test_latin_encoding(self):
if compat.PY2:
tm.assert_raises_regex(
Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/io/parser/data/items.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"a": 1, "b": 2}
{"b":2, "a" :1}
48 changes: 0 additions & 48 deletions pandas/tests/io/parser/test_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,62 +4,14 @@
Tests parsers ability to read and parse non-local files
and hence require a network connection to be read.
"""
import os

import pytest
import moto

import pandas.util.testing as tm
from pandas import DataFrame
from pandas.io.parsers import read_csv, read_table
from pandas.compat import BytesIO


@pytest.fixture(scope='module')
def tips_file():
return os.path.join(tm.get_data_path(), 'tips.csv')


@pytest.fixture(scope='module')
def salaries_table():
path = os.path.join(tm.get_data_path(), 'salaries.csv')
return read_table(path)


@pytest.fixture(scope='module')
def s3_resource(tips_file):
pytest.importorskip('s3fs')
moto.mock_s3().start()

test_s3_files = [
('tips.csv', tips_file),
('tips.csv.gz', tips_file + '.gz'),
('tips.csv.bz2', tips_file + '.bz2'),
]

def add_tips_files(bucket_name):
for s3_key, file_name in test_s3_files:
with open(file_name, 'rb') as f:
conn.Bucket(bucket_name).put_object(
Key=s3_key,
Body=f)

boto3 = pytest.importorskip('boto3')
# see gh-16135
bucket = 'pandas-test'

conn = boto3.resource("s3", region_name="us-east-1")
conn.create_bucket(Bucket=bucket)
add_tips_files(bucket)

conn.create_bucket(Bucket='cant_get_it', ACL='private')
add_tips_files('cant_get_it')

yield conn

moto.mock_s3().stop()


@pytest.mark.network
@pytest.mark.parametrize(
"compression,extension",
Expand Down

0 comments on commit 4fd104a

Please sign in to comment.