Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

COMPAT: reading json with lines=True from s3, xref #17200 #17201

Merged
merged 22 commits into from
Nov 27, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.21.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ I/O
- :func:`read_parquet` now allows to specify kwargs which are passed to the respective engine (:issue:`18216`)
- Bug in parsing integer datetime-like columns with specified format in ``read_sql`` (:issue:`17855`).
- Bug in :meth:`DataFrame.to_msgpack` when serializing data of the numpy.bool_ datatype (:issue:`18390`)

- Bug in :func:`read_json` not decoding when reading line deliminted JSON from S3 (:issue:`17200`)

Plotting
^^^^^^^^
Expand Down
20 changes: 11 additions & 9 deletions pandas/io/json/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import pandas._libs.json as json
from pandas._libs.tslib import iNaT
from pandas.compat import StringIO, long, u
from pandas.compat import StringIO, long, u, to_str
from pandas import compat, isna
from pandas import Series, DataFrame, to_datetime, MultiIndex
from pandas.io.common import (get_filepath_or_buffer, _get_handle,
Expand Down Expand Up @@ -458,8 +458,10 @@ def read(self):
if self.lines and self.chunksize:
obj = concat(self)
elif self.lines:

data = to_str(self.data)
obj = self._get_object_parser(
self._combine_lines(self.data.split('\n'))
self._combine_lines(data.split('\n'))
)
else:
obj = self._get_object_parser(self.data)
Expand Down Expand Up @@ -612,7 +614,7 @@ def _try_convert_data(self, name, data, use_dtypes=True,
try:
dtype = np.dtype(dtype)
return data.astype(dtype), True
except:
except (TypeError, ValueError):
return data, False

if convert_dates:
Expand All @@ -628,7 +630,7 @@ def _try_convert_data(self, name, data, use_dtypes=True,
try:
data = data.astype('float64')
result = True
except:
except (TypeError, ValueError):
pass

if data.dtype.kind == 'f':
Expand All @@ -639,7 +641,7 @@ def _try_convert_data(self, name, data, use_dtypes=True,
try:
data = data.astype('float64')
result = True
except:
except (TypeError, ValueError):
pass

# do't coerce 0-len data
Expand All @@ -651,7 +653,7 @@ def _try_convert_data(self, name, data, use_dtypes=True,
if (new_data == data).all():
data = new_data
result = True
except:
except (TypeError, ValueError):
pass

# coerce ints to 64
Expand All @@ -661,7 +663,7 @@ def _try_convert_data(self, name, data, use_dtypes=True,
try:
data = data.astype('int64')
result = True
except:
except (TypeError, ValueError):
pass

return data, result
Expand All @@ -680,7 +682,7 @@ def _try_convert_to_date(self, data):
if new_data.dtype == 'object':
try:
new_data = data.astype('int64')
except:
except (TypeError, ValueError):
pass

# ignore numbers that are out of range
Expand All @@ -697,7 +699,7 @@ def _try_convert_to_date(self, data):
unit=date_unit)
except ValueError:
continue
except:
except Exception:
break
return new_data, True
return data, False
Expand Down
74 changes: 74 additions & 0 deletions pandas/tests/io/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import os

import moto
import pytest
from pandas.io.parsers import read_table

HERE = os.path.dirname(__file__)


@pytest.fixture(scope='module')
def tips_file():
"""Path to the tips dataset"""
return os.path.join(HERE, 'parser', 'data', 'tips.csv')


@pytest.fixture(scope='module')
def jsonl_file():
"""Path a JSONL dataset"""
return os.path.join(HERE, 'parser', 'data', 'items.jsonl')


@pytest.fixture(scope='module')
def salaries_table():
"""DataFrame with the salaries dataset"""
path = os.path.join(HERE, 'parser', 'data', 'salaries.csv')
return read_table(path)


@pytest.fixture(scope='module')
def s3_resource(tips_file, jsonl_file):
"""Fixture for mocking S3 interaction.

The primary bucket name is "pandas-test". The following datasets
are loaded.

- tips.csv
- tips.csv.gz
- tips.csv.bz2
- items.jsonl

A private bucket "cant_get_it" is also created. The boto3 s3 resource
is yielded by the fixture.
"""
pytest.importorskip('s3fs')
moto.mock_s3().start()

test_s3_files = [
('tips.csv', tips_file),
('tips.csv.gz', tips_file + '.gz'),
('tips.csv.bz2', tips_file + '.bz2'),
('items.jsonl', jsonl_file),
]

def add_tips_files(bucket_name):
for s3_key, file_name in test_s3_files:
with open(file_name, 'rb') as f:
conn.Bucket(bucket_name).put_object(
Key=s3_key,
Body=f)

boto3 = pytest.importorskip('boto3')
# see gh-16135
bucket = 'pandas-test'

conn = boto3.resource("s3", region_name="us-east-1")
conn.create_bucket(Bucket=bucket)
add_tips_files(bucket)

conn.create_bucket(Bucket='cant_get_it', ACL='private')
add_tips_files('cant_get_it')

yield conn

moto.mock_s3().stop()
65 changes: 64 additions & 1 deletion pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from pandas.compat import (range, lrange, StringIO,
OrderedDict, is_platform_32bit)
import os

import numpy as np
from pandas import (Series, DataFrame, DatetimeIndex, Timestamp,
read_json, compat)
Expand Down Expand Up @@ -1032,6 +1031,70 @@ def test_tz_range_is_utc(self):
df = DataFrame({'DT': dti})
assert dumps(df, iso_dates=True) == dfexp

def test_read_inline_jsonl(self):
# GH9180
result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
assert_frame_equal(result, expected)

def test_read_s3_jsonl(self, s3_resource):
pytest.importorskip('s3fs')
# GH17200

result = read_json('s3n://pandas-test/items.jsonl', lines=True)
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
assert_frame_equal(result, expected)

def test_read_local_jsonl(self):
# GH17200
with ensure_clean('tmp_items.json') as path:
with open(path, 'w') as infile:
infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n')
result = read_json(path, lines=True)
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
assert_frame_equal(result, expected)

def test_read_jsonl_unicode_chars(self):
# GH15132: non-ascii unicode characters
# \u201d == RIGHT DOUBLE QUOTATION MARK

# simulate file handle
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
json = StringIO(json)
result = read_json(json, lines=True)
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
columns=['a', 'b'])
assert_frame_equal(result, expected)

# simulate string
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
result = read_json(json, lines=True)
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
columns=['a', 'b'])
assert_frame_equal(result, expected)

def test_to_jsonl(self):
# GH9180
df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
result = df.to_json(orient="records", lines=True)
expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
assert result == expected

df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b'])
result = df.to_json(orient="records", lines=True)
expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
assert result == expected
assert_frame_equal(pd.read_json(result, lines=True), df)

# GH15096: escaped characters in columns and data
df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
columns=["a\\", 'b'])
result = df.to_json(orient="records", lines=True)
expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
'{"a\\\\":"foo\\"","b":"bar"}')
assert result == expected
assert_frame_equal(pd.read_json(result, lines=True), df)

def test_latin_encoding(self):
if compat.PY2:
tm.assert_raises_regex(
Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/io/parser/data/items.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"a": 1, "b": 2}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is the purpose of this file?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, ok you have to have this named .json otherwise it won't be picked up by setup.py (IOW the install test will fail).

{"b":2, "a" :1}
48 changes: 0 additions & 48 deletions pandas/tests/io/parser/test_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,62 +4,14 @@
Tests parsers ability to read and parse non-local files
and hence require a network connection to be read.
"""
import os

import pytest
import moto

import pandas.util.testing as tm
from pandas import DataFrame
from pandas.io.parsers import read_csv, read_table
from pandas.compat import BytesIO


@pytest.fixture(scope='module')
def tips_file():
return os.path.join(tm.get_data_path(), 'tips.csv')

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cool


@pytest.fixture(scope='module')
def salaries_table():
path = os.path.join(tm.get_data_path(), 'salaries.csv')
return read_table(path)


@pytest.fixture(scope='module')
def s3_resource(tips_file):
pytest.importorskip('s3fs')
moto.mock_s3().start()

test_s3_files = [
('tips.csv', tips_file),
('tips.csv.gz', tips_file + '.gz'),
('tips.csv.bz2', tips_file + '.bz2'),
]

def add_tips_files(bucket_name):
for s3_key, file_name in test_s3_files:
with open(file_name, 'rb') as f:
conn.Bucket(bucket_name).put_object(
Key=s3_key,
Body=f)

boto3 = pytest.importorskip('boto3')
# see gh-16135
bucket = 'pandas-test'

conn = boto3.resource("s3", region_name="us-east-1")
conn.create_bucket(Bucket=bucket)
add_tips_files(bucket)

conn.create_bucket(Bucket='cant_get_it', ACL='private')
add_tips_files('cant_get_it')

yield conn

moto.mock_s3().stop()


@pytest.mark.network
@pytest.mark.parametrize(
"compression,extension",
Expand Down