pandas-dev · jreback · Nov 27, 2017 · Aug 8, 2017 · Aug 8, 2017 · Aug 8, 2017
diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt
@@ -88,7 +88,7 @@ I/O
 - :func:`read_parquet` now allows to specify kwargs which are passed to the respective engine (:issue:`18216`)
 - Bug in parsing integer datetime-like columns with specified format in ``read_sql`` (:issue:`17855`).
 - Bug in :meth:`DataFrame.to_msgpack` when serializing data of the numpy.bool_ datatype (:issue:`18390`)
-
+- Bug in :func:`read_json` not decoding when reading line deliminted JSON from S3 (:issue:`17200`)
 
 Plotting
 ^^^^^^^^

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
@@ -5,7 +5,7 @@
 
 import pandas._libs.json as json
 from pandas._libs.tslib import iNaT
-from pandas.compat import StringIO, long, u
+from pandas.compat import StringIO, long, u, to_str
 from pandas import compat, isna
 from pandas import Series, DataFrame, to_datetime, MultiIndex
 from pandas.io.common import (get_filepath_or_buffer, _get_handle,
@@ -458,8 +458,10 @@ def read(self):
         if self.lines and self.chunksize:
             obj = concat(self)
         elif self.lines:
+
+            data = to_str(self.data)
             obj = self._get_object_parser(
-                self._combine_lines(self.data.split('\n'))
+                self._combine_lines(data.split('\n'))
             )
         else:
             obj = self._get_object_parser(self.data)
@@ -612,7 +614,7 @@ def _try_convert_data(self, name, data, use_dtypes=True,
                     try:
                         dtype = np.dtype(dtype)
                         return data.astype(dtype), True
-                    except:
+                    except (TypeError, ValueError):
                         return data, False
 
         if convert_dates:
@@ -628,7 +630,7 @@ def _try_convert_data(self, name, data, use_dtypes=True,
             try:
                 data = data.astype('float64')
                 result = True
-            except:
+            except (TypeError, ValueError):
                 pass
 
         if data.dtype.kind == 'f':
@@ -639,7 +641,7 @@ def _try_convert_data(self, name, data, use_dtypes=True,
                 try:
                     data = data.astype('float64')
                     result = True
-                except:
+                except (TypeError, ValueError):
                     pass
 
         # do't coerce 0-len data
@@ -651,7 +653,7 @@ def _try_convert_data(self, name, data, use_dtypes=True,
                 if (new_data == data).all():
                     data = new_data
                     result = True
-            except:
+            except (TypeError, ValueError):
                 pass
 
         # coerce ints to 64
@@ -661,7 +663,7 @@ def _try_convert_data(self, name, data, use_dtypes=True,
             try:
                 data = data.astype('int64')
                 result = True
-            except:
+            except (TypeError, ValueError):
                 pass
 
         return data, result
@@ -680,7 +682,7 @@ def _try_convert_to_date(self, data):
         if new_data.dtype == 'object':
             try:
                 new_data = data.astype('int64')
-            except:
+            except (TypeError, ValueError):
                 pass
 
         # ignore numbers that are out of range
@@ -697,7 +699,7 @@ def _try_convert_to_date(self, data):
                                        unit=date_unit)
             except ValueError:
                 continue
-            except:
+            except Exception:
                 break
             return new_data, True
         return data, False

diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py
@@ -0,0 +1,74 @@
+import os
+
+import moto
+import pytest
+from pandas.io.parsers import read_table
+
+HERE = os.path.dirname(__file__)
+
+
+@pytest.fixture(scope='module')
+def tips_file():
+    """Path to the tips dataset"""
+    return os.path.join(HERE, 'parser', 'data', 'tips.csv')
+
+
+@pytest.fixture(scope='module')
+def jsonl_file():
+    """Path a JSONL dataset"""
+    return os.path.join(HERE, 'parser', 'data', 'items.jsonl')
+
+
+@pytest.fixture(scope='module')
+def salaries_table():
+    """DataFrame with the salaries dataset"""
+    path = os.path.join(HERE, 'parser', 'data', 'salaries.csv')
+    return read_table(path)
+
+
+@pytest.fixture(scope='module')
+def s3_resource(tips_file, jsonl_file):
+    """Fixture for mocking S3 interaction.
+
+    The primary bucket name is "pandas-test". The following datasets
+    are loaded.
+
+    - tips.csv
+    - tips.csv.gz
+    - tips.csv.bz2
+    - items.jsonl
+
+    A private bucket "cant_get_it" is also created. The boto3 s3 resource
+    is yielded by the fixture.
+    """
+    pytest.importorskip('s3fs')
+    moto.mock_s3().start()
+
+    test_s3_files = [
+        ('tips.csv', tips_file),
+        ('tips.csv.gz', tips_file + '.gz'),
+        ('tips.csv.bz2', tips_file + '.bz2'),
+        ('items.jsonl', jsonl_file),
+    ]
+
+    def add_tips_files(bucket_name):
+        for s3_key, file_name in test_s3_files:
+            with open(file_name, 'rb') as f:
+                conn.Bucket(bucket_name).put_object(
+                    Key=s3_key,
+                    Body=f)
+
+    boto3 = pytest.importorskip('boto3')
+    # see gh-16135
+    bucket = 'pandas-test'
+
+    conn = boto3.resource("s3", region_name="us-east-1")
+    conn.create_bucket(Bucket=bucket)
+    add_tips_files(bucket)
+
+    conn.create_bucket(Bucket='cant_get_it', ACL='private')
+    add_tips_files('cant_get_it')
+
+    yield conn
+
+    moto.mock_s3().stop()
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -4,7 +4,6 @@
 from pandas.compat import (range, lrange, StringIO,
                            OrderedDict, is_platform_32bit)
 import os
-
 import numpy as np
 from pandas import (Series, DataFrame, DatetimeIndex, Timestamp,
                     read_json, compat)
@@ -1032,6 +1031,70 @@ def test_tz_range_is_utc(self):
         df = DataFrame({'DT': dti})
         assert dumps(df, iso_dates=True) == dfexp
 
+    def test_read_inline_jsonl(self):
+        # GH9180
+        result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
+        expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
+        assert_frame_equal(result, expected)
+
+    def test_read_s3_jsonl(self, s3_resource):
+        pytest.importorskip('s3fs')
+        # GH17200
+
+        result = read_json('s3n://pandas-test/items.jsonl', lines=True)
+        expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
+        assert_frame_equal(result, expected)
+
+    def test_read_local_jsonl(self):
+        # GH17200
+        with ensure_clean('tmp_items.json') as path:
+            with open(path, 'w') as infile:
+                infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n')
+            result = read_json(path, lines=True)
+            expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
+            assert_frame_equal(result, expected)
+
+    def test_read_jsonl_unicode_chars(self):
+        # GH15132: non-ascii unicode characters
+        # \u201d == RIGHT DOUBLE QUOTATION MARK
+
+        # simulate file handle
+        json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
+        json = StringIO(json)
+        result = read_json(json, lines=True)
+        expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
+                             columns=['a', 'b'])
+        assert_frame_equal(result, expected)
+
+        # simulate string
+        json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
+        result = read_json(json, lines=True)
+        expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
+                             columns=['a', 'b'])
+        assert_frame_equal(result, expected)
+
+    def test_to_jsonl(self):
+        # GH9180
+        df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
+        result = df.to_json(orient="records", lines=True)
+        expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
+        assert result == expected
+
+        df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b'])
+        result = df.to_json(orient="records", lines=True)
+        expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
+        assert result == expected
+        assert_frame_equal(pd.read_json(result, lines=True), df)
+
+        # GH15096: escaped characters in columns and data
+        df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
+                       columns=["a\\", 'b'])
+        result = df.to_json(orient="records", lines=True)
+        expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
+                    '{"a\\\\":"foo\\"","b":"bar"}')
+        assert result == expected
+        assert_frame_equal(pd.read_json(result, lines=True), df)
+
     def test_latin_encoding(self):
         if compat.PY2:
             tm.assert_raises_regex(

diff --git a/pandas/tests/io/parser/data/items.jsonl b/pandas/tests/io/parser/data/items.jsonl
@@ -0,0 +1,2 @@
+{"a": 1, "b": 2}
+{"b":2, "a" :1}
diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py
@@ -4,62 +4,14 @@
 Tests parsers ability to read and parse non-local files
 and hence require a network connection to be read.
 """
-import os
-
 import pytest
-import moto
 
 import pandas.util.testing as tm
 from pandas import DataFrame
 from pandas.io.parsers import read_csv, read_table
 from pandas.compat import BytesIO
 
 
-@pytest.fixture(scope='module')
-def tips_file():
-    return os.path.join(tm.get_data_path(), 'tips.csv')
-
-
-@pytest.fixture(scope='module')
-def salaries_table():
-    path = os.path.join(tm.get_data_path(), 'salaries.csv')
-    return read_table(path)
-
-
-@pytest.fixture(scope='module')
-def s3_resource(tips_file):
-    pytest.importorskip('s3fs')
-    moto.mock_s3().start()
-
-    test_s3_files = [
-        ('tips.csv', tips_file),
-        ('tips.csv.gz', tips_file + '.gz'),
-        ('tips.csv.bz2', tips_file + '.bz2'),
-    ]
-
-    def add_tips_files(bucket_name):
-        for s3_key, file_name in test_s3_files:
-            with open(file_name, 'rb') as f:
-                conn.Bucket(bucket_name).put_object(
-                    Key=s3_key,
-                    Body=f)
-
-    boto3 = pytest.importorskip('boto3')
-    # see gh-16135
-    bucket = 'pandas-test'
-
-    conn = boto3.resource("s3", region_name="us-east-1")
-    conn.create_bucket(Bucket=bucket)
-    add_tips_files(bucket)
-
-    conn.create_bucket(Bucket='cant_get_it', ACL='private')
-    add_tips_files('cant_get_it')
-
-    yield conn
-
-    moto.mock_s3().stop()
-
-
 @pytest.mark.network
 @pytest.mark.parametrize(
     "compression,extension",
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"a": 1, "b": 2}
Copy link Contributor jreback Sep 26, 2017 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. what is the purpose of this file? Copy link Contributor jreback Sep 26, 2017 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. I see, ok you have to have this named `.json` otherwise it won't be picked up by `setup.py` (IOW the install test will fail).
		{"b":2, "a" :1}