Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Switch engine=cudf to the new JSON reader #12509

Merged
merged 22 commits into from
Jan 23, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
129e692
switch to new json reader
galipremsagar Jan 9, 2023
fff3c25
Merge remote-tracking branch 'upstream/branch-23.02' into read_json_s…
galipremsagar Jan 17, 2023
84705bf
switch legacy
galipremsagar Jan 17, 2023
d9266d8
Merge remote-tracking branch 'upstream/branch-23.02' into read_json_s…
galipremsagar Jan 17, 2023
16b29f7
drop experimental in more places
galipremsagar Jan 17, 2023
cfbda0b
Merge remote-tracking branch 'upstream/branch-23.02' into read_json_s…
galipremsagar Jan 20, 2023
3016f03
update tests
galipremsagar Jan 20, 2023
d4d6dd9
improve test
galipremsagar Jan 20, 2023
8cf3b9a
zip
galipremsagar Jan 20, 2023
d880e35
Merge remote-tracking branch 'upstream/branch-23.02' into read_json_s…
galipremsagar Jan 20, 2023
fb32906
drop engine
galipremsagar Jan 20, 2023
0d5b50c
reviews
galipremsagar Jan 20, 2023
42518a6
Update python/dask_cudf/dask_cudf/io/tests/test_json.py
galipremsagar Jan 20, 2023
e80112f
Merge branch 'read_json_switch' of https://github.com/galipremsagar/c…
galipremsagar Jan 20, 2023
b6e8569
drop engine when not needed
galipremsagar Jan 20, 2023
78b55c0
Apply suggestions from code review
galipremsagar Jan 20, 2023
76b7e10
docs
galipremsagar Jan 20, 2023
2641e49
docs
galipremsagar Jan 20, 2023
1df85fa
Apply suggestions from code review
galipremsagar Jan 23, 2023
5e0bde4
improve tests
galipremsagar Jan 23, 2023
6c90095
Merge remote-tracking branch 'upstream/branch-23.02' into read_json_s…
galipremsagar Jan 23, 2023
c01e0ff
address reviews
galipremsagar Jan 23, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions docs/cudf/source/user_guide/io/read-json.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ each object corresponds to a row.
{"a": "v2", "b": 7},
{"a": "v3", "b": 5}
]'''
>>> df_records = cudf.read_json(j, engine='cudf_experimental')
>>> df_records = cudf.read_json(j, engine='cudf')
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved

>>> j = '\n'.join([
... '{"a": "v1", "b": 12}',
... '{"a": "v2", "b": 7}',
... '{"a": "v3", "b": 5}'
... ])
>>> df_lines = cudf.read_json(j, lines=True, engine='cudf_experimental')
>>> df_lines = cudf.read_json(j, lines=True, engine='cudf')

>>> df_lines
a b
Expand All @@ -49,7 +49,7 @@ reading nested JSON data.
{"list": [0,1,2], "struct": {"k":"v1"}},
{"list": [3,4,5], "struct": {"k":"v2"}}
]'''
>>> df = cudf.read_json(j, engine='cudf_experimental')
>>> df = cudf.read_json(j, engine='cudf')
>>> df
list struct
0 [0, 1, 2] {'k': 'v1'}
Expand All @@ -61,7 +61,7 @@ reading nested JSON data.
... '{"a": [{"k": 0}], "b": {"k": [0, 1], "m": 5}}',
... '{"a": [{"k": 1}, {"k": 2}], "b": {"k": [2, 3], "m": 6}}',
... ])
>>> df = cudf.read_json(j, lines=True, engine='cudf_experimental')
>>> df = cudf.read_json(j, lines=True, engine='cudf')
>>> df
a b
0 [{'k': 0}] {'k': [0, 1], 'm': 5}
Expand Down Expand Up @@ -97,7 +97,7 @@ should be adjacent, as shown in the following example.
... j,
... lines=True,
... byte_range=(chunk_size * x, chunk_size),
... engine='cudf_experimental'
... engine='cudf'
... )
... data.append(d)
>>> df = cudf.concat(data)
Expand All @@ -115,7 +115,7 @@ raw strings, or file-like objects, as well as iterables of these sources.
>>> j1 = '{"id":0}\n{"id":1}\n'
>>> j2 = '{"id":2}\n{"id":3}\n'

>>> df = cudf.read_json([j1, j2], lines=True, engine='cudf_experimental')
>>> df = cudf.read_json([j1, j2], lines=True, engine='cudf')
```

## Unpacking list and struct data
Expand All @@ -133,7 +133,7 @@ following example demonstrates how to extract data from a struct column.
... '{"x": "Jakarta", "y": {"country": "Indonesia", "iso2": "ID"}}',
... '{"x": "Shanghai", "y": {"country": "China", "iso2": "CN"}}'
... ])
>>> df = cudf.read_json(j, lines=True, engine='cudf_experimental')
>>> df = cudf.read_json(j, lines=True, engine='cudf')
>>> df = df.drop(columns='y').join(df['y'].struct.explode())
>>> df
x country iso2
Expand All @@ -156,7 +156,7 @@ list column.
... '{"name": "New Bedford, MA", "coord": [41.63, -70.93]}'
... ])

>>> df = cudf.read_json(j, lines=True, engine='cudf_experimental')
>>> df = cudf.read_json(j, lines=True, engine='cudf')
>>> df['latitude'] = df['coord'].list.get(0)
>>> df['longitude'] = df['coord'].list.get(1)
>>> df = df.drop(columns='coord')
Expand All @@ -181,7 +181,7 @@ the parent dataframe.
... '{"product": "shirts", "ratings": [3, 4]}'
... ])

>>> df = cudf.read_json(j, lines=True, engine='cudf_experimental')
>>> df = cudf.read_json(j, lines=True, engine='cudf')
>>> df = df.drop(columns='ratings').join(df['ratings'].explode())
>>> df
product ratings
Expand Down Expand Up @@ -217,7 +217,7 @@ reads a JSON object as a single line and then extracts the
}'''

# first read the JSON object with line=True
>>> df = cudf.read_json(j, lines=True, engine='cudf_experimental')
>>> df = cudf.read_json(j, lines=True, engine='cudf')
>>> df
metadata records
0 {'vehicle': 'car'} [{'id': 0, 'distance': 1.2}, {'id': 1, 'distan...
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/json.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ cpdef read_json(object filepaths_or_buffers,
bool lines,
object compression,
object byte_range,
bool experimental,
bool legacy,
bool keep_quotes):
"""
Cython function to call into libcudf API, see `read_json`.
Expand Down Expand Up @@ -99,7 +99,7 @@ cpdef read_json(object filepaths_or_buffers,
.lines(c_lines)
.byte_range_offset(c_range_offset)
.byte_range_size(c_range_size)
.legacy(not experimental)
.legacy(legacy)
.build()
)
if is_list_like_dtypes:
Expand Down
29 changes: 22 additions & 7 deletions python/cudf/cudf/io/json.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
# Copyright (c) 2019-2023, NVIDIA CORPORATION.

import warnings
from collections import abc
Expand Down Expand Up @@ -37,16 +37,31 @@ def read_json(
f"or a bool, or None. Got {type(dtype)}"
)

if engine == "cudf" and not lines:
if engine == "cudf_experimental":
raise ValueError(
"engine='cudf_experimental' support has been removed, "
"use `engine='cudf'`"
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be a hard error or a deprecation warning that replaces the value with "cudf"?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since it was experimental, I feel we have the flexibility to make this an error, what do you think?

Copy link
Contributor

@bdice bdice Jan 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that's justifiable -- but we'll want to delete this error at some point later. Therefore, it's the same amount of work for us as developers to deprecate it as to force an error. "Add warning, delete warning later" vs. "add error, delete error later."

No strong feelings here - resolve as you see fit.


if engine == "cudf_legacy":
# TODO: Deprecated in 23.02, please
# give some time until `cudf_legacy`
# support can be removed completely.
Copy link
Contributor

@bdice bdice Jan 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"give some time until cudf_legacy support can be removed completely"

Will our one-release deprecation policy be enough for this, or do you think a special exception is warranted? If more time is needed, let's try to indicate the release in which support should be removed in the error message or perhaps a code comment.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, we definitely want to give this a special exception. I don't think we have a specific release decided yet to remove it completely. I updated the comment here.

cc: @GregoryKimball @vuule too.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we'll pick the removal release based on 23.02 feedback. In general, we'll remove the old reader as soon as there are no user issues specific to the new one.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That sounds reasonable! I just wanted to make sure we discussed this topic before merging. 👍

warnings.warn(
"engine='cudf_legacy' is a deprecated engine."
"This will be removed in a future release."
"Please switch to using engine='cudf'.",
FutureWarning,
)
if engine == "cudf_legacy" and not lines:
raise ValueError(f"{engine} engine only supports JSON Lines format")
if engine != "cudf_experimental" and keep_quotes:
if engine != "cudf" and keep_quotes:
raise ValueError(
"keep_quotes='True' is supported only with"
" engine='cudf_experimental'"
"keep_quotes='True' is supported only with engine='cudf'"
)
if engine == "auto":
engine = "cudf" if lines else "pandas"
if engine == "cudf" or engine == "cudf_experimental":
if engine == "cudf_legacy" or engine == "cudf":
if dtype is None:
dtype = True

Expand Down Expand Up @@ -97,7 +112,7 @@ def read_json(
lines,
compression,
byte_range,
engine == "cudf_experimental",
engine == "cudf_legacy",
bdice marked this conversation as resolved.
Show resolved Hide resolved
keep_quotes,
)
else:
Expand Down
52 changes: 23 additions & 29 deletions python/cudf/cudf/tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ def test_json_engine_selection():
assert isinstance(col_name, str)

# should use the pandas engine
df = cudf.read_json(json, lines=False)
df = cudf.read_json(json, lines=False, engine="pandas")
# column names are ints when parsing with pandas
for col_name in df.columns:
assert isinstance(col_name, int)
Expand All @@ -332,7 +332,7 @@ def test_json_engine_selection():

# should raise an exception
with pytest.raises(ValueError):
cudf.read_json(json, lines=False, engine="cudf")
cudf.read_json(json, lines=False, engine="cudf_legacy")


def test_json_bool_values():
Expand All @@ -352,6 +352,9 @@ def test_json_bool_values():
np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes)


@pytest.mark.filterwarnings(
"ignore:engine='cudf_legacy' is a deprecated engine."
)
@pytest.mark.parametrize(
"buffer",
[
Expand All @@ -362,7 +365,7 @@ def test_json_bool_values():
],
)
def test_json_null_literal(buffer):
df = cudf.read_json(buffer, lines=True)
df = cudf.read_json(buffer, lines=True, engine="cudf_legacy")

# first column contains a null field, type should be set to float
# second column contains only empty fields, type should be set to int8
Expand Down Expand Up @@ -532,12 +535,13 @@ def test_default_integer_bitwidth(default_integer_bitwidth, engine):
"engine",
[
pytest.param(
"cudf",
"cudf_legacy",
marks=pytest.mark.skip(
reason="cannot partially set dtypes for cudf json engine"
),
),
"pandas",
"cudf",
],
)
def test_default_integer_bitwidth_partial(default_integer_bitwidth, engine):
Expand Down Expand Up @@ -589,7 +593,7 @@ def test_json_nested_basic(tmpdir):
pdf = pd.DataFrame(data)
pdf.to_json(fname, orient="records")

df = cudf.read_json(fname, engine="cudf_experimental", orient="records")
df = cudf.read_json(fname, engine="cudf", orient="records")
pdf = pd.read_json(fname, orient="records")

assert_eq(pdf, df)
Expand Down Expand Up @@ -617,9 +621,7 @@ def test_json_nested_lines(data, lines):
pdf = pd.DataFrame(data)
pdf.to_json(bytes, orient="records", lines=lines)
bytes.seek(0)
df = cudf.read_json(
bytes, engine="cudf_experimental", orient="records", lines=lines
)
df = cudf.read_json(bytes, engine="cudf", orient="records", lines=lines)
bytes.seek(0)
pdf = pd.read_json(bytes, orient="records", lines=lines)
# In the second test-case we need to take a detour via pyarrow
Expand All @@ -638,9 +640,7 @@ def test_json_nested_data():
'[{"0":{},"2":{}},{"1":[[""],[]],"2":{"2":""}},'
'{"0":{"a":"1"},"2":{"0":"W&RR=+I","1":""}}]'
)
df = cudf.read_json(
StringIO(json_str), engine="cudf_experimental", orient="records"
)
df = cudf.read_json(StringIO(json_str), engine="cudf", orient="records")
pdf = pd.read_json(StringIO(json_str), orient="records")
pdf.columns = pdf.columns.astype("str")
pa_table_pdf = pa.Table.from_pandas(
Expand All @@ -659,7 +659,7 @@ def test_json_empty_types():
"""
df = cudf.read_json(
StringIO(json_str),
engine="cudf_experimental",
engine="cudf",
orient="records",
lines=True,
)
Expand All @@ -676,9 +676,7 @@ def test_json_types_data():
'{"1":[123],"0":{"0":"foo","1":123.4},"2":{"0":false}},'
'{"0":{},"1":[],"2":{"0":null}}]'
)
df = cudf.read_json(
StringIO(json_str), engine="cudf_experimental", orient="records"
)
df = cudf.read_json(StringIO(json_str), engine="cudf", orient="records")
pdf = pd.read_json(StringIO(json_str), orient="records")
pdf.columns = pdf.columns.astype("str")
pa_table_pdf = pa.Table.from_pandas(
Expand Down Expand Up @@ -725,7 +723,7 @@ def test_json_types_data():
def test_json_quoted_values_with_schema(col_type, json_str):
experimental_df = cudf.read_json(
StringIO(json_str),
engine="cudf_experimental",
engine="cudf",
bdice marked this conversation as resolved.
Show resolved Hide resolved
orient="records",
dtype={"k": col_type},
)
Expand Down Expand Up @@ -767,7 +765,7 @@ def test_json_quoted_values_with_schema(col_type, json_str):
def test_json_quoted_values(col_type, json_str, expected):
experimental_df = cudf.read_json(
StringIO(json_str),
engine="cudf_experimental",
engine="cudf",
orient="records",
dtype={"k": col_type},
)
Expand Down Expand Up @@ -818,7 +816,7 @@ def test_json_keep_quotes(keep_quotes, result):

actual = cudf.read_json(
bytes_file,
engine="cudf_experimental",
engine="cudf",
orient="records",
lines=True,
keep_quotes=keep_quotes,
Expand Down Expand Up @@ -875,7 +873,7 @@ def test_json_dtypes_nested_data():

df = cudf.read_json(
StringIO(actual_json_str),
engine="cudf_experimental",
engine="cudf",
orient="records",
lines=True,
dtype={
Expand Down Expand Up @@ -947,17 +945,15 @@ def test_json_dtypes_nested_data():
class TestNestedJsonReaderCommon:
@pytest.mark.parametrize("chunk_size", [10, 100, 1024, 1024 * 1024])
def test_chunked_nested_json_reader(self, tag, data, chunk_size):
expected = cudf.read_json(
StringIO(data), engine="cudf_experimental", lines=True
)
expected = cudf.read_json(StringIO(data), engine="cudf", lines=True)

source_size = len(data)
chunks = []
for chunk_start in range(0, source_size, chunk_size):
chunks.append(
cudf.read_json(
StringIO(data),
engine="cudf_experimental",
engine="cudf",
byte_range=[chunk_start, chunk_size],
lines=True,
)
Expand All @@ -967,9 +963,7 @@ def test_chunked_nested_json_reader(self, tag, data, chunk_size):

def test_order_nested_json_reader(self, tag, data):
expected = pd.read_json(StringIO(data), lines=True)
target = cudf.read_json(
StringIO(data), engine="cudf_experimental", lines=True
)
target = cudf.read_json(StringIO(data), engine="cudf", lines=True)
if tag == "dtype_mismatch":
with pytest.raises(AssertionError):
# pandas parses integer values in float representation
Expand Down Expand Up @@ -1087,13 +1081,13 @@ def _replace_with_nulls(df, replace_items):
pdf = _replace_with_nulls(pdf, [123, "123", 12.3, "abc"])
gdf = cudf.read_json(
StringIO(jsonl_string),
engine="cudf_experimental",
engine="cudf",
orient="records",
lines=True,
)
gdf2 = cudf.read_json(
StringIO(json_string),
engine="cudf_experimental",
engine="cudf",
orient="records",
lines=False,
)
Expand Down Expand Up @@ -1134,7 +1128,7 @@ def test_json_nested_mixed_types_error(jsonl_string):
with pytest.raises(RuntimeError):
cudf.read_json(
StringIO(jsonl_string),
engine="cudf_experimental",
engine="cudf",
orient="records",
lines=True,
)
Loading