Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Switch engine=cudf to the new JSON reader #12509

Merged
merged 22 commits into from
Jan 23, 2023
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
129e692
switch to new json reader
galipremsagar Jan 9, 2023
fff3c25
Merge remote-tracking branch 'upstream/branch-23.02' into read_json_s…
galipremsagar Jan 17, 2023
84705bf
switch legacy
galipremsagar Jan 17, 2023
d9266d8
Merge remote-tracking branch 'upstream/branch-23.02' into read_json_s…
galipremsagar Jan 17, 2023
16b29f7
drop experimental in more places
galipremsagar Jan 17, 2023
cfbda0b
Merge remote-tracking branch 'upstream/branch-23.02' into read_json_s…
galipremsagar Jan 20, 2023
3016f03
update tests
galipremsagar Jan 20, 2023
d4d6dd9
improve test
galipremsagar Jan 20, 2023
8cf3b9a
zip
galipremsagar Jan 20, 2023
d880e35
Merge remote-tracking branch 'upstream/branch-23.02' into read_json_s…
galipremsagar Jan 20, 2023
fb32906
drop engine
galipremsagar Jan 20, 2023
0d5b50c
reviews
galipremsagar Jan 20, 2023
42518a6
Update python/dask_cudf/dask_cudf/io/tests/test_json.py
galipremsagar Jan 20, 2023
e80112f
Merge branch 'read_json_switch' of https://github.com/galipremsagar/c…
galipremsagar Jan 20, 2023
b6e8569
drop engine when not needed
galipremsagar Jan 20, 2023
78b55c0
Apply suggestions from code review
galipremsagar Jan 20, 2023
76b7e10
docs
galipremsagar Jan 20, 2023
2641e49
docs
galipremsagar Jan 20, 2023
1df85fa
Apply suggestions from code review
galipremsagar Jan 23, 2023
5e0bde4
improve tests
galipremsagar Jan 23, 2023
6c90095
Merge remote-tracking branch 'upstream/branch-23.02' into read_json_s…
galipremsagar Jan 23, 2023
c01e0ff
address reviews
galipremsagar Jan 23, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 9 additions & 10 deletions docs/cudf/source/user_guide/io/read-json.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ each object corresponds to a row.
{"a": "v2", "b": 7},
{"a": "v3", "b": 5}
]'''
>>> df_records = cudf.read_json(j, engine='cudf_experimental')
>>> df_records = cudf.read_json(j, engine='cudf')
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved

>>> j = '\n'.join([
... '{"a": "v1", "b": 12}',
... '{"a": "v2", "b": 7}',
... '{"a": "v3", "b": 5}'
... ])
>>> df_lines = cudf.read_json(j, lines=True, engine='cudf_experimental')
>>> df_lines = cudf.read_json(j, lines=True)

>>> df_lines
a b
Expand All @@ -49,7 +49,7 @@ reading nested JSON data.
{"list": [0,1,2], "struct": {"k":"v1"}},
{"list": [3,4,5], "struct": {"k":"v2"}}
]'''
>>> df = cudf.read_json(j, engine='cudf_experimental')
>>> df = cudf.read_json(j, engine='cudf')
>>> df
list struct
0 [0, 1, 2] {'k': 'v1'}
Expand All @@ -61,7 +61,7 @@ reading nested JSON data.
... '{"a": [{"k": 0}], "b": {"k": [0, 1], "m": 5}}',
... '{"a": [{"k": 1}, {"k": 2}], "b": {"k": [2, 3], "m": 6}}',
... ])
>>> df = cudf.read_json(j, lines=True, engine='cudf_experimental')
>>> df = cudf.read_json(j, lines=True)
>>> df
a b
0 [{'k': 0}] {'k': [0, 1], 'm': 5}
Expand Down Expand Up @@ -97,7 +97,6 @@ should be adjacent, as shown in the following example.
... j,
... lines=True,
... byte_range=(chunk_size * x, chunk_size),
... engine='cudf_experimental'
... )
... data.append(d)
>>> df = cudf.concat(data)
Expand All @@ -115,7 +114,7 @@ raw strings, or file-like objects, as well as iterables of these sources.
>>> j1 = '{"id":0}\n{"id":1}\n'
>>> j2 = '{"id":2}\n{"id":3}\n'

>>> df = cudf.read_json([j1, j2], lines=True, engine='cudf_experimental')
>>> df = cudf.read_json([j1, j2], lines=True)
```

## Unpacking list and struct data
Expand All @@ -133,7 +132,7 @@ following example demonstrates how to extract data from a struct column.
... '{"x": "Jakarta", "y": {"country": "Indonesia", "iso2": "ID"}}',
... '{"x": "Shanghai", "y": {"country": "China", "iso2": "CN"}}'
... ])
>>> df = cudf.read_json(j, lines=True, engine='cudf_experimental')
>>> df = cudf.read_json(j, lines=True)
>>> df = df.drop(columns='y').join(df['y'].struct.explode())
>>> df
x country iso2
Expand All @@ -156,7 +155,7 @@ list column.
... '{"name": "New Bedford, MA", "coord": [41.63, -70.93]}'
... ])

>>> df = cudf.read_json(j, lines=True, engine='cudf_experimental')
>>> df = cudf.read_json(j, lines=True)
>>> df['latitude'] = df['coord'].list.get(0)
>>> df['longitude'] = df['coord'].list.get(1)
>>> df = df.drop(columns='coord')
Expand All @@ -181,7 +180,7 @@ the parent dataframe.
... '{"product": "shirts", "ratings": [3, 4]}'
... ])

>>> df = cudf.read_json(j, lines=True, engine='cudf_experimental')
>>> df = cudf.read_json(j, lines=True)
>>> df = df.drop(columns='ratings').join(df['ratings'].explode())
>>> df
product ratings
Expand Down Expand Up @@ -217,7 +216,7 @@ reads a JSON object as a single line and then extracts the
}'''

# first read the JSON object with line=True
>>> df = cudf.read_json(j, lines=True, engine='cudf_experimental')
>>> df = cudf.read_json(j, lines=True)
>>> df
metadata records
0 {'vehicle': 'car'} [{'id': 0, 'distance': 1.2}, {'id': 1, 'distan...
Expand Down
6 changes: 4 additions & 2 deletions python/cudf/cudf/_lib/json.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ cpdef read_json(object filepaths_or_buffers,
bool lines,
object compression,
object byte_range,
bool experimental,
bool legacy,
bool keep_quotes):
"""
Cython function to call into libcudf API, see `read_json`.
Expand Down Expand Up @@ -71,6 +71,8 @@ cpdef read_json(object filepaths_or_buffers,
c_compression = cudf_io_types.compression_type.GZIP
elif compression == 'bz2':
c_compression = cudf_io_types.compression_type.BZIP2
elif compression == 'zip':
c_compression = cudf_io_types.compression_type.ZIP
else:
c_compression = cudf_io_types.compression_type.AUTO
else:
Expand Down Expand Up @@ -99,7 +101,7 @@ cpdef read_json(object filepaths_or_buffers,
.lines(c_lines)
.byte_range_offset(c_range_offset)
.byte_range_size(c_range_size)
.legacy(not experimental)
.legacy(legacy)
.build()
)
if is_list_like_dtypes:
Expand Down
33 changes: 25 additions & 8 deletions python/cudf/cudf/io/json.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
# Copyright (c) 2019-2023, NVIDIA CORPORATION.

import warnings
from collections import abc
Expand Down Expand Up @@ -37,16 +37,33 @@ def read_json(
f"or a bool, or None. Got {type(dtype)}"
)

if engine == "cudf" and not lines:
raise ValueError(f"{engine} engine only supports JSON Lines format")
if engine != "cudf_experimental" and keep_quotes:
if engine == "cudf_experimental":
raise ValueError(
"keep_quotes='True' is supported only with"
" engine='cudf_experimental'"
"engine='cudf_experimental' support has been removed, "
"use `engine='cudf'`"
)

if engine == "cudf_legacy":
# TODO: Deprecated in 23.02, please
# give some time until(more than couple of
# releases from now) `cudf_legacy`
# support can be removed completely.
warnings.warn(
"engine='cudf_legacy' is a deprecated engine."
"This will be removed in a future release."
"Please switch to using engine='cudf'.",
FutureWarning,
)
if engine == "cudf_legacy" and not lines:
raise ValueError(f"{engine} engine only supports JSON Lines format")
if engine == "auto":
engine = "cudf" if lines else "pandas"
if engine == "cudf" or engine == "cudf_experimental":
if engine != "cudf" and keep_quotes:
raise ValueError(
"keep_quotes='True' is supported only with engine='cudf'"
)

if engine == "cudf_legacy" or engine == "cudf":
if dtype is None:
dtype = True

Expand Down Expand Up @@ -97,7 +114,7 @@ def read_json(
lines,
compression,
byte_range,
engine == "cudf_experimental",
engine == "cudf_legacy",
bdice marked this conversation as resolved.
Show resolved Hide resolved
keep_quotes,
)
else:
Expand Down
Loading