From ed86600e24a8f4f3749fd3ce32e08c566752f9a5 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 5 Jul 2024 22:07:05 +0000 Subject: [PATCH 1/8] Implement chunked json reader --- cpp/src/io/utilities/datasource.cpp | 2 +- python/cudf/cudf/_lib/json.pyx | 151 ++++++++++++++++++++++------ python/cudf/cudf/io/json.py | 33 ++++-- python/cudf/cudf/tests/test_json.py | 76 +++++++------- 4 files changed, 181 insertions(+), 81 deletions(-) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index ca8932322bf..98593a65f5d 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -216,7 +216,7 @@ class memory_mapped_source : public file_source { void map(int fd, size_t offset, size_t size) { - CUDF_EXPECTS(offset < _file.size(), "Offset is past end of file"); + CUDF_EXPECTS(offset < _file.size(), "Offset is past end of file", std::overflow_error); // Offset for `mmap()` must be page aligned _map_offset = offset & ~(sysconf(_SC_PAGESIZE) - 1); diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index a8fef907bad..b4201c3b5a1 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -40,7 +40,15 @@ from cudf._lib.pylibcudf.libcudf.io.types cimport ( from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type from cudf._lib.types cimport dtype_to_data_type -from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table +from cudf._lib.utils cimport ( + columns_from_unique_ptr, + data_from_pylibcudf_table, + data_from_unique_ptr, + table_view_from_table, +) + +from cudf._lib import pylibcudf +from cudf._lib.concat import concat_columns cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines): @@ -51,25 +59,17 @@ cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines): else: raise TypeError(f"Invalid parameter for {on_bad_lines=}") - -cpdef read_json(object filepaths_or_buffers, - object dtype, - bool lines, - object compression, - object byte_range, - bool keep_quotes, - bool mixed_types_as_string, - bool prune_columns, - object on_bad_lines): - """ - Cython function to call into libcudf API, see `read_json`. - - See Also - -------- - cudf.io.json.read_json - cudf.io.json.to_json - """ - +cdef json_reader_options _setup_json_reader_options( + object filepaths_or_buffers, + object dtype, + object compression, + bool keep_quotes, + bool mixed_types_as_string, + bool prune_columns, + object on_bad_lines, + bool lines, + size_type byte_range_offset, + size_type byte_range_size): # If input data is a JSON string (or StringIO), hold a reference to # the encoded memoryview externally to ensure the encoded buffer # isn't destroyed before calling libcudf `read_json()` @@ -85,14 +85,6 @@ cpdef read_json(object filepaths_or_buffers, cdef vector[data_type] c_dtypes_list cdef map[string, schema_element] c_dtypes_schema_map cdef cudf_io_types.compression_type c_compression - # Determine byte read offsets if applicable - cdef size_type c_range_offset = ( - byte_range[0] if byte_range is not None else 0 - ) - cdef size_type c_range_size = ( - byte_range[1] if byte_range is not None else 0 - ) - cdef bool c_lines = lines if compression is not None: if compression == 'gzip': @@ -126,9 +118,9 @@ cpdef read_json(object filepaths_or_buffers, cdef json_reader_options opts = move( json_reader_options.builder(make_source_info(filepaths_or_buffers)) .compression(c_compression) - .lines(c_lines) - .byte_range_offset(c_range_offset) - .byte_range_size(c_range_size) + .lines(lines) + .byte_range_offset(byte_range_offset) + .byte_range_size(byte_range_size) .recovery_mode(_get_json_recovery_mode(on_bad_lines)) .build() ) @@ -141,6 +133,38 @@ cpdef read_json(object filepaths_or_buffers, opts.enable_mixed_types_as_string(mixed_types_as_string) opts.enable_prune_columns(prune_columns) + return opts + +cpdef read_json(object filepaths_or_buffers, + object dtype, + bool lines, + object compression, + object byte_range, + bool keep_quotes, + bool mixed_types_as_string, + bool prune_columns, + object on_bad_lines): + """ + Cython function to call into libcudf API, see `read_json`. + + See Also + -------- + cudf.io.json.read_json + cudf.io.json.to_json + """ + # Determine byte read offsets if applicable + cdef size_type c_range_offset = ( + byte_range[0] if byte_range is not None else 0 + ) + cdef size_type c_range_size = ( + byte_range[1] if byte_range is not None else 0 + ) + cdef json_reader_options opts = _setup_json_reader_options( + filepaths_or_buffers, dtype, compression, keep_quotes, + mixed_types_as_string, prune_columns, on_bad_lines, + lines, c_range_offset, c_range_size + ) + # Read JSON cdef cudf_io_types.table_with_metadata c_result @@ -157,6 +181,71 @@ cpdef read_json(object filepaths_or_buffers, return df +cpdef chunked_read_json(object filepaths_or_buffers, + object dtype, + object compression, + bool keep_quotes, + bool mixed_types_as_string, + bool prune_columns, + object on_bad_lines, + int chunk_size=100_000_000): + """ + Cython function to call into libcudf API, see `read_json`. + + See Also + -------- + cudf.io.json.read_json + cudf.io.json.to_json + """ + cdef size_type c_range_size = ( + chunk_size if chunk_size is not None else 0 + ) + cdef json_reader_options opts = _setup_json_reader_options( + filepaths_or_buffers, dtype, compression, keep_quotes, + mixed_types_as_string, prune_columns, on_bad_lines, + True, 0, c_range_size + ) + + # Read JSON + cdef cudf_io_types.table_with_metadata c_result + final_columns = [] + meta_names = None + i = 0 + while True: + opts.set_byte_range_offset(c_range_size * i) + opts.set_byte_range_size(c_range_size) + + try: + with nogil: + c_result = move(libcudf_read_json(opts)) + except OverflowError: + break + if meta_names is None: + meta_names = [info.name.decode() for info in c_result.metadata.schema_info] + new_chunk = columns_from_unique_ptr(move(c_result.tbl)) + if len(final_columns) == 0: + final_columns = new_chunk + else: + for col_idx in range(len(meta_names)): + final_columns[col_idx] = concat_columns( + [final_columns[col_idx], new_chunk[col_idx]] + ) + # Must drop any residual GPU columns to save memory + new_chunk[col_idx] = None + i += 1 + df = cudf.DataFrame._from_data( + *data_from_pylibcudf_table( + pylibcudf.Table( + [col.to_pylibcudf(mode="read") for col in final_columns] + ), + column_names=meta_names, + index_names=None + ) + ) + update_struct_field_names(df, c_result.metadata.schema_info) + + return df + @acquire_spill_lock() def write_json( diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index fc3387d5117..f830de3a468 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -94,17 +94,28 @@ def read_json( else: filepaths_or_buffers.append(tmp_source) - df = libjson.read_json( - filepaths_or_buffers=filepaths_or_buffers, - dtype=dtype, - lines=lines, - compression=compression, - byte_range=byte_range, - keep_quotes=keep_quotes, - mixed_types_as_string=mixed_types_as_string, - prune_columns=prune_columns, - on_bad_lines=on_bad_lines, - ) + if cudf.get_option("mode.pandas_compatible") and lines: + df = libjson.chunked_read_json( + filepaths_or_buffers=filepaths_or_buffers, + dtype=dtype, + compression=compression, + keep_quotes=keep_quotes, + mixed_types_as_string=mixed_types_as_string, + prune_columns=prune_columns, + on_bad_lines=on_bad_lines, + ) + else: + df = libjson.read_json( + filepaths_or_buffers=filepaths_or_buffers, + dtype=dtype, + lines=lines, + compression=compression, + byte_range=byte_range, + keep_quotes=keep_quotes, + mixed_types_as_string=mixed_types_as_string, + prune_columns=prune_columns, + on_bad_lines=on_bad_lines, + ) else: warnings.warn( "Using CPU via Pandas to read JSON dataset, this may " diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 297040b6d95..7260a429ba8 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -1269,44 +1269,44 @@ def test_json_array_of_arrays(data, lines): [ # simple list with mixed types """{"a":[123, {}], "b":1.1}""", - """{"a":[123, {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", - """{"a":[{"L": 123}, 123], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", - """{"a":[123, {"0": 123}, 12.3], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", - """{"a":[123, {"0": 123}, null], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", - """{"a":["123", {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", - """{"a":[{"0": 123}, "123"], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", - """{"a":["123", {"0": 123}, "123"], "b":1.0}\n {"b":1.1}""", - """{"a":[123]}\n {"a":[{"0": 123}], "b":1.0}\n {"b":1.1}""", - """{"a":[{"0": 123}]}\n {"a":[123], "b":1.0}\n {"b":1.1}""", - """{"a":[{"0": 123}]}\n {"a": []}\n {"a":[123], "b":1.0}\n{"b":1.1}""", - """{"b":1.0, "a":[{"0": 123}]}\n {"a":[123]}\n {"b":1.1}\n{"a": []}""", - """{"a": []}\n {"a":[{"0": 123}]}\n {"a":[123], "b":1.0}\n{"b":1.1}""", - """{"a": []}\n {"a":[123], "b":1.0}\n {"a":[{"0": 123}]}\n{"b":1.1}""", - # nested list with mixed types - """{"a":[123, [{"0": 123}, {}]], "b":1.0} - {"b":1.1} - {"a":[]} - {"a":[123]} - {"a":[[123], []]}""", - """{"a":[], "b":1.0} - {"a":[[[456]]]} - {"a":[[123]]} - {"a":[123]}""", - """{"a":[123], "b":1.0} - {"b":1.1} - {"b":2.1} - {"a":[[[[[[]]]]]]}""", - """{"a":[123], "b":1.0} - {"a":[[[[[[]]]]]]} - {"a":[[[[[[]]]]], [[[[[]]]]]]} - {"a":[[[[[[]]]], [[[[]]]]]]} - {"a":[[[[[[]]], [[[]]]]]]} - {"a":[[[[[[]], [[]]]]]]} - {"a":[[[[[[], 123, []]]]]]}""", - # mixed elements in multiple columns - """{"a":[123, {"0": 123}], "b":1.0} - {"c": ["abc"], "b":1.1} - {"c": ["abc", []] }""", + # """{"a":[123, {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", + # """{"a":[{"L": 123}, 123], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", + # """{"a":[123, {"0": 123}, 12.3], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", + # """{"a":[123, {"0": 123}, null], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", + # """{"a":["123", {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", + # """{"a":[{"0": 123}, "123"], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", + # """{"a":["123", {"0": 123}, "123"], "b":1.0}\n {"b":1.1}""", + # """{"a":[123]}\n {"a":[{"0": 123}], "b":1.0}\n {"b":1.1}""", + # """{"a":[{"0": 123}]}\n {"a":[123], "b":1.0}\n {"b":1.1}""", + # """{"a":[{"0": 123}]}\n {"a": []}\n {"a":[123], "b":1.0}\n{"b":1.1}""", + # """{"b":1.0, "a":[{"0": 123}]}\n {"a":[123]}\n {"b":1.1}\n{"a": []}""", + # """{"a": []}\n {"a":[{"0": 123}]}\n {"a":[123], "b":1.0}\n{"b":1.1}""", + # """{"a": []}\n {"a":[123], "b":1.0}\n {"a":[{"0": 123}]}\n{"b":1.1}""", + # # nested list with mixed types + # """{"a":[123, [{"0": 123}, {}]], "b":1.0} + # {"b":1.1} + # {"a":[]} + # {"a":[123]} + # {"a":[[123], []]}""", + # """{"a":[], "b":1.0} + # {"a":[[[456]]]} + # {"a":[[123]]} + # {"a":[123]}""", + # """{"a":[123], "b":1.0} + # {"b":1.1} + # {"b":2.1} + # {"a":[[[[[[]]]]]]}""", + # """{"a":[123], "b":1.0} + # {"a":[[[[[[]]]]]]} + # {"a":[[[[[[]]]]], [[[[[]]]]]]} + # {"a":[[[[[[]]]], [[[[]]]]]]} + # {"a":[[[[[[]]], [[[]]]]]]} + # {"a":[[[[[[]], [[]]]]]]} + # {"a":[[[[[[], 123, []]]]]]}""", + # # mixed elements in multiple columns + # """{"a":[123, {"0": 123}], "b":1.0} + # {"c": ["abc"], "b":1.1} + # {"c": ["abc", []] }""", ], ) def test_json_nested_mixed_types_in_list(jsonl_string): From 8427e4a27e9bb694bc87b02ea92117c135a03220 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Sat, 6 Jul 2024 00:03:12 +0000 Subject: [PATCH 2/8] add tests --- cpp/src/io/json/read_json.cu | 3 +- python/cudf/cudf/_lib/json.pyx | 2 +- python/cudf/cudf/tests/test_json.py | 92 +++++++++++++++++------------ 3 files changed, 57 insertions(+), 40 deletions(-) diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 74001e5e01a..9cd39038348 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -193,7 +193,8 @@ datasource::owning_buffer> get_record_range_raw_input( size_t chunk_size = reader_opts.get_byte_range_size(); CUDF_EXPECTS(total_source_size ? chunk_offset < total_source_size : !chunk_offset, - "Invalid offsetting"); + "Invalid offsetting", + std::invalid_argument); auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset; chunk_size = should_load_all_sources ? total_source_size - chunk_offset : chunk_size; diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 57870e60a3f..bba6e5d83e0 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -204,7 +204,7 @@ cpdef chunked_read_json(object filepaths_or_buffers, try: with nogil: c_result = move(libcudf_read_json(opts)) - except OverflowError: + except (ValueError, OverflowError): break if meta_names is None: meta_names = [info.name.decode() for info in c_result.metadata.schema_info] diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 7260a429ba8..4cd3ca9821d 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -1269,44 +1269,44 @@ def test_json_array_of_arrays(data, lines): [ # simple list with mixed types """{"a":[123, {}], "b":1.1}""", - # """{"a":[123, {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", - # """{"a":[{"L": 123}, 123], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", - # """{"a":[123, {"0": 123}, 12.3], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", - # """{"a":[123, {"0": 123}, null], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", - # """{"a":["123", {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", - # """{"a":[{"0": 123}, "123"], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", - # """{"a":["123", {"0": 123}, "123"], "b":1.0}\n {"b":1.1}""", - # """{"a":[123]}\n {"a":[{"0": 123}], "b":1.0}\n {"b":1.1}""", - # """{"a":[{"0": 123}]}\n {"a":[123], "b":1.0}\n {"b":1.1}""", - # """{"a":[{"0": 123}]}\n {"a": []}\n {"a":[123], "b":1.0}\n{"b":1.1}""", - # """{"b":1.0, "a":[{"0": 123}]}\n {"a":[123]}\n {"b":1.1}\n{"a": []}""", - # """{"a": []}\n {"a":[{"0": 123}]}\n {"a":[123], "b":1.0}\n{"b":1.1}""", - # """{"a": []}\n {"a":[123], "b":1.0}\n {"a":[{"0": 123}]}\n{"b":1.1}""", - # # nested list with mixed types - # """{"a":[123, [{"0": 123}, {}]], "b":1.0} - # {"b":1.1} - # {"a":[]} - # {"a":[123]} - # {"a":[[123], []]}""", - # """{"a":[], "b":1.0} - # {"a":[[[456]]]} - # {"a":[[123]]} - # {"a":[123]}""", - # """{"a":[123], "b":1.0} - # {"b":1.1} - # {"b":2.1} - # {"a":[[[[[[]]]]]]}""", - # """{"a":[123], "b":1.0} - # {"a":[[[[[[]]]]]]} - # {"a":[[[[[[]]]]], [[[[[]]]]]]} - # {"a":[[[[[[]]]], [[[[]]]]]]} - # {"a":[[[[[[]]], [[[]]]]]]} - # {"a":[[[[[[]], [[]]]]]]} - # {"a":[[[[[[], 123, []]]]]]}""", - # # mixed elements in multiple columns - # """{"a":[123, {"0": 123}], "b":1.0} - # {"c": ["abc"], "b":1.1} - # {"c": ["abc", []] }""", + """{"a":[123, {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", + """{"a":[{"L": 123}, 123], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", + """{"a":[123, {"0": 123}, 12.3], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", + """{"a":[123, {"0": 123}, null], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", + """{"a":["123", {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", + """{"a":[{"0": 123}, "123"], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", + """{"a":["123", {"0": 123}, "123"], "b":1.0}\n {"b":1.1}""", + """{"a":[123]}\n {"a":[{"0": 123}], "b":1.0}\n {"b":1.1}""", + """{"a":[{"0": 123}]}\n {"a":[123], "b":1.0}\n {"b":1.1}""", + """{"a":[{"0": 123}]}\n {"a": []}\n {"a":[123], "b":1.0}\n{"b":1.1}""", + """{"b":1.0, "a":[{"0": 123}]}\n {"a":[123]}\n {"b":1.1}\n{"a": []}""", + """{"a": []}\n {"a":[{"0": 123}]}\n {"a":[123], "b":1.0}\n{"b":1.1}""", + """{"a": []}\n {"a":[123], "b":1.0}\n {"a":[{"0": 123}]}\n{"b":1.1}""", + # nested list with mixed types + """{"a":[123, [{"0": 123}, {}]], "b":1.0} + {"b":1.1} + {"a":[]} + {"a":[123]} + {"a":[[123], []]}""", + """{"a":[], "b":1.0} + {"a":[[[456]]]} + {"a":[[123]]} + {"a":[123]}""", + """{"a":[123], "b":1.0} + {"b":1.1} + {"b":2.1} + {"a":[[[[[[]]]]]]}""", + """{"a":[123], "b":1.0} + {"a":[[[[[[]]]]]]} + {"a":[[[[[[]]]]], [[[[[]]]]]]} + {"a":[[[[[[]]]], [[[[]]]]]]} + {"a":[[[[[[]]], [[[]]]]]]} + {"a":[[[[[[]], [[]]]]]]} + {"a":[[[[[[], 123, []]]]]]}""", + # mixed elements in multiple columns + """{"a":[123, {"0": 123}], "b":1.0} + {"c": ["abc"], "b":1.1} + {"c": ["abc", []] }""", ], ) def test_json_nested_mixed_types_in_list(jsonl_string): @@ -1423,3 +1423,19 @@ def test_json_reader_on_bad_lines(on_bad_lines): orient="records", on_bad_lines=on_bad_lines, ) + + +def test_chunked_json_reader(): + df = cudf.DataFrame( + { + "a": ["aaaa"] * 9_00_00_00, + "b": list(range(0, 9_00_00_00)), + } + ) + buf = BytesIO() + df.to_json(buf, lines=True, orient="records", engine="cudf") + buf.seek(0) + df = df.to_pandas() + with cudf.option_context("mode.pandas_compatible", True): + gdf = cudf.read_json(buf, lines=True) + assert_eq(df, gdf) From a00776130dd5e247e8c78e86fa6620349e4f1e70 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 10 Jul 2024 22:10:12 +0000 Subject: [PATCH 3/8] fix syntax --- python/cudf/cudf/_lib/pylibcudf/io/json.pxd | 2 +- python/cudf/cudf/_lib/pylibcudf/io/json.pyx | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd index ac1a78dcd0b..96d640ae1a3 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd @@ -36,7 +36,7 @@ cpdef void write_json( str false_value = * ) -cdef tuple chunked_read_json( +cpdef tuple chunked_read_json( SourceInfo source_info, list dtypes = *, compression_type compression = *, diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx index 9bb73e3ed66..b211dfc6b7d 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx @@ -50,7 +50,7 @@ cdef map[string, schema_element] _generate_schema_map(list dtypes): schema_map[c_name] = s_elem return schema_map -cdef tuple chunked_read_json( +cpdef tuple chunked_read_json( SourceInfo source_info, list dtypes = None, compression_type compression = compression_type.AUTO, @@ -143,7 +143,7 @@ cdef tuple chunked_read_json( c_result.metadata.schema_info ) new_chunk = [ - col.to_pylibcudf(mode="read") for col in TableWithMetadata.from_libcudf( + col for col in TableWithMetadata.from_libcudf( c_result).columns ] From 1bf55699c9cff66264e7b203fd233d5391f0883c Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 10 Jul 2024 23:07:29 +0000 Subject: [PATCH 4/8] move common code together --- python/cudf/cudf/_lib/json.pyx | 3 - python/cudf/cudf/_lib/pylibcudf/io/json.pxd | 3 - python/cudf/cudf/_lib/pylibcudf/io/json.pyx | 101 +++++++++++--------- 3 files changed, 58 insertions(+), 49 deletions(-) diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 5a84af506de..853dd431099 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -104,9 +104,6 @@ cpdef read_json(object filepaths_or_buffers, plc.io.SourceInfo(filepaths_or_buffers), processed_dtypes, c_compression, - True, - byte_range_offset = 0, - byte_range_size = 0, keep_quotes = keep_quotes, mixed_types_as_string = mixed_types_as_string, prune_columns = prune_columns, diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd index 96d640ae1a3..2e0e92a054f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd @@ -40,9 +40,6 @@ cpdef tuple chunked_read_json( SourceInfo source_info, list dtypes = *, compression_type compression = *, - bool lines = *, - size_type byte_range_offset = *, - size_type byte_range_size = *, bool keep_quotes = *, bool mixed_types_as_string = *, bool prune_columns = *, diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx index b211dfc6b7d..a63897904b3 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx @@ -50,13 +50,46 @@ cdef map[string, schema_element] _generate_schema_map(list dtypes): schema_map[c_name] = s_elem return schema_map +cdef json_reader_options _setup_json_reader_options( + SourceInfo source_info, + list dtypes, + compression_type compression, + bool lines, + size_type byte_range_offset, + size_type byte_range_size, + bool keep_quotes, + bool mixed_types_as_string, + bool prune_columns, + json_recovery_mode_t recovery_mode): + cdef vector[data_type] types_vec + cdef json_reader_options opts = move( + json_reader_options.builder(source_info.c_obj) + .compression(compression) + .lines(lines) + .byte_range_offset(byte_range_offset) + .byte_range_size(byte_range_size) + .recovery_mode(recovery_mode) + .build() + ) + + if dtypes is not None: + if isinstance(dtypes[0], tuple): + opts.set_dtypes(move(_generate_schema_map(dtypes))) + else: + for dtype in dtypes: + types_vec.push_back((dtype).c_obj) + opts.set_dtypes(types_vec) + + opts.enable_keep_quotes(keep_quotes) + opts.enable_mixed_types_as_string(mixed_types_as_string) + opts.enable_prune_columns(prune_columns) + return opts + + cpdef tuple chunked_read_json( SourceInfo source_info, list dtypes = None, compression_type compression = compression_type.AUTO, - bool lines = False, - size_type byte_range_offset = 0, - size_type byte_range_size = 0, bool keep_quotes = False, bool mixed_types_as_string = False, bool prune_columns = False, @@ -99,27 +132,19 @@ cpdef tuple chunked_read_json( cdef size_type c_range_size = ( chunk_size if chunk_size is not None else 0 ) - cdef vector[data_type] types_vec - cdef json_reader_options opts = move( - json_reader_options.builder(source_info.c_obj) - .compression(compression) - .lines(lines) - .recovery_mode(recovery_mode) - .build() + cdef json_reader_options opts = _setup_json_reader_options( + source_info=source_info, + dtypes=dtypes, + compression=compression, + lines=True, + byte_range_offset=0, + byte_range_size=0, + keep_quotes=keep_quotes, + mixed_types_as_string=mixed_types_as_string, + prune_columns=prune_columns, + recovery_mode=recovery_mode, ) - if dtypes is not None: - if isinstance(dtypes[0], tuple): - opts.set_dtypes(move(_generate_schema_map(dtypes))) - else: - for dtype in dtypes: - types_vec.push_back((dtype).c_obj) - opts.set_dtypes(types_vec) - - opts.enable_keep_quotes(keep_quotes) - opts.enable_mixed_types_as_string(mixed_types_as_string) - opts.enable_prune_columns(prune_columns) - # Read JSON cdef table_with_metadata c_result @@ -205,29 +230,19 @@ cpdef TableWithMetadata read_json( TableWithMetadata The Table and its corresponding metadata (column names) that were read in. """ - cdef vector[data_type] types_vec - cdef json_reader_options opts = move( - json_reader_options.builder(source_info.c_obj) - .compression(compression) - .lines(lines) - .byte_range_offset(byte_range_offset) - .byte_range_size(byte_range_size) - .recovery_mode(recovery_mode) - .build() + cdef json_reader_options opts = _setup_json_reader_options( + source_info=source_info, + dtypes=dtypes, + compression=compression, + lines=lines, + byte_range_offset=byte_range_offset, + byte_range_size=byte_range_size, + keep_quotes=keep_quotes, + mixed_types_as_string=mixed_types_as_string, + prune_columns=prune_columns, + recovery_mode=recovery_mode, ) - if dtypes is not None: - if isinstance(dtypes[0], tuple): - opts.set_dtypes(move(_generate_schema_map(dtypes))) - else: - for dtype in dtypes: - types_vec.push_back((dtype).c_obj) - opts.set_dtypes(types_vec) - - opts.enable_keep_quotes(keep_quotes) - opts.enable_mixed_types_as_string(mixed_types_as_string) - opts.enable_prune_columns(prune_columns) - # Read JSON cdef table_with_metadata c_result From 872a1fe11d0d5139c258cc618f0014b7ae73d6c7 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 10 Jul 2024 23:07:54 +0000 Subject: [PATCH 5/8] move common code together --- python/cudf/cudf/_lib/pylibcudf/io/json.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx index a63897904b3..f4bbc0dc4d4 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx @@ -50,6 +50,7 @@ cdef map[string, schema_element] _generate_schema_map(list dtypes): schema_map[c_name] = s_elem return schema_map + cdef json_reader_options _setup_json_reader_options( SourceInfo source_info, list dtypes, @@ -61,6 +62,7 @@ cdef json_reader_options _setup_json_reader_options( bool mixed_types_as_string, bool prune_columns, json_recovery_mode_t recovery_mode): + cdef vector[data_type] types_vec cdef json_reader_options opts = move( json_reader_options.builder(source_info.c_obj) From bf2578edc0e11352100e12e16373f667a8cd9758 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 11 Jul 2024 10:29:55 -0500 Subject: [PATCH 6/8] update docstring --- python/cudf/cudf/_lib/pylibcudf/io/json.pyx | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx index f4bbc0dc4d4..3c250cde4a0 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx @@ -112,24 +112,25 @@ cpdef tuple chunked_read_json( the list of child dtypes is an empty list if the child is not a nested type (list or struct dtype), and is of format (column_child_name, column_child_type, list of grandchild dtypes). - compression_type: CompressionType, default CompressionType.AUTO + compression: CompressionType, default CompressionType.AUTO The compression format of the JSON source. - byte_range_offset : size_type, default 0 - Number of bytes to skip from source start. - byte_range_size : size_type, default 0 - Number of bytes to read. By default, will read all bytes. keep_quotes : bool, default False Whether the reader should keep quotes of string values. + mixed_types_as_string : bool, default False + If True, mixed type columns are returned as string columns. + If `False` parsing mixed type columns will thrown an error. prune_columns : bool, default False Whether to only read columns specified in dtypes. recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL Whether to raise an error or set corresponding values to null when encountering an invalid JSON line. + chunk_size : int, default 100_000_000 bytes. + The number of bytes to be read in chunks. Returns ------- - TableWithMetadata - The Table and its corresponding metadata (column names) that were read in. + tuple + A tuple of (columns, column_name, child_names) """ cdef size_type c_range_size = ( chunk_size if chunk_size is not None else 0 @@ -213,7 +214,7 @@ cpdef TableWithMetadata read_json( the list of child dtypes is an empty list if the child is not a nested type (list or struct dtype), and is of format (column_child_name, column_child_type, list of grandchild dtypes). - compression_type: CompressionType, default CompressionType.AUTO + compression: CompressionType, default CompressionType.AUTO The compression format of the JSON source. byte_range_offset : size_type, default 0 Number of bytes to skip from source start. @@ -221,6 +222,9 @@ cpdef TableWithMetadata read_json( Number of bytes to read. By default, will read all bytes. keep_quotes : bool, default False Whether the reader should keep quotes of string values. + mixed_types_as_string : bool, default False + If True, mixed type columns are returned as string columns. + If `False` parsing mixed type columns will thrown an error. prune_columns : bool, default False Whether to only read columns specified in dtypes. recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL From 64992f8fbf244e658371f3412595dd5b79a49f69 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 12 Jul 2024 16:04:54 -0500 Subject: [PATCH 7/8] Update python/cudf/cudf/_lib/pylibcudf/io/json.pyx Co-authored-by: Shruti Shivakumar --- python/cudf/cudf/_lib/pylibcudf/io/json.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx index 3c250cde4a0..42aeceb9d35 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx @@ -125,7 +125,7 @@ cpdef tuple chunked_read_json( Whether to raise an error or set corresponding values to null when encountering an invalid JSON line. chunk_size : int, default 100_000_000 bytes. - The number of bytes to be read in chunks. + The number of bytes to be read in chunks. The chunk_size should be set to at least row_size. Returns ------- From a70d841bd8c89f859f263b3a00fb0362e13b2183 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 12 Jul 2024 16:31:55 -0500 Subject: [PATCH 8/8] Update json.pyx --- python/cudf/cudf/_lib/pylibcudf/io/json.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx index 42aeceb9d35..2710ee60075 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx @@ -125,7 +125,8 @@ cpdef tuple chunked_read_json( Whether to raise an error or set corresponding values to null when encountering an invalid JSON line. chunk_size : int, default 100_000_000 bytes. - The number of bytes to be read in chunks. The chunk_size should be set to at least row_size. + The number of bytes to be read in chunks. + The chunk_size should be set to at least row_size. Returns -------