From ed86600e24a8f4f3749fd3ce32e08c566752f9a5 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Fri, 5 Jul 2024 22:07:05 +0000
Subject: [PATCH 1/8] Implement chunked json reader

---
 cpp/src/io/utilities/datasource.cpp |   2 +-
 python/cudf/cudf/_lib/json.pyx      | 151 ++++++++++++++++++++++------
 python/cudf/cudf/io/json.py         |  33 ++++--
 python/cudf/cudf/tests/test_json.py |  76 +++++++-------
 4 files changed, 181 insertions(+), 81 deletions(-)

diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index ca8932322bf..98593a65f5d 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -216,7 +216,7 @@ class memory_mapped_source : public file_source {
 
   void map(int fd, size_t offset, size_t size)
   {
-    CUDF_EXPECTS(offset < _file.size(), "Offset is past end of file");
+    CUDF_EXPECTS(offset < _file.size(), "Offset is past end of file", std::overflow_error);
 
     // Offset for `mmap()` must be page aligned
     _map_offset = offset & ~(sysconf(_SC_PAGESIZE) - 1);
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index a8fef907bad..b4201c3b5a1 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -40,7 +40,15 @@ from cudf._lib.pylibcudf.libcudf.io.types cimport (
 from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
 from cudf._lib.types cimport dtype_to_data_type
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport (
+    columns_from_unique_ptr,
+    data_from_pylibcudf_table,
+    data_from_unique_ptr,
+    table_view_from_table,
+)
+
+from cudf._lib import pylibcudf
+from cudf._lib.concat import concat_columns
 
 
 cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines):
@@ -51,25 +59,17 @@ cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines):
     else:
         raise TypeError(f"Invalid parameter for {on_bad_lines=}")
 
-
-cpdef read_json(object filepaths_or_buffers,
-                object dtype,
-                bool lines,
-                object compression,
-                object byte_range,
-                bool keep_quotes,
-                bool mixed_types_as_string,
-                bool prune_columns,
-                object on_bad_lines):
-    """
-    Cython function to call into libcudf API, see `read_json`.
-
-    See Also
-    --------
-    cudf.io.json.read_json
-    cudf.io.json.to_json
-    """
-
+cdef json_reader_options _setup_json_reader_options(
+        object filepaths_or_buffers,
+        object dtype,
+        object compression,
+        bool keep_quotes,
+        bool mixed_types_as_string,
+        bool prune_columns,
+        object on_bad_lines,
+        bool lines,
+        size_type byte_range_offset,
+        size_type byte_range_size):
     # If input data is a JSON string (or StringIO), hold a reference to
     # the encoded memoryview externally to ensure the encoded buffer
     # isn't destroyed before calling libcudf `read_json()`
@@ -85,14 +85,6 @@ cpdef read_json(object filepaths_or_buffers,
     cdef vector[data_type] c_dtypes_list
     cdef map[string, schema_element] c_dtypes_schema_map
     cdef cudf_io_types.compression_type c_compression
-    # Determine byte read offsets if applicable
-    cdef size_type c_range_offset = (
-        byte_range[0] if byte_range is not None else 0
-    )
-    cdef size_type c_range_size = (
-        byte_range[1] if byte_range is not None else 0
-    )
-    cdef bool c_lines = lines
 
     if compression is not None:
         if compression == 'gzip':
@@ -126,9 +118,9 @@ cpdef read_json(object filepaths_or_buffers,
     cdef json_reader_options opts = move(
         json_reader_options.builder(make_source_info(filepaths_or_buffers))
         .compression(c_compression)
-        .lines(c_lines)
-        .byte_range_offset(c_range_offset)
-        .byte_range_size(c_range_size)
+        .lines(lines)
+        .byte_range_offset(byte_range_offset)
+        .byte_range_size(byte_range_size)
         .recovery_mode(_get_json_recovery_mode(on_bad_lines))
         .build()
     )
@@ -141,6 +133,38 @@ cpdef read_json(object filepaths_or_buffers,
     opts.enable_mixed_types_as_string(mixed_types_as_string)
     opts.enable_prune_columns(prune_columns)
 
+    return opts
+
+cpdef read_json(object filepaths_or_buffers,
+                object dtype,
+                bool lines,
+                object compression,
+                object byte_range,
+                bool keep_quotes,
+                bool mixed_types_as_string,
+                bool prune_columns,
+                object on_bad_lines):
+    """
+    Cython function to call into libcudf API, see `read_json`.
+
+    See Also
+    --------
+    cudf.io.json.read_json
+    cudf.io.json.to_json
+    """
+    # Determine byte read offsets if applicable
+    cdef size_type c_range_offset = (
+        byte_range[0] if byte_range is not None else 0
+    )
+    cdef size_type c_range_size = (
+        byte_range[1] if byte_range is not None else 0
+    )
+    cdef json_reader_options opts = _setup_json_reader_options(
+        filepaths_or_buffers, dtype, compression, keep_quotes,
+        mixed_types_as_string, prune_columns, on_bad_lines,
+        lines, c_range_offset, c_range_size
+    )
+
     # Read JSON
     cdef cudf_io_types.table_with_metadata c_result
 
@@ -157,6 +181,71 @@ cpdef read_json(object filepaths_or_buffers,
 
     return df
 
+cpdef chunked_read_json(object filepaths_or_buffers,
+                        object dtype,
+                        object compression,
+                        bool keep_quotes,
+                        bool mixed_types_as_string,
+                        bool prune_columns,
+                        object on_bad_lines,
+                        int chunk_size=100_000_000):
+    """
+    Cython function to call into libcudf API, see `read_json`.
+
+    See Also
+    --------
+    cudf.io.json.read_json
+    cudf.io.json.to_json
+    """
+    cdef size_type c_range_size = (
+        chunk_size if chunk_size is not None else 0
+    )
+    cdef json_reader_options opts = _setup_json_reader_options(
+        filepaths_or_buffers, dtype, compression, keep_quotes,
+        mixed_types_as_string, prune_columns, on_bad_lines,
+        True, 0, c_range_size
+    )
+
+    # Read JSON
+    cdef cudf_io_types.table_with_metadata c_result
+    final_columns = []
+    meta_names = None
+    i = 0
+    while True:
+        opts.set_byte_range_offset(c_range_size * i)
+        opts.set_byte_range_size(c_range_size)
+
+        try:
+            with nogil:
+                c_result = move(libcudf_read_json(opts))
+        except OverflowError:
+            break
+        if meta_names is None:
+            meta_names = [info.name.decode() for info in c_result.metadata.schema_info]
+        new_chunk = columns_from_unique_ptr(move(c_result.tbl))
+        if len(final_columns) == 0:
+            final_columns = new_chunk
+        else:
+            for col_idx in range(len(meta_names)):
+                final_columns[col_idx] = concat_columns(
+                    [final_columns[col_idx], new_chunk[col_idx]]
+                )
+                # Must drop any residual GPU columns to save memory
+                new_chunk[col_idx] = None
+        i += 1
+    df = cudf.DataFrame._from_data(
+            *data_from_pylibcudf_table(
+                pylibcudf.Table(
+                    [col.to_pylibcudf(mode="read") for col in final_columns]
+                ),
+                column_names=meta_names,
+                index_names=None
+            )
+        )
+    update_struct_field_names(df, c_result.metadata.schema_info)
+
+    return df
+
 
 @acquire_spill_lock()
 def write_json(
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index fc3387d5117..f830de3a468 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -94,17 +94,28 @@ def read_json(
             else:
                 filepaths_or_buffers.append(tmp_source)
 
-        df = libjson.read_json(
-            filepaths_or_buffers=filepaths_or_buffers,
-            dtype=dtype,
-            lines=lines,
-            compression=compression,
-            byte_range=byte_range,
-            keep_quotes=keep_quotes,
-            mixed_types_as_string=mixed_types_as_string,
-            prune_columns=prune_columns,
-            on_bad_lines=on_bad_lines,
-        )
+        if cudf.get_option("mode.pandas_compatible") and lines:
+            df = libjson.chunked_read_json(
+                filepaths_or_buffers=filepaths_or_buffers,
+                dtype=dtype,
+                compression=compression,
+                keep_quotes=keep_quotes,
+                mixed_types_as_string=mixed_types_as_string,
+                prune_columns=prune_columns,
+                on_bad_lines=on_bad_lines,
+            )
+        else:
+            df = libjson.read_json(
+                filepaths_or_buffers=filepaths_or_buffers,
+                dtype=dtype,
+                lines=lines,
+                compression=compression,
+                byte_range=byte_range,
+                keep_quotes=keep_quotes,
+                mixed_types_as_string=mixed_types_as_string,
+                prune_columns=prune_columns,
+                on_bad_lines=on_bad_lines,
+            )
     else:
         warnings.warn(
             "Using CPU via Pandas to read JSON dataset, this may "
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 297040b6d95..7260a429ba8 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -1269,44 +1269,44 @@ def test_json_array_of_arrays(data, lines):
     [
         # simple list with mixed types
         """{"a":[123, {}], "b":1.1}""",
-        """{"a":[123, {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
-        """{"a":[{"L": 123}, 123], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
-        """{"a":[123, {"0": 123}, 12.3], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
-        """{"a":[123, {"0": 123}, null], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
-        """{"a":["123", {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
-        """{"a":[{"0": 123}, "123"], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
-        """{"a":["123", {"0": 123}, "123"], "b":1.0}\n {"b":1.1}""",
-        """{"a":[123]}\n {"a":[{"0": 123}], "b":1.0}\n {"b":1.1}""",
-        """{"a":[{"0": 123}]}\n {"a":[123], "b":1.0}\n {"b":1.1}""",
-        """{"a":[{"0": 123}]}\n {"a": []}\n {"a":[123], "b":1.0}\n{"b":1.1}""",
-        """{"b":1.0, "a":[{"0": 123}]}\n {"a":[123]}\n {"b":1.1}\n{"a": []}""",
-        """{"a": []}\n {"a":[{"0": 123}]}\n {"a":[123], "b":1.0}\n{"b":1.1}""",
-        """{"a": []}\n {"a":[123], "b":1.0}\n {"a":[{"0": 123}]}\n{"b":1.1}""",
-        # nested list with mixed types
-        """{"a":[123, [{"0": 123}, {}]], "b":1.0}
-           {"b":1.1}
-           {"a":[]}
-           {"a":[123]}
-           {"a":[[123], []]}""",
-        """{"a":[], "b":1.0}
-           {"a":[[[456]]]}
-           {"a":[[123]]}
-           {"a":[123]}""",
-        """{"a":[123], "b":1.0}
-           {"b":1.1}
-           {"b":2.1}
-           {"a":[[[[[[]]]]]]}""",
-        """{"a":[123], "b":1.0}
-           {"a":[[[[[[]]]]]]}
-           {"a":[[[[[[]]]]], [[[[[]]]]]]}
-           {"a":[[[[[[]]]], [[[[]]]]]]}
-           {"a":[[[[[[]]], [[[]]]]]]}
-           {"a":[[[[[[]], [[]]]]]]}
-           {"a":[[[[[[], 123, []]]]]]}""",
-        # mixed elements in multiple columns
-        """{"a":[123, {"0": 123}], "b":1.0}
-           {"c": ["abc"], "b":1.1}
-           {"c": ["abc", []] }""",
+        # """{"a":[123, {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
+        # """{"a":[{"L": 123}, 123], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
+        # """{"a":[123, {"0": 123}, 12.3], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
+        # """{"a":[123, {"0": 123}, null], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
+        # """{"a":["123", {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
+        # """{"a":[{"0": 123}, "123"], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
+        # """{"a":["123", {"0": 123}, "123"], "b":1.0}\n {"b":1.1}""",
+        # """{"a":[123]}\n {"a":[{"0": 123}], "b":1.0}\n {"b":1.1}""",
+        # """{"a":[{"0": 123}]}\n {"a":[123], "b":1.0}\n {"b":1.1}""",
+        # """{"a":[{"0": 123}]}\n {"a": []}\n {"a":[123], "b":1.0}\n{"b":1.1}""",
+        # """{"b":1.0, "a":[{"0": 123}]}\n {"a":[123]}\n {"b":1.1}\n{"a": []}""",
+        # """{"a": []}\n {"a":[{"0": 123}]}\n {"a":[123], "b":1.0}\n{"b":1.1}""",
+        # """{"a": []}\n {"a":[123], "b":1.0}\n {"a":[{"0": 123}]}\n{"b":1.1}""",
+        # # nested list with mixed types
+        # """{"a":[123, [{"0": 123}, {}]], "b":1.0}
+        #    {"b":1.1}
+        #    {"a":[]}
+        #    {"a":[123]}
+        #    {"a":[[123], []]}""",
+        # """{"a":[], "b":1.0}
+        #    {"a":[[[456]]]}
+        #    {"a":[[123]]}
+        #    {"a":[123]}""",
+        # """{"a":[123], "b":1.0}
+        #    {"b":1.1}
+        #    {"b":2.1}
+        #    {"a":[[[[[[]]]]]]}""",
+        # """{"a":[123], "b":1.0}
+        #    {"a":[[[[[[]]]]]]}
+        #    {"a":[[[[[[]]]]], [[[[[]]]]]]}
+        #    {"a":[[[[[[]]]], [[[[]]]]]]}
+        #    {"a":[[[[[[]]], [[[]]]]]]}
+        #    {"a":[[[[[[]], [[]]]]]]}
+        #    {"a":[[[[[[], 123, []]]]]]}""",
+        # # mixed elements in multiple columns
+        # """{"a":[123, {"0": 123}], "b":1.0}
+        #    {"c": ["abc"], "b":1.1}
+        #    {"c": ["abc", []] }""",
     ],
 )
 def test_json_nested_mixed_types_in_list(jsonl_string):

From 8427e4a27e9bb694bc87b02ea92117c135a03220 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Sat, 6 Jul 2024 00:03:12 +0000
Subject: [PATCH 2/8] add tests

---
 cpp/src/io/json/read_json.cu        |  3 +-
 python/cudf/cudf/_lib/json.pyx      |  2 +-
 python/cudf/cudf/tests/test_json.py | 92 +++++++++++++++++------------
 3 files changed, 57 insertions(+), 40 deletions(-)

diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 74001e5e01a..9cd39038348 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -193,7 +193,8 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
   size_t chunk_size                         = reader_opts.get_byte_range_size();
 
   CUDF_EXPECTS(total_source_size ? chunk_offset < total_source_size : !chunk_offset,
-               "Invalid offsetting");
+               "Invalid offsetting",
+               std::invalid_argument);
   auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset;
   chunk_size = should_load_all_sources ? total_source_size - chunk_offset : chunk_size;
 
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 57870e60a3f..bba6e5d83e0 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -204,7 +204,7 @@ cpdef chunked_read_json(object filepaths_or_buffers,
         try:
             with nogil:
                 c_result = move(libcudf_read_json(opts))
-        except OverflowError:
+        except (ValueError, OverflowError):
             break
         if meta_names is None:
             meta_names = [info.name.decode() for info in c_result.metadata.schema_info]
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 7260a429ba8..4cd3ca9821d 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -1269,44 +1269,44 @@ def test_json_array_of_arrays(data, lines):
     [
         # simple list with mixed types
         """{"a":[123, {}], "b":1.1}""",
-        # """{"a":[123, {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
-        # """{"a":[{"L": 123}, 123], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
-        # """{"a":[123, {"0": 123}, 12.3], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
-        # """{"a":[123, {"0": 123}, null], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
-        # """{"a":["123", {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
-        # """{"a":[{"0": 123}, "123"], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
-        # """{"a":["123", {"0": 123}, "123"], "b":1.0}\n {"b":1.1}""",
-        # """{"a":[123]}\n {"a":[{"0": 123}], "b":1.0}\n {"b":1.1}""",
-        # """{"a":[{"0": 123}]}\n {"a":[123], "b":1.0}\n {"b":1.1}""",
-        # """{"a":[{"0": 123}]}\n {"a": []}\n {"a":[123], "b":1.0}\n{"b":1.1}""",
-        # """{"b":1.0, "a":[{"0": 123}]}\n {"a":[123]}\n {"b":1.1}\n{"a": []}""",
-        # """{"a": []}\n {"a":[{"0": 123}]}\n {"a":[123], "b":1.0}\n{"b":1.1}""",
-        # """{"a": []}\n {"a":[123], "b":1.0}\n {"a":[{"0": 123}]}\n{"b":1.1}""",
-        # # nested list with mixed types
-        # """{"a":[123, [{"0": 123}, {}]], "b":1.0}
-        #    {"b":1.1}
-        #    {"a":[]}
-        #    {"a":[123]}
-        #    {"a":[[123], []]}""",
-        # """{"a":[], "b":1.0}
-        #    {"a":[[[456]]]}
-        #    {"a":[[123]]}
-        #    {"a":[123]}""",
-        # """{"a":[123], "b":1.0}
-        #    {"b":1.1}
-        #    {"b":2.1}
-        #    {"a":[[[[[[]]]]]]}""",
-        # """{"a":[123], "b":1.0}
-        #    {"a":[[[[[[]]]]]]}
-        #    {"a":[[[[[[]]]]], [[[[[]]]]]]}
-        #    {"a":[[[[[[]]]], [[[[]]]]]]}
-        #    {"a":[[[[[[]]], [[[]]]]]]}
-        #    {"a":[[[[[[]], [[]]]]]]}
-        #    {"a":[[[[[[], 123, []]]]]]}""",
-        # # mixed elements in multiple columns
-        # """{"a":[123, {"0": 123}], "b":1.0}
-        #    {"c": ["abc"], "b":1.1}
-        #    {"c": ["abc", []] }""",
+        """{"a":[123, {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
+        """{"a":[{"L": 123}, 123], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
+        """{"a":[123, {"0": 123}, 12.3], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
+        """{"a":[123, {"0": 123}, null], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
+        """{"a":["123", {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
+        """{"a":[{"0": 123}, "123"], "b":1.0}\n {"b":1.1}\n {"b":2.1}""",
+        """{"a":["123", {"0": 123}, "123"], "b":1.0}\n {"b":1.1}""",
+        """{"a":[123]}\n {"a":[{"0": 123}], "b":1.0}\n {"b":1.1}""",
+        """{"a":[{"0": 123}]}\n {"a":[123], "b":1.0}\n {"b":1.1}""",
+        """{"a":[{"0": 123}]}\n {"a": []}\n {"a":[123], "b":1.0}\n{"b":1.1}""",
+        """{"b":1.0, "a":[{"0": 123}]}\n {"a":[123]}\n {"b":1.1}\n{"a": []}""",
+        """{"a": []}\n {"a":[{"0": 123}]}\n {"a":[123], "b":1.0}\n{"b":1.1}""",
+        """{"a": []}\n {"a":[123], "b":1.0}\n {"a":[{"0": 123}]}\n{"b":1.1}""",
+        # nested list with mixed types
+        """{"a":[123, [{"0": 123}, {}]], "b":1.0}
+           {"b":1.1}
+           {"a":[]}
+           {"a":[123]}
+           {"a":[[123], []]}""",
+        """{"a":[], "b":1.0}
+           {"a":[[[456]]]}
+           {"a":[[123]]}
+           {"a":[123]}""",
+        """{"a":[123], "b":1.0}
+           {"b":1.1}
+           {"b":2.1}
+           {"a":[[[[[[]]]]]]}""",
+        """{"a":[123], "b":1.0}
+           {"a":[[[[[[]]]]]]}
+           {"a":[[[[[[]]]]], [[[[[]]]]]]}
+           {"a":[[[[[[]]]], [[[[]]]]]]}
+           {"a":[[[[[[]]], [[[]]]]]]}
+           {"a":[[[[[[]], [[]]]]]]}
+           {"a":[[[[[[], 123, []]]]]]}""",
+        # mixed elements in multiple columns
+        """{"a":[123, {"0": 123}], "b":1.0}
+           {"c": ["abc"], "b":1.1}
+           {"c": ["abc", []] }""",
     ],
 )
 def test_json_nested_mixed_types_in_list(jsonl_string):
@@ -1423,3 +1423,19 @@ def test_json_reader_on_bad_lines(on_bad_lines):
                 orient="records",
                 on_bad_lines=on_bad_lines,
             )
+
+
+def test_chunked_json_reader():
+    df = cudf.DataFrame(
+        {
+            "a": ["aaaa"] * 9_00_00_00,
+            "b": list(range(0, 9_00_00_00)),
+        }
+    )
+    buf = BytesIO()
+    df.to_json(buf, lines=True, orient="records", engine="cudf")
+    buf.seek(0)
+    df = df.to_pandas()
+    with cudf.option_context("mode.pandas_compatible", True):
+        gdf = cudf.read_json(buf, lines=True)
+    assert_eq(df, gdf)

From a00776130dd5e247e8c78e86fa6620349e4f1e70 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 10 Jul 2024 22:10:12 +0000
Subject: [PATCH 3/8] fix syntax

---
 python/cudf/cudf/_lib/pylibcudf/io/json.pxd | 2 +-
 python/cudf/cudf/_lib/pylibcudf/io/json.pyx | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
index ac1a78dcd0b..96d640ae1a3 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
@@ -36,7 +36,7 @@ cpdef void write_json(
     str false_value = *
 )
 
-cdef tuple chunked_read_json(
+cpdef tuple chunked_read_json(
     SourceInfo source_info,
     list dtypes = *,
     compression_type compression = *,
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
index 9bb73e3ed66..b211dfc6b7d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
@@ -50,7 +50,7 @@ cdef map[string, schema_element] _generate_schema_map(list dtypes):
         schema_map[c_name] = s_elem
     return schema_map
 
-cdef tuple chunked_read_json(
+cpdef tuple chunked_read_json(
     SourceInfo source_info,
     list dtypes = None,
     compression_type compression = compression_type.AUTO,
@@ -143,7 +143,7 @@ cdef tuple chunked_read_json(
                 c_result.metadata.schema_info
             )
         new_chunk = [
-            col.to_pylibcudf(mode="read") for col in TableWithMetadata.from_libcudf(
+            col for col in TableWithMetadata.from_libcudf(
                 c_result).columns
         ]
 

From 1bf55699c9cff66264e7b203fd233d5391f0883c Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 10 Jul 2024 23:07:29 +0000
Subject: [PATCH 4/8] move common code together

---
 python/cudf/cudf/_lib/json.pyx              |   3 -
 python/cudf/cudf/_lib/pylibcudf/io/json.pxd |   3 -
 python/cudf/cudf/_lib/pylibcudf/io/json.pyx | 101 +++++++++++---------
 3 files changed, 58 insertions(+), 49 deletions(-)

diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 5a84af506de..853dd431099 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -104,9 +104,6 @@ cpdef read_json(object filepaths_or_buffers,
             plc.io.SourceInfo(filepaths_or_buffers),
             processed_dtypes,
             c_compression,
-            True,
-            byte_range_offset = 0,
-            byte_range_size = 0,
             keep_quotes = keep_quotes,
             mixed_types_as_string = mixed_types_as_string,
             prune_columns = prune_columns,
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
index 96d640ae1a3..2e0e92a054f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
@@ -40,9 +40,6 @@ cpdef tuple chunked_read_json(
     SourceInfo source_info,
     list dtypes = *,
     compression_type compression = *,
-    bool lines = *,
-    size_type byte_range_offset = *,
-    size_type byte_range_size = *,
     bool keep_quotes = *,
     bool mixed_types_as_string = *,
     bool prune_columns = *,
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
index b211dfc6b7d..a63897904b3 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
@@ -50,13 +50,46 @@ cdef map[string, schema_element] _generate_schema_map(list dtypes):
         schema_map[c_name] = s_elem
     return schema_map
 
+cdef json_reader_options _setup_json_reader_options(
+        SourceInfo source_info,
+        list dtypes,
+        compression_type compression,
+        bool lines,
+        size_type byte_range_offset,
+        size_type byte_range_size,
+        bool keep_quotes,
+        bool mixed_types_as_string,
+        bool prune_columns,
+        json_recovery_mode_t recovery_mode):
+    cdef vector[data_type] types_vec
+    cdef json_reader_options opts = move(
+        json_reader_options.builder(source_info.c_obj)
+        .compression(compression)
+        .lines(lines)
+        .byte_range_offset(byte_range_offset)
+        .byte_range_size(byte_range_size)
+        .recovery_mode(recovery_mode)
+        .build()
+    )
+
+    if dtypes is not None:
+        if isinstance(dtypes[0], tuple):
+            opts.set_dtypes(move(_generate_schema_map(dtypes)))
+        else:
+            for dtype in dtypes:
+                types_vec.push_back((<DataType>dtype).c_obj)
+            opts.set_dtypes(types_vec)
+
+    opts.enable_keep_quotes(keep_quotes)
+    opts.enable_mixed_types_as_string(mixed_types_as_string)
+    opts.enable_prune_columns(prune_columns)
+    return opts
+
+
 cpdef tuple chunked_read_json(
     SourceInfo source_info,
     list dtypes = None,
     compression_type compression = compression_type.AUTO,
-    bool lines = False,
-    size_type byte_range_offset = 0,
-    size_type byte_range_size = 0,
     bool keep_quotes = False,
     bool mixed_types_as_string = False,
     bool prune_columns = False,
@@ -99,27 +132,19 @@ cpdef tuple chunked_read_json(
     cdef size_type c_range_size = (
         chunk_size if chunk_size is not None else 0
     )
-    cdef vector[data_type] types_vec
-    cdef json_reader_options opts = move(
-        json_reader_options.builder(source_info.c_obj)
-        .compression(compression)
-        .lines(lines)
-        .recovery_mode(recovery_mode)
-        .build()
+    cdef json_reader_options opts = _setup_json_reader_options(
+        source_info=source_info,
+        dtypes=dtypes,
+        compression=compression,
+        lines=True,
+        byte_range_offset=0,
+        byte_range_size=0,
+        keep_quotes=keep_quotes,
+        mixed_types_as_string=mixed_types_as_string,
+        prune_columns=prune_columns,
+        recovery_mode=recovery_mode,
     )
 
-    if dtypes is not None:
-        if isinstance(dtypes[0], tuple):
-            opts.set_dtypes(move(_generate_schema_map(dtypes)))
-        else:
-            for dtype in dtypes:
-                types_vec.push_back((<DataType>dtype).c_obj)
-            opts.set_dtypes(types_vec)
-
-    opts.enable_keep_quotes(keep_quotes)
-    opts.enable_mixed_types_as_string(mixed_types_as_string)
-    opts.enable_prune_columns(prune_columns)
-
     # Read JSON
     cdef table_with_metadata c_result
 
@@ -205,29 +230,19 @@ cpdef TableWithMetadata read_json(
     TableWithMetadata
         The Table and its corresponding metadata (column names) that were read in.
     """
-    cdef vector[data_type] types_vec
-    cdef json_reader_options opts = move(
-        json_reader_options.builder(source_info.c_obj)
-        .compression(compression)
-        .lines(lines)
-        .byte_range_offset(byte_range_offset)
-        .byte_range_size(byte_range_size)
-        .recovery_mode(recovery_mode)
-        .build()
+    cdef json_reader_options opts = _setup_json_reader_options(
+        source_info=source_info,
+        dtypes=dtypes,
+        compression=compression,
+        lines=lines,
+        byte_range_offset=byte_range_offset,
+        byte_range_size=byte_range_size,
+        keep_quotes=keep_quotes,
+        mixed_types_as_string=mixed_types_as_string,
+        prune_columns=prune_columns,
+        recovery_mode=recovery_mode,
     )
 
-    if dtypes is not None:
-        if isinstance(dtypes[0], tuple):
-            opts.set_dtypes(move(_generate_schema_map(dtypes)))
-        else:
-            for dtype in dtypes:
-                types_vec.push_back((<DataType>dtype).c_obj)
-            opts.set_dtypes(types_vec)
-
-    opts.enable_keep_quotes(keep_quotes)
-    opts.enable_mixed_types_as_string(mixed_types_as_string)
-    opts.enable_prune_columns(prune_columns)
-
     # Read JSON
     cdef table_with_metadata c_result
 

From 872a1fe11d0d5139c258cc618f0014b7ae73d6c7 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 10 Jul 2024 23:07:54 +0000
Subject: [PATCH 5/8] move common code together

---
 python/cudf/cudf/_lib/pylibcudf/io/json.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
index a63897904b3..f4bbc0dc4d4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
@@ -50,6 +50,7 @@ cdef map[string, schema_element] _generate_schema_map(list dtypes):
         schema_map[c_name] = s_elem
     return schema_map
 
+
 cdef json_reader_options _setup_json_reader_options(
         SourceInfo source_info,
         list dtypes,
@@ -61,6 +62,7 @@ cdef json_reader_options _setup_json_reader_options(
         bool mixed_types_as_string,
         bool prune_columns,
         json_recovery_mode_t recovery_mode):
+
     cdef vector[data_type] types_vec
     cdef json_reader_options opts = move(
         json_reader_options.builder(source_info.c_obj)

From bf2578edc0e11352100e12e16373f667a8cd9758 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 11 Jul 2024 10:29:55 -0500
Subject: [PATCH 6/8] update docstring

---
 python/cudf/cudf/_lib/pylibcudf/io/json.pyx | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
index f4bbc0dc4d4..3c250cde4a0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
@@ -112,24 +112,25 @@ cpdef tuple chunked_read_json(
         the list of child dtypes is an empty list if the child is not
         a nested type (list or struct dtype), and is of format
         (column_child_name, column_child_type, list of grandchild dtypes).
-    compression_type: CompressionType, default CompressionType.AUTO
+    compression: CompressionType, default CompressionType.AUTO
         The compression format of the JSON source.
-    byte_range_offset : size_type, default 0
-        Number of bytes to skip from source start.
-    byte_range_size : size_type, default 0
-        Number of bytes to read. By default, will read all bytes.
     keep_quotes : bool, default False
         Whether the reader should keep quotes of string values.
+    mixed_types_as_string : bool, default False
+        If True, mixed type columns are returned as string columns.
+        If `False` parsing mixed type columns will thrown an error.
     prune_columns : bool, default False
         Whether to only read columns specified in dtypes.
     recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL
         Whether to raise an error or set corresponding values to null
         when encountering an invalid JSON line.
+    chunk_size : int, default 100_000_000 bytes.
+        The number of bytes to be read in chunks.
 
     Returns
     -------
-    TableWithMetadata
-        The Table and its corresponding metadata (column names) that were read in.
+    tuple
+        A tuple of (columns, column_name, child_names)
     """
     cdef size_type c_range_size = (
         chunk_size if chunk_size is not None else 0
@@ -213,7 +214,7 @@ cpdef TableWithMetadata read_json(
         the list of child dtypes is an empty list if the child is not
         a nested type (list or struct dtype), and is of format
         (column_child_name, column_child_type, list of grandchild dtypes).
-    compression_type: CompressionType, default CompressionType.AUTO
+    compression: CompressionType, default CompressionType.AUTO
         The compression format of the JSON source.
     byte_range_offset : size_type, default 0
         Number of bytes to skip from source start.
@@ -221,6 +222,9 @@ cpdef TableWithMetadata read_json(
         Number of bytes to read. By default, will read all bytes.
     keep_quotes : bool, default False
         Whether the reader should keep quotes of string values.
+    mixed_types_as_string : bool, default False
+        If True, mixed type columns are returned as string columns.
+        If `False` parsing mixed type columns will thrown an error.
     prune_columns : bool, default False
         Whether to only read columns specified in dtypes.
     recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL

From 64992f8fbf244e658371f3412595dd5b79a49f69 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 12 Jul 2024 16:04:54 -0500
Subject: [PATCH 7/8] Update python/cudf/cudf/_lib/pylibcudf/io/json.pyx

Co-authored-by: Shruti Shivakumar <shruti.shivakumar@gmail.com>
---
 python/cudf/cudf/_lib/pylibcudf/io/json.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
index 3c250cde4a0..42aeceb9d35 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
@@ -125,7 +125,7 @@ cpdef tuple chunked_read_json(
         Whether to raise an error or set corresponding values to null
         when encountering an invalid JSON line.
     chunk_size : int, default 100_000_000 bytes.
-        The number of bytes to be read in chunks.
+        The number of bytes to be read in chunks. The chunk_size should be set to at least row_size.
 
     Returns
     -------

From a70d841bd8c89f859f263b3a00fb0362e13b2183 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 12 Jul 2024 16:31:55 -0500
Subject: [PATCH 8/8] Update json.pyx

---
 python/cudf/cudf/_lib/pylibcudf/io/json.pyx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
index 42aeceb9d35..2710ee60075 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
@@ -125,7 +125,8 @@ cpdef tuple chunked_read_json(
         Whether to raise an error or set corresponding values to null
         when encountering an invalid JSON line.
     chunk_size : int, default 100_000_000 bytes.
-        The number of bytes to be read in chunks. The chunk_size should be set to at least row_size.
+        The number of bytes to be read in chunks.
+        The chunk_size should be set to at least row_size.
 
     Returns
     -------