From b52c5e8116a1bfdb99509168cb1f824c4ca30d83 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 17 Dec 2024 10:51:47 -0800 Subject: [PATCH 1/5] Add JSON writer options structs to pylibcudf --- python/cudf/cudf/io/json.py | 46 +- python/cudf_polars/cudf_polars/dsl/ir.py | 10 +- python/pylibcudf/pylibcudf/io/json.pxd | 66 ++- python/pylibcudf/pylibcudf/io/json.pyi | 53 +- python/pylibcudf/pylibcudf/io/json.pyx | 500 +++++++++++++----- .../pylibcudf/pylibcudf/tests/io/test_json.py | 57 +- 6 files changed, 506 insertions(+), 226 deletions(-) diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 4f0709ec985..410a31a7ea0 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -161,13 +161,15 @@ def read_json( if cudf.get_option("io.json.low_memory") and lines: res_cols, res_col_names, res_child_names = ( plc.io.json.chunked_read_json( - plc.io.SourceInfo(filepaths_or_buffers), - processed_dtypes, - c_compression, - keep_quotes=keep_quotes, - mixed_types_as_string=mixed_types_as_string, - prune_columns=prune_columns, - recovery_mode=c_on_bad_lines, + plc.io.json._setup_json_reader_options( + plc.io.SourceInfo(filepaths_or_buffers), + processed_dtypes, + c_compression, + keep_quotes=keep_quotes, + mixed_types_as_string=mixed_types_as_string, + prune_columns=prune_columns, + recovery_mode=c_on_bad_lines, + ) ) ) df = cudf.DataFrame._from_data( @@ -181,19 +183,23 @@ def read_json( return df else: table_w_meta = plc.io.json.read_json( - plc.io.SourceInfo(filepaths_or_buffers), - processed_dtypes, - c_compression, - lines, - byte_range_offset=byte_range[0] - if byte_range is not None - else 0, - byte_range_size=byte_range[1] if byte_range is not None else 0, - keep_quotes=keep_quotes, - mixed_types_as_string=mixed_types_as_string, - prune_columns=prune_columns, - recovery_mode=c_on_bad_lines, - extra_parameters=kwargs, + plc.io.json._setup_json_reader_options( + plc.io.SourceInfo(filepaths_or_buffers), + processed_dtypes, + c_compression, + lines, + byte_range_offset=byte_range[0] + if byte_range is not None + else 0, + byte_range_size=byte_range[1] + if byte_range is not None + else 0, + keep_quotes=keep_quotes, + mixed_types_as_string=mixed_types_as_string, + prune_columns=prune_columns, + recovery_mode=c_on_bad_lines, + extra_parameters=kwargs, + ) ) df = cudf.DataFrame._from_data( diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index b5af3bb80bf..1c1d4860eec 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -604,10 +604,12 @@ def slice_skip(tbl: plc.Table): (name, typ, []) for name, typ in schema.items() ] plc_tbl_w_meta = plc.io.json.read_json( - plc.io.SourceInfo(paths), - lines=True, - dtypes=json_schema, - prune_columns=True, + plc.io.json._setup_json_reader_options( + plc.io.SourceInfo(paths), + lines=True, + dtypes=json_schema, + prune_columns=True, + ) ) # TODO: I don't think cudf-polars supports nested types in general right now # (but when it does, we should pass child column names from nested columns in) diff --git a/python/pylibcudf/pylibcudf/io/json.pxd b/python/pylibcudf/pylibcudf/io/json.pxd index d7726971351..1cb3fcf1dd5 100644 --- a/python/pylibcudf/pylibcudf/io/json.pxd +++ b/python/pylibcudf/pylibcudf/io/json.pxd @@ -6,23 +6,53 @@ from pylibcudf.io.types cimport ( TableWithMetadata, compression_type, ) -from pylibcudf.libcudf.io.json cimport json_recovery_mode_t +from pylibcudf.libcudf.io.json cimport ( + json_recovery_mode_t, + json_reader_options, + json_reader_options_builder, +) from pylibcudf.libcudf.types cimport size_type -cpdef TableWithMetadata read_json( - SourceInfo source_info, - list dtypes = *, - compression_type compression = *, - bool lines = *, - size_t byte_range_offset = *, - size_t byte_range_size = *, - bool keep_quotes = *, - bool mixed_types_as_string = *, - bool prune_columns = *, - json_recovery_mode_t recovery_mode = *, - dict extra_parameters = *, -) +cdef class JsonReaderOptions: + cdef json_reader_options c_obj + cdef SourceInfo source + # cpdef void set_dtypes_list(self, list types) + # cpdef void set_dtypes_map(self, dict types) + cpdef void set_dtypes(self, list types) + cpdef void enable_keep_quotes(self, bool keep_quotes) + cpdef void enable_mixed_types_as_string(self, bool mixed_types_as_string) + cpdef void enable_prune_columns(self, bool prune_columns) + cpdef void set_byte_range_offset(self, size_t offset) + cpdef void set_byte_range_size(self, size_t size) + cpdef void enable_lines(self, bool val) + # These hidden options are subjected to change without deprecation cycle. + # These are used to test libcudf JSON reader features, not used in cuDF. + cpdef void set_delimiter(self, str val) + cpdef void enable_dayfirst(self, bool val) + cpdef void enable_experimental(self, bool val) + cpdef void enable_normalize_single_quotes(self, bool val) + cpdef void enable_normalize_whitespace(self, bool val) + cpdef void set_strict_validation(self, bool val) + cpdef void allow_unquoted_control_chars(self, bool val) + cpdef void allow_numeric_leading_zeros(self, bool val) + cpdef void allow_nonnumeric_numbers(self, bool val) + cpdef void set_na_values(self, list vals) + +cdef class JsonReaderOptionsBuilder: + cdef json_reader_options_builder c_obj + cdef SourceInfo source + cpdef JsonReaderOptionsBuilder compression(self, compression_type compression) + cpdef JsonReaderOptionsBuilder lines(self, bool val) + cpdef JsonReaderOptionsBuilder keep_quotes(self, bool val) + cpdef JsonReaderOptionsBuilder byte_range_offset(self, size_t byte_range_offset) + cpdef JsonReaderOptionsBuilder byte_range_size(self, size_t byte_range_size) + cpdef JsonReaderOptionsBuilder recovery_mode( + self, json_recovery_mode_t recovery_mode + ) + cpdef build(self) + +cpdef TableWithMetadata read_json(JsonReaderOptions options) cpdef void write_json( @@ -37,12 +67,6 @@ cpdef void write_json( ) cpdef tuple chunked_read_json( - SourceInfo source_info, - list dtypes = *, - compression_type compression = *, - bool keep_quotes = *, - bool mixed_types_as_string = *, - bool prune_columns = *, - json_recovery_mode_t recovery_mode = *, + JsonReaderOptions options, int chunk_size= *, ) diff --git a/python/pylibcudf/pylibcudf/io/json.pyi b/python/pylibcudf/pylibcudf/io/json.pyi index b2bc6a43700..641def67fd8 100644 --- a/python/pylibcudf/pylibcudf/io/json.pyi +++ b/python/pylibcudf/pylibcudf/io/json.pyi @@ -2,6 +2,8 @@ from collections.abc import Mapping from typing import TypeAlias +from typing_extensions import Self + from pylibcudf.column import Column from pylibcudf.io.types import ( CompressionType, @@ -16,18 +18,37 @@ ChildNameToTypeMap: TypeAlias = Mapping[str, ChildNameToTypeMap] NameAndType: TypeAlias = tuple[str, DataType, list[NameAndType]] -def read_json( - source_info: SourceInfo, - dtypes: list[NameAndType] | None = None, - compression: CompressionType = CompressionType.AUTO, - lines: bool = False, - byte_range_offset: int = 0, - byte_range_size: int = 0, - keep_quotes: bool = False, - mixed_types_as_string: bool = False, - prune_columns: bool = False, - recovery_mode: JSONRecoveryMode = JSONRecoveryMode.FAIL, -) -> TableWithMetadata: ... +class JsonReaderOptions: + def set_dtypes( + self, types: list[DataType] | list[NameAndType] + ) -> None: ... + def enable_keep_quotes(self, keep_quotes: bool) -> None: ... + def enable_mixed_types_as_string( + self, mixed_types_as_string: bool + ) -> None: ... + def enable_prune_columns(self, prune_columns: bool) -> None: ... + def set_delimiter(self, val: str) -> None: ... + def enable_dayfirst(self, val: bool) -> None: ... + def enable_experimental(self, val: bool) -> None: ... + def enable_normalize_single_quotes(self, val: bool) -> None: ... + def enable_normalize_whitespace(self, val: bool) -> None: ... + def set_strict_validation(self, val: bool) -> None: ... + def allow_unquoted_control_chars(self, val: bool) -> None: ... + def allow_numeric_leading_zeros(self, val: bool) -> None: ... + def allow_nonnumeric_numbers(self, val: bool) -> None: ... + def set_na_values(self, vals: list[str]) -> None: ... + @staticmethod + def builder(source: SourceInfo) -> JsonReaderOptionsBuilder: ... + +class JsonReaderOptionsBuilder: + def compression(self, compression: CompressionType) -> Self: ... + def lines(self, lines: bool) -> Self: ... + def byte_range_offset(self, byte_range_offset: int) -> Self: ... + def byte_range_size(self, byte_range_size: int) -> Self: ... + def recovery_mode(self, recovery_mode: JSONRecoveryMode) -> Self: ... + def build(self) -> JsonReaderOptions: ... + +def read_json(options: JsonReaderOptions) -> TableWithMetadata: ... def write_json( sink_info: SinkInfo, table_w_meta: TableWithMetadata, @@ -39,12 +60,6 @@ def write_json( false_value: str = "false", ) -> None: ... def chunked_read_json( - source_info: SourceInfo, - dtypes: list[NameAndType] | None = None, - compression: CompressionType = CompressionType.AUTO, - keep_quotes: bool = False, - mixed_types_as_string: bool = False, - prune_columns: bool = False, - recovery_mode: JSONRecoveryMode = JSONRecoveryMode.FAIL, + options: JsonReaderOptions, chunk_size: int = 100_000_000, ) -> tuple[list[Column], list[str], ChildNameToTypeMap]: ... diff --git a/python/pylibcudf/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx index 32f737fbff4..97eeee856db 100644 --- a/python/pylibcudf/pylibcudf/io/json.pyx +++ b/python/pylibcudf/pylibcudf/io/json.pyx @@ -23,7 +23,13 @@ from pylibcudf.libcudf.io.types cimport ( from pylibcudf.libcudf.types cimport data_type, size_type from pylibcudf.types cimport DataType -__all__ = ["chunked_read_json", "read_json", "write_json"] +__all__ = [ + "chunked_read_json", + "read_json", + "write_json", + "JsonReaderOptions", + "JsonReaderOptionsBuilder" +] cdef map[string, schema_element] _generate_schema_map(list dtypes): cdef map[string, schema_element] schema_map @@ -47,23 +53,21 @@ cdef map[string, schema_element] _generate_schema_map(list dtypes): return schema_map -cdef json_reader_options _setup_json_reader_options( +cpdef JsonReaderOptions _setup_json_reader_options( SourceInfo source_info, list dtypes, - compression_type compression, - bool lines, - size_t byte_range_offset, - size_t byte_range_size, - bool keep_quotes, - bool mixed_types_as_string, - bool prune_columns, - json_recovery_mode_t recovery_mode, - dict extra_parameters=None): - - cdef vector[string] na_vec - cdef vector[data_type] types_vec - cdef json_reader_options opts = ( - json_reader_options.builder(source_info.c_obj) + compression_type compression = compression_type.AUTO, + bool lines = False, + size_t byte_range_offset = 0, + size_t byte_range_size = 0, + bool keep_quotes = False, + bool mixed_types_as_string = False, + bool prune_columns = False, + json_recovery_mode_t recovery_mode = json_recovery_mode_t.FAIL, + dict extra_parameters=None, +): + options = ( + JsonReaderOptions.builder(source_info) .compression(compression) .lines(lines) .byte_range_offset(byte_range_offset) @@ -73,88 +77,358 @@ cdef json_reader_options _setup_json_reader_options( ) if dtypes is not None: - if isinstance(dtypes[0], tuple): - opts.set_dtypes(move(_generate_schema_map(dtypes))) - else: - for dtype in dtypes: - types_vec.push_back((dtype).c_obj) - opts.set_dtypes(types_vec) + options.set_dtypes(dtypes) - opts.enable_keep_quotes(keep_quotes) - opts.enable_mixed_types_as_string(mixed_types_as_string) - opts.enable_prune_columns(prune_columns) + options.enable_keep_quotes(keep_quotes) + options.enable_mixed_types_as_string(mixed_types_as_string) + options.enable_prune_columns(prune_columns) # These hidden options are subjected to change without deprecation cycle. # These are used to test libcudf JSON reader features, not used in cuDF. if extra_parameters is not None: for key, value in extra_parameters.items(): if key == 'delimiter': - opts.set_delimiter(ord(value)) + options.set_delimiter(value) elif key == 'dayfirst': - opts.enable_dayfirst(value) + options.enable_dayfirst(value) elif key == 'experimental': - opts.enable_experimental(value) + options.enable_experimental(value) elif key == 'normalize_single_quotes': - opts.enable_normalize_single_quotes(value) + options.enable_normalize_single_quotes(value) elif key == 'normalize_whitespace': - opts.enable_normalize_whitespace(value) + options.enable_normalize_whitespace(value) elif key == 'strict_validation': - opts.set_strict_validation(value) + options.set_strict_validation(value) elif key == 'allow_unquoted_control_chars': - opts.allow_unquoted_control_chars(value) + options.allow_unquoted_control_chars(value) elif key == 'allow_numeric_leading_zeros': - opts.allow_numeric_leading_zeros(value) + options.allow_numeric_leading_zeros(value) elif key == 'allow_nonnumeric_numbers': - opts.allow_nonnumeric_numbers(value) + options.allow_nonnumeric_numbers(value) elif key == 'na_values': - for na_val in value: - if isinstance(na_val, str): - na_vec.push_back(na_val.encode()) - opts.set_na_values(na_vec) + options.set_na_values(value) else: raise ValueError( "cudf engine doesn't support the " f"'{key}' keyword argument for read_json" ) - return opts + return options + + +cdef class JsonReaderOptions: + """ + The settings to use for ``read_json`` + + For details, see `:cpp:class:`cudf::io::json_reader_options` + """ + @staticmethod + def builder(SourceInfo source): + """ + Create a JsonReaderOptionsBuilder object + + For details, see :cpp:func:`cudf::io::json_reader_options::builder` + + Parameters + ---------- + sink : SourceInfo + The source to read the JSON file from. + + Returns + ------- + JsonReaderOptionsBuilder + Builder to build JsonReaderOptions + """ + cdef JsonReaderOptionsBuilder json_builder = ( + JsonReaderOptionsBuilder.__new__(JsonReaderOptionsBuilder) + ) + json_builder.c_obj = json_reader_options.builder(source.c_obj) + json_builder.source = source + return json_builder + + cpdef void set_dtypes(self, list types): + """ + Set data types for columns to be read. + + Parameters + ---------- + types : list + List of dtypes or a list of tuples of + column names, dtypes, and list of tuples + (to support nested column hierarchy) + + Returns + ------- + None + """ + cdef vector[data_type] types_vec + if isinstance(types[0], tuple): + self.c_obj.set_dtypes(_generate_schema_map(types)) + else: + for dtype in types: + types_vec.push_back((dtype).c_obj) + self.c_obj.set_dtypes(types_vec) + + cpdef void enable_keep_quotes(self, bool keep_quotes): + """ + Set whether the reader should keep quotes of string values. + + Parameters + ---------- + keep_quotes : bool + Boolean value to indicate whether the reader should + keep quotes of string values + + Returns + ------- + None + """ + self.c_obj.enable_keep_quotes(keep_quotes) + + cpdef void enable_mixed_types_as_string(self, bool mixed_types_as_string): + """ + Set whether to parse mixed types as a string column. + Also enables forcing to read a struct as string column using schema. + + Parameters + ---------- + mixed_types_as_string : bool + Boolean value to enable/disable parsing mixed types + as a string column + + Returns + ------- + None + """ + self.c_obj.enable_mixed_types_as_string(mixed_types_as_string) + + cpdef void enable_prune_columns(self, bool prune_columns): + """ + Set whether to prune columns on read, selected + based on the ``set_dtypes`` option. + + Parameters + ---------- + prune_columns : bool + When set as true, if the reader options include + ``set_dtypes``, then the reader will only return those + columns which are mentioned in ``set_dtypes``. If false, + then all columns are returned, independent of the + ``set_dtypes`` setting. + + Returns + ------- + None + """ + self.c_obj.enable_prune_columns(prune_columns) + + cpdef void set_byte_range_offset(self, size_t offset): + """ + Set number of bytes to skip from source start. + + Parameters + ---------- + offset : size_t + Number of bytes of offset + + Returns + ------- + None + """ + self.c_obj.set_byte_range_offset(offset) + + cpdef void set_byte_range_size(self, size_t size): + """ + Set number of bytes to read. + + Parameters + ---------- + size : size_t + Number of bytes to read + + Returns + ------- + None + """ + self.c_obj.set_byte_range_size(size) + + cpdef void enable_lines(self, bool val): + """ + Set whether to read the file as a json object per line. + + Parameters + ---------- + val : bool + Boolean value to enable/disable the option + to read each line as a json object + + Returns + ------- + None + """ + self.c_obj.enable_lines(val) + + # These hidden options are subjected to change without deprecation cycle. + # These are used to test libcudf JSON reader features, not used in cuDF. + + cpdef void set_delimiter(self, str val): + self.c_obj.set_delimiter(val.encode()) + + cpdef void enable_dayfirst(self, bool val): + self.c_obj.enable_dayfirst(val) + + cpdef void enable_experimental(self, bool val): + self.c_obj.enable_experimental(val) + + cpdef void enable_normalize_single_quotes(self, bool val): + self.c_obj.enable_normalize_single_quotes(val) + + cpdef void enable_normalize_whitespace(self, bool val): + self.c_obj.enable_normalize_whitespace(val) + + cpdef void set_strict_validation(self, bool val): + self.c_obj.set_strict_validation(val) + + cpdef void allow_unquoted_control_chars(self, bool val): + self.c_obj.allow_unquoted_control_chars(val) + + cpdef void allow_numeric_leading_zeros(self, bool val): + self.c_obj.allow_numeric_leading_zeros(val) + + cpdef void allow_nonnumeric_numbers(self, bool val): + self.c_obj.allow_nonnumeric_numbers(val) + + cpdef void set_na_values(self, list vals): + cdef vector[string] vec + for val in vals: + if isinstance(val, str): + vec.push_back(val.encode()) + self.c_obj.set_na_values(vec) + + +cdef class JsonReaderOptionsBuilder: + cpdef JsonReaderOptionsBuilder compression(self, compression_type compression): + """ + Sets compression type. + + Parameters + ---------- + compression : CompressionType + The compression type to use + + Returns + ------- + Self + """ + self.c_obj.compression(compression) + return self + + cpdef JsonReaderOptionsBuilder lines(self, bool val): + """ + Set whether to read the file as a json object per line. + + Parameters + ---------- + val : bool + Boolean value to enable/disable the option + to read each line as a json object + + Returns + ------- + Self + """ + self.c_obj.lines(val) + return self + + cpdef JsonReaderOptionsBuilder keep_quotes(self, bool val): + """ + Set whether the reader should keep quotes of string values. + + Parameters + ---------- + val : bool + Boolean value to indicate whether the + reader should keep quotes of string values + + Returns + ------- + Self + """ + self.c_obj.keep_quotes(val) + return self + + cpdef JsonReaderOptionsBuilder byte_range_offset(self, size_t byte_range_offset): + """ + Set number of bytes to skip from source start. + + Parameters + ---------- + byte_range_offset : size_t + Number of bytes of offset + + Returns + ------- + Self + """ + self.c_obj.byte_range_offset(byte_range_offset) + return self + + cpdef JsonReaderOptionsBuilder byte_range_size(self, size_t byte_range_size): + """ + Set number of bytes to read. + + Parameters + ---------- + byte_range_size : size_t + Number of bytes to read + + Returns + ------- + Self + """ + self.c_obj.byte_range_size(byte_range_size) + return self + + cpdef JsonReaderOptionsBuilder recovery_mode( + self, + json_recovery_mode_t recovery_mode + ): + """ + Specifies the JSON reader's behavior on invalid JSON lines. + + Parameters + ---------- + recovery_mode : json_recovery_mode_t + An enum value to indicate the JSON reader's + behavior on invalid JSON lines. + + Returns + ------- + Self + """ + self.c_obj.recovery_mode(recovery_mode) + return self + + cpdef build(self): + """Create a JsonReaderOptions object""" + cdef JsonReaderOptions json_options = JsonReaderOptions.__new__( + JsonReaderOptions + ) + json_options.c_obj = move(self.c_obj.build()) + json_options.source = self.source + return json_options cpdef tuple chunked_read_json( - SourceInfo source_info, - list dtypes = None, - compression_type compression = compression_type.AUTO, - bool keep_quotes = False, - bool mixed_types_as_string = False, - bool prune_columns = False, - json_recovery_mode_t recovery_mode = json_recovery_mode_t.FAIL, + JsonReaderOptions options, int chunk_size=100_000_000, ): - """Reads an JSON file into a :py:class:`~.types.TableWithMetadata`. + """ + Reads chunks of a JSON file into a :py:class:`~.types.TableWithMetadata`. Parameters ---------- - source_info : SourceInfo - The SourceInfo object to read the JSON file from. - dtypes : list, default None - Set data types for the columns in the JSON file. - - Each element of the list has the format - (column_name, column_dtype, list of child dtypes), where - the list of child dtypes is an empty list if the child is not - a nested type (list or struct dtype), and is of format - (column_child_name, column_child_type, list of grandchild dtypes). - compression: CompressionType, default CompressionType.AUTO - The compression format of the JSON source. - keep_quotes : bool, default False - Whether the reader should keep quotes of string values. - mixed_types_as_string : bool, default False - If True, mixed type columns are returned as string columns. - If `False` parsing mixed type columns will thrown an error. - prune_columns : bool, default False - Whether to only read columns specified in dtypes. - recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL - Whether to raise an error or set corresponding values to null - when encountering an invalid JSON line. + options : JsonReaderOptions + Settings for controlling reading behavior chunk_size : int, default 100_000_000 bytes. The number of bytes to be read in chunks. The chunk_size should be set to at least row_size. @@ -167,20 +441,6 @@ cpdef tuple chunked_read_json( cdef size_type c_range_size = ( chunk_size if chunk_size is not None else 0 ) - cdef json_reader_options opts = _setup_json_reader_options( - source_info=source_info, - dtypes=dtypes, - compression=compression, - lines=True, - byte_range_offset=0, - byte_range_size=0, - keep_quotes=keep_quotes, - mixed_types_as_string=mixed_types_as_string, - prune_columns=prune_columns, - recovery_mode=recovery_mode, - ) - - # Read JSON cdef table_with_metadata c_result final_columns = [] @@ -188,12 +448,13 @@ cpdef tuple chunked_read_json( child_names = None i = 0 while True: - opts.set_byte_range_offset(c_range_size * i) - opts.set_byte_range_size(c_range_size) + options.enable_lines(True) + options.set_byte_range_offset(c_range_size * i) + options.set_byte_range_size(c_range_size) try: with nogil: - c_result = move(cpp_read_json(opts)) + c_result = move(cpp_read_json(options.c_obj)) except (ValueError, OverflowError): break if meta_names is None: @@ -221,75 +482,30 @@ cpdef tuple chunked_read_json( cpdef TableWithMetadata read_json( - SourceInfo source_info, - list dtypes = None, - compression_type compression = compression_type.AUTO, - bool lines = False, - size_t byte_range_offset = 0, - size_t byte_range_size = 0, - bool keep_quotes = False, - bool mixed_types_as_string = False, - bool prune_columns = False, - json_recovery_mode_t recovery_mode = json_recovery_mode_t.FAIL, - dict extra_parameters = None, + JsonReaderOptions options ): - """Reads an JSON file into a :py:class:`~.types.TableWithMetadata`. + """ + Read from JSON format. + + The source to read from and options are encapsulated + by the `options` object. + + For details, see :cpp:func:`read_json`. Parameters ---------- - source_info : SourceInfo - The SourceInfo object to read the JSON file from. - dtypes : list, default None - Set data types for the columns in the JSON file. - - Each element of the list has the format - (column_name, column_dtype, list of child dtypes), where - the list of child dtypes is an empty list if the child is not - a nested type (list or struct dtype), and is of format - (column_child_name, column_child_type, list of grandchild dtypes). - compression: CompressionType, default CompressionType.AUTO - The compression format of the JSON source. - byte_range_offset : size_t, default 0 - Number of bytes to skip from source start. - byte_range_size : size_t, default 0 - Number of bytes to read. By default, will read all bytes. - keep_quotes : bool, default False - Whether the reader should keep quotes of string values. - mixed_types_as_string : bool, default False - If True, mixed type columns are returned as string columns. - If `False` parsing mixed type columns will thrown an error. - prune_columns : bool, default False - Whether to only read columns specified in dtypes. - recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL - Whether to raise an error or set corresponding values to null - when encountering an invalid JSON line. - extra_parameters : dict, default None - Additional hidden parameters to pass to the JSON reader. + options: JsonReaderOptions + Settings for controlling reading behavior Returns ------- TableWithMetadata The Table and its corresponding metadata (column names) that were read in. """ - cdef json_reader_options opts = _setup_json_reader_options( - source_info=source_info, - dtypes=dtypes, - compression=compression, - lines=lines, - byte_range_offset=byte_range_offset, - byte_range_size=byte_range_size, - keep_quotes=keep_quotes, - mixed_types_as_string=mixed_types_as_string, - prune_columns=prune_columns, - recovery_mode=recovery_mode, - extra_parameters=extra_parameters, - ) - - # Read JSON cdef table_with_metadata c_result with nogil: - c_result = move(cpp_read_json(opts)) + c_result = move(cpp_read_json(options.c_obj)) return TableWithMetadata.from_libcudf(c_result) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_json.py b/python/pylibcudf/pylibcudf/tests/io/test_json.py index 453e5ce32a8..b9cb4965acd 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_json.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_json.py @@ -150,9 +150,12 @@ def test_read_json_basic( source.seek(0) res = plc.io.json.read_json( - plc.io.SourceInfo([source]), - compression=compression_type, - lines=lines, + ( + plc.io.json.JsonReaderOptions.builder(plc.io.SourceInfo([source])) + .compression(compression_type) + .lines(lines) + .build() + ) ) # Adjustments to correct for the fact orient=records is lossy @@ -226,9 +229,14 @@ def get_child_types(typ): new_schema = pa.schema(new_fields) - res = plc.io.json.read_json( - plc.io.SourceInfo([source]), dtypes=dtypes, lines=True + options = ( + plc.io.json.JsonReaderOptions.builder(plc.io.SourceInfo([source])) + .lines(True) + .build() ) + options.set_dtypes(dtypes) + + res = plc.io.json.read_json(options) new_table = pa_table.cast(new_schema) # orient=records is lossy @@ -252,10 +260,15 @@ def test_read_json_lines_byte_range(source_or_sink, chunk_size): for chunk_start in range(0, len(json_str.encode("utf-8")), chunk_size): tbls_w_meta.append( plc.io.json.read_json( - plc.io.SourceInfo([source]), - lines=True, - byte_range_offset=chunk_start, - byte_range_size=chunk_start + chunk_size, + ( + plc.io.json.JsonReaderOptions.builder( + plc.io.SourceInfo([source]) + ) + .lines(True) + .byte_range_offset(chunk_start) + .byte_range_size(chunk_start + chunk_size) + .build() + ) ) ) @@ -285,7 +298,12 @@ def test_read_json_lines_keep_quotes(keep_quotes, source_or_sink): write_source_str(source, json_bytes) tbl_w_meta = plc.io.json.read_json( - plc.io.SourceInfo([source]), lines=True, keep_quotes=keep_quotes + ( + plc.io.json.JsonReaderOptions.builder(plc.io.SourceInfo([source])) + .lines(True) + .keep_quotes(keep_quotes) + .build() + ) ) template = "{0}" @@ -313,20 +331,19 @@ def test_read_json_lines_recovery_mode(recovery_mode, source_or_sink): json_str = '{"a":1,"b":10}\n{"a":2,"b":11}\nabc\n{"a":3,"b":12}\n' write_source_str(source, json_str) + options = ( + plc.io.json.JsonReaderOptions.builder(plc.io.SourceInfo([source])) + .lines(True) + .recovery_mode(recovery_mode) + .build() + ) + if recovery_mode == plc.io.types.JSONRecoveryMode.FAIL: with pytest.raises(RuntimeError): - plc.io.json.read_json( - plc.io.SourceInfo([source]), - lines=True, - recovery_mode=recovery_mode, - ) + plc.io.json.read_json(options) else: # Recover case (bad values replaced with nulls) - tbl_w_meta = plc.io.json.read_json( - plc.io.SourceInfo([source]), - lines=True, - recovery_mode=recovery_mode, - ) + tbl_w_meta = plc.io.json.read_json(options) exp = pa.Table.from_arrays( [[1, 2, None, 3], [10, 11, None, 12]], names=["a", "b"] ) From bc7a600486c74ab2002a8cb6b22a2a9ef85b8150 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 17 Dec 2024 10:54:21 -0800 Subject: [PATCH 2/5] add missing type stubs --- python/pylibcudf/pylibcudf/io/json.pyi | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/pylibcudf/pylibcudf/io/json.pyi b/python/pylibcudf/pylibcudf/io/json.pyi index 641def67fd8..46125ff9416 100644 --- a/python/pylibcudf/pylibcudf/io/json.pyi +++ b/python/pylibcudf/pylibcudf/io/json.pyi @@ -27,6 +27,9 @@ class JsonReaderOptions: self, mixed_types_as_string: bool ) -> None: ... def enable_prune_columns(self, prune_columns: bool) -> None: ... + def set_byte_range_offset(self, offset: int) -> None: ... + def set_byte_range_size(self, size: int) -> None: ... + def enable_lines(self, val: bool) -> None: ... def set_delimiter(self, val: str) -> None: ... def enable_dayfirst(self, val: bool) -> None: ... def enable_experimental(self, val: bool) -> None: ... From c3995975e798859cb1bdf164bd06a03b881ee5e3 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 17 Dec 2024 14:18:11 -0500 Subject: [PATCH 3/5] clean up --- python/pylibcudf/pylibcudf/io/json.pxd | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/pylibcudf/pylibcudf/io/json.pxd b/python/pylibcudf/pylibcudf/io/json.pxd index 1cb3fcf1dd5..6e548a67937 100644 --- a/python/pylibcudf/pylibcudf/io/json.pxd +++ b/python/pylibcudf/pylibcudf/io/json.pxd @@ -17,8 +17,6 @@ from pylibcudf.libcudf.types cimport size_type cdef class JsonReaderOptions: cdef json_reader_options c_obj cdef SourceInfo source - # cpdef void set_dtypes_list(self, list types) - # cpdef void set_dtypes_map(self, dict types) cpdef void set_dtypes(self, list types) cpdef void enable_keep_quotes(self, bool keep_quotes) cpdef void enable_mixed_types_as_string(self, bool mixed_types_as_string) From 218056e8f4f496aa8adb1ee401f9e793cfdce575 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 17 Dec 2024 20:17:25 -0500 Subject: [PATCH 4/5] reserve memory for vector --- python/pylibcudf/pylibcudf/io/json.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pylibcudf/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx index 97eeee856db..d8437e8bad7 100644 --- a/python/pylibcudf/pylibcudf/io/json.pyx +++ b/python/pylibcudf/pylibcudf/io/json.pyx @@ -164,6 +164,7 @@ cdef class JsonReaderOptions: if isinstance(types[0], tuple): self.c_obj.set_dtypes(_generate_schema_map(types)) else: + types_vec.reserve(len(types)) for dtype in types: types_vec.push_back((dtype).c_obj) self.c_obj.set_dtypes(types_vec) From b5444a38a8d8753756180e0ed378612214b2b5d7 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 18 Dec 2024 15:24:55 +0000 Subject: [PATCH 5/5] clean up --- python/pylibcudf/pylibcudf/io/json.pyi | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pylibcudf/pylibcudf/io/json.pyi b/python/pylibcudf/pylibcudf/io/json.pyi index 58cd1bfd9d6..b84b437a3a2 100644 --- a/python/pylibcudf/pylibcudf/io/json.pyi +++ b/python/pylibcudf/pylibcudf/io/json.pyi @@ -69,7 +69,6 @@ class JsonWriterOptionsBuilder: def build(self) -> JsonWriterOptions: ... def write_json(options: JsonWriterOptions) -> None: ... - def chunked_read_json( options: JsonReaderOptions, chunk_size: int = 100_000_000,