From e940e30c1d07274b2832af4ddc6f85525ac5a2c0 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Thu, 27 Jun 2024 21:44:41 +0000 Subject: [PATCH] Address code review Co-authored-by: Vyas Ramasubramani --- python/cudf/cudf/_lib/pylibcudf/io/json.pxd | 2 - python/cudf/cudf/_lib/pylibcudf/io/json.pyx | 6 +- python/cudf/cudf/_lib/pylibcudf/io/types.pyx | 76 ++++++++--------- .../cudf/cudf/pylibcudf_tests/common/utils.py | 10 +-- python/cudf/cudf/pylibcudf_tests/conftest.py | 81 +++++++------------ .../cudf/cudf/pylibcudf_tests/io/test_json.py | 2 +- 6 files changed, 71 insertions(+), 106 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd index 27e3db1e9a9..a91d574131f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd @@ -1,10 +1,8 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from libcpp cimport bool -from libcpp.string cimport string from cudf._lib.pylibcudf.io.types cimport SinkInfo, TableWithMetadata -from cudf._lib.pylibcudf.libcudf.io.types cimport compression_type from cudf._lib.pylibcudf.libcudf.types cimport size_type diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx index 54b899ab6ed..522f07c0a69 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx @@ -48,8 +48,6 @@ cpdef void write_json( """ cdef table_metadata tbl_meta = table_w_meta.metadata cdef string na_rep_c = na_rep.encode() - cdef string true_value_c = true_value.encode() - cdef string false_value_c = false_value.encode() cdef json_writer_options options = move( json_writer_options.builder(sink_info.c_obj, table_w_meta.tbl.view()) @@ -63,9 +61,9 @@ cpdef void write_json( if rows_per_chunk != numeric_limits[size_type].max(): options.set_rows_per_chunk(rows_per_chunk) if true_value != "true": - options.set_true_value(true_value_c) + options.set_true_value(true_value.encode()) if false_value != "false": - options.set_false_value(false_value_c) + options.set_false_value(false_value.encode()) with nogil: cpp_write_json(options) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx index acfb02bf6be..64081c15d66 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx @@ -55,7 +55,7 @@ cdef class TableWithMetadata: if not isinstance(name, str): raise ValueError("Column name must be a string!") - info.name = move( name.encode()) + info.name = name.encode() info.children = move(self._make_column_info(child_names)) col_name_infos.push_back(info) @@ -200,60 +200,48 @@ cdef class SinkInfo: def __init__(self, list sinks): cdef vector[data_sink *] data_sinks - cdef unique_ptr[data_sink] sink - cdef vector[string] paths if not sinks: raise ValueError("Need to pass at least one sink") - if isinstance(sinks[0], io.StringIO): - data_sinks.reserve(len(sinks)) - for s in sinks: - if not isinstance(s, io.StringIO): - raise ValueError("All sinks must be of the same type!") - self.sink_storage.push_back( - unique_ptr[data_sink](new iobase_data_sink(s)) - ) - data_sinks.push_back(self.sink_storage.back().get()) - self.c_obj = sink_info(data_sinks) - elif isinstance(sinks[0], io.IOBase): - data_sinks.reserve(len(sinks)) - for s in sinks: - # Files opened in text mode expect writes to be str rather than - # bytes, which requires conversion from utf-8. If the underlying - # buffer is utf-8, we can bypass this conversion by writing - # directly to it. - if isinstance(s, io.TextIOBase): + cdef object initial_sink_cls = type(sinks[0]) + + for s in sinks: + if not isinstance(s, initial_sink_cls): + raise ValueError("All sinks must be of the same type!") + if isinstance(s, str): + paths.reserve(len(sinks)) + paths.push_back( s.encode()) + elif isinstance(s, os.PathLike): + paths.reserve(len(sinks)) + paths.push_back( os.path.expanduser(s).encode()) + else: + data_sinks.reserve(len(sinks)) + if isinstance(s, (io.StringIO, io.BytesIO)): + self.sink_storage.push_back( + unique_ptr[data_sink](new iobase_data_sink(s)) + ) + elif isinstance(s, io.TextIOBase): if codecs.lookup(s.encoding).name not in { "utf-8", "ascii", }: - raise NotImplementedError(f"Unsupported encoding {s.encoding}") - sink = move(unique_ptr[data_sink](new iobase_data_sink(s.buffer))) - elif isinstance(s, io.BytesIO): - sink = move(unique_ptr[data_sink](new iobase_data_sink(s))) + raise NotImplementedError( + f"Unsupported encoding {s.encoding}" + ) + self.sink_storage.push_back( + unique_ptr[data_sink](new iobase_data_sink(s.buffer)) + ) else: - raise ValueError("All sinks must be of the same type!") + raise TypeError( + "Unrecognized input type: {}".format(type(sinks[0])) + ) - self.sink_storage.push_back( - move(sink) - ) data_sinks.push_back(self.sink_storage.back().get()) + + if data_sinks.size() > 0: self.c_obj = sink_info(data_sinks) - elif isinstance(sinks[0], str): - paths.reserve(len(sinks)) - for s in sinks: - if not isinstance(s, str): - raise ValueError("All sinks must be of the same type!") - paths.push_back( s.encode()) - self.c_obj = sink_info(move(paths)) - elif isinstance(sinks[0], os.PathLike): - paths.reserve(len(sinks)) - for s in sinks: - if not isinstance(s, os.PathLike): - raise ValueError("All sinks must be of the same type!") - paths.push_back( os.path.expanduser(s).encode()) - self.c_obj = sink_info(move(paths)) else: - raise TypeError("Unrecognized input type: {}".format(type(sinks[0]))) + # we don't have sinks so we must have paths to sinks + self.c_obj = sink_info(paths) diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index d5693750002..f8bfe340ae5 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -145,24 +145,24 @@ def is_fixed_width(plc_dtype: plc.DataType): ) -def nesting(typ) -> tuple[int, int]: +def nesting_level(typ) -> tuple[int, int]: """Return list and struct nesting of a pyarrow type.""" if isinstance(typ, pa.ListType): - list_, struct = nesting(typ.value_type) + list_, struct = nesting_level(typ.value_type) return list_ + 1, struct elif isinstance(typ, pa.StructType): - lists, structs = map(max, zip(*(nesting(t.type) for t in typ))) + lists, structs = map(max, zip(*(nesting_level(t.type) for t in typ))) return lists, structs + 1 else: return 0, 0 def is_nested_struct(typ): - return nesting(typ)[1] > 1 + return nesting_level(typ)[1] > 1 def is_nested_list(typ): - return nesting(typ)[0] > 1 + return nesting_level(typ)[0] > 1 def sink_to_str(sink): diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py index ee2debed579..e4760ea7ac8 100644 --- a/python/cudf/cudf/pylibcudf_tests/conftest.py +++ b/python/cudf/cudf/pylibcudf_tests/conftest.py @@ -52,7 +52,7 @@ def table_data(request): """ nrows = request.param - table_dict = dict() + table_dict = {} # Colnames in the format expected by # plc.io.TableWithMetadata colnames = [] @@ -63,55 +63,36 @@ def table_data(request): rand_vals = np.random.randint(0, nrows, nrows) child_colnames = [] - if isinstance(typ, pa.ListType): - - def _generate_list_data(typ): - child_colnames = [] - if isinstance(typ, pa.ListType): - # recurse to get vals - rand_arrs, grandchild_colnames = _generate_list_data( - typ.value_type - ) - pa_array = pa.array( - [list(row_vals) for row_vals in zip(rand_arrs)], - type=typ, - ) - child_colnames.append(("", grandchild_colnames)) - else: - # typ is scalar type - pa_array = pa.array(rand_vals).cast(typ) - child_colnames.append(("", [])) - return pa_array, child_colnames - - rand_arr, child_colnames = _generate_list_data(typ) - elif isinstance(typ, pa.StructType): - - def _generate_struct_data(typ): - child_colnames = [] - if isinstance(typ, pa.StructType): - # recurse to get vals - rand_arrs = [] - for i in range(typ.num_fields): - rand_arr, grandchild_colnames = _generate_struct_data( - typ.field(i).type - ) - rand_arrs.append(rand_arr) - child_colnames.append( - (typ.field(i).name, grandchild_colnames) - ) - - pa_array = pa.StructArray.from_arrays( - [rand_arr for rand_arr in rand_arrs], - names=[ - typ.field(i).name for i in range(typ.num_fields) - ], - ) - else: - # typ is scalar type - pa_array = pa.array(rand_vals).cast(typ) - return pa_array, child_colnames - - rand_arr, child_colnames = _generate_struct_data(typ) + def _generate_nested_data(typ): + child_colnames = [] + + # recurse to get vals for children + rand_arrs = [] + for i in range(typ.num_fields): + rand_arr, grandchild_colnames = _generate_nested_data( + typ.field(i).type + ) + rand_arrs.append(rand_arr) + child_colnames.append((typ.field(i).name, grandchild_colnames)) + + if isinstance(typ, pa.StructType): + pa_array = pa.StructArray.from_arrays( + [rand_arr for rand_arr in rand_arrs], + names=[typ.field(i).name for i in range(typ.num_fields)], + ) + elif isinstance(typ, pa.ListType): + pa_array = pa.array( + [list(row_vals) for row_vals in zip(rand_arrs[0])], + type=typ, + ) + child_colnames.append(("", grandchild_colnames)) + else: + # typ is scalar type + pa_array = pa.array(rand_vals).cast(typ) + return pa_array, child_colnames + + if isinstance(typ, (pa.ListType, pa.StructType)): + rand_arr, child_colnames = _generate_nested_data(typ) else: rand_arr = pa.array(rand_vals).cast(typ) diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_json.py b/python/cudf/cudf/pylibcudf_tests/io/test_json.py index a24424cd2a3..552baa5ee51 100644 --- a/python/cudf/cudf/pylibcudf_tests/io/test_json.py +++ b/python/cudf/cudf/pylibcudf_tests/io/test_json.py @@ -14,7 +14,7 @@ def test_write_json_basic(table_data, source_or_sink, lines, rows_per_chunk): plc_table_w_meta, pa_table = table_data sink = source_or_sink - kwargs = dict() + kwargs = {} if rows_per_chunk <= plc_table_w_meta.tbl.num_rows(): kwargs["rows_per_chunk"] = rows_per_chunk