diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index c64296eb7da..2cbdf76030b 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -375,13 +375,19 @@ cdef class ORCWriter: cdef object index cdef table_input_metadata tbl_meta cdef object cols_as_map_type + cdef object stripe_size_bytes + cdef object stripe_size_rows + cdef object row_index_stride def __cinit__(self, object path, object index=None, object compression="snappy", object statistics="ROWGROUP", - object cols_as_map_type=None): + object cols_as_map_type=None, + object stripe_size_bytes=None, + object stripe_size_rows=None, + object row_index_stride=None): self.sink = make_sink_info(path, self._data_sink) self.stat_freq = _get_orc_stat_freq(statistics) @@ -389,6 +395,9 @@ cdef class ORCWriter: self.index = index self.cols_as_map_type = cols_as_map_type \ if cols_as_map_type is None else set(cols_as_map_type) + self.stripe_size_bytes = stripe_size_bytes + self.stripe_size_rows = stripe_size_rows + self.row_index_stride = row_index_stride self.initialized = False def write_table(self, table): @@ -456,9 +465,7 @@ cdef class ORCWriter: pandas_metadata = generate_pandas_metadata(table, self.index) user_data[str.encode("pandas")] = str.encode(pandas_metadata) - cdef chunked_orc_writer_options args - with nogil: - args = move( + cdef chunked_orc_writer_options c_opts = move( chunked_orc_writer_options.builder(self.sink) .metadata(self.tbl_meta) .key_value_metadata(move(user_data)) @@ -466,7 +473,15 @@ cdef class ORCWriter: .enable_statistics(self.stat_freq) .build() ) - self.writer.reset(new orc_chunked_writer(args)) + if self.stripe_size_bytes is not None: + c_opts.set_stripe_size_bytes(self.stripe_size_bytes) + if self.stripe_size_rows is not None: + c_opts.set_stripe_size_rows(self.stripe_size_rows) + if self.row_index_stride is not None: + c_opts.set_row_index_stride(self.row_index_stride) + + with nogil: + self.writer.reset(new orc_chunked_writer(c_opts)) self.initialized = True diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 4630b6eef0a..6b7f86098a0 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1911,3 +1911,25 @@ def test_orc_reader_empty_deeply_nested_level(datadir): got = cudf.read_orc(path) assert_eq(expect, got) + + +def test_orc_chunked_writer_stripe_size(datadir): + from pyarrow import orc + + df = cudf.DataFrame({"col": gen_rand_series("int", 100000)}) + + buffer = BytesIO() + writer = ORCWriter(buffer, stripe_size_bytes=64 * 1024) + writer.write_table(df) + writer.close() + + orc_file = orc.ORCFile(buffer) + assert_eq(orc_file.nstripes, 10) + + buffer = BytesIO() + writer = ORCWriter(buffer, stripe_size_rows=20000) + writer.write_table(df) + writer.close() + + orc_file = orc.ORCFile(buffer) + assert_eq(orc_file.nstripes, 5)