Skip to content

Commit

Permalink
Add row index and stripe size options to Python ORC chunked writer (#…
Browse files Browse the repository at this point in the history
…14785)

Adds the APIs that control the stripe/row group size when using the chunked writer. This functions are already present in to_orc (non-chunked version of the same API).

Adding this options to facilitate smaller unit tests.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: #14785
  • Loading branch information
vuule authored Jan 24, 2024
1 parent 807318b commit 258d9ee
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 5 deletions.
25 changes: 20 additions & 5 deletions python/cudf/cudf/_lib/orc.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -375,20 +375,29 @@ cdef class ORCWriter:
cdef object index
cdef table_input_metadata tbl_meta
cdef object cols_as_map_type
cdef object stripe_size_bytes
cdef object stripe_size_rows
cdef object row_index_stride

def __cinit__(self,
object path,
object index=None,
object compression="snappy",
object statistics="ROWGROUP",
object cols_as_map_type=None):
object cols_as_map_type=None,
object stripe_size_bytes=None,
object stripe_size_rows=None,
object row_index_stride=None):

self.sink = make_sink_info(path, self._data_sink)
self.stat_freq = _get_orc_stat_freq(statistics)
self.comp_type = _get_comp_type(compression)
self.index = index
self.cols_as_map_type = cols_as_map_type \
if cols_as_map_type is None else set(cols_as_map_type)
self.stripe_size_bytes = stripe_size_bytes
self.stripe_size_rows = stripe_size_rows
self.row_index_stride = row_index_stride
self.initialized = False

def write_table(self, table):
Expand Down Expand Up @@ -456,17 +465,23 @@ cdef class ORCWriter:
pandas_metadata = generate_pandas_metadata(table, self.index)
user_data[str.encode("pandas")] = str.encode(pandas_metadata)

cdef chunked_orc_writer_options args
with nogil:
args = move(
cdef chunked_orc_writer_options c_opts = move(
chunked_orc_writer_options.builder(self.sink)
.metadata(self.tbl_meta)
.key_value_metadata(move(user_data))
.compression(self.comp_type)
.enable_statistics(self.stat_freq)
.build()
)
self.writer.reset(new orc_chunked_writer(args))
if self.stripe_size_bytes is not None:
c_opts.set_stripe_size_bytes(self.stripe_size_bytes)
if self.stripe_size_rows is not None:
c_opts.set_stripe_size_rows(self.stripe_size_rows)
if self.row_index_stride is not None:
c_opts.set_row_index_stride(self.row_index_stride)

with nogil:
self.writer.reset(new orc_chunked_writer(c_opts))

self.initialized = True

Expand Down
22 changes: 22 additions & 0 deletions python/cudf/cudf/tests/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1911,3 +1911,25 @@ def test_orc_reader_empty_deeply_nested_level(datadir):
got = cudf.read_orc(path)

assert_eq(expect, got)


def test_orc_chunked_writer_stripe_size(datadir):
from pyarrow import orc

df = cudf.DataFrame({"col": gen_rand_series("int", 100000)})

buffer = BytesIO()
writer = ORCWriter(buffer, stripe_size_bytes=64 * 1024)
writer.write_table(df)
writer.close()

orc_file = orc.ORCFile(buffer)
assert_eq(orc_file.nstripes, 10)

buffer = BytesIO()
writer = ORCWriter(buffer, stripe_size_rows=20000)
writer.write_table(df)
writer.close()

orc_file = orc.ORCFile(buffer)
assert_eq(orc_file.nstripes, 5)

0 comments on commit 258d9ee

Please sign in to comment.