Skip to content

Commit

Permalink
Convert compression and io to string axis type in IO benchmarks (#14347)
Browse files Browse the repository at this point in the history
Addresses issue: [#12739](#12739)


This PR transforms compression and io into string axis types to enable the selection of different values via the CLI, eliminating the need to execute all values in an automation when required.  Additionally, this PR introduces two new functions, `retrieve_io_type_enum` and `retrieve_compression_type_enum`, which facilitate the conversion of string input into the corresponding enum type that can be used in benchmarking functions.

IO Benchmarks:
- [x] PARQUET READER 


For example:
`./PARQUET_READER_NVBENCH -b parquet_read_io_compression --axis io_type=[HOST_BUFFER] --axis compression_type=[NONE]`

Authors:
  - Suraj Aralihalli (https://github.com/SurajAralihalli)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: #14347
  • Loading branch information
SurajAralihalli authored Dec 11, 2023
1 parent 759a1c8 commit 3c32e5d
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 54 deletions.
27 changes: 27 additions & 0 deletions cpp/benchmarks/io/cuio_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,3 +201,30 @@ void try_drop_l3_cache()
[](auto& cmd) { return exec_cmd(cmd).empty(); }),
"Failed to execute the drop cache command");
}

cudf::io::io_type retrieve_io_type_enum(std::string_view io_string)
{
if (io_string == "FILEPATH") { return cudf::io::io_type::FILEPATH; }
if (io_string == "HOST_BUFFER") { return cudf::io::io_type::HOST_BUFFER; }
if (io_string == "DEVICE_BUFFER") { return cudf::io::io_type::DEVICE_BUFFER; }
if (io_string == "VOID") { return cudf::io::io_type::VOID; }
if (io_string == "USER_IMPLEMENTED") { return cudf::io::io_type::USER_IMPLEMENTED; }
CUDF_FAIL("Unsupported io_type.");
}

cudf::io::compression_type retrieve_compression_type_enum(std::string_view compression_string)
{
if (compression_string == "NONE") { return cudf::io::compression_type::NONE; }
if (compression_string == "AUTO") { return cudf::io::compression_type::AUTO; }
if (compression_string == "SNAPPY") { return cudf::io::compression_type::SNAPPY; }
if (compression_string == "GZIP") { return cudf::io::compression_type::GZIP; }
if (compression_string == "BZIP2") { return cudf::io::compression_type::BZIP2; }
if (compression_string == "BROTLI") { return cudf::io::compression_type::BROTLI; }
if (compression_string == "ZIP") { return cudf::io::compression_type::ZIP; }
if (compression_string == "XZ") { return cudf::io::compression_type::XZ; }
if (compression_string == "ZLIB") { return cudf::io::compression_type::ZLIB; }
if (compression_string == "LZ4") { return cudf::io::compression_type::LZ4; }
if (compression_string == "LZO") { return cudf::io::compression_type::LZO; }
if (compression_string == "ZSTD") { return cudf::io::compression_type::ZSTD; }
CUDF_FAIL("Unsupported compression_type.");
}
24 changes: 24 additions & 0 deletions cpp/benchmarks/io/cuio_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,27 @@ std::vector<cudf::size_type> segments_in_chunk(int num_segments, int num_chunks,
* @throw cudf::logic_error if the environment variable is set and the command fails
*/
void try_drop_l3_cache();

/**
* @brief Convert a string to the corresponding io_type enum value.
*
* This function takes a string and returns the matching io_type enum value. It allows you to
* convert a string representation of an io_type into its corresponding enum value.
*
* @param io_string The input string representing the io_type
*
* @return The io_type enum value
*/
cudf::io::io_type retrieve_io_type_enum(std::string_view io_string);

/**
* @brief Convert a string to the corresponding compression_type enum value.
*
* This function takes a string and returns the matching compression_type enum value. It allows you
* to convert a string representation of a compression_type into its corresponding enum value.
*
* @param compression_string The input string representing the compression_type
*
* @return The compression_type enum value
*/
cudf::io::compression_type retrieve_compression_type_enum(std::string_view compression_string);
91 changes: 37 additions & 54 deletions cpp/benchmarks/io/parquet/parquet_reader_input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,34 +56,30 @@ void parquet_read_common(cudf::io::parquet_writer_options const& write_opts,
state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
}

template <data_type DataType, cudf::io::io_type IOType>
void BM_parquet_read_data(
nvbench::state& state,
nvbench::type_list<nvbench::enum_type<DataType>, nvbench::enum_type<IOType>>)
template <data_type DataType>
void BM_parquet_read_data(nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>>)
{
auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));
cudf::size_type const cardinality = state.get_int64("cardinality");
cudf::size_type const run_length = state.get_int64("run_length");
auto const compression = cudf::io::compression_type::SNAPPY;
auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));
auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
auto const run_length = static_cast<cudf::size_type>(state.get_int64("run_length"));
auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
auto const compression = cudf::io::compression_type::SNAPPY;

auto const tbl =
create_random_table(cycle_dtypes(d_type, num_cols),
table_size_bytes{data_size},
data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
auto const view = tbl->view();

cuio_source_sink_pair source_sink(IOType);
cuio_source_sink_pair source_sink(source_type);
cudf::io::parquet_writer_options write_opts =
cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
.compression(compression);

parquet_read_common(write_opts, source_sink, state);
}

template <cudf::io::io_type IOType, cudf::io::compression_type Compression>
void BM_parquet_read_io_compression(
nvbench::state& state,
nvbench::type_list<nvbench::enum_type<IOType>, nvbench::enum_type<Compression>>)
void BM_parquet_read_io_compression(nvbench::state& state)
{
auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
static_cast<int32_t>(data_type::FLOAT),
Expand All @@ -94,10 +90,10 @@ void BM_parquet_read_io_compression(
static_cast<int32_t>(data_type::LIST),
static_cast<int32_t>(data_type::STRUCT)});

cudf::size_type const cardinality = state.get_int64("cardinality");
cudf::size_type const run_length = state.get_int64("run_length");
auto const compression = Compression;
auto const source_type = IOType;
auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
auto const run_length = static_cast<cudf::size_type>(state.get_int64("run_length"));
auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
auto const compression = retrieve_compression_type_enum(state.get_string("compression_type"));

auto const tbl =
create_random_table(cycle_dtypes(d_type, num_cols),
Expand All @@ -113,17 +109,15 @@ void BM_parquet_read_io_compression(
parquet_read_common(write_opts, source_sink, state);
}

template <cudf::io::io_type IOType>
void BM_parquet_read_io_small_mixed(nvbench::state& state,
nvbench::type_list<nvbench::enum_type<IOType>>)
void BM_parquet_read_io_small_mixed(nvbench::state& state)
{
auto const d_type =
std::pair<cudf::type_id, cudf::type_id>{cudf::type_id::STRING, cudf::type_id::INT32};

cudf::size_type const cardinality = state.get_int64("cardinality");
cudf::size_type const run_length = state.get_int64("run_length");
cudf::size_type const num_strings = state.get_int64("num_string_cols");
auto const source_type = IOType;
auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
auto const run_length = static_cast<cudf::size_type>(state.get_int64("run_length"));
auto const num_strings = static_cast<cudf::size_type>(state.get_int64("num_string_cols"));
auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));

// want 80 pages total, across 4 columns, so 20 pages per column
cudf::size_type constexpr n_col = 4;
Expand All @@ -145,24 +139,23 @@ void BM_parquet_read_io_small_mixed(nvbench::state& state,
parquet_read_common(write_opts, source_sink, state);
}

template <data_type DataType, cudf::io::io_type IOType>
void BM_parquet_read_chunks(
nvbench::state& state,
nvbench::type_list<nvbench::enum_type<DataType>, nvbench::enum_type<IOType>>)
template <data_type DataType>
void BM_parquet_read_chunks(nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>>)
{
auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));
cudf::size_type const cardinality = state.get_int64("cardinality");
cudf::size_type const run_length = state.get_int64("run_length");
cudf::size_type const byte_limit = state.get_int64("byte_limit");
auto const compression = cudf::io::compression_type::SNAPPY;
auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));
auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
auto const run_length = static_cast<cudf::size_type>(state.get_int64("run_length"));
auto const byte_limit = static_cast<cudf::size_type>(state.get_int64("byte_limit"));
auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
auto const compression = cudf::io::compression_type::SNAPPY;

auto const tbl =
create_random_table(cycle_dtypes(d_type, num_cols),
table_size_bytes{data_size},
data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
auto const view = tbl->view();

cuio_source_sink_pair source_sink(IOType);
cuio_source_sink_pair source_sink(source_type);
cudf::io::parquet_writer_options write_opts =
cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
.compression(compression);
Expand Down Expand Up @@ -202,43 +195,33 @@ using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
data_type::LIST,
data_type::STRUCT>;

using io_list = nvbench::enum_type_list<cudf::io::io_type::FILEPATH,
cudf::io::io_type::HOST_BUFFER,
cudf::io::io_type::DEVICE_BUFFER>;

using compression_list =
nvbench::enum_type_list<cudf::io::compression_type::SNAPPY, cudf::io::compression_type::NONE>;

NVBENCH_BENCH_TYPES(BM_parquet_read_data,
NVBENCH_TYPE_AXES(d_type_list,
nvbench::enum_type_list<cudf::io::io_type::DEVICE_BUFFER>))
NVBENCH_BENCH_TYPES(BM_parquet_read_data, NVBENCH_TYPE_AXES(d_type_list))
.set_name("parquet_read_decode")
.set_type_axes_names({"data_type", "io"})
.set_type_axes_names({"data_type"})
.add_string_axis("io_type", {"DEVICE_BUFFER"})
.set_min_samples(4)
.add_int64_axis("cardinality", {0, 1000})
.add_int64_axis("run_length", {1, 32});

NVBENCH_BENCH_TYPES(BM_parquet_read_io_compression, NVBENCH_TYPE_AXES(io_list, compression_list))
NVBENCH_BENCH(BM_parquet_read_io_compression)
.set_name("parquet_read_io_compression")
.set_type_axes_names({"io", "compression"})
.add_string_axis("io_type", {"FILEPATH", "HOST_BUFFER", "DEVICE_BUFFER"})
.add_string_axis("compression_type", {"SNAPPY", "NONE"})
.set_min_samples(4)
.add_int64_axis("cardinality", {0, 1000})
.add_int64_axis("run_length", {1, 32});

NVBENCH_BENCH_TYPES(BM_parquet_read_chunks,
NVBENCH_TYPE_AXES(d_type_list,
nvbench::enum_type_list<cudf::io::io_type::DEVICE_BUFFER>))
NVBENCH_BENCH_TYPES(BM_parquet_read_chunks, NVBENCH_TYPE_AXES(d_type_list))
.set_name("parquet_read_chunks")
.set_type_axes_names({"data_type", "io"})
.add_string_axis("io_type", {"DEVICE_BUFFER"})
.set_min_samples(4)
.add_int64_axis("cardinality", {0, 1000})
.add_int64_axis("run_length", {1, 32})
.add_int64_axis("byte_limit", {0, 500'000});

NVBENCH_BENCH_TYPES(BM_parquet_read_io_small_mixed,
NVBENCH_TYPE_AXES(nvbench::enum_type_list<cudf::io::io_type::FILEPATH>))
NVBENCH_BENCH(BM_parquet_read_io_small_mixed)
.set_name("parquet_read_io_small_mixed")
.set_type_axes_names({"io"})
.add_string_axis("io_type", {"FILEPATH"})
.set_min_samples(4)
.add_int64_axis("cardinality", {0, 1000})
.add_int64_axis("run_length", {1, 32})
Expand Down

0 comments on commit 3c32e5d

Please sign in to comment.