Skip to content

Commit

Permalink
orc multithreaded benchmark (#16009)
Browse files Browse the repository at this point in the history
Addresses: #15973

Adds multithreaded benchmarks for the ORC reader. Based off of the parquet equivalent in #15585

```
# Benchmark Results

## orc_multithreaded_read_decode_mixed

### [0] NVIDIA RTX 5880 Ada Generation

| cardinality | total_data_size | num_threads | num_cols | run_length | Samples | CPU Time  | Noise | GPU Time  | Noise | bytes_per_second | peak_memory_usage | encoded_file_size |
|-------------|-----------------|-------------|----------|------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------|
|        1000 |       536870912 |           1 |        4 |          8 |    338x | 44.348 ms | 1.18% | 44.343 ms | 1.18% |      12107185968 |       939.341 MiB |        39.557 MiB |
|        1000 |      1073741824 |           1 |        4 |          8 |     80x | 77.634 ms | 0.65% | 77.629 ms | 0.65% |      13831742649 |         1.834 GiB |        79.072 MiB |
|        1000 |       536870912 |           2 |        4 |          8 |    341x | 43.921 ms | 1.20% | 43.916 ms | 1.20% |      12224889363 |       825.333 MiB |        39.568 MiB |
|        1000 |      1073741824 |           2 |        4 |          8 |     80x | 75.418 ms | 0.70% | 75.414 ms | 0.70% |      14237999015 |         1.611 GiB |        79.113 MiB |
|        1000 |       536870912 |           4 |        4 |          8 |     80x | 42.682 ms | 1.18% | 42.678 ms | 1.18% |      12579566132 |       883.436 MiB |        39.587 MiB |
|        1000 |      1073741824 |           4 |        4 |          8 |      9x | 74.056 ms | 0.48% | 74.052 ms | 0.48% |      14499873867 |         1.724 GiB |        79.136 MiB |
|        1000 |       536870912 |           8 |        4 |          8 |     25x | 42.198 ms | 0.50% | 42.194 ms | 0.49% |      12723960975 |       940.562 MiB |        39.600 MiB |
|        1000 |      1073741824 |           8 |        4 |          8 |      8x | 73.933 ms | 0.49% | 73.929 ms | 0.49% |      14524042443 |         1.781 GiB |        79.175 MiB |

## orc_multithreaded_read_decode_fixed_width

### [0] NVIDIA RTX 5880 Ada Generation

| cardinality | total_data_size | num_threads | num_cols | run_length | Samples | CPU Time  | Noise | GPU Time  | Noise | bytes_per_second | peak_memory_usage | encoded_file_size |
|-------------|-----------------|-------------|----------|------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------|
|        1000 |       536870912 |           1 |        4 |          8 |     13x | 40.149 ms | 0.04% | 40.144 ms | 0.04% |      13373482726 |       643.390 MiB |        59.821 MiB |
|        1000 |      1073741824 |           1 |        4 |          8 |    211x | 71.216 ms | 0.67% | 71.211 ms | 0.67% |      15078297784 |         1.257 GiB |       119.650 MiB |
|        1000 |       536870912 |           2 |        4 |          8 |    378x | 39.662 ms | 1.31% | 39.658 ms | 1.31% |      13537590893 |       643.392 MiB |        59.833 MiB |
|        1000 |      1073741824 |           2 |        4 |          8 |    209x | 71.693 ms | 0.71% | 71.688 ms | 0.71% |      14978085376 |         1.257 GiB |       119.642 MiB |
|        1000 |       536870912 |           4 |        4 |          8 |    377x | 39.731 ms | 1.30% | 39.726 ms | 1.30% |      13514305239 |       643.394 MiB |        59.856 MiB |
|        1000 |      1073741824 |           4 |        4 |          8 |      8x | 70.766 ms | 0.08% | 70.761 ms | 0.08% |      15174115364 |         1.030 GiB |       119.665 MiB |
|        1000 |       536870912 |           8 |        4 |          8 |    379x | 39.486 ms | 1.27% | 39.482 ms | 1.27% |      13597888468 |       647.399 MiB |        59.928 MiB |
|        1000 |      1073741824 |           8 |        4 |          8 |    207x | 72.686 ms | 2.04% | 72.681 ms | 2.04% |      14773317833 |         1.143 GiB |       119.711 MiB |

## orc_multithreaded_read_decode_string

### [0] NVIDIA RTX 5880 Ada Generation

| cardinality | total_data_size | num_threads | num_cols | run_length | Samples | CPU Time  | Noise | GPU Time  | Noise | bytes_per_second | peak_memory_usage | encoded_file_size |
|-------------|-----------------|-------------|----------|------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------|
|        1000 |       536870912 |           1 |        4 |          8 |     80x | 22.933 ms | 2.13% | 22.928 ms | 2.13% |      23415352877 |       661.948 MiB |        10.879 MiB |
|        1000 |      1073741824 |           1 |        4 |          8 |    160x | 34.167 ms | 1.41% | 34.162 ms | 1.41% |      31430436877 |         1.293 GiB |        21.757 MiB |
|        1000 |       536870912 |           2 |        4 |          8 |    560x | 22.533 ms | 2.18% | 22.528 ms | 2.18% |      23830839172 |       609.407 MiB |        10.941 MiB |
|        1000 |      1073741824 |           2 |        4 |          8 |     80x | 34.311 ms | 1.54% | 34.307 ms | 1.54% |      31298288990 |         1.188 GiB |        21.758 MiB |
|        1000 |       536870912 |           4 |        4 |          8 |     23x | 22.179 ms | 0.11% | 22.175 ms | 0.11% |      24211151047 |       624.177 MiB |        10.947 MiB |
|        1000 |      1073741824 |           4 |        4 |          8 |     15x | 33.793 ms | 0.08% | 33.789 ms | 0.08% |      31777989791 |         1.190 GiB |        21.881 MiB |
|        1000 |       536870912 |           8 |        4 |          8 |    679x | 22.006 ms | 1.74% | 22.002 ms | 1.74% |      24401381631 |       624.524 MiB |        10.951 MiB |
|        1000 |      1073741824 |           8 |        4 |          8 |    160x | 33.320 ms | 1.57% | 33.316 ms | 1.57% |      32229227026 |         1.207 GiB |        21.894 MiB |

## orc_multithreaded_read_decode_list

### [0] NVIDIA RTX 5880 Ada Generation

| cardinality | total_data_size | num_threads | num_cols | run_length | Samples |  CPU Time  | Noise  |  GPU Time  | Noise  | bytes_per_second | peak_memory_usage | encoded_file_size |
|-------------|-----------------|-------------|----------|------------|---------|------------|--------|------------|--------|------------------|-------------------|-------------------|
|        1000 |       536870912 |           1 |        4 |          8 |     96x |  74.437 ms |  0.68% |  74.433 ms |  0.68% |       7212831148 |       600.751 MiB |        60.245 MiB |
|        1000 |      1073741824 |           1 |        4 |          8 |      7x |  80.994 ms |  0.49% |  80.990 ms |  0.49% |      13257745936 |         1.173 GiB |       120.549 MiB |
|        1000 |       536870912 |           2 |        4 |          8 |     80x |  79.234 ms |  4.57% |  79.229 ms |  4.57% |       6776190522 |       600.950 MiB |        60.250 MiB |
|        1000 |      1073741824 |           2 |        4 |          8 |    166x |  90.437 ms | 17.19% |  90.432 ms | 17.19% |      11873413959 |         1.173 GiB |       120.489 MiB |
|        1000 |       536870912 |           4 |        4 |          8 |     80x |  78.613 ms |  2.98% |  78.608 ms |  2.98% |       6829702014 |       602.764 MiB |        60.323 MiB |
|        1000 |      1073741824 |           4 |        4 |          8 |    127x | 118.629 ms | 22.67% | 118.624 ms | 22.67% |       9051644873 |         1.174 GiB |       120.499 MiB |
|        1000 |       536870912 |           8 |        4 |          8 |    112x | 133.950 ms |  4.45% | 133.945 ms |  4.45% |       4008135293 |       603.471 MiB |        60.353 MiB |
|        1000 |      1073741824 |           8 |        4 |          8 |     90x | 167.850 ms | 15.93% | 167.844 ms | 15.93% |       6397248426 |         1.177 GiB |       120.646 MiB |

## orc_multithreaded_read_decode_chunked_mixed

### [0] NVIDIA RTX 5880 Ada Generation

| cardinality | total_data_size | num_threads | num_cols | run_length | input_limit | output_limit | Samples | CPU Time  | Noise | GPU Time  | Noise | bytes_per_second | peak_memory_usage | encoded_file_size |
|-------------|-----------------|-------------|----------|------------|-------------|--------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------|
|        1000 |       536870912 |           1 |        4 |          8 |   671088640 |    671088640 |    333x | 45.009 ms | 1.10% | 45.005 ms | 1.10% |      11929261073 |       939.341 MiB |        39.557 MiB |
|        1000 |      1073741824 |           1 |        4 |          8 |   671088640 |    671088640 |     96x | 81.524 ms | 0.61% | 81.519 ms | 0.61% |      13171640865 |         1.834 GiB |        79.072 MiB |
|        1000 |       536870912 |           2 |        4 |          8 |   671088640 |    671088640 |    339x | 44.183 ms | 0.96% | 44.179 ms | 0.96% |      12152252271 |       825.333 MiB |        39.568 MiB |
|        1000 |      1073741824 |           2 |        4 |          8 |   671088640 |    671088640 |      7x | 79.051 ms | 0.02% | 79.046 ms | 0.02% |      13583676002 |         1.611 GiB |        79.113 MiB |
|        1000 |       536870912 |           4 |        4 |          8 |   671088640 |    671088640 |     12x | 43.276 ms | 0.09% | 43.272 ms | 0.09% |      12407024794 |       883.436 MiB |        39.587 MiB |
|        1000 |      1073741824 |           4 |        4 |          8 |   671088640 |    671088640 |     19x | 78.019 ms | 0.49% | 78.014 ms | 0.49% |      13763433041 |         1.724 GiB |        79.136 MiB |
|        1000 |       536870912 |           8 |        4 |          8 |   671088640 |    671088640 |     80x | 42.803 ms | 1.22% | 42.799 ms | 1.22% |      12543864010 |       911.993 MiB |        39.600 MiB |
|        1000 |      1073741824 |           8 |        4 |          8 |   671088640 |    671088640 |    193x | 77.856 ms | 0.59% | 77.852 ms | 0.59% |      13792063986 |         1.837 GiB |        79.175 MiB |

## orc_multithreaded_read_decode_chunked_fixed_width

### [0] NVIDIA RTX 5880 Ada Generation

| cardinality | total_data_size | num_threads | num_cols | run_length | input_limit | output_limit | Samples | CPU Time  | Noise | GPU Time  | Noise | bytes_per_second | peak_memory_usage | encoded_file_size |
|-------------|-----------------|-------------|----------|------------|-------------|--------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------|
|        1000 |       536870912 |           1 |        4 |          8 |   671088640 |    671088640 |    112x | 40.497 ms | 1.23% | 40.493 ms | 1.23% |      13258480947 |       643.390 MiB |        59.821 MiB |
|        1000 |      1073741824 |           1 |        4 |          8 |   671088640 |    671088640 |      7x | 75.440 ms | 0.09% | 75.435 ms | 0.09% |      14234033611 |         1.648 GiB |       119.651 MiB |
|        1000 |       536870912 |           2 |        4 |          8 |   671088640 |    671088640 |     80x | 39.793 ms | 1.36% | 39.789 ms | 1.36% |      13493067216 |       643.392 MiB |        59.833 MiB |
|        1000 |      1073741824 |           2 |        4 |          8 |   671088640 |    671088640 |     69x | 74.499 ms | 0.50% | 74.494 ms | 0.50% |      14413864845 |         1.336 GiB |       119.642 MiB |
|        1000 |       536870912 |           4 |        4 |          8 |   671088640 |    671088640 |    381x | 39.273 ms | 1.11% | 39.269 ms | 1.11% |      13671742653 |       643.394 MiB |        59.856 MiB |
|        1000 |      1073741824 |           4 |        4 |          8 |   671088640 |    671088640 |    204x | 73.755 ms | 0.60% | 73.751 ms | 0.60% |      14559012350 |         1.648 GiB |       119.665 MiB |
|        1000 |       536870912 |           8 |        4 |          8 |   671088640 |    671088640 |     80x | 39.490 ms | 1.31% | 39.486 ms | 1.31% |      13596333864 |       631.980 MiB |        59.928 MiB |
|        1000 |      1073741824 |           8 |        4 |          8 |   671088640 |    671088640 |    203x | 73.907 ms | 1.34% | 73.903 ms | 1.34% |      14529071322 |         1.454 GiB |       119.711 MiB |

## orc_multithreaded_read_decode_chunked_string

### [0] NVIDIA RTX 5880 Ada Generation

| cardinality | total_data_size | num_threads | num_cols | run_length | input_limit | output_limit | Samples | CPU Time  | Noise | GPU Time  | Noise | bytes_per_second | peak_memory_usage | encoded_file_size |
|-------------|-----------------|-------------|----------|------------|-------------|--------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------|
|        1000 |       536870912 |           1 |        4 |          8 |   671088640 |    671088640 |     80x | 23.022 ms | 1.96% | 23.017 ms | 1.96% |      23324556592 |       661.948 MiB |        10.879 MiB |
|        1000 |      1073741824 |           1 |        4 |          8 |   671088640 |    671088640 |     80x | 37.687 ms | 1.37% | 37.682 ms | 1.37% |      28494755419 |         1.659 GiB |        21.757 MiB |
|        1000 |       536870912 |           2 |        4 |          8 |   671088640 |    671088640 |     80x | 22.703 ms | 2.30% | 22.699 ms | 2.30% |      23652118769 |       609.407 MiB |        10.941 MiB |
|        1000 |      1073741824 |           2 |        4 |          8 |   671088640 |    671088640 |     80x | 37.581 ms | 1.42% | 37.577 ms | 1.42% |      28574723179 |         1.658 GiB |        21.758 MiB |
|        1000 |       536870912 |           4 |        4 |          8 |   671088640 |    671088640 |    544x | 22.296 ms | 1.56% | 22.293 ms | 1.56% |      24082840350 |       631.319 MiB |        10.947 MiB |
|        1000 |      1073741824 |           4 |        4 |          8 |   671088640 |    671088640 |     14x | 36.990 ms | 0.14% | 36.985 ms | 0.14% |      29031484389 |         1.554 GiB |        21.881 MiB |
|        1000 |       536870912 |           8 |        4 |          8 |   671088640 |    671088640 |    676x | 22.114 ms | 1.22% | 22.110 ms | 1.22% |      24281965280 |       627.616 MiB |        10.951 MiB |
|        1000 |      1073741824 |           8 |        4 |          8 |   671088640 |    671088640 |     80x | 37.409 ms | 1.40% | 37.405 ms | 1.40% |      28706077426 |         1.562 GiB |        21.894 MiB |

## orc_multithreaded_read_decode_chunked_list

### [0] NVIDIA RTX 5880 Ada Generation

| cardinality | total_data_size | num_threads | num_cols | run_length | input_limit | output_limit | Samples |  CPU Time  | Noise  |  GPU Time  | Noise  | bytes_per_second | peak_memory_usage | encoded_file_size |
|-------------|-----------------|-------------|----------|------------|-------------|--------------|---------|------------|--------|------------|--------|------------------|-------------------|-------------------|
|        1000 |       536870912 |           1 |        4 |          8 |   671088640 |    671088640 |     80x |  74.780 ms |  0.67% |  74.776 ms |  0.67% |       7179747067 |       600.751 MiB |        60.245 MiB |
|        1000 |      1073741824 |           1 |        4 |          8 |   671088640 |    671088640 |    175x |  86.040 ms |  0.56% |  86.035 ms |  0.56% |      12480222210 |         1.576 GiB |       120.549 MiB |
|        1000 |       536870912 |           2 |        4 |          8 |   671088640 |    671088640 |    186x |  80.668 ms |  4.14% |  80.664 ms |  4.14% |       6655685080 |       600.951 MiB |        60.250 MiB |
|        1000 |      1073741824 |           2 |        4 |          8 |   671088640 |    671088640 |    143x | 105.217 ms | 21.56% | 105.212 ms | 21.56% |      10205531345 |         1.576 GiB |       120.489 MiB |
|        1000 |       536870912 |           4 |        4 |          8 |   671088640 |    671088640 |    128x |  80.087 ms |  3.05% |  80.082 ms |  3.05% |       6704042147 |       602.764 MiB |        60.323 MiB |
|        1000 |      1073741824 |           4 |        4 |          8 |   671088640 |    671088640 |    135x | 111.556 ms | 21.88% | 111.551 ms | 21.88% |       9625546746 |         1.489 GiB |       120.499 MiB |
|        1000 |       536870912 |           8 |        4 |          8 |   671088640 |    671088640 |    112x | 134.677 ms |  4.14% | 134.672 ms |  4.14% |       3986513604 |       603.471 MiB |        60.353 MiB |
|        1000 |      1073741824 |           8 |        4 |          8 |   671088640 |    671088640 |     80x | 178.735 ms | 14.17% | 178.730 ms | 14.17% |       6007630497 |         1.520 GiB |       120.646 MiB |

```

Authors:
  - Zach Puller (https://github.com/zpuller)
  - Vukasin Milovanovic (https://github.com/vuule)
  - MithunR (https://github.com/mythrocks)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - MithunR (https://github.com/mythrocks)

URL: #16009
  • Loading branch information
zpuller authored Jun 14, 2024
1 parent 829b3a9 commit 34227d3
Show file tree
Hide file tree
Showing 2 changed files with 340 additions and 0 deletions.
5 changes: 5 additions & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,11 @@ ConfigureNVBench(PARQUET_MULTITHREAD_READER_NVBENCH io/parquet/parquet_reader_mu
# * orc reader benchmark --------------------------------------------------------------------------
ConfigureNVBench(ORC_READER_NVBENCH io/orc/orc_reader_input.cpp io/orc/orc_reader_options.cpp)

# ##################################################################################################
# * orc multithreaded benchmark
# --------------------------------------------------------------------------
ConfigureNVBench(ORC_MULTITHREADED_NVBENCH io/orc/orc_reader_multithreaded.cpp)

# ##################################################################################################
# * csv reader benchmark --------------------------------------------------------------------------
ConfigureNVBench(CSV_READER_NVBENCH io/csv/csv_reader_input.cpp io/csv/csv_reader_options.cpp)
Expand Down
335 changes: 335 additions & 0 deletions cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,335 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/io/cuio_common.hpp>
#include <benchmarks/io/nvbench_helpers.hpp>

#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/stream_pool.hpp>
#include <cudf/io/orc.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/pinned_memory.hpp>
#include <cudf/utilities/thread_pool.hpp>

#include <nvbench/nvbench.cuh>

#include <vector>

size_t get_num_read_threads(nvbench::state const& state) { return state.get_int64("num_threads"); }

size_t get_read_size(nvbench::state const& state)
{
auto const num_reads = get_num_read_threads(state);
return state.get_int64("total_data_size") / num_reads;
}

std::string get_label(std::string const& test_name, nvbench::state const& state)
{
auto const num_cols = state.get_int64("num_cols");
size_t const read_size_mb = get_read_size(state) / (1024 * 1024);
return {test_name + ", " + std::to_string(num_cols) + " columns, " +
std::to_string(get_num_read_threads(state)) + " threads " + " (" +
std::to_string(read_size_mb) + " MB each)"};
}

std::tuple<std::vector<cuio_source_sink_pair>, size_t, size_t> write_file_data(
nvbench::state& state, std::vector<cudf::type_id> const& d_types)
{
auto const cardinality = state.get_int64("cardinality");
auto const run_length = state.get_int64("run_length");
auto const num_cols = state.get_int64("num_cols");
size_t const num_files = get_num_read_threads(state);
size_t const per_file_data_size = get_read_size(state);

std::vector<cuio_source_sink_pair> source_sink_vector;

size_t total_file_size = 0;

for (size_t i = 0; i < num_files; ++i) {
cuio_source_sink_pair source_sink{io_type::HOST_BUFFER};

auto const tbl = create_random_table(
cycle_dtypes(d_types, num_cols),
table_size_bytes{per_file_data_size},
data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
auto const view = tbl->view();

cudf::io::orc_writer_options const write_opts =
cudf::io::orc_writer_options::builder(source_sink.make_sink_info(), view)
.compression(cudf::io::compression_type::SNAPPY);

cudf::io::write_orc(write_opts);
total_file_size += source_sink.size();

source_sink_vector.push_back(std::move(source_sink));
}

return {std::move(source_sink_vector), total_file_size, num_files};
}

void BM_orc_multithreaded_read_common(nvbench::state& state,
std::vector<cudf::type_id> const& d_types,
std::string const& label)
{
auto const data_size = state.get_int64("total_data_size");
auto const num_threads = state.get_int64("num_threads");

auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
cudf::detail::thread_pool threads(num_threads);

auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
std::vector<cudf::io::source_info> source_info_vector;
std::transform(source_sink_vector.begin(),
source_sink_vector.end(),
std::back_inserter(source_info_vector),
[](auto& source_sink) { return source_sink.make_source_info(); });

auto mem_stats_logger = cudf::memory_stats_logger();

{
cudf::scoped_range range{("(read) " + label).c_str()};
state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
[&](nvbench::launch& launch, auto& timer) {
auto read_func = [&](int index) {
auto const stream = streams[index % num_threads];
cudf::io::orc_reader_options read_opts =
cudf::io::orc_reader_options::builder(source_info_vector[index]);
cudf::io::read_orc(read_opts, stream, rmm::mr::get_current_device_resource());
};

threads.paused = true;
for (size_t i = 0; i < num_files; ++i) {
threads.submit(read_func, i);
}
timer.start();
threads.paused = false;
threads.wait_for_tasks();
cudf::detail::join_streams(streams, cudf::get_default_stream());
timer.stop();
});
}

auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
state.add_buffer_size(
mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size");
}

void BM_orc_multithreaded_read_mixed(nvbench::state& state)
{
auto label = get_label("mixed", state);
cudf::scoped_range range{label.c_str()};
BM_orc_multithreaded_read_common(
state, {cudf::type_id::INT32, cudf::type_id::DECIMAL64, cudf::type_id::STRING}, label);
}

void BM_orc_multithreaded_read_fixed_width(nvbench::state& state)
{
auto label = get_label("fixed width", state);
cudf::scoped_range range{label.c_str()};
BM_orc_multithreaded_read_common(state, {cudf::type_id::INT32}, label);
}

void BM_orc_multithreaded_read_string(nvbench::state& state)
{
auto label = get_label("string", state);
cudf::scoped_range range{label.c_str()};
BM_orc_multithreaded_read_common(state, {cudf::type_id::STRING}, label);
}

void BM_orc_multithreaded_read_list(nvbench::state& state)
{
auto label = get_label("list", state);
cudf::scoped_range range{label.c_str()};
BM_orc_multithreaded_read_common(state, {cudf::type_id::LIST}, label);
}

void BM_orc_multithreaded_read_chunked_common(nvbench::state& state,
std::vector<cudf::type_id> const& d_types,
std::string const& label)
{
size_t const data_size = state.get_int64("total_data_size");
auto const num_threads = state.get_int64("num_threads");
size_t const input_limit = state.get_int64("input_limit");
size_t const output_limit = state.get_int64("output_limit");

auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
cudf::detail::thread_pool threads(num_threads);
auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
std::vector<cudf::io::source_info> source_info_vector;
std::transform(source_sink_vector.begin(),
source_sink_vector.end(),
std::back_inserter(source_info_vector),
[](auto& source_sink) { return source_sink.make_source_info(); });

auto mem_stats_logger = cudf::memory_stats_logger();

{
cudf::scoped_range range{("(read) " + label).c_str()};
std::vector<cudf::io::table_with_metadata> chunks;
state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
[&](nvbench::launch& launch, auto& timer) {
auto read_func = [&](int index) {
auto const stream = streams[index % num_threads];
cudf::io::orc_reader_options read_opts =
cudf::io::orc_reader_options::builder(source_info_vector[index]);
// divide chunk limits by number of threads so the number of chunks produced is the
// same for all cases. this seems better than the alternative, which is to keep the
// limits the same. if we do that, as the number of threads goes up, the number of
// chunks goes down - so are actually benchmarking the same thing in that case?
auto reader = cudf::io::chunked_orc_reader(
output_limit / num_threads, input_limit / num_threads, read_opts, stream);

// read all the chunks
do {
auto table = reader.read_chunk();
} while (reader.has_next());
};

threads.paused = true;
for (size_t i = 0; i < num_files; ++i) {
threads.submit(read_func, i);
}
timer.start();
threads.paused = false;
threads.wait_for_tasks();
cudf::detail::join_streams(streams, cudf::get_default_stream());
timer.stop();
});
}

auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
state.add_buffer_size(
mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size");
}

void BM_orc_multithreaded_read_chunked_mixed(nvbench::state& state)
{
auto label = get_label("mixed", state);
cudf::scoped_range range{label.c_str()};
BM_orc_multithreaded_read_chunked_common(
state, {cudf::type_id::INT32, cudf::type_id::DECIMAL64, cudf::type_id::STRING}, label);
}

void BM_orc_multithreaded_read_chunked_fixed_width(nvbench::state& state)
{
auto label = get_label("fixed width", state);
cudf::scoped_range range{label.c_str()};
BM_orc_multithreaded_read_chunked_common(state, {cudf::type_id::INT32}, label);
}

void BM_orc_multithreaded_read_chunked_string(nvbench::state& state)
{
auto label = get_label("string", state);
cudf::scoped_range range{label.c_str()};
BM_orc_multithreaded_read_chunked_common(state, {cudf::type_id::STRING}, label);
}

void BM_orc_multithreaded_read_chunked_list(nvbench::state& state)
{
auto label = get_label("list", state);
cudf::scoped_range range{label.c_str()};
BM_orc_multithreaded_read_chunked_common(state, {cudf::type_id::LIST}, label);
}
auto const thread_range = std::vector<nvbench::int64_t>{1, 2, 4, 8};
auto const total_data_size = std::vector<nvbench::int64_t>{512 * 1024 * 1024, 1024 * 1024 * 1024};

// mixed data types: fixed width and strings
NVBENCH_BENCH(BM_orc_multithreaded_read_mixed)
.set_name("orc_multithreaded_read_decode_mixed")
.set_min_samples(4)
.add_int64_axis("cardinality", {1000})
.add_int64_axis("total_data_size", total_data_size)
.add_int64_axis("num_threads", thread_range)
.add_int64_axis("num_cols", {4})
.add_int64_axis("run_length", {8});

NVBENCH_BENCH(BM_orc_multithreaded_read_fixed_width)
.set_name("orc_multithreaded_read_decode_fixed_width")
.set_min_samples(4)
.add_int64_axis("cardinality", {1000})
.add_int64_axis("total_data_size", total_data_size)
.add_int64_axis("num_threads", thread_range)
.add_int64_axis("num_cols", {4})
.add_int64_axis("run_length", {8});

NVBENCH_BENCH(BM_orc_multithreaded_read_string)
.set_name("orc_multithreaded_read_decode_string")
.set_min_samples(4)
.add_int64_axis("cardinality", {1000})
.add_int64_axis("total_data_size", total_data_size)
.add_int64_axis("num_threads", thread_range)
.add_int64_axis("num_cols", {4})
.add_int64_axis("run_length", {8});

NVBENCH_BENCH(BM_orc_multithreaded_read_list)
.set_name("orc_multithreaded_read_decode_list")
.set_min_samples(4)
.add_int64_axis("cardinality", {1000})
.add_int64_axis("total_data_size", total_data_size)
.add_int64_axis("num_threads", thread_range)
.add_int64_axis("num_cols", {4})
.add_int64_axis("run_length", {8});

// mixed data types: fixed width, strings
NVBENCH_BENCH(BM_orc_multithreaded_read_chunked_mixed)
.set_name("orc_multithreaded_read_decode_chunked_mixed")
.set_min_samples(4)
.add_int64_axis("cardinality", {1000})
.add_int64_axis("total_data_size", total_data_size)
.add_int64_axis("num_threads", thread_range)
.add_int64_axis("num_cols", {4})
.add_int64_axis("run_length", {8})
.add_int64_axis("input_limit", {640 * 1024 * 1024})
.add_int64_axis("output_limit", {640 * 1024 * 1024});

NVBENCH_BENCH(BM_orc_multithreaded_read_chunked_fixed_width)
.set_name("orc_multithreaded_read_decode_chunked_fixed_width")
.set_min_samples(4)
.add_int64_axis("cardinality", {1000})
.add_int64_axis("total_data_size", total_data_size)
.add_int64_axis("num_threads", thread_range)
.add_int64_axis("num_cols", {4})
.add_int64_axis("run_length", {8})
.add_int64_axis("input_limit", {640 * 1024 * 1024})
.add_int64_axis("output_limit", {640 * 1024 * 1024});

NVBENCH_BENCH(BM_orc_multithreaded_read_chunked_string)
.set_name("orc_multithreaded_read_decode_chunked_string")
.set_min_samples(4)
.add_int64_axis("cardinality", {1000})
.add_int64_axis("total_data_size", total_data_size)
.add_int64_axis("num_threads", thread_range)
.add_int64_axis("num_cols", {4})
.add_int64_axis("run_length", {8})
.add_int64_axis("input_limit", {640 * 1024 * 1024})
.add_int64_axis("output_limit", {640 * 1024 * 1024});

NVBENCH_BENCH(BM_orc_multithreaded_read_chunked_list)
.set_name("orc_multithreaded_read_decode_chunked_list")
.set_min_samples(4)
.add_int64_axis("cardinality", {1000})
.add_int64_axis("total_data_size", total_data_size)
.add_int64_axis("num_threads", thread_range)
.add_int64_axis("num_cols", {4})
.add_int64_axis("run_length", {8})
.add_int64_axis("input_limit", {640 * 1024 * 1024})
.add_int64_axis("output_limit", {640 * 1024 * 1024});

0 comments on commit 34227d3

Please sign in to comment.