diff --git a/src/main/cpp/src/get_json_object.cu b/src/main/cpp/src/get_json_object.cu index f0f695ef83..9f8696e645 100644 --- a/src/main/cpp/src/get_json_object.cu +++ b/src/main/cpp/src/get_json_object.cu @@ -897,6 +897,43 @@ __launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL } } +void launch_kernel(bool exec_row_per_thread, + cudf::column_device_view const& input, + cudf::device_span query_data, + rmm::cuda_stream_view stream) +{ + auto const get_SM_count = []() { + int device_id{}; + cudaDeviceProp props{}; + CUDF_CUDA_TRY(cudaGetDevice(&device_id)); + CUDF_CUDA_TRY(cudaGetDeviceProperties(&props, device_id)); + return props.multiProcessorCount; + }; + + // We explicitly set the minBlocksPerMultiprocessor parameter in the launch bounds to avoid + // spilling from the kernel itself. By default NVCC uses a heuristic to find a balance between + // the maximum number of registers used by a kernel and the parallelism of the kernel. + // If lots of registers are used the parallelism may suffer. But in our case + // NVCC gets this wrong and we want to avoid spilling all the time or else + // the performance is really bad. This essentially tells NVCC to prefer using lots + // of registers over spilling. + if (exec_row_per_thread) { + constexpr int block_size = 256; + constexpr int min_block_per_sm = 1; + constexpr int block_count_multiplier = 1; + static auto const num_blocks = get_SM_count() * block_count_multiplier; + get_json_object_kernel_row_per_thread + <<>>(input, query_data); + } else { + constexpr int block_size = 512; + constexpr int min_block_per_sm = 2; + constexpr int block_count_multiplier = 8; + static auto const num_blocks = get_SM_count() * block_count_multiplier; + get_json_object_kernel_row_per_warp + <<>>(input, query_data); + } +} + std::tuple>, std::unique_ptr>>, cudf::string_scalar, @@ -962,43 +999,6 @@ construct_path_commands( std::move(h_inst_names)}; } -void launch_kernel(bool exec_row_per_thread, - cudf::column_device_view const& input, - cudf::device_span query_data, - rmm::cuda_stream_view stream) -{ - auto const get_SM_count = []() { - int device_id{}; - cudaDeviceProp props{}; - CUDF_CUDA_TRY(cudaGetDevice(&device_id)); - CUDF_CUDA_TRY(cudaGetDeviceProperties(&props, device_id)); - return props.multiProcessorCount; - }; - - // We explicitly set the minBlocksPerMultiprocessor parameter in the launch bounds to avoid - // spilling from the kernel itself. By default NVCC uses a heuristic to find a balance between - // the maximum number of registers used by a kernel and the parallelism of the kernel. - // If lots of registers are used the parallelism may suffer. But in our case - // NVCC gets this wrong and we want to avoid spilling all the time or else - // the performance is really bad. This essentially tells NVCC to prefer using lots - // of registers over spilling. - if (exec_row_per_thread) { - constexpr int block_size = 256; - constexpr int min_block_per_sm = 1; - constexpr int block_count_multiplier = 1; - static auto const num_blocks = get_SM_count() * block_count_multiplier; - get_json_object_kernel_row_per_thread - <<>>(input, query_data); - } else { - constexpr int block_size = 512; - constexpr int min_block_per_sm = 2; - constexpr int block_count_multiplier = 8; - static auto const num_blocks = get_SM_count() * block_count_multiplier; - get_json_object_kernel_row_per_warp - <<>>(input, query_data); - } -} - std::vector> get_json_object( cudf::strings_column_view const& input, std::vector>> const&