Skip to content

Commit

Permalink
Move function
Browse files Browse the repository at this point in the history
Signed-off-by: Nghia Truong <nghiat@nvidia.com>
  • Loading branch information
ttnghia committed Jul 17, 2024
1 parent e4d1352 commit 750fcca
Showing 1 changed file with 37 additions and 37 deletions.
74 changes: 37 additions & 37 deletions src/main/cpp/src/get_json_object.cu
Original file line number Diff line number Diff line change
Expand Up @@ -897,6 +897,43 @@ __launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL
}
}

void launch_kernel(bool exec_row_per_thread,
cudf::column_device_view const& input,
cudf::device_span<json_path_query_data> query_data,
rmm::cuda_stream_view stream)
{
auto const get_SM_count = []() {
int device_id{};
cudaDeviceProp props{};
CUDF_CUDA_TRY(cudaGetDevice(&device_id));
CUDF_CUDA_TRY(cudaGetDeviceProperties(&props, device_id));
return props.multiProcessorCount;
};

// We explicitly set the minBlocksPerMultiprocessor parameter in the launch bounds to avoid
// spilling from the kernel itself. By default NVCC uses a heuristic to find a balance between
// the maximum number of registers used by a kernel and the parallelism of the kernel.
// If lots of registers are used the parallelism may suffer. But in our case
// NVCC gets this wrong and we want to avoid spilling all the time or else
// the performance is really bad. This essentially tells NVCC to prefer using lots
// of registers over spilling.
if (exec_row_per_thread) {
constexpr int block_size = 256;
constexpr int min_block_per_sm = 1;
constexpr int block_count_multiplier = 1;
static auto const num_blocks = get_SM_count() * block_count_multiplier;
get_json_object_kernel_row_per_thread<block_size, min_block_per_sm>
<<<num_blocks, block_size, 0, stream.value()>>>(input, query_data);
} else {
constexpr int block_size = 512;
constexpr int min_block_per_sm = 2;
constexpr int block_count_multiplier = 8;
static auto const num_blocks = get_SM_count() * block_count_multiplier;
get_json_object_kernel_row_per_warp<block_size, min_block_per_sm>
<<<num_blocks, block_size, 0, stream.value()>>>(input, query_data);
}
}

std::tuple<std::vector<rmm::device_uvector<path_instruction>>,
std::unique_ptr<std::vector<std::vector<path_instruction>>>,
cudf::string_scalar,
Expand Down Expand Up @@ -962,43 +999,6 @@ construct_path_commands(
std::move(h_inst_names)};
}

void launch_kernel(bool exec_row_per_thread,
cudf::column_device_view const& input,
cudf::device_span<json_path_query_data> query_data,
rmm::cuda_stream_view stream)
{
auto const get_SM_count = []() {
int device_id{};
cudaDeviceProp props{};
CUDF_CUDA_TRY(cudaGetDevice(&device_id));
CUDF_CUDA_TRY(cudaGetDeviceProperties(&props, device_id));
return props.multiProcessorCount;
};

// We explicitly set the minBlocksPerMultiprocessor parameter in the launch bounds to avoid
// spilling from the kernel itself. By default NVCC uses a heuristic to find a balance between
// the maximum number of registers used by a kernel and the parallelism of the kernel.
// If lots of registers are used the parallelism may suffer. But in our case
// NVCC gets this wrong and we want to avoid spilling all the time or else
// the performance is really bad. This essentially tells NVCC to prefer using lots
// of registers over spilling.
if (exec_row_per_thread) {
constexpr int block_size = 256;
constexpr int min_block_per_sm = 1;
constexpr int block_count_multiplier = 1;
static auto const num_blocks = get_SM_count() * block_count_multiplier;
get_json_object_kernel_row_per_thread<block_size, min_block_per_sm>
<<<num_blocks, block_size, 0, stream.value()>>>(input, query_data);
} else {
constexpr int block_size = 512;
constexpr int min_block_per_sm = 2;
constexpr int block_count_multiplier = 8;
static auto const num_blocks = get_SM_count() * block_count_multiplier;
get_json_object_kernel_row_per_warp<block_size, min_block_per_sm>
<<<num_blocks, block_size, 0, stream.value()>>>(input, query_data);
}
}

std::vector<std::unique_ptr<cudf::column>> get_json_object(
cudf::strings_column_view const& input,
std::vector<std::vector<std::tuple<path_instruction_type, std::string, int64_t>>> const&
Expand Down

0 comments on commit 750fcca

Please sign in to comment.