Move function

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
NVIDIA · Jul 17, 2024 · 750fcca · 750fcca
1 parent e4d1352
commit 750fcca
Showing 1 changed file with 37 additions and 37 deletions.
diff --git a/src/main/cpp/src/get_json_object.cu b/src/main/cpp/src/get_json_object.cu
@@ -897,6 +897,43 @@ __launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL
   }
 }
 
+void launch_kernel(bool exec_row_per_thread,
+                   cudf::column_device_view const& input,
+                   cudf::device_span<json_path_query_data> query_data,
+                   rmm::cuda_stream_view stream)
+{
+  auto const get_SM_count = []() {
+    int device_id{};
+    cudaDeviceProp props{};
+    CUDF_CUDA_TRY(cudaGetDevice(&device_id));
+    CUDF_CUDA_TRY(cudaGetDeviceProperties(&props, device_id));
+    return props.multiProcessorCount;
+  };
+
+  // We explicitly set the minBlocksPerMultiprocessor parameter in the launch bounds to avoid
+  // spilling from the kernel itself. By default NVCC uses a heuristic to find a balance between
+  // the maximum number of registers used by a kernel and the parallelism of the kernel.
+  // If lots of registers are used the parallelism may suffer. But in our case
+  // NVCC gets this wrong and we want to avoid spilling all the time or else
+  // the performance is really bad. This essentially tells NVCC to prefer using lots
+  // of registers over spilling.
+  if (exec_row_per_thread) {
+    constexpr int block_size             = 256;
+    constexpr int min_block_per_sm       = 1;
+    constexpr int block_count_multiplier = 1;
+    static auto const num_blocks         = get_SM_count() * block_count_multiplier;
+    get_json_object_kernel_row_per_thread<block_size, min_block_per_sm>
+      <<<num_blocks, block_size, 0, stream.value()>>>(input, query_data);
+  } else {
+    constexpr int block_size             = 512;
+    constexpr int min_block_per_sm       = 2;
+    constexpr int block_count_multiplier = 8;
+    static auto const num_blocks         = get_SM_count() * block_count_multiplier;
+    get_json_object_kernel_row_per_warp<block_size, min_block_per_sm>
+      <<<num_blocks, block_size, 0, stream.value()>>>(input, query_data);
+  }
+}
+
 std::tuple<std::vector<rmm::device_uvector<path_instruction>>,
            std::unique_ptr<std::vector<std::vector<path_instruction>>>,
            cudf::string_scalar,
@@ -962,43 +999,6 @@ construct_path_commands(
           std::move(h_inst_names)};
 }
 
-void launch_kernel(bool exec_row_per_thread,
-                   cudf::column_device_view const& input,
-                   cudf::device_span<json_path_query_data> query_data,
-                   rmm::cuda_stream_view stream)
-{
-  auto const get_SM_count = []() {
-    int device_id{};
-    cudaDeviceProp props{};
-    CUDF_CUDA_TRY(cudaGetDevice(&device_id));
-    CUDF_CUDA_TRY(cudaGetDeviceProperties(&props, device_id));
-    return props.multiProcessorCount;
-  };
-
-  // We explicitly set the minBlocksPerMultiprocessor parameter in the launch bounds to avoid
-  // spilling from the kernel itself. By default NVCC uses a heuristic to find a balance between
-  // the maximum number of registers used by a kernel and the parallelism of the kernel.
-  // If lots of registers are used the parallelism may suffer. But in our case
-  // NVCC gets this wrong and we want to avoid spilling all the time or else
-  // the performance is really bad. This essentially tells NVCC to prefer using lots
-  // of registers over spilling.
-  if (exec_row_per_thread) {
-    constexpr int block_size             = 256;
-    constexpr int min_block_per_sm       = 1;
-    constexpr int block_count_multiplier = 1;
-    static auto const num_blocks         = get_SM_count() * block_count_multiplier;
-    get_json_object_kernel_row_per_thread<block_size, min_block_per_sm>
-      <<<num_blocks, block_size, 0, stream.value()>>>(input, query_data);
-  } else {
-    constexpr int block_size             = 512;
-    constexpr int min_block_per_sm       = 2;
-    constexpr int block_count_multiplier = 8;
-    static auto const num_blocks         = get_SM_count() * block_count_multiplier;
-    get_json_object_kernel_row_per_warp<block_size, min_block_per_sm>
-      <<<num_blocks, block_size, 0, stream.value()>>>(input, query_data);
-  }
-}
-
 std::vector<std::unique_ptr<cudf::column>> get_json_object(
   cudf::strings_column_view const& input,
   std::vector<std::vector<std::tuple<path_instruction_type, std::string, int64_t>>> const&