rapidsai · rapids-bot · Oct 27, 2021 · Apr 17, 2021 · May 25, 2021 · May 25, 2021
@@ -135,6 +135,8 @@ struct shmem_size_params {
   /// n_items is how many input samples (items) any thread processes. If 0 is given,
   /// choose the reasonable most (<=4) that fit into shared memory. See init_n_items()
   int n_items = 0;
+  // blockdim_x is the CUDA block size
+  int blockdim_x = 0;
   /// shm_sz is the associated shared memory footprint
   int shm_sz = INT_MAX;
 
@@ -146,9 +148,6 @@ struct shmem_size_params {
              ? sizeof(float) * sdata_stride() * n_items << log2_threads_per_tree
              : 0;
   }
-  void compute_smem_footprint();
-  template <int NITEMS>
-  size_t get_smem_footprint();
   template <int NITEMS, leaf_algo_t leaf_algo>
   size_t get_smem_footprint();
 };
@@ -173,6 +172,89 @@ struct predict_params : shmem_size_params {
   int num_blocks;
 };
 
+namespace dispatch {
+
+template <template <bool, leaf_algo_t, int> class Func, typename storage_type,
+          bool cols_in_shmem, leaf_algo_t leaf_algo, int n_items,
+          typename... Args>
+void dispatch_on_n_items(predict_params& params, Args... args) {
+  ASSERT(params.n_items <= 4, "internal error: n_items > 4");
+  if (params.n_items == n_items) {
+    Func<cols_in_shmem, leaf_algo, n_items>::template run<storage_type>(
+      params, args...);
+  } else if constexpr (n_items < 4) {
+    dispatch_on_n_items<Func, storage_type, cols_in_shmem, leaf_algo,
+                        n_items + 1>(params, args...);
+  }
+}
+
+template <template <bool, leaf_algo_t, int> class Func, typename storage_type,
+          bool cols_in_shmem, typename... Args>
+void dispatch_on_leaf_algo(predict_params& params, Args... args) {
+  switch (params.leaf_algo) {
+    case FLOAT_UNARY_BINARY:
+      params.blockdim_x = FIL_TPB;
+      dispatch_on_n_items<Func, storage_type, cols_in_shmem, FLOAT_UNARY_BINARY,
+                          1>(params, args...);
+      break;
+    case GROVE_PER_CLASS:
+      if (params.num_classes > FIL_TPB) {
+        params.blockdim_x = FIL_TPB;
+        dispatch_on_n_items<Func, storage_type, cols_in_shmem,
+                            GROVE_PER_CLASS_MANY_CLASSES, 1>(params, args...);
+      } else {
+        params.blockdim_x = FIL_TPB - FIL_TPB % params.num_classes;
+        dispatch_on_n_items<Func, storage_type, cols_in_shmem,
+                            GROVE_PER_CLASS_FEW_CLASSES, 1>(params, args...);
+      }
+      break;
+    case CATEGORICAL_LEAF:
+      params.blockdim_x = FIL_TPB;
+      dispatch_on_n_items<Func, storage_type, cols_in_shmem, CATEGORICAL_LEAF,
+                          1>(params, args...);
+      break;
+    case VECTOR_LEAF:
+      params.blockdim_x = FIL_TPB;
+      dispatch_on_n_items<Func, storage_type, cols_in_shmem, VECTOR_LEAF, 1>(
+        params, args...);
+      break;
+    default:
+      ASSERT(false, "internal error: dispatch: invalid leaf_algo %d",
+             params.leaf_algo);
+  }
+}
+
+template <template <bool, leaf_algo_t, int> class Func, typename storage_type,
+          typename... Args>
+void dispatch_on_cols_in_shmem(predict_params& params, Args... args) {
+  if (params.cols_in_shmem)
+    dispatch_on_leaf_algo<Func, storage_type, true>(params, args...);
+  else
+    dispatch_on_leaf_algo<Func, storage_type, false>(params, args...);
+}
+
+}  // namespace dispatch
+
+template <template <bool, leaf_algo_t, int> class Func, typename storage_type,
+          typename... Args>
+void dispatch_on_fil_template_params(predict_params& params, Args... args) {
+  dispatch::dispatch_on_cols_in_shmem<Func, storage_type>(params, args...);
+}
+
+// we need to instantiate all get_smem_footprint instantiations in infer.cu.
+// The only guarantee is by instantiating
+// dispatch_on_FIL_template<compute_smem_footprint...  in infer.cu. This
+// requires a declaration of this struct with the declaration of the run method
+// (i.e. all but one line) visible from infer.cu, as well as this full
+// definition visible from fil.cu. We'll just define it in common.cuh.
+template <bool cols_in_shmem, leaf_algo_t leaf_algo, int n_items>
+struct compute_smem_footprint {
+  template <typename storage_type>
+  static void run(predict_params& ssp) {
+    ssp.shm_sz = ssp.get_smem_footprint<n_items, leaf_algo>();
+  }
+};
+
 // infer() calls the inference kernel with the parameters on the stream
 template <typename storage_type>
 void infer(storage_type forest, predict_params params, cudaStream_t stream);

@@ -74,6 +74,9 @@ __global__ void transform_k(float* preds, size_t n, output_t output,
     preds[i] = result;
 }
 
+extern template void dispatch_on_fil_template_params<
+  compute_smem_footprint, dense_storage>(predict_params&);
+
 struct forest {
   void init_n_items(int device) {
     int max_shm_std = 48 * 1024;  // 48 KiB
@@ -99,7 +102,7 @@ struct forest {
     for (bool predict_proba : {false, true}) {
       shmem_size_params& ssp_ = predict_proba ? proba_ssp_ : class_ssp_;
       ssp_.predict_proba = predict_proba;
-      shmem_size_params ssp = ssp_;
+      predict_params ssp = ssp_;
       // if n_items was not provided, try from 1 to 4. Otherwise, use as-is.
       int min_n_items = ssp.n_items == 0 ? 1 : ssp.n_items;
       int max_n_items = ssp.n_items == 0
@@ -109,7 +112,8 @@ struct forest {
         ssp.cols_in_shmem = cols_in_shmem;
         for (ssp.n_items = min_n_items; ssp.n_items <= max_n_items;
              ++ssp.n_items) {
-          ssp.compute_smem_footprint();
+          dispatch_on_fil_template_params<compute_smem_footprint,
+                                          dense_storage>(ssp);
           if (ssp.shm_sz < max_shm) ssp_ = ssp;
         }
       }
@@ -276,7 +280,8 @@ struct forest {
                          global_bias != 0.0f;
           break;
         default:
-          ASSERT(false, "internal error: invalid leaf_algo_");
+          ASSERT(false, "internal error: predict: invalid leaf_algo %d",
+                 params.leaf_algo);
       }
     } else {
       if (params.leaf_algo == leaf_algo_t::FLOAT_UNARY_BINARY) {

@@ -750,118 +750,32 @@ size_t shmem_size_params::get_smem_footprint() {
     tree_aggregator_t<NITEMS, leaf_algo>::smem_accumulate_footprint(
       num_classes) +
     cols_shmem_size();
-
   return std::max(accumulate_footprint, finalize_footprint);
 }
 
-template <int NITEMS>
-size_t shmem_size_params::get_smem_footprint() {
-  switch (leaf_algo) {
-    case FLOAT_UNARY_BINARY:
-      return get_smem_footprint<NITEMS, FLOAT_UNARY_BINARY>();
-    case CATEGORICAL_LEAF:
-      return get_smem_footprint<NITEMS, CATEGORICAL_LEAF>();
-    case GROVE_PER_CLASS:
-      if (num_classes > FIL_TPB)
-        return get_smem_footprint<NITEMS, GROVE_PER_CLASS_MANY_CLASSES>();
-      return get_smem_footprint<NITEMS, GROVE_PER_CLASS_FEW_CLASSES>();
-    case VECTOR_LEAF:
-      return get_smem_footprint<NITEMS, VECTOR_LEAF>();
-    default:
-      ASSERT(false, "internal error: unexpected leaf_algo_t");
+// make sure to instantiate all possible get_smem_footprint instantiations
+template void dispatch_on_fil_template_params<compute_smem_footprint,
+                                              dense_storage>(predict_params&);
+
+template <bool cols_in_shmem, leaf_algo_t leaf_algo, int n_items>
+struct infer_k_launcher {
+  template <typename storage_type>
+  static void run(predict_params& params, storage_type forest,
+                  cudaStream_t stream) {
+    params.num_blocks = params.num_blocks != 0
+                          ? params.num_blocks
+                          : raft::ceildiv(int(params.num_rows), params.n_items);
+    infer_k<n_items, leaf_algo, cols_in_shmem, storage_type>
+      <<<params.num_blocks, params.blockdim_x, params.shm_sz, stream>>>(forest,
+                                                                        params);
+    CUDA_CHECK(cudaPeekAtLastError());
   }
-}
-
-void shmem_size_params::compute_smem_footprint() {
-  switch (n_items) {
-    case 1:
-      shm_sz = get_smem_footprint<1>();
-      break;
-    case 2:
-      shm_sz = get_smem_footprint<2>();
-      break;
-    case 3:
-      shm_sz = get_smem_footprint<3>();
-      break;
-    case 4:
-      shm_sz = get_smem_footprint<4>();
-      break;
-    default:
-      ASSERT(false, "internal error: n_items > 4");
-  }
-}
-
-template <leaf_algo_t leaf_algo, bool cols_in_shmem, typename storage_type>
-void infer_k_nitems_launcher(storage_type forest, predict_params params,
-                             cudaStream_t stream, int block_dim_x) {
-  switch (params.n_items) {
-    case 1:
-      infer_k<1, leaf_algo, cols_in_shmem>
-        <<<params.num_blocks, block_dim_x, params.shm_sz, stream>>>(forest,
-                                                                    params);
-      break;
-    case 2:
-      infer_k<2, leaf_algo, cols_in_shmem>
-        <<<params.num_blocks, block_dim_x, params.shm_sz, stream>>>(forest,
-                                                                    params);
-      break;
-    case 3:
-      infer_k<3, leaf_algo, cols_in_shmem>
-        <<<params.num_blocks, block_dim_x, params.shm_sz, stream>>>(forest,
-                                                                    params);
-      break;
-    case 4:
-      infer_k<4, leaf_algo, cols_in_shmem>
-        <<<params.num_blocks, block_dim_x, params.shm_sz, stream>>>(forest,
-                                                                    params);
-      break;
-    default:
-      ASSERT(false, "internal error: nitems > 4");
-  }
-  CUDA_CHECK(cudaPeekAtLastError());
-}
-
-template <leaf_algo_t leaf_algo, typename storage_type>
-void infer_k_launcher(storage_type forest, predict_params params,
-                      cudaStream_t stream, int blockdim_x) {
-  params.num_blocks = params.num_blocks != 0
-                        ? params.num_blocks
-                        : raft::ceildiv(int(params.num_rows), params.n_items);
-  if (params.cols_in_shmem) {
-    infer_k_nitems_launcher<leaf_algo, true>(forest, params, stream,
-                                             blockdim_x);
-  } else {
-    infer_k_nitems_launcher<leaf_algo, false>(forest, params, stream,
-                                              blockdim_x);
-  }
-}
+};
 
 template <typename storage_type>
 void infer(storage_type forest, predict_params params, cudaStream_t stream) {
-  switch (params.leaf_algo) {
-    case FLOAT_UNARY_BINARY:
-      infer_k_launcher<FLOAT_UNARY_BINARY>(forest, params, stream, FIL_TPB);
-      break;
-    case GROVE_PER_CLASS:
-      if (params.num_classes > FIL_TPB) {
-        params.leaf_algo = GROVE_PER_CLASS_MANY_CLASSES;
-        infer_k_launcher<GROVE_PER_CLASS_MANY_CLASSES>(forest, params, stream,
-                                                       FIL_TPB);
-      } else {
-        params.leaf_algo = GROVE_PER_CLASS_FEW_CLASSES;
-        infer_k_launcher<GROVE_PER_CLASS_FEW_CLASSES>(
-          forest, params, stream, FIL_TPB - FIL_TPB % params.num_classes);
-      }
-      break;
-    case CATEGORICAL_LEAF:
-      infer_k_launcher<CATEGORICAL_LEAF>(forest, params, stream, FIL_TPB);
-      break;
-    case VECTOR_LEAF:
-      infer_k_launcher<VECTOR_LEAF>(forest, params, stream, FIL_TPB);
-      break;
-    default:
-      ASSERT(false, "internal error: invalid leaf_algo");
-  }
+  dispatch_on_fil_template_params<infer_k_launcher, storage_type>(
+    params, forest, stream);
 }
 
 template void infer<dense_storage>(dense_storage forest, predict_params params,