rapidsai · rapids-bot · Oct 27, 2021 · Apr 17, 2021 · May 25, 2021 · May 25, 2021
@@ -72,6 +72,8 @@ struct forest;
 /** forest_t is the predictor handle */
 typedef forest* forest_t;
 
+constexpr int MAX_N_ITEMS = 4;
+
 /** treelite_params_t are parameters for importing treelite models */
 struct treelite_params_t {
   // algo is the inference algorithm
@@ -94,7 +96,7 @@ struct treelite_params_t {
   // can only be a power of 2
   int threads_per_tree;
   // n_items is how many input samples (items) any thread processes. If 0 is given,
-  // choose most (up to 4) that fit into shared memory.
+  // choose most (up to MAX_N_ITEMS) that fit into shared memory.
   int n_items;
   // if non-nullptr, *pforest_shape_str will be set to caller-owned string that
   // contains forest shape

@@ -130,16 +130,16 @@ struct shmem_size_params {
   /// are the input columns are prefetched into shared
   /// memory before inferring the row in question
   bool cols_in_shmem = true;
-  // are there categorical inner nodes? doesnt' currently affect shared memory size,
+  // are there categorical inner nodes? doesn't currently affect shared memory size,
   // but participates in template dispatch and may affect it later
-  bool cats_present;
+  bool cats_present = false;
   /// log2_threads_per_tree determines how many threads work on a single tree
   /// at once inside a block (sharing trees means splitting input rows)
   int log2_threads_per_tree = 0;
   /// n_items is how many input samples (items) any thread processes. If 0 is given,
-  /// choose the reasonable most (<=4) that fit into shared memory. See init_n_items()
+  /// choose the reasonable most (<= MAX_N_ITEMS) that fit into shared memory. See init_n_items()
   int n_items = 0;
-  // block_dim_x is the CUDA block size
+  // block_dim_x is the CUDA block size. Set by dispatch_on_leaf_algo(...)
   int block_dim_x = 0;
   /// shm_sz is the associated shared memory footprint
   int shm_sz = INT_MAX;
@@ -177,105 +177,95 @@ struct predict_params : shmem_size_params {
   int num_blocks;
 };
 
-template <bool COLS_IN_SHMEM  = false,
-          bool CATS_SUPPORTED = false,
-          int LEAF_ALGO       = 0,
-          int N_ITEMS         = 1>
-struct KernelTemplateParameters {
-  static const bool cols_in_shmem    = COLS_IN_SHMEM;
-  static const bool cats_supported   = CATS_SUPPORTED;
-  static const leaf_algo_t leaf_algo = static_cast<leaf_algo_t>(LEAF_ALGO);
-  static const int n_items           = N_ITEMS;
+template <bool COLS_IN_SHMEM_  = false,
+          bool CATS_SUPPORTED_ = false,
+          int LEAF_ALGO_       = 0,
+          int N_ITEMS_         = 1>
+struct KernelTemplateParams {
+  static const bool COLS_IN_SHMEM    = COLS_IN_SHMEM_;
+  static const bool CATS_SUPPORTED   = CATS_SUPPORTED_;
+  static const leaf_algo_t LEAF_ALGO = static_cast<leaf_algo_t>(LEAF_ALGO_);
+  static const int N_ITEMS           = N_ITEMS_;
 
   template <bool _cats_supported>
-  using replace_cats_supported =
-    KernelTemplateParameters<cols_in_shmem, _cats_supported, leaf_algo, n_items>;
-  using inc_leaf_algo =
-    KernelTemplateParameters<cols_in_shmem, cats_supported, leaf_algo + 1, n_items>;
+  using ReplaceCatsSupported =
+    KernelTemplateParams<COLS_IN_SHMEM, _cats_supported, LEAF_ALGO, N_ITEMS>;
+  using NextLeafAlgo = KernelTemplateParams<COLS_IN_SHMEM, CATS_SUPPORTED, LEAF_ALGO + 1, N_ITEMS>;
   template <int _leaf_algo>
-  using replace_leaf_algo =
-    KernelTemplateParameters<cols_in_shmem, cats_supported, _leaf_algo, n_items>;
-  using inc_n_items =
-    KernelTemplateParameters<cols_in_shmem, cats_supported, leaf_algo, n_items + 1>;
+  using ReplaceLeafAlgo = KernelTemplateParams<COLS_IN_SHMEM, CATS_SUPPORTED, _leaf_algo, N_ITEMS>;
+  using IncNItems = KernelTemplateParams<COLS_IN_SHMEM, CATS_SUPPORTED, LEAF_ALGO, N_ITEMS + 1>;
 };
 
 namespace dispatch {
 
 template <class KernelParams, class Func>
-void dispatch_on_n_items(Func func, predict_params& params)
+auto dispatch_on_n_items(Func func, predict_params params) -> decltype(func.run(params))
 {
-  if (params.n_items == KernelParams::n_items) {
-    func.template run<KernelParams>(params);
-  } else if constexpr (KernelParams::n_items < 4) {
-    dispatch_on_n_items<class KernelParams::inc_n_items>(func, params);
+  if (params.n_items == KernelParams::N_ITEMS) {
+    return func.template run<KernelParams>(params);
+  } else if constexpr (KernelParams::N_ITEMS < MAX_N_ITEMS) {
+    return dispatch_on_n_items<class KernelParams::IncNItems>(func, params);
   } else {
-    ASSERT(false, "internal error: n_items > 4 or < 1");
+    ASSERT(false, "internal error: n_items > %d or < 1", MAX_N_ITEMS);
   }
+  return func.run(params);  // appeasing the compiler
 }
 
 template <class KernelParams, class Func>
-void dispatch_on_leaf_algo(Func func, predict_params& params)
+auto dispatch_on_leaf_algo(Func func, predict_params params) -> decltype(func.run(params))
 {
-  if (params.leaf_algo == KernelParams::leaf_algo) {
-    if constexpr (KernelParams::leaf_algo == GROVE_PER_CLASS) {
+  if (params.leaf_algo == KernelParams::LEAF_ALGO) {
+    if constexpr (KernelParams::LEAF_ALGO == GROVE_PER_CLASS) {
       if (params.num_classes <= FIL_TPB) {
         params.block_dim_x = FIL_TPB - FIL_TPB % params.num_classes;
-        using Next         = typename KernelParams::replace_leaf_algo<GROVE_PER_CLASS_FEW_CLASSES>;
-        dispatch_on_n_items<Next>(func, params);
+        using Next         = typename KernelParams::ReplaceLeafAlgo<GROVE_PER_CLASS_FEW_CLASSES>;
+        return dispatch_on_n_items<Next>(func, params);
       } else {
         params.block_dim_x = FIL_TPB;
-        using Next         = typename KernelParams::replace_leaf_algo<GROVE_PER_CLASS_MANY_CLASSES>;
-        dispatch_on_n_items<Next>(func, params);
+        using Next         = typename KernelParams::ReplaceLeafAlgo<GROVE_PER_CLASS_MANY_CLASSES>;
+        return dispatch_on_n_items<Next>(func, params);
       }
     } else {
       params.block_dim_x = FIL_TPB;
-      dispatch_on_n_items<KernelParams>(func, params);
+      return dispatch_on_n_items<KernelParams>(func, params);
     }
-  } else if constexpr (KernelParams::leaf_algo + 1 < static_cast<int>(LEAF_ALGO_INVALID)) {
-    dispatch_on_leaf_algo<class KernelParams::inc_leaf_algo>(func, params);
+  } else if constexpr (KernelParams::NextLeafAlgo::LEAF_ALGO < LEAF_ALGO_INVALID) {
+    return dispatch_on_leaf_algo<class KernelParams::NextLeafAlgo>(func, params);
   } else {
     ASSERT(false, "internal error: dispatch: invalid leaf_algo %d", params.leaf_algo);
   }
+  return func.run(params);  // appeasing the compiler
 }
 
 template <class KernelParams, class Func>
-void dispatch_on_cats_supported(Func func, predict_params& params)
+auto dispatch_on_cats_supported(Func func, predict_params params) -> decltype(func.run(params))
 {
-  if (params.cats_present)
-    dispatch_on_leaf_algo<typename KernelParams::replace_cats_supported<true>>(func, params);
-  else
-    dispatch_on_leaf_algo<typename KernelParams::replace_cats_supported<false>>(func, params);
+  return params.cats_present
+           ? dispatch_on_leaf_algo<typename KernelParams::ReplaceCatsSupported<true>>(func, params)
+           : dispatch_on_leaf_algo<typename KernelParams::ReplaceCatsSupported<false>>(func,
+                                                                                       params);
 }
 
 template <class Func>
-void dispatch_on_cols_in_shmem(Func func, predict_params& params)
+auto dispatch_on_cols_in_shmem(Func func, predict_params params) -> decltype(func.run(params))
 {
-  if (params.cols_in_shmem)
-    dispatch_on_cats_supported<KernelTemplateParameters<true>>(func, params);
-  else
-    dispatch_on_cats_supported<KernelTemplateParameters<false>>(func, params);
+  return params.cols_in_shmem
+           ? dispatch_on_cats_supported<KernelTemplateParams<true>>(func, params)
+           : dispatch_on_cats_supported<KernelTemplateParams<false>>(func, params);
 }
 
 }  // namespace dispatch
 
 template <class Func>
-void dispatch_on_fil_template_params(Func func, predict_params& params)
+auto dispatch_on_fil_template_params(Func func, predict_params params) -> decltype(func.run(params))
 {
-  dispatch::dispatch_on_cols_in_shmem(func, params);
+  return dispatch::dispatch_on_cols_in_shmem(func, params);
 }
 
-/* For an example of Func, see this:
- *
- * We need to instantiate all get_smem_footprint instantiations in infer.cu.
- * The only guarantee is by instantiating
- * dispatch_on_FIL_template<compute_smem_footprint...  in infer.cu. This
- * requires a declaration of this struct with the declaration of the `run` template
- * (i.e. all but one line) visible from infer.cu, as well as this full
- * definition visible from fil.cu. We'll just define it in common.cuh.
- */
+// For an example of Func, see this:
 struct compute_smem_footprint {
-  template <class KernelParams>
-  void run(predict_params& ssp);
+  template <class KernelParams = KernelTemplateParams<>>
+  int run(predict_params ssp);
 };
 
 // infer() calls the inference kernel with the parameters on the stream

@@ -98,9 +98,9 @@ __global__ void transform_k(float* preds,
 }
 
 // needed to avoid expanding the dispatch template into unresolved
-// compute_smem_footprint::run<KernelParams>() calls. In infer.cu, we don't export those symbols,
+// compute_smem_footprint::run() calls. In infer.cu, we don't export those symbols,
 // but rather one symbol for the whole template specialization, as below.
-extern template void dispatch_on_fil_template_params(compute_smem_footprint, predict_params&);
+extern template int dispatch_on_fil_template_params(compute_smem_footprint, predict_params);
 
 struct forest {
   forest(const raft::handle_t& h) : vector_leaf_(0, h.get_stream()), cat_sets_(h.get_stream()) {}
@@ -130,16 +130,15 @@ struct forest {
       shmem_size_params& ssp_ = predict_proba ? proba_ssp_ : class_ssp_;
       ssp_.predict_proba      = predict_proba;
       shmem_size_params ssp   = ssp_;
-      // if n_items was not provided, try from 1 to 4. Otherwise, use as-is.
+      // if n_items was not provided, try from 1 to MAX_N_ITEMS. Otherwise, use as-is.
       int min_n_items = ssp.n_items == 0 ? 1 : ssp.n_items;
       int max_n_items =
-        ssp.n_items == 0 ? (algo_ == algo_t::BATCH_TREE_REORG ? 4 : 1) : ssp.n_items;
+        ssp.n_items == 0 ? (algo_ == algo_t::BATCH_TREE_REORG ? MAX_N_ITEMS : 1) : ssp.n_items;
       for (bool cols_in_shmem : {false, true}) {
         ssp.cols_in_shmem = cols_in_shmem;
         for (ssp.n_items = min_n_items; ssp.n_items <= max_n_items; ++ssp.n_items) {
-          predict_params pp = ssp;
-          dispatch_on_fil_template_params(compute_smem_footprint(), pp);
-          if (pp.shm_sz < max_shm) ssp_ = pp;
+          ssp.shm_sz = dispatch_on_fil_template_params(compute_smem_footprint(), ssp);
+          if (ssp.shm_sz < max_shm) ssp_ = ssp;
         }
       }
       ASSERT(max_shm >= ssp_.shm_sz,
@@ -163,11 +162,6 @@ struct forest {
                    const std::vector<float>& vector_leaf,
                    const forest_params_t* params)
   {
-    int device          = h.get_device();
-    cudaStream_t stream = h.get_stream();
-    // categorical features
-    cat_sets_ = cat_sets_device_owner(cat_sets, stream);
-
     depth_                           = params->depth;
     num_trees_                       = params->num_trees;
     algo_                            = params->algo;
@@ -179,9 +173,11 @@ struct forest {
     proba_ssp_.leaf_algo             = params->leaf_algo;
     proba_ssp_.num_cols              = params->num_cols;
     proba_ssp_.num_classes           = params->num_classes;
-    proba_ssp_.cats_present          = cat_sets_.accessor().cats_present();
+    proba_ssp_.cats_present          = cat_sets.cats_present();
     class_ssp_                       = proba_ssp_;
 
+    int device          = h.get_device();
+    cudaStream_t stream = h.get_stream();
     init_n_items(device);  // n_items takes priority over blocks_per_sm
     init_fixed_block_count(device, params->blocks_per_sm);
 
@@ -195,6 +191,9 @@ struct forest {
                                  cudaMemcpyHostToDevice,
                                  stream));
     }
+
+    // categorical features
+    cat_sets_ = cat_sets_device_owner(cat_sets, stream);
   }
 
   virtual void infer(predict_params params, cudaStream_t stream) = 0;

@@ -36,7 +36,7 @@
 #endif  // __CUDA_ARCH__
 #endif  // CUDA_PRAGMA_UNROLL
 
-#define INLINE_CONFIG __noinline__
+#define INLINE_CONFIG __forceinline__
 
 namespace ML {
 namespace fil {
@@ -786,7 +786,7 @@ __device__ INLINE_CONFIG void load_data(float* sdata,
 template <int NITEMS,
           leaf_algo_t leaf_algo,
           bool cols_in_shmem,
-          bool cats_supported,
+          bool CATS_SUPPORTED,
           class storage_type>
 __global__ void infer_k(storage_type forest, predict_params params)
 {
@@ -823,7 +823,7 @@ __global__ void infer_k(storage_type forest, predict_params params)
       typedef typename leaf_output_t<leaf_algo>::T pred_t;
       vec<NITEMS, pred_t> prediction;
       if (tree < forest.num_trees() && thread_num_rows != 0) {
-        prediction = infer_one_tree<NITEMS, cats_supported, pred_t>(
+        prediction = infer_one_tree<NITEMS, CATS_SUPPORTED, pred_t>(
           forest[tree],
           cols_in_shmem ? sdata + thread_row0 * sdata_stride : block_input + thread_row0 * num_cols,
           cols_in_shmem ? sdata_stride : num_cols,
@@ -855,32 +855,29 @@ size_t shmem_size_params::get_smem_footprint()
 }
 
 template <class KernelParams>
-void compute_smem_footprint::run(predict_params& ssp)
+int compute_smem_footprint::run(predict_params ssp)
 {
-  // need GROVE_PER_CLASS_*_CLASSES
-  if constexpr (KernelParams::leaf_algo != GROVE_PER_CLASS) {
-    ssp.shm_sz = ssp.template get_smem_footprint<KernelParams::n_items, KernelParams::leaf_algo>();
-  }
+  return ssp.template get_smem_footprint<KernelParams::N_ITEMS, KernelParams::LEAF_ALGO>();
 }
 
 // make sure to instantiate all possible get_smem_footprint instantiations
-template void dispatch_on_fil_template_params(compute_smem_footprint, predict_params&);
+template int dispatch_on_fil_template_params(compute_smem_footprint, predict_params);
 
 template <typename storage_type>
 struct infer_k_storage_template {
   storage_type forest;
   cudaStream_t stream;
 
-  template <class KernelParams>
-  void run(predict_params& params)
+  template <class KernelParams = KernelTemplateParams<>>
+  void run(predict_params params)
   {
     params.num_blocks = params.num_blocks != 0
                           ? params.num_blocks
                           : raft::ceildiv(int(params.num_rows), params.n_items);
-    infer_k<KernelParams::n_items,
-            KernelParams::leaf_algo,
-            KernelParams::cols_in_shmem,
-            KernelParams::cats_supported>
+    infer_k<KernelParams::N_ITEMS,
+            KernelParams::LEAF_ALGO,
+            KernelParams::COLS_IN_SHMEM,
+            KernelParams::CATS_SUPPORTED>
       <<<params.num_blocks, params.block_dim_x, params.shm_sz, stream>>>(forest, params);
     CUDA_CHECK(cudaPeekAtLastError());
   }

@@ -301,7 +301,7 @@ struct forest_params_t {
   // at once inside a block (sharing trees means splitting input rows)
   int threads_per_tree;
   // n_items is how many input samples (items) any thread processes. If 0 is given,
-  // choose most (up to 4) that fit into shared memory.
+  // choose most (up to MAX_N_ITEMS) that fit into shared memory.
   int n_items;
 };