From 53e0683155a4b0707fc36c104168ada5c12da2e4 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 16 Apr 2021 23:29:06 -0700
Subject: [PATCH 01/30] try 1

---
 cpp/src/fil/common.cuh  |  4 ++--
 cpp/src/fil/fil.cu      |  5 +----
 cpp/src/fil/infer.cu    | 42 ++++++++++++++++++++++++++++++++++++-----
 cpp/test/sg/fil_test.cu |  2 ++
 4 files changed, 42 insertions(+), 11 deletions(-)
diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 0beca695fc..3a487f050a 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -131,9 +131,9 @@ struct shmem_size_params {
   }
   void compute_smem_footprint();
   template <int NITEMS>
-  size_t get_smem_footprint();
+  int get_smem_footprint();
   template <int NITEMS, leaf_algo_t leaf_algo>
-  size_t get_smem_footprint();
+  int get_smem_footprint();
 };
 
 // predict_params are parameters for prediction
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 4db300d6fb..639509a59d 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -72,13 +72,10 @@ __global__ void transform_k(float* preds, size_t n, output_t output,
 
 struct forest {
   void init_n_items(int device) {
-    int max_shm_std = 48 * 1024;  // 48 KiB
     /// the most shared memory a kernel can request on the GPU in question
     int max_shm = 0;
     CUDA_CHECK(cudaDeviceGetAttribute(
       &max_shm, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
-    // TODO(canonizer): use >48KiB shared memory if available
-    max_shm = std::min(max_shm, max_shm_std);
 
     // searching for the most items per block while respecting the shared
     // memory limits creates a full linear programming problem.
@@ -93,7 +90,7 @@ struct forest {
              ssp.n_items <= (algo_ == algo_t::BATCH_TREE_REORG ? 4 : 1);
              ++ssp.n_items) {
           ssp.compute_smem_footprint();
-          if (ssp.shm_sz < max_shm) ssp_ = ssp;
+          if (ssp.shm_sz <= max_shm) ssp_ = ssp;
         }
       }
       ASSERT(max_shm >= ssp_.shm_sz,
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index edf8337a1e..019b4f6c29 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -576,21 +576,53 @@ __global__ void infer_k(storage_type forest, predict_params params) {
   }
 }
 
+void set_carveout(void* kernel, int footprint, int max_shm) {
+  CUDA_CHECK(
+    cudaFuncSetAttribute(kernel, cudaFuncAttributePreferredSharedMemoryCarveout,
+                         // footprint in % of max_shm, rounding up
+                         (100 * footprint + max_shm - 1) / max_shm));
+  CUDA_CHECK(cudaFuncSetAttribute(
+    kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, footprint));
+}
+
+template <int N, leaf_algo_t l, bool c>
+void set_carveouts(int footprint) {
+  int device = 0;
+  CUDA_CHECK(cudaGetDevice(&device));
+  int max_shm = 0;
+  CUDA_CHECK(cudaDeviceGetAttribute(
+    &max_shm, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
+  if (footprint > max_shm) return;
+
+  set_carveout((void*)infer_k<N, l, c, dense_storage>, footprint, max_shm);
+  set_carveout((void*)infer_k<N, l, c, sparse_storage8>, footprint, max_shm);
+  set_carveout((void*)infer_k<N, l, c, sparse_storage16>, footprint, max_shm);
+}
+
 template <int NITEMS, leaf_algo_t leaf_algo>
-size_t shmem_size_params::get_smem_footprint() {
-  size_t finalize_footprint =
+int shmem_size_params::get_smem_footprint() {
+  int finalize_footprint =
     tree_aggregator_t<NITEMS, leaf_algo>::smem_finalize_footprint(
       cols_shmem_size(), num_classes, predict_proba);
-  size_t accumulate_footprint =
+  int accumulate_footprint =
     tree_aggregator_t<NITEMS, leaf_algo>::smem_accumulate_footprint(
       num_classes) +
     cols_shmem_size();
 
-  return std::max(accumulate_footprint, finalize_footprint);
+  int footprint = std::max(accumulate_footprint, finalize_footprint);
+  int max_shm_std = 48 * 1024;  // 48 KiB available on any architecture
+  if (footprint > max_shm_std) {
+    // for no cols_in_shmem, it is a matter of supporting this config at all
+    set_carveouts<NITEMS, leaf_algo, false>(footprint);
+    // for cols_in_shmem, it will accelerate performance
+    set_carveouts<NITEMS, leaf_algo, true>(footprint);
+    // This much may not suffice, in which case set_carveouts will do nothing.
+  }
+  return footprint;
 }
 
 template <int NITEMS>
-size_t shmem_size_params::get_smem_footprint() {
+int shmem_size_params::get_smem_footprint() {
   switch (leaf_algo) {
     case FLOAT_UNARY_BINARY:
       return get_smem_footprint<NITEMS, FLOAT_UNARY_BINARY>();
diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu
index 1c32e3224e..215deacd77 100644
--- a/cpp/test/sg/fil_test.cu
+++ b/cpp/test/sg/fil_test.cu
@@ -749,6 +749,8 @@ std::vector<FilTestParams> predict_dense_inputs = {
   FIL_TEST_PARAMS(num_rows = 103, num_cols = 100'000, depth = 5, num_trees = 1,
                   algo = BATCH_TREE_REORG, leaf_algo = CATEGORICAL_LEAF,
                   num_classes = 3),
+  // use shared memory opt-in carveout if available, or infer out of L1 cache
+  FIL_TEST_PARAMS(num_cols = ((48 + 1) * 1024) / sizeof(float), algo = NAIVE),
 };
 
 TEST_P(PredictDenseFilTest, Predict) { compare(); }

From 31b885f7c3244456a778ff91ab0b2c2ca1c837ba Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Tue, 25 May 2021 16:15:42 -0700
Subject: [PATCH 02/30] draft of set-and-launch

---
 cpp/src/fil/fil.cu   |  9 ++++++++
 cpp/src/fil/infer.cu | 49 +++++++++++---------------------------------
 2 files changed, 21 insertions(+), 37 deletions(-)

diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 639509a59d..47c2701e8c 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -112,6 +112,11 @@ struct forest {
     fixed_block_count_ = blocks_per_sm * sm_count;
   }
 
+  void init_max_shm(int device) {
+    CUDA_CHECK(cudaDeviceGetAttribute(
+      &max_shm_, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
+  }
+
   void init_common(const raft::handle_t& h, const forest_params_t* params) {
     depth_ = params->depth;
     num_trees_ = params->num_trees;
@@ -127,6 +132,7 @@ struct forest {
     int device = h.get_device();
     init_n_items(device);  // n_items takes priority over blocks_per_sm
     init_fixed_block_count(device, params->blocks_per_sm);
+    init_max_shm(device);
   }
 
   virtual void infer(predict_params params, cudaStream_t stream) = 0;
@@ -250,6 +256,8 @@ struct forest {
     }
   }
 
+  int max_shm() { return max_shm_; }
+
   virtual void free(const raft::handle_t& h) = 0;
   virtual ~forest() {}
 
@@ -261,6 +269,7 @@ struct forest {
   float global_bias_ = 0;
   shmem_size_params class_ssp_, proba_ssp_;
   int fixed_block_count_ = 0;
+  int max_shm_ = 0;
 };
 
 struct dense_forest : forest {
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 019b4f6c29..f00665bb08 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -585,20 +585,6 @@ void set_carveout(void* kernel, int footprint, int max_shm) {
     kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, footprint));
 }
 
-template <int N, leaf_algo_t l, bool c>
-void set_carveouts(int footprint) {
-  int device = 0;
-  CUDA_CHECK(cudaGetDevice(&device));
-  int max_shm = 0;
-  CUDA_CHECK(cudaDeviceGetAttribute(
-    &max_shm, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
-  if (footprint > max_shm) return;
-
-  set_carveout((void*)infer_k<N, l, c, dense_storage>, footprint, max_shm);
-  set_carveout((void*)infer_k<N, l, c, sparse_storage8>, footprint, max_shm);
-  set_carveout((void*)infer_k<N, l, c, sparse_storage16>, footprint, max_shm);
-}
-
 template <int NITEMS, leaf_algo_t leaf_algo>
 int shmem_size_params::get_smem_footprint() {
   int finalize_footprint =
@@ -608,17 +594,7 @@ int shmem_size_params::get_smem_footprint() {
     tree_aggregator_t<NITEMS, leaf_algo>::smem_accumulate_footprint(
       num_classes) +
     cols_shmem_size();
-
-  int footprint = std::max(accumulate_footprint, finalize_footprint);
-  int max_shm_std = 48 * 1024;  // 48 KiB available on any architecture
-  if (footprint > max_shm_std) {
-    // for no cols_in_shmem, it is a matter of supporting this config at all
-    set_carveouts<NITEMS, leaf_algo, false>(footprint);
-    // for cols_in_shmem, it will accelerate performance
-    set_carveouts<NITEMS, leaf_algo, true>(footprint);
-    // This much may not suffice, in which case set_carveouts will do nothing.
-  }
-  return footprint;
+  return std::max(accumulate_footprint, finalize_footprint);
 }
 
 template <int NITEMS>
@@ -659,30 +635,29 @@ void shmem_size_params::compute_smem_footprint() {
 template <leaf_algo_t leaf_algo, bool cols_in_shmem, typename storage_type>
 void infer_k_nitems_launcher(storage_type forest, predict_params params,
                              cudaStream_t stream, int block_dim_x) {
+  void (*kernel)(storage_type, predict_params);
   switch (params.n_items) {
     case 1:
-      infer_k<1, leaf_algo, cols_in_shmem>
-        <<<params.num_blocks, block_dim_x, params.shm_sz, stream>>>(forest,
-                                                                    params);
+      kernel = infer_k<1, leaf_algo, cols_in_shmem>;
       break;
     case 2:
-      infer_k<2, leaf_algo, cols_in_shmem>
-        <<<params.num_blocks, block_dim_x, params.shm_sz, stream>>>(forest,
-                                                                    params);
+      kernel = infer_k<2, leaf_algo, cols_in_shmem>;
       break;
     case 3:
-      infer_k<3, leaf_algo, cols_in_shmem>
-        <<<params.num_blocks, block_dim_x, params.shm_sz, stream>>>(forest,
-                                                                    params);
+      kernel = infer_k<3, leaf_algo, cols_in_shmem>;
       break;
     case 4:
-      infer_k<4, leaf_algo, cols_in_shmem>
-        <<<params.num_blocks, block_dim_x, params.shm_sz, stream>>>(forest,
-                                                                    params);
+      kernel = infer_k<4, leaf_algo, cols_in_shmem>;
       break;
     default:
       ASSERT(false, "internal error: nitems > 4");
   }
+  // Two forests might be using the same handle, so
+  // large batch will run fastest if we set just before launching.
+  // This will not cause a race condition between setting and launching.
+  set_carveout((void*)kernel, params.shm_sz, forest.max_shm());
+  kernel<<<params.num_blocks, block_dim_x, params.shm_sz, stream>>>(forest,
+                                                                    params);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 

From 26480b07b6f6e468d7fce6c92f9e4da189df5927 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Tue, 25 May 2021 19:03:40 -0700
Subject: [PATCH 03/30] set carveout and occupancy-affecting preferred cache
 config before every inference

---
 cpp/src/fil/common.cuh |  2 ++
 cpp/src/fil/fil.cu     | 20 ++++++-----------
 cpp/src/fil/infer.cu   | 49 +++++++++++++++++++++++-------------------
 3 files changed, 36 insertions(+), 35 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 3a487f050a..61ebb048e2 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -123,6 +123,8 @@ struct shmem_size_params {
   bool cols_in_shmem = true;
   /// n_items is the most items per thread that fit into shared memory
   int n_items = 0;
+  /// max_shm is the maximum opt-in shared memory on the device
+  int max_shm = 0;
   /// shm_sz is the associated shared memory footprint
   int shm_sz = INT_MAX;
 
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index aa626eb3eb..a48fa34953 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -76,11 +76,6 @@ __global__ void transform_k(float* preds, size_t n, output_t output,
 
 struct forest {
   void init_n_items(int device) {
-    /// the most shared memory a kernel can request on the GPU in question
-    int max_shm = 0;
-    CUDA_CHECK(cudaDeviceGetAttribute(
-      &max_shm, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
-
     // searching for the most items per block while respecting the shared
     // memory limits creates a full linear programming problem.
     // solving it in a single equation looks less tractable than this
@@ -94,10 +89,10 @@ struct forest {
              ssp.n_items <= (algo_ == algo_t::BATCH_TREE_REORG ? 4 : 1);
              ++ssp.n_items) {
           ssp.compute_smem_footprint();
-          if (ssp.shm_sz <= max_shm) ssp_ = ssp;
+          if (ssp.shm_sz <= ssp.max_shm) ssp_ = ssp;
         }
       }
-      ASSERT(max_shm >= ssp_.shm_sz,
+      ASSERT(ssp_.max_shm >= ssp_.shm_sz,
              "FIL out of shared memory. Perhaps the maximum number of \n"
              "supported classes is exceeded? 5'000 would still be safe.");
     }
@@ -116,12 +111,13 @@ struct forest {
     fixed_block_count_ = blocks_per_sm * sm_count;
   }
 
-  void init_max_shm(int device) {
-    CUDA_CHECK(cudaDeviceGetAttribute(
-      &max_shm_, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
-  }
+  void init_max_shm(int device) {}
 
   void init_common(const raft::handle_t& h, const forest_params_t* params) {
+    int device = h.get_device();
+    CUDA_CHECK(cudaDeviceGetAttribute(
+      &proba_ssp_.max_shm, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
+
     depth_ = params->depth;
     num_trees_ = params->num_trees;
     algo_ = params->algo;
@@ -133,10 +129,8 @@ struct forest {
     proba_ssp_.num_classes = params->num_classes;
     class_ssp_ = proba_ssp_;
 
-    int device = h.get_device();
     init_n_items(device);  // n_items takes priority over blocks_per_sm
     init_fixed_block_count(device, params->blocks_per_sm);
-    init_max_shm(device);
   }
 
   virtual void infer(predict_params params, cudaStream_t stream) = 0;
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 2c401a865e..d4015fe00c 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -16,6 +16,7 @@
 
 #include <algorithm>
 #include <cmath>
+#include <mutex>
 
 #include <thrust/functional.h>
 #include <cuml/fil/multi_sum.cuh>
@@ -24,6 +25,8 @@
 namespace ML {
 namespace fil {
 
+std::mutex shmem_carveout_mutex;
+
 // vec wraps float[N] for cub::BlockReduce
 template <int N, typename T>
 struct vec;
@@ -575,11 +578,13 @@ __global__ void infer_k(storage_type forest, predict_params params) {
 }
 
 void set_carveout(void* kernel, int footprint, int max_shm) {
-  CUDA_CHECK(
+  // ensure optimal occupancy in case default allows less blocks/SM
+  CUDA_CHECK_NO_THROW(
     cudaFuncSetAttribute(kernel, cudaFuncAttributePreferredSharedMemoryCarveout,
                          // footprint in % of max_shm, rounding up
                          (100 * footprint + max_shm - 1) / max_shm));
-  CUDA_CHECK(cudaFuncSetAttribute(
+  // even if the footprint < 48'000, ensure that we reset after previous forest
+  CUDA_CHECK_NO_THROW(cudaFuncSetAttribute(
     kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, footprint));
 }
 
@@ -633,30 +638,30 @@ void shmem_size_params::compute_smem_footprint() {
 template <leaf_algo_t leaf_algo, bool cols_in_shmem, typename storage_type>
 void infer_k_nitems_launcher(storage_type forest, predict_params params,
                              cudaStream_t stream, int block_dim_x) {
-  void (*kernel)(storage_type, predict_params);
-  switch (params.n_items) {
-    case 1:
-      kernel = infer_k<1, leaf_algo, cols_in_shmem>;
-      break;
-    case 2:
-      kernel = infer_k<2, leaf_algo, cols_in_shmem>;
-      break;
-    case 3:
-      kernel = infer_k<3, leaf_algo, cols_in_shmem>;
-      break;
-    case 4:
-      kernel = infer_k<4, leaf_algo, cols_in_shmem>;
-      break;
-    default:
-      ASSERT(false, "internal error: nitems > 4");
-  }
+  void (*kernels[])(storage_type, predict_params) = {
+    nullptr,
+    infer_k<1, leaf_algo, cols_in_shmem, storage_type>,
+    infer_k<2, leaf_algo, cols_in_shmem, storage_type>,
+    infer_k<3, leaf_algo, cols_in_shmem, storage_type>,
+    infer_k<4, leaf_algo, cols_in_shmem, storage_type>,
+  };
+  ASSERT(params.n_items <= 4, "internal error: nitems > 4");
+  void (*kernel)(storage_type, predict_params) = kernels[params.n_items];
   // Two forests might be using the same handle, so
   // large batch will run fastest if we set just before launching.
-  // This will not cause a race condition between setting and launching.
-  set_carveout((void*)kernel, params.shm_sz, forest.max_shm());
+  // This will not cause a race condition between setting and launching despite
+  // CPU-GPU asynchronicity.
+  shmem_carveout_mutex.lock();
+  set_carveout((void*)kernel, params.shm_sz, params.max_shm);
   kernel<<<params.num_blocks, block_dim_x, params.shm_sz, stream>>>(forest,
                                                                     params);
-  CUDA_CHECK(cudaPeekAtLastError());
+  CUDA_CHECK_NO_THROW(cudaPeekAtLastError());
+  shmem_carveout_mutex.unlock();  // a CUDA error should not hang other threads
+  if (cudaPeekAtLastError() != cudaSuccess) {
+    // a wrong thread might throw, it's OK
+    throw raft::cuda_error(
+      "CUDA error in ML::fil::predict() (see stdout for details)");
+  }
 }
 
 template <leaf_algo_t leaf_algo, typename storage_type>

From a037f169da84d08cf8475430f726177d1d24d427 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Tue, 25 May 2021 19:13:01 -0700
Subject: [PATCH 04/30] other review comments

---
 cpp/src/fil/common.cuh  |  6 +++---
 cpp/src/fil/infer.cu    | 12 ++++++------
 cpp/test/sg/fil_test.cu |  3 ++-
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 61ebb048e2..53ebe7be30 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -126,16 +126,16 @@ struct shmem_size_params {
   /// max_shm is the maximum opt-in shared memory on the device
   int max_shm = 0;
   /// shm_sz is the associated shared memory footprint
-  int shm_sz = INT_MAX;
+  size_t shm_sz = INT_MAX;
 
   __host__ __device__ size_t cols_shmem_size() {
     return cols_in_shmem ? sizeof(float) * num_cols * n_items : 0;
   }
   void compute_smem_footprint();
   template <int NITEMS>
-  int get_smem_footprint();
+  size_t get_smem_footprint();
   template <int NITEMS, leaf_algo_t leaf_algo>
-  int get_smem_footprint();
+  size_t get_smem_footprint();
 };
 
 // predict_params are parameters for prediction
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index d4015fe00c..5ef32f19b5 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -577,23 +577,23 @@ __global__ void infer_k(storage_type forest, predict_params params) {
   }
 }
 
-void set_carveout(void* kernel, int footprint, int max_shm) {
+void set_carveout(void* kernel, size_t footprint, int max_shm) {
   // ensure optimal occupancy in case default allows less blocks/SM
   CUDA_CHECK_NO_THROW(
     cudaFuncSetAttribute(kernel, cudaFuncAttributePreferredSharedMemoryCarveout,
                          // footprint in % of max_shm, rounding up
                          (100 * footprint + max_shm - 1) / max_shm));
-  // even if the footprint < 48'000, ensure that we reset after previous forest
+  // even if the footprint < 48 * 1024, ensure that we reset after previous forest
   CUDA_CHECK_NO_THROW(cudaFuncSetAttribute(
     kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, footprint));
 }
 
 template <int NITEMS, leaf_algo_t leaf_algo>
-int shmem_size_params::get_smem_footprint() {
-  int finalize_footprint =
+size_t shmem_size_params::get_smem_footprint() {
+  size_t finalize_footprint =
     tree_aggregator_t<NITEMS, leaf_algo>::smem_finalize_footprint(
       cols_shmem_size(), num_classes, predict_proba);
-  int accumulate_footprint =
+  size_t accumulate_footprint =
     tree_aggregator_t<NITEMS, leaf_algo>::smem_accumulate_footprint(
       num_classes) +
     cols_shmem_size();
@@ -601,7 +601,7 @@ int shmem_size_params::get_smem_footprint() {
 }
 
 template <int NITEMS>
-int shmem_size_params::get_smem_footprint() {
+size_t shmem_size_params::get_smem_footprint() {
   switch (leaf_algo) {
     case FLOAT_UNARY_BINARY:
       return get_smem_footprint<NITEMS, FLOAT_UNARY_BINARY>();
diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu
index 8d8b1a41c9..63c2eaeb32 100644
--- a/cpp/test/sg/fil_test.cu
+++ b/cpp/test/sg/fil_test.cu
@@ -763,7 +763,8 @@ std::vector<FilTestParams> predict_dense_inputs = {
                   algo = BATCH_TREE_REORG, leaf_algo = CATEGORICAL_LEAF,
                   num_classes = 3),
   // use shared memory opt-in carveout if available, or infer out of L1 cache
-  FIL_TEST_PARAMS(num_cols = ((48 + 1) * 1024) / sizeof(float), algo = NAIVE),
+  FIL_TEST_PARAMS(num_rows = 103, num_cols = ((48 + 1) * 1024) / sizeof(float),
+                  algo = NAIVE),
 };
 
 TEST_P(PredictDenseFilTest, Predict) { compare(); }

From 2a1d622105aa094f9614bba1c7a585a4e890f1b7 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 11 Jun 2021 23:30:59 -0700
Subject: [PATCH 05/30] DRY: rewrote in terms of
 dispatch_on_FIL_template_params<func, storage_type>(predict_params, ...)

---
 cpp/src/fil/common.cuh   |  89 ++++++++++++++++++++++++++-
 cpp/src/fil/fil.cu       |  36 ++++++++++-
 cpp/src/fil/infer.cu     | 127 +++++----------------------------------
 cpp/src/fil/internal.cuh |   6 ++
 4 files changed, 140 insertions(+), 118 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 53ebe7be30..65e66055ba 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -125,15 +125,14 @@ struct shmem_size_params {
   int n_items = 0;
   /// max_shm is the maximum opt-in shared memory on the device
   int max_shm = 0;
+  // blockdim_x is the CUDA block size
+  int blockdim_x = 0;
   /// shm_sz is the associated shared memory footprint
   size_t shm_sz = INT_MAX;
 
   __host__ __device__ size_t cols_shmem_size() {
     return cols_in_shmem ? sizeof(float) * num_cols * n_items : 0;
   }
-  void compute_smem_footprint();
-  template <int NITEMS>
-  size_t get_smem_footprint();
   template <int NITEMS, leaf_algo_t leaf_algo>
   size_t get_smem_footprint();
 };
@@ -158,6 +157,90 @@ struct predict_params : shmem_size_params {
   int num_blocks;
 };
 
+namespace dispatch {
+
+template <template <bool, leaf_algo_t, int> class Func, typename storage_type,
+          bool cols_in_shmem, leaf_algo_t leaf_algo, int n_items,
+          typename... Args>
+void dispatch_final(predict_params params, Args... args) {
+  Func<cols_in_shmem, leaf_algo, n_items>::template run<storage_type>(params,
+                                                                      args...);
+}
+
+template <template <bool, leaf_algo_t, int> class Func, typename storage_type,
+          bool cols_in_shmem, leaf_algo_t leaf_algo, typename... Args>
+void dispatch_on_nitems(predict_params params, Args... args) {
+  switch (params.n_items) {
+    case 1:
+      dispatch_final<Func, storage_type, cols_in_shmem, leaf_algo, 1>(params,
+                                                                      args...);
+      break;
+    case 2:
+      dispatch_final<Func, storage_type, cols_in_shmem, leaf_algo, 2>(params,
+                                                                      args...);
+      break;
+    case 3:
+      dispatch_final<Func, storage_type, cols_in_shmem, leaf_algo, 3>(params,
+                                                                      args...);
+      break;
+    case 4:
+      dispatch_final<Func, storage_type, cols_in_shmem, leaf_algo, 4>(params,
+                                                                      args...);
+      break;
+    default:
+      ASSERT(false, "internal error: nitems > 4");
+  }
+}
+
+template <template <bool, leaf_algo_t, int> class Func, typename storage_type,
+          bool cols_in_shmem, typename... Args>
+void dispatch_on_leaf_algo(predict_params params, Args... args) {
+  switch (params.leaf_algo) {
+    case FLOAT_UNARY_BINARY:
+      params.blockdim_x = FIL_TPB;
+      dispatch_on_nitems<Func, storage_type, cols_in_shmem, FLOAT_UNARY_BINARY>(
+        params, args...);
+      break;
+    case GROVE_PER_CLASS:
+      if (params.num_classes > FIL_TPB) {
+        params.leaf_algo = GROVE_PER_CLASS_MANY_CLASSES;
+        params.blockdim_x = FIL_TPB;
+        dispatch_on_nitems<Func, storage_type, cols_in_shmem,
+                           GROVE_PER_CLASS_MANY_CLASSES>(params, args...);
+      } else {
+        params.leaf_algo = GROVE_PER_CLASS_FEW_CLASSES;
+        params.blockdim_x = FIL_TPB - FIL_TPB % params.num_classes;
+        dispatch_on_nitems<Func, storage_type, cols_in_shmem,
+                           GROVE_PER_CLASS_FEW_CLASSES>(params, args...);
+      }
+      break;
+    case CATEGORICAL_LEAF:
+      params.blockdim_x = FIL_TPB;
+      dispatch_on_nitems<Func, storage_type, cols_in_shmem, CATEGORICAL_LEAF>(
+        params, args...);
+      break;
+    default:
+      ASSERT(false, "internal error: invalid leaf_algo");
+  }
+}
+
+template <template <bool, leaf_algo_t, int> class Func, typename storage_type,
+          typename... Args>
+void dispatch_on_cols_in_shmem(predict_params params, Args... args) {
+  if (params.cols_in_shmem)
+    dispatch_on_leaf_algo<Func, storage_type, true>(params, args...);
+  else
+    dispatch_on_leaf_algo<Func, storage_type, false>(params, args...);
+}
+
+}  // namespace dispatch
+
+template <template <bool, leaf_algo_t, int> class Func, typename storage_type,
+          typename... Args>
+void dispatch_on_FIL_template_params(predict_params params, Args... args) {
+  dispatch::dispatch_on_cols_in_shmem<Func, storage_type>(params, args...);
+}
+
 // infer() calls the inference kernel with the parameters on the stream
 template <typename storage_type>
 void infer(storage_type forest, predict_params params, cudaStream_t stream);
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index a48fa34953..136f38dfde 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -75,6 +75,14 @@ __global__ void transform_k(float* preds, size_t n, output_t output,
 }
 
 struct forest {
+  template <bool cols_in_shmem, leaf_algo_t leaf_algo, int nitems>
+  struct compute_smem_footprint {
+    template <typename storage_type>
+    static void run(predict_params ssp) {
+      ssp.shm_sz = ssp.get_smem_footprint<nitems, leaf_algo>();
+    }
+  };
+
   void init_n_items(int device) {
     // searching for the most items per block while respecting the shared
     // memory limits creates a full linear programming problem.
@@ -88,7 +96,8 @@ struct forest {
         for (ssp.n_items = 1;
              ssp.n_items <= (algo_ == algo_t::BATCH_TREE_REORG ? 4 : 1);
              ++ssp.n_items) {
-          ssp.compute_smem_footprint();
+          dispatch_on_FIL_template_params<compute_smem_footprint,
+                                          dense_storage>(predict_params(ssp));
           if (ssp.shm_sz <= ssp.max_shm) ssp_ = ssp;
         }
       }
@@ -111,8 +120,6 @@ struct forest {
     fixed_block_count_ = blocks_per_sm * sm_count;
   }
 
-  void init_max_shm(int device) {}
-
   void init_common(const raft::handle_t& h, const forest_params_t* params) {
     int device = h.get_device();
     CUDA_CHECK(cudaDeviceGetAttribute(
@@ -270,6 +277,22 @@ struct forest {
   int max_shm_ = 0;
 };
 
+template <bool cols_in_shmem, leaf_algo_t leaf_algo, int nitems>
+struct enable_smem_carveout {
+  template <typename storage_type>
+  static void run(predict_params params, int max_shm) {
+    void (*kernel)(storage_type, predict_params) =
+      infer_k<nitems, leaf_algo, cols_in_shmem, storage_type>;
+    // ensure optimal occupancy and L1 cache size in case config at launch is suboptimal
+    CUDA_CHECK_NO_THROW(cudaFuncSetAttribute(
+      kernel, cudaFuncAttributePreferredSharedMemoryCarveout,
+      cudaFuncCachePreferL1));
+    // even if the footprint < 48 * 1024, ensure that we reset after previous forest
+    CUDA_CHECK_NO_THROW(cudaFuncSetAttribute(
+      kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shm));
+  }
+};
+
 struct dense_forest : forest {
   void transform_trees(const dense_node* nodes) {
     /* Populate node information:
@@ -313,6 +336,9 @@ struct dense_forest : forest {
     CUDA_CHECK(cudaMemcpyAsync(nodes_, h_nodes_.data(),
                                num_nodes * sizeof(dense_node),
                                cudaMemcpyHostToDevice, h.get_stream()));
+
+    dispatch_on_FIL_template_params<enable_smem_carveout, dense_storage>(
+      predict_params(class_ssp_), max_shm_);
     // copy must be finished before freeing the host data
     CUDA_CHECK(cudaStreamSynchronize(h.get_stream()));
     h_nodes_.clear();
@@ -356,6 +382,10 @@ struct sparse_forest : forest {
       sizeof(node_t) * num_nodes_, h.get_stream());
     CUDA_CHECK(cudaMemcpyAsync(nodes_, nodes, sizeof(node_t) * num_nodes_,
                                cudaMemcpyHostToDevice, h.get_stream()));
+
+    dispatch_on_FIL_template_params<enable_smem_carveout,
+                                    sparse_storage<node_t>>(
+      predict_params(class_ssp_), max_shm_);
   }
 
   virtual void infer(predict_params params, cudaStream_t stream) override {
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 5ef32f19b5..74e397ac3e 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -16,7 +16,6 @@
 
 #include <algorithm>
 #include <cmath>
-#include <mutex>
 
 #include <thrust/functional.h>
 #include <cuml/fil/multi_sum.cuh>
@@ -25,8 +24,6 @@
 namespace ML {
 namespace fil {
 
-std::mutex shmem_carveout_mutex;
-
 // vec wraps float[N] for cub::BlockReduce
 template <int N, typename T>
 struct vec;
@@ -577,17 +574,6 @@ __global__ void infer_k(storage_type forest, predict_params params) {
   }
 }
 
-void set_carveout(void* kernel, size_t footprint, int max_shm) {
-  // ensure optimal occupancy in case default allows less blocks/SM
-  CUDA_CHECK_NO_THROW(
-    cudaFuncSetAttribute(kernel, cudaFuncAttributePreferredSharedMemoryCarveout,
-                         // footprint in % of max_shm, rounding up
-                         (100 * footprint + max_shm - 1) / max_shm));
-  // even if the footprint < 48 * 1024, ensure that we reset after previous forest
-  CUDA_CHECK_NO_THROW(cudaFuncSetAttribute(
-    kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, footprint));
-}
-
 template <int NITEMS, leaf_algo_t leaf_algo>
 size_t shmem_size_params::get_smem_footprint() {
   size_t finalize_footprint =
@@ -600,108 +586,25 @@ size_t shmem_size_params::get_smem_footprint() {
   return std::max(accumulate_footprint, finalize_footprint);
 }
 
-template <int NITEMS>
-size_t shmem_size_params::get_smem_footprint() {
-  switch (leaf_algo) {
-    case FLOAT_UNARY_BINARY:
-      return get_smem_footprint<NITEMS, FLOAT_UNARY_BINARY>();
-    case CATEGORICAL_LEAF:
-      return get_smem_footprint<NITEMS, CATEGORICAL_LEAF>();
-    case GROVE_PER_CLASS:
-      if (num_classes > FIL_TPB)
-        return get_smem_footprint<NITEMS, GROVE_PER_CLASS_MANY_CLASSES>();
-      return get_smem_footprint<NITEMS, GROVE_PER_CLASS_FEW_CLASSES>();
-    default:
-      ASSERT(false, "internal error: unexpected leaf_algo_t");
-  }
-}
-
-void shmem_size_params::compute_smem_footprint() {
-  switch (n_items) {
-    case 1:
-      shm_sz = get_smem_footprint<1>();
-      break;
-    case 2:
-      shm_sz = get_smem_footprint<2>();
-      break;
-    case 3:
-      shm_sz = get_smem_footprint<3>();
-      break;
-    case 4:
-      shm_sz = get_smem_footprint<4>();
-      break;
-    default:
-      ASSERT(false, "internal error: n_items > 4");
+template <bool cols_in_shmem, leaf_algo_t leaf_algo, int nitems>
+struct infer_k_launcher {
+  template <typename storage_type>
+  static void run(predict_params params, storage_type forest,
+                  cudaStream_t stream) {
+    params.num_blocks = params.num_blocks != 0
+                          ? params.num_blocks
+                          : raft::ceildiv(int(params.num_rows), params.n_items);
+    infer_k<nitems, leaf_algo, cols_in_shmem, storage_type>
+      <<<params.num_blocks, params.blockdim_x, params.shm_sz, stream>>>(forest,
+                                                                        params);
+    CUDA_CHECK(cudaPeekAtLastError());
   }
-}
-
-template <leaf_algo_t leaf_algo, bool cols_in_shmem, typename storage_type>
-void infer_k_nitems_launcher(storage_type forest, predict_params params,
-                             cudaStream_t stream, int block_dim_x) {
-  void (*kernels[])(storage_type, predict_params) = {
-    nullptr,
-    infer_k<1, leaf_algo, cols_in_shmem, storage_type>,
-    infer_k<2, leaf_algo, cols_in_shmem, storage_type>,
-    infer_k<3, leaf_algo, cols_in_shmem, storage_type>,
-    infer_k<4, leaf_algo, cols_in_shmem, storage_type>,
-  };
-  ASSERT(params.n_items <= 4, "internal error: nitems > 4");
-  void (*kernel)(storage_type, predict_params) = kernels[params.n_items];
-  // Two forests might be using the same handle, so
-  // large batch will run fastest if we set just before launching.
-  // This will not cause a race condition between setting and launching despite
-  // CPU-GPU asynchronicity.
-  shmem_carveout_mutex.lock();
-  set_carveout((void*)kernel, params.shm_sz, params.max_shm);
-  kernel<<<params.num_blocks, block_dim_x, params.shm_sz, stream>>>(forest,
-                                                                    params);
-  CUDA_CHECK_NO_THROW(cudaPeekAtLastError());
-  shmem_carveout_mutex.unlock();  // a CUDA error should not hang other threads
-  if (cudaPeekAtLastError() != cudaSuccess) {
-    // a wrong thread might throw, it's OK
-    throw raft::cuda_error(
-      "CUDA error in ML::fil::predict() (see stdout for details)");
-  }
-}
-
-template <leaf_algo_t leaf_algo, typename storage_type>
-void infer_k_launcher(storage_type forest, predict_params params,
-                      cudaStream_t stream, int blockdim_x) {
-  params.num_blocks = params.num_blocks != 0
-                        ? params.num_blocks
-                        : raft::ceildiv(int(params.num_rows), params.n_items);
-  if (params.cols_in_shmem) {
-    infer_k_nitems_launcher<leaf_algo, true>(forest, params, stream,
-                                             blockdim_x);
-  } else {
-    infer_k_nitems_launcher<leaf_algo, false>(forest, params, stream,
-                                              blockdim_x);
-  }
-}
+};
 
 template <typename storage_type>
 void infer(storage_type forest, predict_params params, cudaStream_t stream) {
-  switch (params.leaf_algo) {
-    case FLOAT_UNARY_BINARY:
-      infer_k_launcher<FLOAT_UNARY_BINARY>(forest, params, stream, FIL_TPB);
-      break;
-    case GROVE_PER_CLASS:
-      if (params.num_classes > FIL_TPB) {
-        params.leaf_algo = GROVE_PER_CLASS_MANY_CLASSES;
-        infer_k_launcher<GROVE_PER_CLASS_MANY_CLASSES>(forest, params, stream,
-                                                       FIL_TPB);
-      } else {
-        params.leaf_algo = GROVE_PER_CLASS_FEW_CLASSES;
-        infer_k_launcher<GROVE_PER_CLASS_FEW_CLASSES>(
-          forest, params, stream, FIL_TPB - FIL_TPB % params.num_classes);
-      }
-      break;
-    case CATEGORICAL_LEAF:
-      infer_k_launcher<CATEGORICAL_LEAF>(forest, params, stream, FIL_TPB);
-      break;
-    default:
-      ASSERT(false, "internal error: invalid leaf_algo");
-  }
+  dispatch_on_FIL_template_params<infer_k_launcher, storage_type>(
+    params, forest, stream);
 }
 
 template void infer<dense_storage>(dense_storage forest, predict_params params,
diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh
index 04a150056f..2d7e526dac 100644
--- a/cpp/src/fil/internal.cuh
+++ b/cpp/src/fil/internal.cuh
@@ -269,5 +269,11 @@ template <typename fil_node_t>
 void init_sparse(const raft::handle_t& h, forest_t* pf, const int* trees,
                  const fil_node_t* nodes, const forest_params_t* params);
 
+struct predict_params;
+
+template <int NITEMS, leaf_algo_t leaf_algo, bool cols_in_shmem,
+          class storage_type>
+__global__ void infer_k(storage_type forest, predict_params params);
+
 }  // namespace fil
 }  // namespace ML

From 5cf38d3aba4772ce1b49a51b6a905d48791469cb Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 11 Jun 2021 23:38:10 -0700
Subject: [PATCH 06/30] style, clean up diff

---
 cpp/src/fil/common.cuh | 20 ++++++++++----------
 cpp/src/fil/fil.cu     | 12 ++++++------
 cpp/src/fil/infer.cu   |  4 ++--
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 65e66055ba..9ba97a248a 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -128,7 +128,7 @@ struct shmem_size_params {
   // blockdim_x is the CUDA block size
   int blockdim_x = 0;
   /// shm_sz is the associated shared memory footprint
-  size_t shm_sz = INT_MAX;
+  int shm_sz = INT_MAX;
 
   __host__ __device__ size_t cols_shmem_size() {
     return cols_in_shmem ? sizeof(float) * num_cols * n_items : 0;
@@ -169,7 +169,7 @@ void dispatch_final(predict_params params, Args... args) {
 
 template <template <bool, leaf_algo_t, int> class Func, typename storage_type,
           bool cols_in_shmem, leaf_algo_t leaf_algo, typename... Args>
-void dispatch_on_nitems(predict_params params, Args... args) {
+void dispatch_on_n_items(predict_params params, Args... args) {
   switch (params.n_items) {
     case 1:
       dispatch_final<Func, storage_type, cols_in_shmem, leaf_algo, 1>(params,
@@ -188,7 +188,7 @@ void dispatch_on_nitems(predict_params params, Args... args) {
                                                                       args...);
       break;
     default:
-      ASSERT(false, "internal error: nitems > 4");
+      ASSERT(false, "internal error: n_items > 4");
   }
 }
 
@@ -198,25 +198,25 @@ void dispatch_on_leaf_algo(predict_params params, Args... args) {
   switch (params.leaf_algo) {
     case FLOAT_UNARY_BINARY:
       params.blockdim_x = FIL_TPB;
-      dispatch_on_nitems<Func, storage_type, cols_in_shmem, FLOAT_UNARY_BINARY>(
-        params, args...);
+      dispatch_on_n_items<Func, storage_type, cols_in_shmem,
+                          FLOAT_UNARY_BINARY>(params, args...);
       break;
     case GROVE_PER_CLASS:
       if (params.num_classes > FIL_TPB) {
         params.leaf_algo = GROVE_PER_CLASS_MANY_CLASSES;
         params.blockdim_x = FIL_TPB;
-        dispatch_on_nitems<Func, storage_type, cols_in_shmem,
-                           GROVE_PER_CLASS_MANY_CLASSES>(params, args...);
+        dispatch_on_n_items<Func, storage_type, cols_in_shmem,
+                            GROVE_PER_CLASS_MANY_CLASSES>(params, args...);
       } else {
         params.leaf_algo = GROVE_PER_CLASS_FEW_CLASSES;
         params.blockdim_x = FIL_TPB - FIL_TPB % params.num_classes;
-        dispatch_on_nitems<Func, storage_type, cols_in_shmem,
-                           GROVE_PER_CLASS_FEW_CLASSES>(params, args...);
+        dispatch_on_n_items<Func, storage_type, cols_in_shmem,
+                            GROVE_PER_CLASS_FEW_CLASSES>(params, args...);
       }
       break;
     case CATEGORICAL_LEAF:
       params.blockdim_x = FIL_TPB;
-      dispatch_on_nitems<Func, storage_type, cols_in_shmem, CATEGORICAL_LEAF>(
+      dispatch_on_n_items<Func, storage_type, cols_in_shmem, CATEGORICAL_LEAF>(
         params, args...);
       break;
     default:
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 136f38dfde..66d28826a9 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -75,11 +75,11 @@ __global__ void transform_k(float* preds, size_t n, output_t output,
 }
 
 struct forest {
-  template <bool cols_in_shmem, leaf_algo_t leaf_algo, int nitems>
+  template <bool cols_in_shmem, leaf_algo_t leaf_algo, int n_items>
   struct compute_smem_footprint {
     template <typename storage_type>
     static void run(predict_params ssp) {
-      ssp.shm_sz = ssp.get_smem_footprint<nitems, leaf_algo>();
+      ssp.shm_sz = ssp.get_smem_footprint<n_items, leaf_algo>();
     }
   };
 
@@ -277,18 +277,18 @@ struct forest {
   int max_shm_ = 0;
 };
 
-template <bool cols_in_shmem, leaf_algo_t leaf_algo, int nitems>
+template <bool cols_in_shmem, leaf_algo_t leaf_algo, int n_items>
 struct enable_smem_carveout {
   template <typename storage_type>
   static void run(predict_params params, int max_shm) {
     void (*kernel)(storage_type, predict_params) =
-      infer_k<nitems, leaf_algo, cols_in_shmem, storage_type>;
+      infer_k<n_items, leaf_algo, cols_in_shmem, storage_type>;
     // ensure optimal occupancy and L1 cache size in case config at launch is suboptimal
-    CUDA_CHECK_NO_THROW(cudaFuncSetAttribute(
+    CUDA_CHECK(cudaFuncSetAttribute(
       kernel, cudaFuncAttributePreferredSharedMemoryCarveout,
       cudaFuncCachePreferL1));
     // even if the footprint < 48 * 1024, ensure that we reset after previous forest
-    CUDA_CHECK_NO_THROW(cudaFuncSetAttribute(
+    CUDA_CHECK(cudaFuncSetAttribute(
       kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shm));
   }
 };
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 74e397ac3e..64de05ef53 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -586,7 +586,7 @@ size_t shmem_size_params::get_smem_footprint() {
   return std::max(accumulate_footprint, finalize_footprint);
 }
 
-template <bool cols_in_shmem, leaf_algo_t leaf_algo, int nitems>
+template <bool cols_in_shmem, leaf_algo_t leaf_algo, int n_items>
 struct infer_k_launcher {
   template <typename storage_type>
   static void run(predict_params params, storage_type forest,
@@ -594,7 +594,7 @@ struct infer_k_launcher {
     params.num_blocks = params.num_blocks != 0
                           ? params.num_blocks
                           : raft::ceildiv(int(params.num_rows), params.n_items);
-    infer_k<nitems, leaf_algo, cols_in_shmem, storage_type>
+    infer_k<n_items, leaf_algo, cols_in_shmem, storage_type>
       <<<params.num_blocks, params.blockdim_x, params.shm_sz, stream>>>(forest,
                                                                         params);
     CUDA_CHECK(cudaPeekAtLastError());

From 258e6747c78f306ddcdfbb4a82f9794ebf87db5e Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Mon, 14 Jun 2021 18:11:37 -0700
Subject: [PATCH 07/30] fixed bugs and linker issues

---
 cpp/src/fil/common.cuh | 43 ++++++++++++++++++++++++++++--------------
 cpp/src/fil/fil.cu     | 36 ++++++++++++++---------------------
 cpp/src/fil/infer.cu   |  6 +++++-
 3 files changed, 48 insertions(+), 37 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 7df08d711d..c15fa4a64b 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -171,14 +171,14 @@ namespace dispatch {
 template <template <bool, leaf_algo_t, int> class Func, typename storage_type,
           bool cols_in_shmem, leaf_algo_t leaf_algo, int n_items,
           typename... Args>
-void dispatch_final(predict_params params, Args... args) {
+void dispatch_final(predict_params& params, Args... args) {
   Func<cols_in_shmem, leaf_algo, n_items>::template run<storage_type>(params,
                                                                       args...);
 }
 
 template <template <bool, leaf_algo_t, int> class Func, typename storage_type,
           bool cols_in_shmem, leaf_algo_t leaf_algo, typename... Args>
-void dispatch_on_n_items(predict_params params, Args... args) {
+void dispatch_on_n_items(predict_params& params, Args... args) {
   switch (params.n_items) {
     case 1:
       dispatch_final<Func, storage_type, cols_in_shmem, leaf_algo, 1>(params,
@@ -203,7 +203,7 @@ void dispatch_on_n_items(predict_params params, Args... args) {
 
 template <template <bool, leaf_algo_t, int> class Func, typename storage_type,
           bool cols_in_shmem, typename... Args>
-void dispatch_on_leaf_algo(predict_params params, Args... args) {
+void dispatch_on_leaf_algo(predict_params& params, Args... args) {
   switch (params.leaf_algo) {
     case FLOAT_UNARY_BINARY:
       params.blockdim_x = FIL_TPB;
@@ -212,15 +212,15 @@ void dispatch_on_leaf_algo(predict_params params, Args... args) {
       break;
     case GROVE_PER_CLASS:
       if (params.num_classes > FIL_TPB) {
-        params.leaf_algo = GROVE_PER_CLASS_MANY_CLASSES;
-        params.blockdim_x = FIL_TPB;
-        dispatch_on_n_items<Func, storage_type, cols_in_shmem,
-                            GROVE_PER_CLASS_MANY_CLASSES>(params, args...);
+        case GROVE_PER_CLASS_MANY_CLASSES:
+          params.blockdim_x = FIL_TPB;
+          dispatch_on_n_items<Func, storage_type, cols_in_shmem,
+                              GROVE_PER_CLASS_MANY_CLASSES>(params, args...);
       } else {
-        params.leaf_algo = GROVE_PER_CLASS_FEW_CLASSES;
-        params.blockdim_x = FIL_TPB - FIL_TPB % params.num_classes;
-        dispatch_on_n_items<Func, storage_type, cols_in_shmem,
-                            GROVE_PER_CLASS_FEW_CLASSES>(params, args...);
+        case GROVE_PER_CLASS_FEW_CLASSES:
+          params.blockdim_x = FIL_TPB - FIL_TPB % params.num_classes;
+          dispatch_on_n_items<Func, storage_type, cols_in_shmem,
+                              GROVE_PER_CLASS_FEW_CLASSES>(params, args...);
       }
       break;
     case CATEGORICAL_LEAF:
@@ -229,13 +229,14 @@ void dispatch_on_leaf_algo(predict_params params, Args... args) {
         params, args...);
       break;
     default:
-      ASSERT(false, "internal error: invalid leaf_algo");
+      ASSERT(false, "internal error: dispatch: invalid leaf_algo %d",
+             params.leaf_algo);
   }
 }
 
 template <template <bool, leaf_algo_t, int> class Func, typename storage_type,
           typename... Args>
-void dispatch_on_cols_in_shmem(predict_params params, Args... args) {
+void dispatch_on_cols_in_shmem(predict_params& params, Args... args) {
   if (params.cols_in_shmem)
     dispatch_on_leaf_algo<Func, storage_type, true>(params, args...);
   else
@@ -246,10 +247,24 @@ void dispatch_on_cols_in_shmem(predict_params params, Args... args) {
 
 template <template <bool, leaf_algo_t, int> class Func, typename storage_type,
           typename... Args>
-void dispatch_on_FIL_template_params(predict_params params, Args... args) {
+void dispatch_on_FIL_template_params(predict_params& params, Args... args) {
   dispatch::dispatch_on_cols_in_shmem<Func, storage_type>(params, args...);
 }
 
+// we need to instantiate all get_smem_footprint instantiations in infer.cu.
+// The only guarantee is by instantiating
+// dispatch_on_FIL_template<compute_smem_footprint...  in infer.cu. This
+// requires a declaration of this struct with the declaration of the run method
+// (i.e. all but one line) visible from infer.cu, as well as this full
+// definition visible from fil.cu. We'll just define it in common.cuh.
+template <bool cols_in_shmem, leaf_algo_t leaf_algo, int n_items>
+struct compute_smem_footprint {
+  template <typename storage_type>
+  static void run(predict_params& ssp) {
+    ssp.shm_sz = ssp.get_smem_footprint<n_items, leaf_algo>();
+  }
+};
+
 // infer() calls the inference kernel with the parameters on the stream
 template <typename storage_type>
 void infer(storage_type forest, predict_params params, cudaStream_t stream);
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index a42f9f527b..342c399445 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -74,40 +74,30 @@ __global__ void transform_k(float* preds, size_t n, output_t output,
     preds[i] = result;
 }
 
-struct forest {
-  
-  template <bool cols_in_shmem, leaf_algo_t leaf_algo, int n_items>
-  struct compute_smem_footprint {
-    template <typename storage_type>
-    static void run(predict_params ssp) {
-      ssp.shm_sz = ssp.get_smem_footprint<n_items, leaf_algo>();
-    }
-  };
+extern template void dispatch_on_FIL_template_params<
+  compute_smem_footprint, dense_storage>(predict_params&);
 
+struct forest {
   void init_n_items(int device) {
-    int max_shm_std = 48 * 1024;  // 48 KiB
-    /// the most shared memory a kernel can request on the GPU in question
-    int max_shm = 0;
     CUDA_CHECK(cudaDeviceGetAttribute(
-      &max_shm, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
+      &max_shm_, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
     /* Our GPUs have been growing the shared memory size generation after
        generation. Eventually, a CUDA GPU might come by that supports more 
        shared memory that would fit into unsigned 16-bit int. For such a GPU,
        we would have otherwise silently overflowed the index calculation due
        to short division. It would have failed cpp tests, but we might forget
        about this source of bugs, if not for the failing assert. */
-    ASSERT(max_shm < 262144,
+    ASSERT(max_shm_ < 262144,
            "internal error: please use a larger type inside"
            " infer_k for column count");
     // TODO(canonizer): use >48KiB shared memory if available
-    max_shm = std::min(max_shm, max_shm_std);
     // searching for the most items per block while respecting the shared
     // memory limits creates a full linear programming problem.
     // solving it in a single equation looks less tractable than this
     for (bool predict_proba : {false, true}) {
       shmem_size_params& ssp_ = predict_proba ? proba_ssp_ : class_ssp_;
       ssp_.predict_proba = predict_proba;
-      shmem_size_params ssp = ssp_;
+      predict_params ssp = ssp_;
       // if n_items was not provided, try from 1 to 4. Otherwise, use as-is.
       int min_n_items = ssp.n_items == 0 ? 1 : ssp.n_items;
       int max_n_items = ssp.n_items == 0
@@ -118,7 +108,7 @@ struct forest {
         for (ssp.n_items = min_n_items; ssp.n_items <= max_n_items;
              ++ssp.n_items) {
           dispatch_on_FIL_template_params<compute_smem_footprint,
-                                          dense_storage>(predict_params(ssp));
+                                          dense_storage>(ssp);
           if (ssp.shm_sz <= ssp.max_shm) ssp_ = ssp;
         }
       }
@@ -255,7 +245,8 @@ struct forest {
           do_transform = ot != output_t::RAW || global_bias_ != 0.0f;
           break;
         default:
-          ASSERT(false, "internal error: invalid leaf_algo_");
+          ASSERT(false, "internal error: predict: invalid leaf_algo %d",
+                 params.leaf_algo);
       }
     } else {
       if (params.leaf_algo == leaf_algo_t::FLOAT_UNARY_BINARY) {
@@ -303,7 +294,7 @@ struct forest {
 template <bool cols_in_shmem, leaf_algo_t leaf_algo, int n_items>
 struct enable_smem_carveout {
   template <typename storage_type>
-  static void run(predict_params params, int max_shm) {
+  static void run(predict_params& params, int max_shm) {
     void (*kernel)(storage_type, predict_params) =
       infer_k<n_items, leaf_algo, cols_in_shmem, storage_type>;
     // ensure optimal occupancy and L1 cache size in case config at launch is suboptimal
@@ -360,8 +351,9 @@ struct dense_forest : forest {
                                num_nodes * sizeof(dense_node),
                                cudaMemcpyHostToDevice, h.get_stream()));
 
+    predict_params ssp = class_ssp_;
     dispatch_on_FIL_template_params<enable_smem_carveout, dense_storage>(
-      predict_params(class_ssp_), max_shm_);
+      ssp, max_shm_);
     // copy must be finished before freeing the host data
     CUDA_CHECK(cudaStreamSynchronize(h.get_stream()));
     h_nodes_.clear();
@@ -406,9 +398,9 @@ struct sparse_forest : forest {
     CUDA_CHECK(cudaMemcpyAsync(nodes_, nodes, sizeof(node_t) * num_nodes_,
                                cudaMemcpyHostToDevice, h.get_stream()));
 
+    predict_params ssp = class_ssp_;
     dispatch_on_FIL_template_params<enable_smem_carveout,
-                                    sparse_storage<node_t>>(
-      predict_params(class_ssp_), max_shm_);
+                                    sparse_storage<node_t>>(ssp, max_shm_);
   }
 
   virtual void infer(predict_params params, cudaStream_t stream) override {
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index effca01b16..f1a3a25357 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -640,10 +640,14 @@ size_t shmem_size_params::get_smem_footprint() {
   return std::max(accumulate_footprint, finalize_footprint);
 }
 
+// make sure to instantiate all possible get_smem_footprint instantiations
+template void dispatch_on_FIL_template_params<compute_smem_footprint,
+                                              dense_storage>(predict_params&);
+
 template <bool cols_in_shmem, leaf_algo_t leaf_algo, int n_items>
 struct infer_k_launcher {
   template <typename storage_type>
-  static void run(predict_params params, storage_type forest,
+  static void run(predict_params& params, storage_type forest,
                   cudaStream_t stream) {
     params.num_blocks = params.num_blocks != 0
                           ? params.num_blocks

From e0f53ea72289ed01f54c868340839b31e36f53d1 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Mon, 14 Jun 2021 18:40:15 -0700
Subject: [PATCH 08/30] removed unnecessary specialization in dispatch

---
 cpp/src/fil/common.cuh  | 14 ++++++--------
 cpp/test/sg/fil_test.cu |  7 +++++++
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index c15fa4a64b..e66293c736 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -212,15 +212,13 @@ void dispatch_on_leaf_algo(predict_params& params, Args... args) {
       break;
     case GROVE_PER_CLASS:
       if (params.num_classes > FIL_TPB) {
-        case GROVE_PER_CLASS_MANY_CLASSES:
-          params.blockdim_x = FIL_TPB;
-          dispatch_on_n_items<Func, storage_type, cols_in_shmem,
-                              GROVE_PER_CLASS_MANY_CLASSES>(params, args...);
+        params.blockdim_x = FIL_TPB;
+        dispatch_on_n_items<Func, storage_type, cols_in_shmem,
+                            GROVE_PER_CLASS_MANY_CLASSES>(params, args...);
       } else {
-        case GROVE_PER_CLASS_FEW_CLASSES:
-          params.blockdim_x = FIL_TPB - FIL_TPB % params.num_classes;
-          dispatch_on_n_items<Func, storage_type, cols_in_shmem,
-                              GROVE_PER_CLASS_FEW_CLASSES>(params, args...);
+        params.blockdim_x = FIL_TPB - FIL_TPB % params.num_classes;
+        dispatch_on_n_items<Func, storage_type, cols_in_shmem,
+                            GROVE_PER_CLASS_FEW_CLASSES>(params, args...);
       }
       break;
     case CATEGORICAL_LEAF:
diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu
index e24d9c66da..98d97b6d22 100644
--- a/cpp/test/sg/fil_test.cu
+++ b/cpp/test/sg/fil_test.cu
@@ -774,6 +774,13 @@ std::vector<FilTestParams> predict_dense_inputs = {
   // use shared memory opt-in carveout if available, or infer out of L1 cache
   FIL_TEST_PARAMS(num_rows = 103, num_cols = ((48 + 1) * 1024) / sizeof(float),
                   algo = NAIVE),
+  FIL_TEST_PARAMS(num_rows = 103, num_cols = ((48 + 1) * 1024) / sizeof(float),
+                  leaf_algo = GROVE_PER_CLASS, num_classes = 5),
+  FIL_TEST_PARAMS(num_rows = 103, num_cols = ((48 + 1) * 1024) / sizeof(float),
+                  num_trees = FIL_TPB + 1, leaf_algo = GROVE_PER_CLASS,
+                  num_classes = FIL_TPB + 1),
+  FIL_TEST_PARAMS(num_rows = 103, num_cols = ((48 + 1) * 1024) / sizeof(float),
+                  leaf_algo = CATEGORICAL_LEAF, num_classes = 3),
   FIL_TEST_PARAMS(algo = BATCH_TREE_REORG, threads_per_tree = 2),
   FIL_TEST_PARAMS(algo = NAIVE, threads_per_tree = 4),
   FIL_TEST_PARAMS(algo = TREE_REORG, threads_per_tree = 8),

From ace36c08ba66573fb1d9806d3539f4124c8601ba Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Thu, 24 Jun 2021 01:03:48 -0700
Subject: [PATCH 09/30] simplified code to template-based dispatch

---
 cpp/src/fil/common.cuh | 48 ++++++++++++++----------------------------
 cpp/src/fil/fil.cu     | 27 ++----------------------
 cpp/src/fil/infer.cu   |  4 ++--
 3 files changed, 20 insertions(+), 59 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index e66293c736..aecfd633b7 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -171,33 +171,17 @@ namespace dispatch {
 template <template <bool, leaf_algo_t, int> class Func, typename storage_type,
           bool cols_in_shmem, leaf_algo_t leaf_algo, int n_items,
           typename... Args>
-void dispatch_final(predict_params& params, Args... args) {
-  Func<cols_in_shmem, leaf_algo, n_items>::template run<storage_type>(params,
-                                                                      args...);
-}
-
-template <template <bool, leaf_algo_t, int> class Func, typename storage_type,
-          bool cols_in_shmem, leaf_algo_t leaf_algo, typename... Args>
 void dispatch_on_n_items(predict_params& params, Args... args) {
-  switch (params.n_items) {
-    case 1:
-      dispatch_final<Func, storage_type, cols_in_shmem, leaf_algo, 1>(params,
-                                                                      args...);
-      break;
-    case 2:
-      dispatch_final<Func, storage_type, cols_in_shmem, leaf_algo, 2>(params,
-                                                                      args...);
-      break;
-    case 3:
-      dispatch_final<Func, storage_type, cols_in_shmem, leaf_algo, 3>(params,
-                                                                      args...);
-      break;
-    case 4:
-      dispatch_final<Func, storage_type, cols_in_shmem, leaf_algo, 4>(params,
-                                                                      args...);
-      break;
-    default:
+  if (params.n_items == n_items) {
+    Func<cols_in_shmem, leaf_algo, n_items>::template run<storage_type>(
+      params, args...);
+  } else {
+    if constexpr (n_items < 4) {
+      dispatch_on_n_items<Func, storage_type, cols_in_shmem, FLOAT_UNARY_BINARY,
+                          n_items + 1>(params, args...);
+    } else {
       ASSERT(false, "internal error: n_items > 4");
+    }
   }
 }
 
@@ -207,24 +191,24 @@ void dispatch_on_leaf_algo(predict_params& params, Args... args) {
   switch (params.leaf_algo) {
     case FLOAT_UNARY_BINARY:
       params.blockdim_x = FIL_TPB;
-      dispatch_on_n_items<Func, storage_type, cols_in_shmem,
-                          FLOAT_UNARY_BINARY>(params, args...);
+      dispatch_on_n_items<Func, storage_type, cols_in_shmem, FLOAT_UNARY_BINARY,
+                          1>(params, args...);
       break;
     case GROVE_PER_CLASS:
       if (params.num_classes > FIL_TPB) {
         params.blockdim_x = FIL_TPB;
         dispatch_on_n_items<Func, storage_type, cols_in_shmem,
-                            GROVE_PER_CLASS_MANY_CLASSES>(params, args...);
+                            GROVE_PER_CLASS_MANY_CLASSES, 1>(params, args...);
       } else {
         params.blockdim_x = FIL_TPB - FIL_TPB % params.num_classes;
         dispatch_on_n_items<Func, storage_type, cols_in_shmem,
-                            GROVE_PER_CLASS_FEW_CLASSES>(params, args...);
+                            GROVE_PER_CLASS_FEW_CLASSES, 1>(params, args...);
       }
       break;
     case CATEGORICAL_LEAF:
       params.blockdim_x = FIL_TPB;
-      dispatch_on_n_items<Func, storage_type, cols_in_shmem, CATEGORICAL_LEAF>(
-        params, args...);
+      dispatch_on_n_items<Func, storage_type, cols_in_shmem, CATEGORICAL_LEAF,
+                          1>(params, args...);
       break;
     default:
       ASSERT(false, "internal error: dispatch: invalid leaf_algo %d",
@@ -245,7 +229,7 @@ void dispatch_on_cols_in_shmem(predict_params& params, Args... args) {
 
 template <template <bool, leaf_algo_t, int> class Func, typename storage_type,
           typename... Args>
-void dispatch_on_FIL_template_params(predict_params& params, Args... args) {
+void dispatch_on_fil_template_params(predict_params& params, Args... args) {
   dispatch::dispatch_on_cols_in_shmem<Func, storage_type>(params, args...);
 }
 
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 342c399445..88391640e5 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -74,7 +74,7 @@ __global__ void transform_k(float* preds, size_t n, output_t output,
     preds[i] = result;
 }
 
-extern template void dispatch_on_FIL_template_params<
+extern template void dispatch_on_fil_template_params<
   compute_smem_footprint, dense_storage>(predict_params&);
 
 struct forest {
@@ -107,7 +107,7 @@ struct forest {
         ssp.cols_in_shmem = cols_in_shmem;
         for (ssp.n_items = min_n_items; ssp.n_items <= max_n_items;
              ++ssp.n_items) {
-          dispatch_on_FIL_template_params<compute_smem_footprint,
+          dispatch_on_fil_template_params<compute_smem_footprint,
                                           dense_storage>(ssp);
           if (ssp.shm_sz <= ssp.max_shm) ssp_ = ssp;
         }
@@ -291,22 +291,6 @@ struct forest {
   int max_shm_ = 0;
 };
 
-template <bool cols_in_shmem, leaf_algo_t leaf_algo, int n_items>
-struct enable_smem_carveout {
-  template <typename storage_type>
-  static void run(predict_params& params, int max_shm) {
-    void (*kernel)(storage_type, predict_params) =
-      infer_k<n_items, leaf_algo, cols_in_shmem, storage_type>;
-    // ensure optimal occupancy and L1 cache size in case config at launch is suboptimal
-    CUDA_CHECK(cudaFuncSetAttribute(
-      kernel, cudaFuncAttributePreferredSharedMemoryCarveout,
-      cudaFuncCachePreferL1));
-    // even if the footprint < 48 * 1024, ensure that we reset after previous forest
-    CUDA_CHECK(cudaFuncSetAttribute(
-      kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shm));
-  }
-};
-
 struct dense_forest : forest {
   void transform_trees(const dense_node* nodes) {
     /* Populate node information:
@@ -351,9 +335,6 @@ struct dense_forest : forest {
                                num_nodes * sizeof(dense_node),
                                cudaMemcpyHostToDevice, h.get_stream()));
 
-    predict_params ssp = class_ssp_;
-    dispatch_on_FIL_template_params<enable_smem_carveout, dense_storage>(
-      ssp, max_shm_);
     // copy must be finished before freeing the host data
     CUDA_CHECK(cudaStreamSynchronize(h.get_stream()));
     h_nodes_.clear();
@@ -397,10 +378,6 @@ struct sparse_forest : forest {
       sizeof(node_t) * num_nodes_, h.get_stream());
     CUDA_CHECK(cudaMemcpyAsync(nodes_, nodes, sizeof(node_t) * num_nodes_,
                                cudaMemcpyHostToDevice, h.get_stream()));
-
-    predict_params ssp = class_ssp_;
-    dispatch_on_FIL_template_params<enable_smem_carveout,
-                                    sparse_storage<node_t>>(ssp, max_shm_);
   }
 
   virtual void infer(predict_params params, cudaStream_t stream) override {
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index f1a3a25357..1d3bb47372 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -641,7 +641,7 @@ size_t shmem_size_params::get_smem_footprint() {
 }
 
 // make sure to instantiate all possible get_smem_footprint instantiations
-template void dispatch_on_FIL_template_params<compute_smem_footprint,
+template void dispatch_on_fil_template_params<compute_smem_footprint,
                                               dense_storage>(predict_params&);
 
 template <bool cols_in_shmem, leaf_algo_t leaf_algo, int n_items>
@@ -661,7 +661,7 @@ struct infer_k_launcher {
 
 template <typename storage_type>
 void infer(storage_type forest, predict_params params, cudaStream_t stream) {
-  dispatch_on_FIL_template_params<infer_k_launcher, storage_type>(
+  dispatch_on_fil_template_params<infer_k_launcher, storage_type>(
     params, forest, stream);
 }
 

From bcd05bc4a949ad22da640122f593c77223e285e8 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Thu, 24 Jun 2021 01:11:15 -0700
Subject: [PATCH 10/30] reverted max_shm changes

---
 cpp/src/fil/common.cuh  |  2 --
 cpp/src/fil/fil.cu      | 22 ++++++++++------------
 cpp/test/sg/fil_test.cu | 10 ----------
 3 files changed, 10 insertions(+), 24 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 58b7d35d7b..b487b6c040 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -135,8 +135,6 @@ struct shmem_size_params {
   /// n_items is how many input samples (items) any thread processes. If 0 is given,
   /// choose the reasonable most (<=4) that fit into shared memory. See init_n_items()
   int n_items = 0;
-  /// max_shm is the maximum opt-in shared memory on the device
-  int max_shm = 0;
   // blockdim_x is the CUDA block size
   int blockdim_x = 0;
   /// shm_sz is the associated shared memory footprint
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 82aaf0def4..ee8421040c 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -79,18 +79,23 @@ extern template void dispatch_on_fil_template_params<
 
 struct forest {
   void init_n_items(int device) {
+    int max_shm_std = 48 * 1024;  // 48 KiB
+    /// the most shared memory a kernel can request on the GPU in question
+    int max_shm = 0;
     CUDA_CHECK(cudaDeviceGetAttribute(
-      &max_shm_, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
+      &max_shm, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
     /* Our GPUs have been growing the shared memory size generation after
        generation. Eventually, a CUDA GPU might come by that supports more 
        shared memory that would fit into unsigned 16-bit int. For such a GPU,
        we would have otherwise silently overflowed the index calculation due
        to short division. It would have failed cpp tests, but we might forget
        about this source of bugs, if not for the failing assert. */
-    ASSERT(max_shm_ < 262144,
+    ASSERT(max_shm < 262144,
            "internal error: please use a larger type inside"
            " infer_k for column count");
     // TODO(canonizer): use >48KiB shared memory if available
+    max_shm = std::min(max_shm, max_shm_std);
+
     // searching for the most items per block while respecting the shared
     // memory limits creates a full linear programming problem.
     // solving it in a single equation looks less tractable than this
@@ -109,10 +114,10 @@ struct forest {
              ++ssp.n_items) {
           dispatch_on_fil_template_params<compute_smem_footprint,
                                           dense_storage>(ssp);
-          if (ssp.shm_sz <= ssp.max_shm) ssp_ = ssp;
+          if (ssp.shm_sz < max_shm) ssp_ = ssp;
         }
       }
-      ASSERT(ssp_.max_shm >= ssp_.shm_sz,
+      ASSERT(max_shm >= ssp_.shm_sz,
              "FIL out of shared memory. Perhaps the maximum number of \n"
              "supported classes is exceeded? 5'000 would still be safe.");
     }
@@ -133,10 +138,6 @@ struct forest {
 
   void init_common(const raft::handle_t& h, const forest_params_t* params,
                    const std::vector<float>& vector_leaf) {
-    int device = h.get_device();
-    CUDA_CHECK(cudaDeviceGetAttribute(
-      &proba_ssp_.max_shm, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
-
     depth_ = params->depth;
     num_trees_ = params->num_trees;
     algo_ = params->algo;
@@ -150,6 +151,7 @@ struct forest {
     proba_ssp_.num_classes = params->num_classes;
     class_ssp_ = proba_ssp_;
 
+    int device = h.get_device();
     init_n_items(device);  // n_items takes priority over blocks_per_sm
     init_fixed_block_count(device, params->blocks_per_sm);
 
@@ -308,8 +310,6 @@ struct forest {
     }
   }
 
-  int max_shm() { return max_shm_; }
-
   virtual void free(const raft::handle_t& h) {
     if (vector_leaf_len_ > 0) {
       h.get_device_allocator()->deallocate(
@@ -327,7 +327,6 @@ struct forest {
   float global_bias_ = 0;
   shmem_size_params class_ssp_, proba_ssp_;
   int fixed_block_count_ = 0;
-  int max_shm_ = 0;
   // Optionally used
   float* vector_leaf_ = nullptr;
   size_t vector_leaf_len_ = 0;
@@ -377,7 +376,6 @@ struct dense_forest : forest {
     CUDA_CHECK(cudaMemcpyAsync(nodes_, h_nodes_.data(),
                                num_nodes * sizeof(dense_node),
                                cudaMemcpyHostToDevice, h.get_stream()));
-
     // copy must be finished before freeing the host data
     CUDA_CHECK(cudaStreamSynchronize(h.get_stream()));
     h_nodes_.clear();
diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu
index 44573f1d69..a0b0272603 100644
--- a/cpp/test/sg/fil_test.cu
+++ b/cpp/test/sg/fil_test.cu
@@ -828,16 +828,6 @@ std::vector<FilTestParams> predict_dense_inputs = {
   FIL_TEST_PARAMS(num_rows = 103, num_cols = 100'000, depth = 5, num_trees = 1,
                   algo = BATCH_TREE_REORG, leaf_algo = CATEGORICAL_LEAF,
                   num_classes = 3),
-  // use shared memory opt-in carveout if available, or infer out of L1 cache
-  FIL_TEST_PARAMS(num_rows = 103, num_cols = ((48 + 1) * 1024) / sizeof(float),
-                  algo = NAIVE),
-  FIL_TEST_PARAMS(num_rows = 103, num_cols = ((48 + 1) * 1024) / sizeof(float),
-                  leaf_algo = GROVE_PER_CLASS, num_classes = 5),
-  FIL_TEST_PARAMS(num_rows = 103, num_cols = ((48 + 1) * 1024) / sizeof(float),
-                  num_trees = FIL_TPB + 1, leaf_algo = GROVE_PER_CLASS,
-                  num_classes = FIL_TPB + 1),
-  FIL_TEST_PARAMS(num_rows = 103, num_cols = ((48 + 1) * 1024) / sizeof(float),
-                  leaf_algo = CATEGORICAL_LEAF, num_classes = 3),
   FIL_TEST_PARAMS(algo = BATCH_TREE_REORG, threads_per_tree = 2),
   FIL_TEST_PARAMS(algo = NAIVE, threads_per_tree = 4),
   FIL_TEST_PARAMS(algo = TREE_REORG, threads_per_tree = 8),

From 36b27f17c1d5e88db12123f914880ee7eee003d3 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Thu, 24 Jun 2021 01:53:29 -0700
Subject: [PATCH 11/30] simplified templates, fixed bug

---
 cpp/src/fil/common.cuh | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index b487b6c040..fe0899caab 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -178,16 +178,13 @@ template <template <bool, leaf_algo_t, int> class Func, typename storage_type,
           bool cols_in_shmem, leaf_algo_t leaf_algo, int n_items,
           typename... Args>
 void dispatch_on_n_items(predict_params& params, Args... args) {
+  ASSERT(params.n_items <= 4, "internal error: n_items > 4");
   if (params.n_items == n_items) {
     Func<cols_in_shmem, leaf_algo, n_items>::template run<storage_type>(
       params, args...);
-  } else {
-    if constexpr (n_items < 4) {
-      dispatch_on_n_items<Func, storage_type, cols_in_shmem, FLOAT_UNARY_BINARY,
-                          n_items + 1>(params, args...);
-    } else {
-      ASSERT(false, "internal error: n_items > 4");
-    }
+  } else if constexpr (n_items < 4) {
+    dispatch_on_n_items<Func, storage_type, cols_in_shmem, leaf_algo,
+                        n_items + 1>(params, args...);
   }
 }
 

From 4dd4f8af6fe3ef8fbe2f3a8f565191af8347fc26 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Tue, 29 Jun 2021 00:03:21 -0700
Subject: [PATCH 12/30] halfway change

---
 cpp/src/fil/common.cuh   | 104 ++++++++++++++++++++++-----------------
 cpp/src/fil/infer.cu     |  21 ++++++++
 cpp/src/fil/internal.cuh |   1 +
 3 files changed, 80 insertions(+), 46 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index fe0899caab..7c9430bb03 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -135,8 +135,8 @@ struct shmem_size_params {
   /// n_items is how many input samples (items) any thread processes. If 0 is given,
   /// choose the reasonable most (<=4) that fit into shared memory. See init_n_items()
   int n_items = 0;
-  // blockdim_x is the CUDA block size
-  int blockdim_x = 0;
+  // block_dim_x is the CUDA block size
+  int block_dim_x = 0;
   /// shm_sz is the associated shared memory footprint
   int shm_sz = INT_MAX;
 
@@ -174,71 +174,64 @@ struct predict_params : shmem_size_params {
 
 namespace dispatch {
 
-template <template <bool, leaf_algo_t, int> class Func, typename storage_type,
-          bool cols_in_shmem, leaf_algo_t leaf_algo, int n_items,
+template <template <bool, leaf_algo_t, int> class FUNC, typename STORAGE_TYPE,
+          bool COLS_IN_SHMEM, leaf_algo_t LEAF_ALGO, int N_ITEMS,
           typename... Args>
 void dispatch_on_n_items(predict_params& params, Args... args) {
-  ASSERT(params.n_items <= 4, "internal error: n_items > 4");
-  if (params.n_items == n_items) {
-    Func<cols_in_shmem, leaf_algo, n_items>::template run<storage_type>(
+  if (params.n_items == N_ITEMS) {
+    FUNC<COLS_IN_SHMEM, LEAF_ALGO, n_items>::template run<STORAGE_TYPE>(
       params, args...);
-  } else if constexpr (n_items < 4) {
-    dispatch_on_n_items<Func, storage_type, cols_in_shmem, leaf_algo,
-                        n_items + 1>(params, args...);
+  } else if constexpr (N_ITEMS < 4) {
+    dispatch_on_n_items<FUNC, STORAGE_TYPE, COLS_IN_SHMEM, LEAF_ALGO,
+                        N_ITEMS + 1>(params, args...);
+  } else {
+    ASSERT(false, "internal error: n_items > 4 or < 1");
   }
 }
 
-template <template <bool, leaf_algo_t, int> class Func, typename storage_type,
-          bool cols_in_shmem, typename... Args>
+template <template <bool, leaf_algo_t, int> class FUNC, typename STORAGE_TYPE,
+          bool COLS_IN_SHMEM, leaf_algo_t LEAF_ALGO, typename... Args>
 void dispatch_on_leaf_algo(predict_params& params, Args... args) {
-  switch (params.leaf_algo) {
-    case FLOAT_UNARY_BINARY:
-      params.blockdim_x = FIL_TPB;
-      dispatch_on_n_items<Func, storage_type, cols_in_shmem, FLOAT_UNARY_BINARY,
-                          1>(params, args...);
-      break;
-    case GROVE_PER_CLASS:
-      if (params.num_classes > FIL_TPB) {
-        params.blockdim_x = FIL_TPB;
-        dispatch_on_n_items<Func, storage_type, cols_in_shmem,
-                            GROVE_PER_CLASS_MANY_CLASSES, 1>(params, args...);
-      } else {
-        params.blockdim_x = FIL_TPB - FIL_TPB % params.num_classes;
-        dispatch_on_n_items<Func, storage_type, cols_in_shmem,
+  if (params.leaf_algo == LEAF_ALGO) {
+    if constexpr (LEAF_ALGO == GROVE_PER_CLASS) {
+      if (params.num_classes <= FIL_TPB) {
+        params.block_dim_x = FIL_TPB - FIL_TPB % params.num_classes;
+        dispatch_on_n_items<FUNC, STORAGE_TYPE, COLS_IN_SHMEM,
                             GROVE_PER_CLASS_FEW_CLASSES, 1>(params, args...);
+      } else {
+        params.block_dim_x = FIL_TPB;
+        dispatch_on_n_items<FUNC, STORAGE_TYPE, COLS_IN_SHMEM,
+                            GROVE_PER_CLASS_MANY_CLASSES, 1>(params, args...);
       }
-      break;
-    case CATEGORICAL_LEAF:
-      params.blockdim_x = FIL_TPB;
-      dispatch_on_n_items<Func, storage_type, cols_in_shmem, CATEGORICAL_LEAF,
-                          1>(params, args...);
-      break;
-    case VECTOR_LEAF:
-      params.blockdim_x = FIL_TPB;
-      dispatch_on_n_items<Func, storage_type, cols_in_shmem, VECTOR_LEAF, 1>(
+    } else {
+      params.block_dim_x = FIL_TPB;
+      dispatch_on_n_items<FUNC, STORAGE_TYPE, COLS_IN_SHMEM, LEAF_ALGO, 1>(
         params, args...);
-      break;
-    default:
-      ASSERT(false, "internal error: dispatch: invalid leaf_algo %d",
-             params.leaf_algo);
+    }
+  } else if constexpr (LEAF_ALGO < (int)leaf_algo_t::LEAF_ALGO_INVALID) {
+    dispatch_on_leaf_algo<FUNC, STORAGE_TYPE, COLS_IN_SHMEM, LEAF_ALGO + 1>(
+      params, args...);
+  } else {
+    ASSERT(false, "internal error: dispatch: invalid leaf_algo %d",
+           params.leaf_algo);
   }
 }
 
-template <template <bool, leaf_algo_t, int> class Func, typename storage_type,
+template <template <bool, leaf_algo_t, int> class FUNC, typename STORAGE_TYPE,
           typename... Args>
 void dispatch_on_cols_in_shmem(predict_params& params, Args... args) {
   if (params.cols_in_shmem)
-    dispatch_on_leaf_algo<Func, storage_type, true>(params, args...);
+    dispatch_on_leaf_algo<FUNC, STORAGE_TYPE, true, 0>(params, args...);
   else
-    dispatch_on_leaf_algo<Func, storage_type, false>(params, args...);
+    dispatch_on_leaf_algo<FUNC, STORAGE_TYPE, false, 0>(params, args...);
 }
 
 }  // namespace dispatch
 
-template <template <bool, leaf_algo_t, int> class Func, typename storage_type,
+template <template <bool, leaf_algo_t, int> class FUNC, typename STORAGE_TYPE,
           typename... Args>
 void dispatch_on_fil_template_params(predict_params& params, Args... args) {
-  dispatch::dispatch_on_cols_in_shmem<Func, storage_type>(params, args...);
+  dispatch::dispatch_on_cols_in_shmem<FUNC, STORAGE_TYPE>(params, args...);
 }
 
 // we need to instantiate all get_smem_footprint instantiations in infer.cu.
@@ -247,14 +240,33 @@ void dispatch_on_fil_template_params(predict_params& params, Args... args) {
 // requires a declaration of this struct with the declaration of the run method
 // (i.e. all but one line) visible from infer.cu, as well as this full
 // definition visible from fil.cu. We'll just define it in common.cuh.
-template <bool cols_in_shmem, leaf_algo_t leaf_algo, int n_items>
+template <bool cols_in_shmem, leaf_algo_t LEAF_ALGO, int n_items>
 struct compute_smem_footprint {
   template <typename storage_type>
   static void run(predict_params& ssp) {
-    ssp.shm_sz = ssp.get_smem_footprint<n_items, leaf_algo>();
+    // need GROVE_PER_CLASS_*_CLASSES
+    if constexpr (LEAF_ALGO != GROVE_PER_CLASS) {
+      ssp.shm_sz = ssp.get_smem_footprint<n_items, LEAF_ALGO>();
+    }
   }
 };
 
+/*void compute_smem_footprint(predict_params& params) {
+  return template <bool COLS_IN_SHMEM>
+  [&]() {
+    return template <int N_ITEMS>
+    [&]() {
+      return template <leaf_algo_t LEAF_ALGO>
+      [&]() {
+        // need GROVE_PER_CLASS_*_CLASSES
+        if constexpr (LEAF_ALGO != GROVE_PER_CLASS) {
+          ssp.shm_sz = ssp.get_smem_footprint<n_items, LEAF_ALGO>();
+        }
+      };
+    };
+  };
+}*/
+
 // infer() calls the inference kernel with the parameters on the stream
 template <typename storage_type>
 void infer(storage_type forest, predict_params params, cudaStream_t stream);
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 4c398121dc..d23adfac63 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -771,6 +771,27 @@ struct infer_k_launcher {
     CUDA_CHECK(cudaPeekAtLastError());
   }
 };
+/*template <typename STORAGE_TYPE>
+void infer_k_launcher(predict_params& params, storage_type forest,
+                      cudaStream_t stream) {
+  return template <bool COLS_IN_SHMEM>
+  [&]() {
+    return template <int N_ITEMS>
+    [&]() {
+      return template <leaf_algo_t LEAF_ALGO>
+      [&]() {
+        params.num_blocks =
+          params.num_blocks != 0
+            ? params.num_blocks
+            : raft::ceildiv(int(params.num_rows), params.n_items);
+        infer_k<N_ITEMS, LEAF_ALGO, COLS_IN_SHMEM, STORAGE_TYPE>
+          <<<params.num_blocks, params.blockdim_x, params.shm_sz, stream>>>(
+            forest, params);
+        CUDA_CHECK(cudaPeekAtLastError());
+      };
+    };
+  };
+}*/
 
 template <typename storage_type>
 void infer(storage_type forest, predict_params params, cudaStream_t stream) {
diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh
index c9c297233d..120b62cebc 100644
--- a/cpp/src/fil/internal.cuh
+++ b/cpp/src/fil/internal.cuh
@@ -197,6 +197,7 @@ enum leaf_algo_t {
   /** Leaf contains an index into a vector of class probabilities. **/
   VECTOR_LEAF = 5,
   // to be extended
+  LEAF_ALGO_INVALID  // no explicit numerical value
 };
 
 template <leaf_algo_t leaf_algo>

From 19319286391a9d96c80fc84ccf422c9dae074a22 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Tue, 29 Jun 2021 02:21:04 -0700
Subject: [PATCH 13/30] recording template lambda-based attempt

---
 cpp/src/fil/common.cuh | 66 +++++++++++++++---------------------------
 cpp/src/fil/fil.cu     | 11 +++----
 cpp/src/fil/infer.cu   | 60 +++++++++++++++++---------------------
 3 files changed, 55 insertions(+), 82 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 7c9430bb03..569a86854b 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -174,64 +174,55 @@ struct predict_params : shmem_size_params {
 
 namespace dispatch {
 
-template <template <bool, leaf_algo_t, int> class FUNC, typename STORAGE_TYPE,
-          bool COLS_IN_SHMEM, leaf_algo_t LEAF_ALGO, int N_ITEMS,
-          typename... Args>
-void dispatch_on_n_items(predict_params& params, Args... args) {
+template <template <int N_ITEMS> class FUNC>
+void dispatch_on_n_items(predict_params& params, FUNC func) {
   if (params.n_items == N_ITEMS) {
-    FUNC<COLS_IN_SHMEM, LEAF_ALGO, n_items>::template run<STORAGE_TYPE>(
-      params, args...);
+    func<N_ITEMS>();
   } else if constexpr (N_ITEMS < 4) {
-    dispatch_on_n_items<FUNC, STORAGE_TYPE, COLS_IN_SHMEM, LEAF_ALGO,
-                        N_ITEMS + 1>(params, args...);
+    dispatch_on_n_items<N_ITEMS + 1>(params, func);
   } else {
     ASSERT(false, "internal error: n_items > 4 or < 1");
   }
 }
 
-template <template <bool, leaf_algo_t, int> class FUNC, typename STORAGE_TYPE,
-          bool COLS_IN_SHMEM, leaf_algo_t LEAF_ALGO, typename... Args>
-void dispatch_on_leaf_algo(predict_params& params, Args... args) {
+template <template <leaf_algo_t LEAF_ALGO> class FUNC>
+void dispatch_on_leaf_algo(predict_params& params, FUNC func) {
   if (params.leaf_algo == LEAF_ALGO) {
+    // never dispatch on GROVE_PER_CLASS directly, resolve the specific one first
     if constexpr (LEAF_ALGO == GROVE_PER_CLASS) {
       if (params.num_classes <= FIL_TPB) {
         params.block_dim_x = FIL_TPB - FIL_TPB % params.num_classes;
-        dispatch_on_n_items<FUNC, STORAGE_TYPE, COLS_IN_SHMEM,
-                            GROVE_PER_CLASS_FEW_CLASSES, 1>(params, args...);
+        dispatch_on_n_items<1>(params, func<GROVE_PER_CLASS_FEW_CLASSES>());
       } else {
         params.block_dim_x = FIL_TPB;
-        dispatch_on_n_items<FUNC, STORAGE_TYPE, COLS_IN_SHMEM,
-                            GROVE_PER_CLASS_MANY_CLASSES, 1>(params, args...);
+        dispatch_on_n_items<1>(params, func<GROVE_PER_CLASS_MANY_CLASSES>());
       }
     } else {
+      // dispatch on all other leaf_algo_t as usual
       params.block_dim_x = FIL_TPB;
-      dispatch_on_n_items<FUNC, STORAGE_TYPE, COLS_IN_SHMEM, LEAF_ALGO, 1>(
-        params, args...);
+      dispatch_on_n_items<1>(params, func<LEAF_ALGO>());
     }
   } else if constexpr (LEAF_ALGO < (int)leaf_algo_t::LEAF_ALGO_INVALID) {
-    dispatch_on_leaf_algo<FUNC, STORAGE_TYPE, COLS_IN_SHMEM, LEAF_ALGO + 1>(
-      params, args...);
+    dispatch_on_leaf_algo<LEAF_ALGO + 1>(params, func);
   } else {
     ASSERT(false, "internal error: dispatch: invalid leaf_algo %d",
            params.leaf_algo);
   }
 }
 
-template <template <bool, leaf_algo_t, int> class FUNC, typename STORAGE_TYPE,
-          typename... Args>
-void dispatch_on_cols_in_shmem(predict_params& params, Args... args) {
+template <template <bool COLS_IN_SHMEM> class FUNC>
+void dispatch_on_cols_in_shmem(predict_params& params, FUNC func) {
   if (params.cols_in_shmem)
-    dispatch_on_leaf_algo<FUNC, STORAGE_TYPE, true, 0>(params, args...);
+    dispatch_on_leaf_algo<0>(params, func<true>());
   else
-    dispatch_on_leaf_algo<FUNC, STORAGE_TYPE, false, 0>(params, args...);
+    dispatch_on_leaf_algo<0>(params, func<false>());
 }
 
 }  // namespace dispatch
 
-template <template <bool, leaf_algo_t, int> class FUNC, typename STORAGE_TYPE,
-          typename... Args>
-void dispatch_on_fil_template_params(predict_params& params, Args... args) {
-  dispatch::dispatch_on_cols_in_shmem<FUNC, STORAGE_TYPE>(params, args...);
+template <template <bool COLS_IN_SHMEM> class FUNC>
+void dispatch_on_fil_template_params(predict_params& params, FUNC func) {
+  dispatch::dispatch_on_cols_in_shmem(params, func);
 }
 
 // we need to instantiate all get_smem_footprint instantiations in infer.cu.
@@ -240,23 +231,12 @@ void dispatch_on_fil_template_params(predict_params& params, Args... args) {
 // requires a declaration of this struct with the declaration of the run method
 // (i.e. all but one line) visible from infer.cu, as well as this full
 // definition visible from fil.cu. We'll just define it in common.cuh.
-template <bool cols_in_shmem, leaf_algo_t LEAF_ALGO, int n_items>
-struct compute_smem_footprint {
-  template <typename storage_type>
-  static void run(predict_params& ssp) {
-    // need GROVE_PER_CLASS_*_CLASSES
-    if constexpr (LEAF_ALGO != GROVE_PER_CLASS) {
-      ssp.shm_sz = ssp.get_smem_footprint<n_items, LEAF_ALGO>();
-    }
-  }
-};
-
-/*void compute_smem_footprint(predict_params& params) {
+void compute_smem_footprint(shmem_size_params& ssp) {
   return template <bool COLS_IN_SHMEM>
   [&]() {
-    return template <int N_ITEMS>
+    return template <leaf_algo_t LEAF_ALGO>
     [&]() {
-      return template <leaf_algo_t LEAF_ALGO>
+      return template <int N_ITEMS>
       [&]() {
         // need GROVE_PER_CLASS_*_CLASSES
         if constexpr (LEAF_ALGO != GROVE_PER_CLASS) {
@@ -265,7 +245,7 @@ struct compute_smem_footprint {
       };
     };
   };
-}*/
+}
 
 // infer() calls the inference kernel with the parameters on the stream
 template <typename storage_type>
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index ee8421040c..11220f19b8 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -74,8 +74,9 @@ __global__ void transform_k(float* preds, size_t n, output_t output,
     preds[i] = result;
 }
 
-extern template void dispatch_on_fil_template_params<
-  compute_smem_footprint, dense_storage>(predict_params&);
+extern template void dispatch_on_fil_template_params(
+  predict_params,
+  decltype(compute_smem_footprint<dense_storage>(shmem_size_params&)));
 
 struct forest {
   void init_n_items(int device) {
@@ -102,7 +103,7 @@ struct forest {
     for (bool predict_proba : {false, true}) {
       shmem_size_params& ssp_ = predict_proba ? proba_ssp_ : class_ssp_;
       ssp_.predict_proba = predict_proba;
-      predict_params ssp = ssp_;
+      shmem_size_params ssp = ssp_;
       // if n_items was not provided, try from 1 to 4. Otherwise, use as-is.
       int min_n_items = ssp.n_items == 0 ? 1 : ssp.n_items;
       int max_n_items = ssp.n_items == 0
@@ -112,8 +113,8 @@ struct forest {
         ssp.cols_in_shmem = cols_in_shmem;
         for (ssp.n_items = min_n_items; ssp.n_items <= max_n_items;
              ++ssp.n_items) {
-          dispatch_on_fil_template_params<compute_smem_footprint,
-                                          dense_storage>(ssp);
+          dispatch_on_fil_template_params(
+            predict_params(ssp), compute_smem_footprint<dense_storage>(ssp));
           if (ssp.shm_sz < max_shm) ssp_ = ssp;
         }
       }
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index d23adfac63..d103d1baaf 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -757,46 +757,38 @@ size_t shmem_size_params::get_smem_footprint() {
 template void dispatch_on_fil_template_params<compute_smem_footprint,
                                               dense_storage>(predict_params&);
 
-template <bool cols_in_shmem, leaf_algo_t leaf_algo, int n_items>
+template <typename STORAGE_TYPE>
 struct infer_k_launcher {
-  template <typename storage_type>
-  static void run(predict_params& params, storage_type forest,
-                  cudaStream_t stream) {
-    params.num_blocks = params.num_blocks != 0
-                          ? params.num_blocks
-                          : raft::ceildiv(int(params.num_rows), params.n_items);
-    infer_k<n_items, leaf_algo, cols_in_shmem, storage_type>
-      <<<params.num_blocks, params.blockdim_x, params.shm_sz, stream>>>(forest,
-                                                                        params);
-    CUDA_CHECK(cudaPeekAtLastError());
+  predict_params params;
+  storage_type forest;
+  cudaStream_t stream;
+  template <bool COLS_IN_SHMEM>
+  auto operator() {
+    return struct {
+      template <leaf_algo_t LEAF_ALGO>
+      auto operator() {
+        return struct {
+          template <int N_ITEMS>
+          auto operator() {
+            params.num_blocks =
+              params.num_blocks != 0
+                ? params.num_blocks
+                : raft::ceildiv(int(params.num_rows), params.n_items);
+            infer_k<N_ITEMS, LEAF_ALGO, COLS_IN_SHMEM, STORAGE_TYPE>
+              <<<params.num_blocks, params.blockdim_x, params.shm_sz, stream>>>(
+                forest, params);
+            CUDA_CHECK(cudaPeekAtLastError());
+          }
+        };
+      }
+    };
   }
 };
-/*template <typename STORAGE_TYPE>
-void infer_k_launcher(predict_params& params, storage_type forest,
-                      cudaStream_t stream) {
-  return template <bool COLS_IN_SHMEM>
-  [&]() {
-    return template <int N_ITEMS>
-    [&]() {
-      return template <leaf_algo_t LEAF_ALGO>
-      [&]() {
-        params.num_blocks =
-          params.num_blocks != 0
-            ? params.num_blocks
-            : raft::ceildiv(int(params.num_rows), params.n_items);
-        infer_k<N_ITEMS, LEAF_ALGO, COLS_IN_SHMEM, STORAGE_TYPE>
-          <<<params.num_blocks, params.blockdim_x, params.shm_sz, stream>>>(
-            forest, params);
-        CUDA_CHECK(cudaPeekAtLastError());
-      };
-    };
-  };
-}*/
 
 template <typename storage_type>
 void infer(storage_type forest, predict_params params, cudaStream_t stream) {
-  dispatch_on_fil_template_params<infer_k_launcher, storage_type>(
-    params, forest, stream);
+  dispatch_on_fil_template_params(
+    params, infer_k_launcher<storage_type>{params, forest, stream});
 }
 
 template void infer<dense_storage>(dense_storage forest, predict_params params,

From ca473dbd81e80f58ee779aabefd761321b3f6979 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Tue, 29 Jun 2021 02:21:06 -0700
Subject: [PATCH 14/30] Revert "recording template lambda-based attempt"

This reverts commit 19319286391a9d96c80fc84ccf422c9dae074a22.
---
 cpp/src/fil/common.cuh | 66 +++++++++++++++++++++++++++---------------
 cpp/src/fil/fil.cu     | 11 ++++---
 cpp/src/fil/infer.cu   | 60 +++++++++++++++++++++-----------------
 3 files changed, 82 insertions(+), 55 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 569a86854b..7c9430bb03 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -174,55 +174,64 @@ struct predict_params : shmem_size_params {
 
 namespace dispatch {
 
-template <template <int N_ITEMS> class FUNC>
-void dispatch_on_n_items(predict_params& params, FUNC func) {
+template <template <bool, leaf_algo_t, int> class FUNC, typename STORAGE_TYPE,
+          bool COLS_IN_SHMEM, leaf_algo_t LEAF_ALGO, int N_ITEMS,
+          typename... Args>
+void dispatch_on_n_items(predict_params& params, Args... args) {
   if (params.n_items == N_ITEMS) {
-    func<N_ITEMS>();
+    FUNC<COLS_IN_SHMEM, LEAF_ALGO, n_items>::template run<STORAGE_TYPE>(
+      params, args...);
   } else if constexpr (N_ITEMS < 4) {
-    dispatch_on_n_items<N_ITEMS + 1>(params, func);
+    dispatch_on_n_items<FUNC, STORAGE_TYPE, COLS_IN_SHMEM, LEAF_ALGO,
+                        N_ITEMS + 1>(params, args...);
   } else {
     ASSERT(false, "internal error: n_items > 4 or < 1");
   }
 }
 
-template <template <leaf_algo_t LEAF_ALGO> class FUNC>
-void dispatch_on_leaf_algo(predict_params& params, FUNC func) {
+template <template <bool, leaf_algo_t, int> class FUNC, typename STORAGE_TYPE,
+          bool COLS_IN_SHMEM, leaf_algo_t LEAF_ALGO, typename... Args>
+void dispatch_on_leaf_algo(predict_params& params, Args... args) {
   if (params.leaf_algo == LEAF_ALGO) {
-    // never dispatch on GROVE_PER_CLASS directly, resolve the specific one first
     if constexpr (LEAF_ALGO == GROVE_PER_CLASS) {
       if (params.num_classes <= FIL_TPB) {
         params.block_dim_x = FIL_TPB - FIL_TPB % params.num_classes;
-        dispatch_on_n_items<1>(params, func<GROVE_PER_CLASS_FEW_CLASSES>());
+        dispatch_on_n_items<FUNC, STORAGE_TYPE, COLS_IN_SHMEM,
+                            GROVE_PER_CLASS_FEW_CLASSES, 1>(params, args...);
       } else {
         params.block_dim_x = FIL_TPB;
-        dispatch_on_n_items<1>(params, func<GROVE_PER_CLASS_MANY_CLASSES>());
+        dispatch_on_n_items<FUNC, STORAGE_TYPE, COLS_IN_SHMEM,
+                            GROVE_PER_CLASS_MANY_CLASSES, 1>(params, args...);
       }
     } else {
-      // dispatch on all other leaf_algo_t as usual
       params.block_dim_x = FIL_TPB;
-      dispatch_on_n_items<1>(params, func<LEAF_ALGO>());
+      dispatch_on_n_items<FUNC, STORAGE_TYPE, COLS_IN_SHMEM, LEAF_ALGO, 1>(
+        params, args...);
     }
   } else if constexpr (LEAF_ALGO < (int)leaf_algo_t::LEAF_ALGO_INVALID) {
-    dispatch_on_leaf_algo<LEAF_ALGO + 1>(params, func);
+    dispatch_on_leaf_algo<FUNC, STORAGE_TYPE, COLS_IN_SHMEM, LEAF_ALGO + 1>(
+      params, args...);
   } else {
     ASSERT(false, "internal error: dispatch: invalid leaf_algo %d",
            params.leaf_algo);
   }
 }
 
-template <template <bool COLS_IN_SHMEM> class FUNC>
-void dispatch_on_cols_in_shmem(predict_params& params, FUNC func) {
+template <template <bool, leaf_algo_t, int> class FUNC, typename STORAGE_TYPE,
+          typename... Args>
+void dispatch_on_cols_in_shmem(predict_params& params, Args... args) {
   if (params.cols_in_shmem)
-    dispatch_on_leaf_algo<0>(params, func<true>());
+    dispatch_on_leaf_algo<FUNC, STORAGE_TYPE, true, 0>(params, args...);
   else
-    dispatch_on_leaf_algo<0>(params, func<false>());
+    dispatch_on_leaf_algo<FUNC, STORAGE_TYPE, false, 0>(params, args...);
 }
 
 }  // namespace dispatch
 
-template <template <bool COLS_IN_SHMEM> class FUNC>
-void dispatch_on_fil_template_params(predict_params& params, FUNC func) {
-  dispatch::dispatch_on_cols_in_shmem(params, func);
+template <template <bool, leaf_algo_t, int> class FUNC, typename STORAGE_TYPE,
+          typename... Args>
+void dispatch_on_fil_template_params(predict_params& params, Args... args) {
+  dispatch::dispatch_on_cols_in_shmem<FUNC, STORAGE_TYPE>(params, args...);
 }
 
 // we need to instantiate all get_smem_footprint instantiations in infer.cu.
@@ -231,12 +240,23 @@ void dispatch_on_fil_template_params(predict_params& params, FUNC func) {
 // requires a declaration of this struct with the declaration of the run method
 // (i.e. all but one line) visible from infer.cu, as well as this full
 // definition visible from fil.cu. We'll just define it in common.cuh.
-void compute_smem_footprint(shmem_size_params& ssp) {
+template <bool cols_in_shmem, leaf_algo_t LEAF_ALGO, int n_items>
+struct compute_smem_footprint {
+  template <typename storage_type>
+  static void run(predict_params& ssp) {
+    // need GROVE_PER_CLASS_*_CLASSES
+    if constexpr (LEAF_ALGO != GROVE_PER_CLASS) {
+      ssp.shm_sz = ssp.get_smem_footprint<n_items, LEAF_ALGO>();
+    }
+  }
+};
+
+/*void compute_smem_footprint(predict_params& params) {
   return template <bool COLS_IN_SHMEM>
   [&]() {
-    return template <leaf_algo_t LEAF_ALGO>
+    return template <int N_ITEMS>
     [&]() {
-      return template <int N_ITEMS>
+      return template <leaf_algo_t LEAF_ALGO>
       [&]() {
         // need GROVE_PER_CLASS_*_CLASSES
         if constexpr (LEAF_ALGO != GROVE_PER_CLASS) {
@@ -245,7 +265,7 @@ void compute_smem_footprint(shmem_size_params& ssp) {
       };
     };
   };
-}
+}*/
 
 // infer() calls the inference kernel with the parameters on the stream
 template <typename storage_type>
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 11220f19b8..ee8421040c 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -74,9 +74,8 @@ __global__ void transform_k(float* preds, size_t n, output_t output,
     preds[i] = result;
 }
 
-extern template void dispatch_on_fil_template_params(
-  predict_params,
-  decltype(compute_smem_footprint<dense_storage>(shmem_size_params&)));
+extern template void dispatch_on_fil_template_params<
+  compute_smem_footprint, dense_storage>(predict_params&);
 
 struct forest {
   void init_n_items(int device) {
@@ -103,7 +102,7 @@ struct forest {
     for (bool predict_proba : {false, true}) {
       shmem_size_params& ssp_ = predict_proba ? proba_ssp_ : class_ssp_;
       ssp_.predict_proba = predict_proba;
-      shmem_size_params ssp = ssp_;
+      predict_params ssp = ssp_;
       // if n_items was not provided, try from 1 to 4. Otherwise, use as-is.
       int min_n_items = ssp.n_items == 0 ? 1 : ssp.n_items;
       int max_n_items = ssp.n_items == 0
@@ -113,8 +112,8 @@ struct forest {
         ssp.cols_in_shmem = cols_in_shmem;
         for (ssp.n_items = min_n_items; ssp.n_items <= max_n_items;
              ++ssp.n_items) {
-          dispatch_on_fil_template_params(
-            predict_params(ssp), compute_smem_footprint<dense_storage>(ssp));
+          dispatch_on_fil_template_params<compute_smem_footprint,
+                                          dense_storage>(ssp);
           if (ssp.shm_sz < max_shm) ssp_ = ssp;
         }
       }
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index d103d1baaf..d23adfac63 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -757,38 +757,46 @@ size_t shmem_size_params::get_smem_footprint() {
 template void dispatch_on_fil_template_params<compute_smem_footprint,
                                               dense_storage>(predict_params&);
 
-template <typename STORAGE_TYPE>
+template <bool cols_in_shmem, leaf_algo_t leaf_algo, int n_items>
 struct infer_k_launcher {
-  predict_params params;
-  storage_type forest;
-  cudaStream_t stream;
-  template <bool COLS_IN_SHMEM>
-  auto operator() {
-    return struct {
-      template <leaf_algo_t LEAF_ALGO>
-      auto operator() {
-        return struct {
-          template <int N_ITEMS>
-          auto operator() {
-            params.num_blocks =
-              params.num_blocks != 0
-                ? params.num_blocks
-                : raft::ceildiv(int(params.num_rows), params.n_items);
-            infer_k<N_ITEMS, LEAF_ALGO, COLS_IN_SHMEM, STORAGE_TYPE>
-              <<<params.num_blocks, params.blockdim_x, params.shm_sz, stream>>>(
-                forest, params);
-            CUDA_CHECK(cudaPeekAtLastError());
-          }
-        };
-      }
-    };
+  template <typename storage_type>
+  static void run(predict_params& params, storage_type forest,
+                  cudaStream_t stream) {
+    params.num_blocks = params.num_blocks != 0
+                          ? params.num_blocks
+                          : raft::ceildiv(int(params.num_rows), params.n_items);
+    infer_k<n_items, leaf_algo, cols_in_shmem, storage_type>
+      <<<params.num_blocks, params.blockdim_x, params.shm_sz, stream>>>(forest,
+                                                                        params);
+    CUDA_CHECK(cudaPeekAtLastError());
   }
 };
+/*template <typename STORAGE_TYPE>
+void infer_k_launcher(predict_params& params, storage_type forest,
+                      cudaStream_t stream) {
+  return template <bool COLS_IN_SHMEM>
+  [&]() {
+    return template <int N_ITEMS>
+    [&]() {
+      return template <leaf_algo_t LEAF_ALGO>
+      [&]() {
+        params.num_blocks =
+          params.num_blocks != 0
+            ? params.num_blocks
+            : raft::ceildiv(int(params.num_rows), params.n_items);
+        infer_k<N_ITEMS, LEAF_ALGO, COLS_IN_SHMEM, STORAGE_TYPE>
+          <<<params.num_blocks, params.blockdim_x, params.shm_sz, stream>>>(
+            forest, params);
+        CUDA_CHECK(cudaPeekAtLastError());
+      };
+    };
+  };
+}*/
 
 template <typename storage_type>
 void infer(storage_type forest, predict_params params, cudaStream_t stream) {
-  dispatch_on_fil_template_params(
-    params, infer_k_launcher<storage_type>{params, forest, stream});
+  dispatch_on_fil_template_params<infer_k_launcher, storage_type>(
+    params, forest, stream);
 }
 
 template void infer<dense_storage>(dense_storage forest, predict_params params,

From 971b19abf5c5ac849d05395be580bf476318a5e1 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 2 Jul 2021 12:52:30 -0700
Subject: [PATCH 15/30] wrapped template params into a struct

---
 cpp/src/fil/common.cuh | 100 ++++++++++++++++++++---------------------
 cpp/src/fil/fil.cu     |   7 ++-
 cpp/src/fil/infer.cu   |  61 +++++++++----------------
 3 files changed, 75 insertions(+), 93 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 7c9430bb03..23f43f8afb 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -172,66 +172,82 @@ struct predict_params : shmem_size_params {
   int num_blocks;
 };
 
+template <bool COLS_IN_SHMEM, int LEAF_ALGO, int N_ITEMS>
+struct KernelTemplateParameters {
+  static const bool cols_in_shmem = COLS_IN_SHMEM;
+  static const leaf_algo_t leaf_algo = (leaf_algo_t)LEAF_ALGO;
+  static const int n_items = N_ITEMS;
+};
+
 namespace dispatch {
 
-template <template <bool, leaf_algo_t, int> class FUNC, typename STORAGE_TYPE,
-          bool COLS_IN_SHMEM, leaf_algo_t LEAF_ALGO, int N_ITEMS,
-          typename... Args>
+template <template <class> class Func, class KernelParams, typename... Args>
 void dispatch_on_n_items(predict_params& params, Args... args) {
-  if (params.n_items == N_ITEMS) {
-    FUNC<COLS_IN_SHMEM, LEAF_ALGO, n_items>::template run<STORAGE_TYPE>(
-      params, args...);
-  } else if constexpr (N_ITEMS < 4) {
-    dispatch_on_n_items<FUNC, STORAGE_TYPE, COLS_IN_SHMEM, LEAF_ALGO,
-                        N_ITEMS + 1>(params, args...);
+  if (params.n_items == KernelParams::n_items) {
+    Func<KernelParams>(params, args...);
+  } else if constexpr (KernelParams::n_items < 4) {
+    typedef KernelTemplateParameters<KernelParams::cols_in_shmem,
+                                     KernelParams::leaf_algo,
+                                     KernelParams::n_items + 1>
+      Next;
+    dispatch_on_n_items<Func, Next>(params, args...);
   } else {
     ASSERT(false, "internal error: n_items > 4 or < 1");
   }
 }
 
-template <template <bool, leaf_algo_t, int> class FUNC, typename STORAGE_TYPE,
-          bool COLS_IN_SHMEM, leaf_algo_t LEAF_ALGO, typename... Args>
+template <template <class> class Func, class KernelParams, typename... Args>
 void dispatch_on_leaf_algo(predict_params& params, Args... args) {
-  if (params.leaf_algo == LEAF_ALGO) {
-    if constexpr (LEAF_ALGO == GROVE_PER_CLASS) {
+  if (params.leaf_algo == KernelParams::leaf_algo) {
+    if constexpr (KernelParams::leaf_algo == GROVE_PER_CLASS) {
       if (params.num_classes <= FIL_TPB) {
         params.block_dim_x = FIL_TPB - FIL_TPB % params.num_classes;
-        dispatch_on_n_items<FUNC, STORAGE_TYPE, COLS_IN_SHMEM,
-                            GROVE_PER_CLASS_FEW_CLASSES, 1>(params, args...);
+        typedef KernelTemplateParameters<KernelParams::cols_in_shmem,
+                                         GROVE_PER_CLASS_FEW_CLASSES, 1>
+          Next;
+        dispatch_on_n_items<Func, Next>(params, args...);
       } else {
         params.block_dim_x = FIL_TPB;
-        dispatch_on_n_items<FUNC, STORAGE_TYPE, COLS_IN_SHMEM,
-                            GROVE_PER_CLASS_MANY_CLASSES, 1>(params, args...);
+        typedef KernelTemplateParameters<KernelParams::cols_in_shmem,
+                                         GROVE_PER_CLASS_MANY_CLASSES, 1>
+          Next;
+        dispatch_on_n_items<Func, Next>(params, args...);
       }
     } else {
       params.block_dim_x = FIL_TPB;
-      dispatch_on_n_items<FUNC, STORAGE_TYPE, COLS_IN_SHMEM, LEAF_ALGO, 1>(
-        params, args...);
+      typedef KernelTemplateParameters<KernelParams::cols_in_shmem,
+                                       KernelParams::leaf_algo, 1>
+        Next;
+      dispatch_on_n_items<Func, Next>(params, args...);
     }
-  } else if constexpr (LEAF_ALGO < (int)leaf_algo_t::LEAF_ALGO_INVALID) {
-    dispatch_on_leaf_algo<FUNC, STORAGE_TYPE, COLS_IN_SHMEM, LEAF_ALGO + 1>(
-      params, args...);
+  } else if constexpr (KernelParams::leaf_algo + 1 <
+                       (int)leaf_algo_t::LEAF_ALGO_INVALID) {
+    typedef KernelTemplateParameters<
+      KernelParams::cols_in_shmem,
+      KernelParams::leaf_algo + 1, 1>
+      Next;
+    dispatch_on_leaf_algo<Func, Next>(params, args...);
   } else {
     ASSERT(false, "internal error: dispatch: invalid leaf_algo %d",
            params.leaf_algo);
   }
 }
 
-template <template <bool, leaf_algo_t, int> class FUNC, typename STORAGE_TYPE,
-          typename... Args>
+template <template <class> class Func, typename... Args>
 void dispatch_on_cols_in_shmem(predict_params& params, Args... args) {
   if (params.cols_in_shmem)
-    dispatch_on_leaf_algo<FUNC, STORAGE_TYPE, true, 0>(params, args...);
+    dispatch_on_leaf_algo<Func, KernelTemplateParameters<true, 0, 1>>(params,
+                                                                      args...);
   else
-    dispatch_on_leaf_algo<FUNC, STORAGE_TYPE, false, 0>(params, args...);
+    dispatch_on_leaf_algo<Func, KernelTemplateParameters<false, 0, 1>>(params,
+                                                                       args...);
 }
 
 }  // namespace dispatch
 
-template <template <bool, leaf_algo_t, int> class FUNC, typename STORAGE_TYPE,
-          typename... Args>
+template <template <class> class Func, typename... Args>
 void dispatch_on_fil_template_params(predict_params& params, Args... args) {
-  dispatch::dispatch_on_cols_in_shmem<FUNC, STORAGE_TYPE>(params, args...);
+  dispatch::dispatch_on_cols_in_shmem<Func>(params, args...);
 }
 
 // we need to instantiate all get_smem_footprint instantiations in infer.cu.
@@ -240,33 +256,17 @@ void dispatch_on_fil_template_params(predict_params& params, Args... args) {
 // requires a declaration of this struct with the declaration of the run method
 // (i.e. all but one line) visible from infer.cu, as well as this full
 // definition visible from fil.cu. We'll just define it in common.cuh.
-template <bool cols_in_shmem, leaf_algo_t LEAF_ALGO, int n_items>
+template <class KernelParams>
 struct compute_smem_footprint {
-  template <typename storage_type>
-  static void run(predict_params& ssp) {
+  compute_smem_footprint(predict_params& ssp) {
     // need GROVE_PER_CLASS_*_CLASSES
-    if constexpr (LEAF_ALGO != GROVE_PER_CLASS) {
-      ssp.shm_sz = ssp.get_smem_footprint<n_items, LEAF_ALGO>();
+    if constexpr (KernelParams::leaf_algo != GROVE_PER_CLASS) {
+      ssp.shm_sz = ssp.template get_smem_footprint<KernelParams::n_items,
+                                                   KernelParams::leaf_algo>();
     }
   }
 };
 
-/*void compute_smem_footprint(predict_params& params) {
-  return template <bool COLS_IN_SHMEM>
-  [&]() {
-    return template <int N_ITEMS>
-    [&]() {
-      return template <leaf_algo_t LEAF_ALGO>
-      [&]() {
-        // need GROVE_PER_CLASS_*_CLASSES
-        if constexpr (LEAF_ALGO != GROVE_PER_CLASS) {
-          ssp.shm_sz = ssp.get_smem_footprint<n_items, LEAF_ALGO>();
-        }
-      };
-    };
-  };
-}*/
-
 // infer() calls the inference kernel with the parameters on the stream
 template <typename storage_type>
 void infer(storage_type forest, predict_params params, cudaStream_t stream);
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index ee8421040c..a72db6726d 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -74,8 +74,8 @@ __global__ void transform_k(float* preds, size_t n, output_t output,
     preds[i] = result;
 }
 
-extern template void dispatch_on_fil_template_params<
-  compute_smem_footprint, dense_storage>(predict_params&);
+extern template void dispatch_on_fil_template_params<compute_smem_footprint>(
+  predict_params&);
 
 struct forest {
   void init_n_items(int device) {
@@ -112,8 +112,7 @@ struct forest {
         ssp.cols_in_shmem = cols_in_shmem;
         for (ssp.n_items = min_n_items; ssp.n_items <= max_n_items;
              ++ssp.n_items) {
-          dispatch_on_fil_template_params<compute_smem_footprint,
-                                          dense_storage>(ssp);
+          dispatch_on_fil_template_params<compute_smem_footprint>(ssp);
           if (ssp.shm_sz < max_shm) ssp_ = ssp;
         }
       }
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index d23adfac63..30e6e931e2 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -754,49 +754,32 @@ size_t shmem_size_params::get_smem_footprint() {
 }
 
 // make sure to instantiate all possible get_smem_footprint instantiations
-template void dispatch_on_fil_template_params<compute_smem_footprint,
-                                              dense_storage>(predict_params&);
-
-template <bool cols_in_shmem, leaf_algo_t leaf_algo, int n_items>
-struct infer_k_launcher {
-  template <typename storage_type>
-  static void run(predict_params& params, storage_type forest,
-                  cudaStream_t stream) {
-    params.num_blocks = params.num_blocks != 0
-                          ? params.num_blocks
-                          : raft::ceildiv(int(params.num_rows), params.n_items);
-    infer_k<n_items, leaf_algo, cols_in_shmem, storage_type>
-      <<<params.num_blocks, params.blockdim_x, params.shm_sz, stream>>>(forest,
-                                                                        params);
-    CUDA_CHECK(cudaPeekAtLastError());
-  }
-};
-/*template <typename STORAGE_TYPE>
-void infer_k_launcher(predict_params& params, storage_type forest,
-                      cudaStream_t stream) {
-  return template <bool COLS_IN_SHMEM>
-  [&]() {
-    return template <int N_ITEMS>
-    [&]() {
-      return template <leaf_algo_t LEAF_ALGO>
-      [&]() {
-        params.num_blocks =
-          params.num_blocks != 0
-            ? params.num_blocks
-            : raft::ceildiv(int(params.num_rows), params.n_items);
-        infer_k<N_ITEMS, LEAF_ALGO, COLS_IN_SHMEM, STORAGE_TYPE>
-          <<<params.num_blocks, params.blockdim_x, params.shm_sz, stream>>>(
-            forest, params);
-        CUDA_CHECK(cudaPeekAtLastError());
-      };
-    };
+template void dispatch_on_fil_template_params<compute_smem_footprint>(
+  predict_params&);
+
+template <typename storage_type>
+struct infer_k_storage_template {
+  template <class KernelParams>
+  struct Func {
+    Func(predict_params& params, storage_type forest, cudaStream_t stream) {
+      params.num_blocks =
+        params.num_blocks != 0
+          ? params.num_blocks
+          : raft::ceildiv(int(params.num_rows), params.n_items);
+      infer_k<KernelParams::n_items, KernelParams::leaf_algo,
+              KernelParams::cols_in_shmem, storage_type>
+        <<<params.num_blocks, params.block_dim_x, params.shm_sz, stream>>>(
+          forest, params);
+      CUDA_CHECK(cudaPeekAtLastError());
+    }
   };
-}*/
+};
 
 template <typename storage_type>
 void infer(storage_type forest, predict_params params, cudaStream_t stream) {
-  dispatch_on_fil_template_params<infer_k_launcher, storage_type>(
-    params, forest, stream);
+  dispatch_on_fil_template_params<
+    infer_k_storage_template<storage_type>::template Func>(params, forest,
+                                                           stream);
 }
 
 template void infer<dense_storage>(dense_storage forest, predict_params params,

From d8505dfbbc1662bb42cab9a377fdd7628400134e Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 2 Jul 2021 18:25:46 -0700
Subject: [PATCH 16/30] moved runtime args to constructor, separated
 ::run(...), added default template args

---
 cpp/src/fil/common.cuh | 53 ++++++++++++++++++++----------------------
 cpp/src/fil/fil.cu     |  6 ++---
 cpp/src/fil/infer.cu   | 34 +++++++++++++--------------
 3 files changed, 44 insertions(+), 49 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 23f43f8afb..8176f923ee 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -172,7 +172,7 @@ struct predict_params : shmem_size_params {
   int num_blocks;
 };
 
-template <bool COLS_IN_SHMEM, int LEAF_ALGO, int N_ITEMS>
+template <bool COLS_IN_SHMEM = false, int LEAF_ALGO = 0, int N_ITEMS = 1>
 struct KernelTemplateParameters {
   static const bool cols_in_shmem = COLS_IN_SHMEM;
   static const leaf_algo_t leaf_algo = (leaf_algo_t)LEAF_ALGO;
@@ -181,73 +181,70 @@ struct KernelTemplateParameters {
 
 namespace dispatch {
 
-template <template <class> class Func, class KernelParams, typename... Args>
-void dispatch_on_n_items(predict_params& params, Args... args) {
+template <class KernelParams, class Func>
+void dispatch_on_n_items(Func func, predict_params& params) {
   if (params.n_items == KernelParams::n_items) {
-    Func<KernelParams>(params, args...);
+    func.template run<KernelParams>(params);
   } else if constexpr (KernelParams::n_items < 4) {
     typedef KernelTemplateParameters<KernelParams::cols_in_shmem,
                                      KernelParams::leaf_algo,
                                      KernelParams::n_items + 1>
       Next;
-    dispatch_on_n_items<Func, Next>(params, args...);
+    dispatch_on_n_items<Next>(func, params);
   } else {
     ASSERT(false, "internal error: n_items > 4 or < 1");
   }
 }
 
-template <template <class> class Func, class KernelParams, typename... Args>
-void dispatch_on_leaf_algo(predict_params& params, Args... args) {
+template <class KernelParams, class Func>
+void dispatch_on_leaf_algo(Func func, predict_params& params) {
   if (params.leaf_algo == KernelParams::leaf_algo) {
     if constexpr (KernelParams::leaf_algo == GROVE_PER_CLASS) {
       if (params.num_classes <= FIL_TPB) {
         params.block_dim_x = FIL_TPB - FIL_TPB % params.num_classes;
         typedef KernelTemplateParameters<KernelParams::cols_in_shmem,
-                                         GROVE_PER_CLASS_FEW_CLASSES, 1>
+                                         GROVE_PER_CLASS_FEW_CLASSES>
           Next;
-        dispatch_on_n_items<Func, Next>(params, args...);
+        dispatch_on_n_items<Next>(func, params);
       } else {
         params.block_dim_x = FIL_TPB;
         typedef KernelTemplateParameters<KernelParams::cols_in_shmem,
-                                         GROVE_PER_CLASS_MANY_CLASSES, 1>
+                                         GROVE_PER_CLASS_MANY_CLASSES>
           Next;
-        dispatch_on_n_items<Func, Next>(params, args...);
+        dispatch_on_n_items<Next>(func, params);
       }
     } else {
       params.block_dim_x = FIL_TPB;
       typedef KernelTemplateParameters<KernelParams::cols_in_shmem,
-                                       KernelParams::leaf_algo, 1>
+                                       KernelParams::leaf_algo>
         Next;
-      dispatch_on_n_items<Func, Next>(params, args...);
+      dispatch_on_n_items<Next>(func, params);
     }
   } else if constexpr (KernelParams::leaf_algo + 1 <
                        (int)leaf_algo_t::LEAF_ALGO_INVALID) {
-    typedef KernelTemplateParameters<
-      KernelParams::cols_in_shmem,
-      KernelParams::leaf_algo + 1, 1>
+    typedef KernelTemplateParameters<KernelParams::cols_in_shmem,
+                                     KernelParams::leaf_algo + 1>
       Next;
-    dispatch_on_leaf_algo<Func, Next>(params, args...);
+    dispatch_on_n_items<Next>(func, params);
   } else {
     ASSERT(false, "internal error: dispatch: invalid leaf_algo %d",
            params.leaf_algo);
   }
 }
 
-template <template <class> class Func, typename... Args>
-void dispatch_on_cols_in_shmem(predict_params& params, Args... args) {
+template <class Func>
+void dispatch_on_cols_in_shmem(Func func, predict_params& params) {
   if (params.cols_in_shmem)
-    dispatch_on_leaf_algo<Func, KernelTemplateParameters<true, 0, 1>>(params,
-                                                                      args...);
+    dispatch_on_leaf_algo<KernelTemplateParameters<true>>(func, params);
   else
-    dispatch_on_leaf_algo<Func, KernelTemplateParameters<false, 0, 1>>(params,
-                                                                       args...);
+    dispatch_on_leaf_algo<KernelTemplateParameters<false>>(func, params);
 }
 
 }  // namespace dispatch
 
-template <template <class> class Func, typename... Args>
-void dispatch_on_fil_template_params(predict_params& params, Args... args) {
-  dispatch::dispatch_on_cols_in_shmem<Func>(params, args...);
+template <class Func>
+void dispatch_on_fil_template_params(Func func, predict_params& params) {
+  dispatch::dispatch_on_cols_in_shmem(func, params);
 }
 
 // we need to instantiate all get_smem_footprint instantiations in infer.cu.
@@ -256,9 +253,9 @@ void dispatch_on_fil_template_params(predict_params& params, Args... args) {
 // requires a declaration of this struct with the declaration of the run method
 // (i.e. all but one line) visible from infer.cu, as well as this full
 // definition visible from fil.cu. We'll just define it in common.cuh.
-template <class KernelParams>
 struct compute_smem_footprint {
-  compute_smem_footprint(predict_params& ssp) {
+  template <class KernelParams>
+  void run(predict_params& ssp) {
     // need GROVE_PER_CLASS_*_CLASSES
     if constexpr (KernelParams::leaf_algo != GROVE_PER_CLASS) {
       ssp.shm_sz = ssp.template get_smem_footprint<KernelParams::n_items,
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 06212275d3..b9ff20a5d5 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -74,8 +74,8 @@ __global__ void transform_k(float* preds, size_t n, output_t output,
     preds[i] = result;
 }
 
-extern template void dispatch_on_fil_template_params<compute_smem_footprint>(
-  predict_params&);
+extern void dispatch_on_fil_template_params(compute_smem_footprint,
+                                            predict_params&);
 
 struct forest {
   void init_n_items(int device) {
@@ -112,7 +112,7 @@ struct forest {
         ssp.cols_in_shmem = cols_in_shmem;
         for (ssp.n_items = min_n_items; ssp.n_items <= max_n_items;
              ++ssp.n_items) {
-          dispatch_on_fil_template_params<compute_smem_footprint>(ssp);
+          dispatch_on_fil_template_params(compute_smem_footprint(), ssp);
           if (ssp.shm_sz < max_shm) ssp_ = ssp;
         }
       }
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 4af7afe443..cbfff057bd 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -756,32 +756,30 @@ size_t shmem_size_params::get_smem_footprint() {
 }
 
 // make sure to instantiate all possible get_smem_footprint instantiations
-template void dispatch_on_fil_template_params<compute_smem_footprint>(
-  predict_params&);
+void dispatch_on_fil_template_params(compute_smem_footprint, predict_params&);
 
 template <typename storage_type>
 struct infer_k_storage_template {
+  storage_type forest;
+  cudaStream_t stream;
+
   template <class KernelParams>
-  struct Func {
-    Func(predict_params& params, storage_type forest, cudaStream_t stream) {
-      params.num_blocks =
-        params.num_blocks != 0
-          ? params.num_blocks
-          : raft::ceildiv(int(params.num_rows), params.n_items);
-      infer_k<KernelParams::n_items, KernelParams::leaf_algo,
-              KernelParams::cols_in_shmem, storage_type>
-        <<<params.num_blocks, params.block_dim_x, params.shm_sz, stream>>>(
-          forest, params);
-      CUDA_CHECK(cudaPeekAtLastError());
-    }
-  };
+  void run(predict_params& params) {
+    params.num_blocks = params.num_blocks != 0
+                          ? params.num_blocks
+                          : raft::ceildiv(int(params.num_rows), params.n_items);
+    infer_k<KernelParams::n_items, KernelParams::leaf_algo,
+            KernelParams::cols_in_shmem, storage_type>
+      <<<params.num_blocks, params.block_dim_x, params.shm_sz, stream>>>(
+        forest, params);
+    CUDA_CHECK(cudaPeekAtLastError());
+  }
 };
 
 template <typename storage_type>
 void infer(storage_type forest, predict_params params, cudaStream_t stream) {
-  dispatch_on_fil_template_params<
-    infer_k_storage_template<storage_type>::template Func>(params, forest,
-                                                           stream);
+  dispatch_on_fil_template_params(
+    infer_k_storage_template<storage_type>{forest, stream}, params);
 }
 
 template void infer<dense_storage>(dense_storage forest, predict_params params,

From 5f2458fc1ea5ffe970cd865725f739d3370067de Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Wed, 29 Sep 2021 15:58:30 -0700
Subject: [PATCH 17/30] KernelParams::inc_*

---
 cpp/src/fil/common.cuh | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 71df82844d..2b699d31b5 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -177,8 +177,10 @@ struct predict_params : shmem_size_params {
 template <bool COLS_IN_SHMEM = false, int LEAF_ALGO = 0, int N_ITEMS = 1>
 struct KernelTemplateParameters {
   static const bool cols_in_shmem = COLS_IN_SHMEM;
-  static const leaf_algo_t leaf_algo = (leaf_algo_t)LEAF_ALGO;
+  static const leaf_algo_t leaf_algo = static_cast<leaf_algo_t>(LEAF_ALGO);
   static const int n_items = N_ITEMS;
+  typedef KernelTemplateParameters<cols_in_shmem, leaf_algo, n_items + 1> inc_n_items;
+  typedef KernelTemplateParameters<cols_in_shmem, leaf_algo + 1, n_items> inc_leaf_algo;
 };
 
 namespace dispatch {
@@ -188,11 +190,7 @@ void dispatch_on_n_items(Func func, predict_params& params) {
   if (params.n_items == KernelParams::n_items) {
     func.template run<KernelParams>(params);
   } else if constexpr (KernelParams::n_items < 4) {
-    typedef KernelTemplateParameters<KernelParams::cols_in_shmem,
-                                     KernelParams::leaf_algo,
-                                     KernelParams::n_items + 1>
-      Next;
-    dispatch_on_n_items<Next>(func, params);
+    dispatch_on_n_items<class KernelParams::inc_n_items>(func, params);
   } else {
     ASSERT(false, "internal error: n_items > 4 or < 1");
   }
@@ -217,17 +215,11 @@ void dispatch_on_leaf_algo(Func func, predict_params& params) {
       }
     } else {
       params.block_dim_x = FIL_TPB;
-      typedef KernelTemplateParameters<KernelParams::cols_in_shmem,
-                                       KernelParams::leaf_algo>
-        Next;
-      dispatch_on_n_items<Next>(func, params);
+      dispatch_on_n_items<KernelParams>(func, params);
     }
   } else if constexpr (KernelParams::leaf_algo + 1 <
-                       (int)leaf_algo_t::LEAF_ALGO_INVALID) {
-    typedef KernelTemplateParameters<KernelParams::cols_in_shmem,
-                                     KernelParams::leaf_algo + 1>
-      Next;
-    dispatch_on_n_items<Next>(func, params);
+                       static_cast<int>(leaf_algo_t::LEAF_ALGO_INVALID)) {
+    dispatch_on_n_items<class KernelParams::inc_leaf_algo>(func, params);
   } else {
     ASSERT(false, "internal error: dispatch: invalid leaf_algo %d",
            params.leaf_algo);
@@ -252,7 +244,7 @@ void dispatch_on_fil_template_params(Func func, predict_params& params) {
 // we need to instantiate all get_smem_footprint instantiations in infer.cu.
 // The only guarantee is by instantiating
 // dispatch_on_FIL_template<compute_smem_footprint...  in infer.cu. This
-// requires a declaration of this struct with the declaration of the run method
+// requires a declaration of this struct with the declaration of the `run` template
 // (i.e. all but one line) visible from infer.cu, as well as this full
 // definition visible from fil.cu. We'll just define it in common.cuh.
 struct compute_smem_footprint {

From 1b87aa02ee77cc0e0dc684c4170da3115903f353 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Wed, 29 Sep 2021 18:59:25 -0700
Subject: [PATCH 18/30] dispatch_on_cats_present

---
 cpp/src/fil/common.cuh | 86 +++++++++++++++++++++++++++---------------
 cpp/src/fil/fil.cu     | 13 +++----
 cpp/src/fil/infer.cu   | 29 +++++++-------
 3 files changed, 77 insertions(+), 51 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 2b699d31b5..8828a42712 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -160,6 +160,8 @@ struct predict_params : shmem_size_params {
   algo_t algo;
   // number of outputs for the forest per each data row
   int num_outputs;
+  // are there categorical inner nodes?
+  bool cats_present;
 
   // Data parameters.
   float* preds;
@@ -174,19 +176,33 @@ struct predict_params : shmem_size_params {
   int num_blocks;
 };
 
-template <bool COLS_IN_SHMEM = false, int LEAF_ALGO = 0, int N_ITEMS = 1>
+template <bool COLS_IN_SHMEM  = false,
+          bool CATS_SUPPORTED = false,
+          int LEAF_ALGO       = 0,
+          int N_ITEMS         = 1>
 struct KernelTemplateParameters {
-  static const bool cols_in_shmem = COLS_IN_SHMEM;
+  static const bool cols_in_shmem    = COLS_IN_SHMEM;
+  static const bool cats_supported   = CATS_SUPPORTED;
   static const leaf_algo_t leaf_algo = static_cast<leaf_algo_t>(LEAF_ALGO);
-  static const int n_items = N_ITEMS;
-  typedef KernelTemplateParameters<cols_in_shmem, leaf_algo, n_items + 1> inc_n_items;
-  typedef KernelTemplateParameters<cols_in_shmem, leaf_algo + 1, n_items> inc_leaf_algo;
+  static const int n_items           = N_ITEMS;
+
+  template <bool _cats_supported>
+  using replace_cats_supported =
+    KernelTemplateParameters<cols_in_shmem, _cats_supported, leaf_algo, n_items>;
+  using inc_leaf_algo =
+    KernelTemplateParameters<cols_in_shmem, cats_supported, leaf_algo + 1, n_items>;
+  template <int _leaf_algo>
+  using replace_leaf_algo =
+    KernelTemplateParameters<cols_in_shmem, cats_supported, _leaf_algo, n_items>;
+  using inc_n_items =
+    KernelTemplateParameters<cols_in_shmem, cats_supported, leaf_algo, n_items + 1>;
 };
 
 namespace dispatch {
 
 template <class KernelParams, class Func>
-void dispatch_on_n_items(Func func, predict_params& params) {
+void dispatch_on_n_items(Func func, predict_params& params)
+{
   if (params.n_items == KernelParams::n_items) {
     func.template run<KernelParams>(params);
   } else if constexpr (KernelParams::n_items < 4) {
@@ -197,63 +213,73 @@ void dispatch_on_n_items(Func func, predict_params& params) {
 }
 
 template <class KernelParams, class Func>
-void dispatch_on_leaf_algo(Func func, predict_params& params) {
+void dispatch_on_leaf_algo(Func func, predict_params& params)
+{
   if (params.leaf_algo == KernelParams::leaf_algo) {
     if constexpr (KernelParams::leaf_algo == GROVE_PER_CLASS) {
       if (params.num_classes <= FIL_TPB) {
         params.block_dim_x = FIL_TPB - FIL_TPB % params.num_classes;
-        typedef KernelTemplateParameters<KernelParams::cols_in_shmem,
-                                         GROVE_PER_CLASS_FEW_CLASSES>
-          Next;
+        using Next         = typename KernelParams::replace_leaf_algo<GROVE_PER_CLASS_FEW_CLASSES>;
         dispatch_on_n_items<Next>(func, params);
       } else {
         params.block_dim_x = FIL_TPB;
-        typedef KernelTemplateParameters<KernelParams::cols_in_shmem,
-                                         GROVE_PER_CLASS_MANY_CLASSES>
-          Next;
+        using Next         = typename KernelParams::replace_leaf_algo<GROVE_PER_CLASS_MANY_CLASSES>;
         dispatch_on_n_items<Next>(func, params);
       }
     } else {
       params.block_dim_x = FIL_TPB;
       dispatch_on_n_items<KernelParams>(func, params);
     }
-  } else if constexpr (KernelParams::leaf_algo + 1 <
-                       static_cast<int>(leaf_algo_t::LEAF_ALGO_INVALID)) {
+  } else if constexpr (KernelParams::leaf_algo + 1 < static_cast<int>(LEAF_ALGO_INVALID)) {
     dispatch_on_n_items<class KernelParams::inc_leaf_algo>(func, params);
   } else {
-    ASSERT(false, "internal error: dispatch: invalid leaf_algo %d",
-           params.leaf_algo);
+    ASSERT(false, "internal error: dispatch: invalid leaf_algo %d", params.leaf_algo);
   }
 }
 
+template <class KernelParams, class Func>
+void dispatch_on_cats_supported(Func func, predict_params& params)
+{
+  if (params.cats_present)
+    dispatch_on_leaf_algo<KernelParams::template replace_cats_supported<true>>(func, params);
+  else
+    dispatch_on_leaf_algo<KernelParams::template replace_cats_supported<false>>(func, params);
+}
+
 template <class Func>
-void dispatch_on_cols_in_shmem(Func func, predict_params& params) {
+void dispatch_on_cols_in_shmem(Func func, predict_params& params)
+{
   if (params.cols_in_shmem)
-    dispatch_on_leaf_algo<KernelTemplateParameters<true>>(func, params);
+    dispatch_on_cats_supported<KernelTemplateParameters<true>>(func, params);
   else
-    dispatch_on_leaf_algo<KernelTemplateParameters<false>>(func, params);
+    dispatch_on_cats_supported<KernelTemplateParameters<false>>(func, params);
 }
 
 }  // namespace dispatch
 
 template <class Func>
-void dispatch_on_fil_template_params(Func func, predict_params& params) {
+void dispatch_on_fil_template_params(Func func, predict_params& params)
+{
   dispatch::dispatch_on_cols_in_shmem(func, params);
 }
 
-// we need to instantiate all get_smem_footprint instantiations in infer.cu.
-// The only guarantee is by instantiating
-// dispatch_on_FIL_template<compute_smem_footprint...  in infer.cu. This
-// requires a declaration of this struct with the declaration of the `run` template
-// (i.e. all but one line) visible from infer.cu, as well as this full
-// definition visible from fil.cu. We'll just define it in common.cuh.
+/* For an example of Func, see this:
+ *
+ * We need to instantiate all get_smem_footprint instantiations in infer.cu.
+ * The only guarantee is by instantiating
+ * dispatch_on_FIL_template<compute_smem_footprint...  in infer.cu. This
+ * requires a declaration of this struct with the declaration of the `run` template
+ * (i.e. all but one line) visible from infer.cu, as well as this full
+ * definition visible from fil.cu. We'll just define it in common.cuh.
+ */
 struct compute_smem_footprint {
   template <class KernelParams>
-  void run(predict_params& ssp) {
+  void run(predict_params& ssp)
+  {
     // need GROVE_PER_CLASS_*_CLASSES
     if constexpr (KernelParams::leaf_algo != GROVE_PER_CLASS) {
-      ssp.shm_sz = ssp.template get_smem_footprint<KernelParams::n_items,
-                                                   KernelParams::leaf_algo>();
+      ssp.shm_sz =
+        ssp.template get_smem_footprint<KernelParams::n_items, KernelParams::leaf_algo>();
     }
   }
 };
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 50f22ce5b4..e879d869d5 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -97,8 +97,7 @@ __global__ void transform_k(float* preds,
     preds[i] = result;
 }
 
-extern void dispatch_on_fil_template_params(compute_smem_footprint,
-                                            predict_params&);
+extern void dispatch_on_fil_template_params(compute_smem_footprint, predict_params&);
 
 struct forest {
   forest(const raft::handle_t& h) : vector_leaf_(0, h.get_stream()), cat_sets_(h.get_stream()) {}
@@ -134,9 +133,9 @@ struct forest {
         ssp.n_items == 0 ? (algo_ == algo_t::BATCH_TREE_REORG ? 4 : 1) : ssp.n_items;
       for (bool cols_in_shmem : {false, true}) {
         ssp.cols_in_shmem = cols_in_shmem;
-        for (ssp.n_items = min_n_items; ssp.n_items <= max_n_items;
-             ++ssp.n_items) {
-          dispatch_on_fil_template_params(compute_smem_footprint(), ssp);
+        for (ssp.n_items = min_n_items; ssp.n_items <= max_n_items; ++ssp.n_items) {
+          predict_params pp = ssp;
+          dispatch_on_fil_template_params(compute_smem_footprint(), pp);
           if (ssp.shm_sz < max_shm) ssp_ = ssp;
         }
       }
@@ -308,9 +307,7 @@ struct forest {
           params.num_outputs = params.num_classes;
           do_transform = (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != 0.0f;
           break;
-        default:
-          ASSERT(false, "internal error: predict: invalid leaf_algo %d",
-                 params.leaf_algo);
+        default: ASSERT(false, "internal error: predict: invalid leaf_algo %d", params.leaf_algo);
       }
     } else {
       if (params.leaf_algo == leaf_algo_t::FLOAT_UNARY_BINARY) {
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 3595f245a4..eeb4ecd220 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -786,7 +786,7 @@ __device__ INLINE_CONFIG void load_data(float* sdata,
 template <int NITEMS,
           leaf_algo_t leaf_algo,
           bool cols_in_shmem,
-          bool CATS_SUPPORTED,
+          bool cats_supported,
           class storage_type>
 __global__ void infer_k(storage_type forest, predict_params params)
 {
@@ -823,7 +823,7 @@ __global__ void infer_k(storage_type forest, predict_params params)
       typedef typename leaf_output_t<leaf_algo>::T pred_t;
       vec<NITEMS, pred_t> prediction;
       if (tree < forest.num_trees() && thread_num_rows != 0) {
-        prediction = infer_one_tree<NITEMS, CATS_SUPPORTED, pred_t>(
+        prediction = infer_one_tree<NITEMS, cats_supported, pred_t>(
           forest[tree],
           cols_in_shmem ? sdata + thread_row0 * sdata_stride : block_input + thread_row0 * num_cols,
           cols_in_shmem ? sdata_stride : num_cols,
@@ -863,22 +863,25 @@ struct infer_k_storage_template {
   cudaStream_t stream;
 
   template <class KernelParams>
-  void run(predict_params& params) {
-    params.num_blocks = params.num_blocks != 0
-                          ? params.num_blocks
-                          : raft::ceildiv(int(params.num_rows), params.n_items);
-    infer_k<KernelParams::n_items, KernelParams::leaf_algo,
-            KernelParams::cols_in_shmem, storage_type>
-      <<<params.num_blocks, params.block_dim_x, params.shm_sz, stream>>>(
-        forest, params);
+  void run(predict_params& params)
+  {
+    params.cats_present = forest.cats_present();
+    params.num_blocks   = params.num_blocks != 0
+                            ? params.num_blocks
+                            : raft::ceildiv(int(params.num_rows), params.n_items);
+    infer_k<KernelParams::n_items,
+            KernelParams::leaf_algo,
+            KernelParams::cols_in_shmem,
+            KernelParams::cats_supported>
+      <<<params.num_blocks, params.block_dim_x, params.shm_sz, stream>>>(forest, params);
     CUDA_CHECK(cudaPeekAtLastError());
   }
 };
 
 template <typename storage_type>
-void infer(storage_type forest, predict_params params, cudaStream_t stream) {
-  dispatch_on_fil_template_params(
-    infer_k_storage_template<storage_type>{forest, stream}, params);
+void infer(storage_type forest, predict_params params, cudaStream_t stream)
+{
+  dispatch_on_fil_template_params(infer_k_storage_template<storage_type>{forest, stream}, params);
 }
 
 template void infer<dense_storage>(dense_storage forest,

From 5a039987f7ca4b25c11dcc651ac135384a128248 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Wed, 6 Oct 2021 21:11:22 -0700
Subject: [PATCH 19/30] fix several issues

proper derived type designator
cats_present now initialized for dispatch...(compute_smem_size... and part of smem_size_params (set once per forest)
integrate LEAF_ALGO_INVALID fully
fix recursion in dispatch_on_leaf_algo
---
 cpp/src/fil/common.cuh  | 11 ++++++-----
 cpp/src/fil/fil.cu      | 18 ++++++++++--------
 cpp/src/fil/infer.cu    |  9 ++++-----
 cpp/test/sg/fil_test.cu |  7 +++++--
 4 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 8828a42712..a0647fe545 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -130,6 +130,9 @@ struct shmem_size_params {
   /// are the input columns are prefetched into shared
   /// memory before inferring the row in question
   bool cols_in_shmem = true;
+  // are there categorical inner nodes? doesnt' currently affect shared memory size,
+  // but participates in template dispatch and may affect it later
+  bool cats_present;
   /// log2_threads_per_tree determines how many threads work on a single tree
   /// at once inside a block (sharing trees means splitting input rows)
   int log2_threads_per_tree = 0;
@@ -160,8 +163,6 @@ struct predict_params : shmem_size_params {
   algo_t algo;
   // number of outputs for the forest per each data row
   int num_outputs;
-  // are there categorical inner nodes?
-  bool cats_present;
 
   // Data parameters.
   float* preds;
@@ -231,7 +232,7 @@ void dispatch_on_leaf_algo(Func func, predict_params& params)
       dispatch_on_n_items<KernelParams>(func, params);
     }
   } else if constexpr (KernelParams::leaf_algo + 1 < static_cast<int>(LEAF_ALGO_INVALID)) {
-    dispatch_on_n_items<class KernelParams::inc_leaf_algo>(func, params);
+    dispatch_on_leaf_algo<class KernelParams::inc_leaf_algo>(func, params);
   } else {
     ASSERT(false, "internal error: dispatch: invalid leaf_algo %d", params.leaf_algo);
   }
@@ -241,9 +242,9 @@ template <class KernelParams, class Func>
 void dispatch_on_cats_supported(Func func, predict_params& params)
 {
   if (params.cats_present)
-    dispatch_on_leaf_algo<KernelParams::template replace_cats_supported<true>>(func, params);
+    dispatch_on_leaf_algo<typename KernelParams::replace_cats_supported<true>>(func, params);
   else
-    dispatch_on_leaf_algo<KernelParams::template replace_cats_supported<false>>(func, params);
+    dispatch_on_leaf_algo<typename KernelParams::replace_cats_supported<false>>(func, params);
 }
 
 template <class Func>
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index e879d869d5..bee725a560 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -97,8 +97,6 @@ __global__ void transform_k(float* preds,
     preds[i] = result;
 }
 
-extern void dispatch_on_fil_template_params(compute_smem_footprint, predict_params&);
-
 struct forest {
   forest(const raft::handle_t& h) : vector_leaf_(0, h.get_stream()), cat_sets_(h.get_stream()) {}
 
@@ -136,7 +134,7 @@ struct forest {
         for (ssp.n_items = min_n_items; ssp.n_items <= max_n_items; ++ssp.n_items) {
           predict_params pp = ssp;
           dispatch_on_fil_template_params(compute_smem_footprint(), pp);
-          if (ssp.shm_sz < max_shm) ssp_ = ssp;
+          if (pp.shm_sz < max_shm) ssp_ = pp;
         }
       }
       ASSERT(max_shm >= ssp_.shm_sz,
@@ -163,6 +161,11 @@ struct forest {
                    const std::vector<float>& vector_leaf,
                    const forest_params_t* params)
   {
+    int device          = h.get_device();
+    cudaStream_t stream = h.get_stream();
+    // categorical features
+    cat_sets_ = cat_sets_device_owner(cat_sets, stream);
+
     depth_                           = params->depth;
     num_trees_                       = params->num_trees;
     algo_                            = params->algo;
@@ -174,10 +177,9 @@ struct forest {
     proba_ssp_.leaf_algo             = params->leaf_algo;
     proba_ssp_.num_cols              = params->num_cols;
     proba_ssp_.num_classes           = params->num_classes;
+    proba_ssp_.cats_present          = cat_sets_.accessor().cats_present();
     class_ssp_                       = proba_ssp_;
 
-    int device          = h.get_device();
-    cudaStream_t stream = h.get_stream();
     init_n_items(device);  // n_items takes priority over blocks_per_sm
     init_fixed_block_count(device, params->blocks_per_sm);
 
@@ -191,9 +193,6 @@ struct forest {
                                  cudaMemcpyHostToDevice,
                                  stream));
     }
-
-    // categorical features
-    cat_sets_ = cat_sets_device_owner(cat_sets, stream);
   }
 
   virtual void infer(predict_params params, cudaStream_t stream) = 0;
@@ -307,6 +306,7 @@ struct forest {
           params.num_outputs = params.num_classes;
           do_transform = (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != 0.0f;
           break;
+        case fil::leaf_algo_t::LEAF_ALGO_INVALID:
         default: ASSERT(false, "internal error: predict: invalid leaf_algo %d", params.leaf_algo);
       }
     } else {
@@ -539,6 +539,7 @@ void check_params(const forest_params_t* params, bool dense)
              "num_classes >= 2 is required for "
              "leaf_algo == VECTOR_LEAF");
       break;
+    case fil::leaf_algo_t::LEAF_ALGO_INVALID:
     default:
       ASSERT(false,
              "leaf_algo must be FLOAT_UNARY_BINARY, CATEGORICAL_LEAF"
@@ -782,6 +783,7 @@ void tl2fil_leaf_payload(fil_node_t* fil_node,
       ASSERT(!tl_tree.HasLeafVector(tl_node_id),
              "some but not all treelite leaves have leaf_vector()");
       break;
+    case fil::leaf_algo_t::LEAF_ALGO_INVALID:
     default: ASSERT(false, "internal error: invalid leaf_algo");
   };
 }
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index eeb4ecd220..bd12e79471 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -855,7 +855,7 @@ size_t shmem_size_params::get_smem_footprint()
 }
 
 // make sure to instantiate all possible get_smem_footprint instantiations
-void dispatch_on_fil_template_params(compute_smem_footprint, predict_params&);
+template void dispatch_on_fil_template_params(compute_smem_footprint, predict_params&);
 
 template <typename storage_type>
 struct infer_k_storage_template {
@@ -865,10 +865,9 @@ struct infer_k_storage_template {
   template <class KernelParams>
   void run(predict_params& params)
   {
-    params.cats_present = forest.cats_present();
-    params.num_blocks   = params.num_blocks != 0
-                            ? params.num_blocks
-                            : raft::ceildiv(int(params.num_rows), params.n_items);
+    params.num_blocks = params.num_blocks != 0
+                          ? params.num_blocks
+                          : raft::ceildiv(int(params.num_rows), params.n_items);
     infer_k<KernelParams::n_items,
             KernelParams::leaf_algo,
             KernelParams::cols_in_shmem,
diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu
index 293222667e..f00ca36426 100644
--- a/cpp/test/sg/fil_test.cu
+++ b/cpp/test/sg/fil_test.cu
@@ -384,6 +384,7 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
           std::memcpy(&w.f, &weights_h[i], sizeof w.f);
           break;
         case fil::leaf_algo_t::VECTOR_LEAF: w.idx = i; break;
+        case fil::leaf_algo_t::LEAF_ALGO_INVALID:
         default: ASSERT(false, "internal error: invalid ps.leaf_algo");
       }
       // make sure nodes are categorical only when their feature ID is categorical
@@ -546,7 +547,8 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
         }
         break;
       case fil::leaf_algo_t::GROVE_PER_CLASS_FEW_CLASSES:
-      case fil::leaf_algo_t::GROVE_PER_CLASS_MANY_CLASSES: break;
+      case fil::leaf_algo_t::GROVE_PER_CLASS_MANY_CLASSES:
+      case fil::leaf_algo_t::LEAF_ALGO_INVALID: break;
     }
 
     // copy to GPU
@@ -770,7 +772,8 @@ class TreeliteFilTest : public BaseFilTest {
           break;
         }
         case fil::leaf_algo_t::GROVE_PER_CLASS_FEW_CLASSES:
-        case fil::leaf_algo_t::GROVE_PER_CLASS_MANY_CLASSES: break;
+        case fil::leaf_algo_t::GROVE_PER_CLASS_MANY_CLASSES:
+        case fil::leaf_algo_t::LEAF_ALGO_INVALID: break;
       }
     } else {
       int left          = root + 2 * (node - root) + 1;

From dbcaa7b1d07cbe58142bfc044d45171fd93c9831 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Thu, 7 Oct 2021 17:02:15 -0700
Subject: [PATCH 20/30] extern template void
 dispatch_on_fil_template_params(compute_smem_footprint, ...)

---
 cpp/src/fil/common.cuh |  9 +--------
 cpp/src/fil/fil.cu     |  5 +++++
 cpp/src/fil/infer.cu   | 11 ++++++++++-
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index a0647fe545..470b01c91d 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -275,14 +275,7 @@ void dispatch_on_fil_template_params(Func func, predict_params& params)
  */
 struct compute_smem_footprint {
   template <class KernelParams>
-  void run(predict_params& ssp)
-  {
-    // need GROVE_PER_CLASS_*_CLASSES
-    if constexpr (KernelParams::leaf_algo != GROVE_PER_CLASS) {
-      ssp.shm_sz =
-        ssp.template get_smem_footprint<KernelParams::n_items, KernelParams::leaf_algo>();
-    }
-  }
+  void run(predict_params& ssp);
 };
 
 // infer() calls the inference kernel with the parameters on the stream
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 14262f6968..005d15710a 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -97,6 +97,11 @@ __global__ void transform_k(float* preds,
     preds[i] = result;
 }
 
+// needed to avoid expanding the dispatch template into unresolved
+// compute_smem_footprint::run<KernelParams>() calls. In infer.cu, we don't export those symbols,
+// but rather one symbol for the whole template specialization, as below.
+extern template void dispatch_on_fil_template_params(compute_smem_footprint, predict_params&);
+
 struct forest {
   forest(const raft::handle_t& h) : vector_leaf_(0, h.get_stream()), cat_sets_(h.get_stream()) {}
 
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index bd12e79471..c6a6e3a871 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -36,7 +36,7 @@
 #endif  // __CUDA_ARCH__
 #endif  // CUDA_PRAGMA_UNROLL
 
-#define INLINE_CONFIG __forceinline__
+#define INLINE_CONFIG __noinline__
 
 namespace ML {
 namespace fil {
@@ -854,6 +854,15 @@ size_t shmem_size_params::get_smem_footprint()
   return std::max(accumulate_footprint, finalize_footprint);
 }
 
+template <class KernelParams>
+void compute_smem_footprint::run(predict_params& ssp)
+{
+  // need GROVE_PER_CLASS_*_CLASSES
+  if constexpr (KernelParams::leaf_algo != GROVE_PER_CLASS) {
+    ssp.shm_sz = ssp.template get_smem_footprint<KernelParams::n_items, KernelParams::leaf_algo>();
+  }
+}
+
 // make sure to instantiate all possible get_smem_footprint instantiations
 template void dispatch_on_fil_template_params(compute_smem_footprint, predict_params&);
 

From 798171f127b6bd6adde08a0b13cb3b2ee3d0aa0c Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 8 Oct 2021 17:04:47 -0700
Subject: [PATCH 21/30] change-by-reference into accept-and-return-by-value

---
 cpp/src/fil/common.cuh | 61 +++++++++++++++++++-----------------------
 cpp/src/fil/fil.cu     |  9 +++----
 cpp/src/fil/infer.cu   | 10 +++----
 3 files changed, 37 insertions(+), 43 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 470b01c91d..847eae4493 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -130,9 +130,9 @@ struct shmem_size_params {
   /// are the input columns are prefetched into shared
   /// memory before inferring the row in question
   bool cols_in_shmem = true;
-  // are there categorical inner nodes? doesnt' currently affect shared memory size,
+  // are there categorical inner nodes? doesn't currently affect shared memory size,
   // but participates in template dispatch and may affect it later
-  bool cats_present;
+  bool cats_present = false;
   /// log2_threads_per_tree determines how many threads work on a single tree
   /// at once inside a block (sharing trees means splitting input rows)
   int log2_threads_per_tree = 0;
@@ -159,6 +159,7 @@ struct shmem_size_params {
 // predict_params are parameters for prediction
 struct predict_params : shmem_size_params {
   predict_params(shmem_size_params ssp) : shmem_size_params(ssp) {}
+  predict_params() {}
   // Model parameters.
   algo_t algo;
   // number of outputs for the forest per each data row
@@ -202,80 +203,74 @@ struct KernelTemplateParameters {
 namespace dispatch {
 
 template <class KernelParams, class Func>
-void dispatch_on_n_items(Func func, predict_params& params)
+auto dispatch_on_n_items(Func func, predict_params params) -> decltype(func.run(params))
 {
   if (params.n_items == KernelParams::n_items) {
-    func.template run<KernelParams>(params);
+    return func.template run<KernelParams>(params);
   } else if constexpr (KernelParams::n_items < 4) {
-    dispatch_on_n_items<class KernelParams::inc_n_items>(func, params);
+    return dispatch_on_n_items<class KernelParams::inc_n_items>(func, params);
   } else {
     ASSERT(false, "internal error: n_items > 4 or < 1");
   }
+  return func.run(params);  // appeasing the compiler
 }
 
 template <class KernelParams, class Func>
-void dispatch_on_leaf_algo(Func func, predict_params& params)
+auto dispatch_on_leaf_algo(Func func, predict_params params) -> decltype(func.run(params))
 {
   if (params.leaf_algo == KernelParams::leaf_algo) {
     if constexpr (KernelParams::leaf_algo == GROVE_PER_CLASS) {
       if (params.num_classes <= FIL_TPB) {
         params.block_dim_x = FIL_TPB - FIL_TPB % params.num_classes;
         using Next         = typename KernelParams::replace_leaf_algo<GROVE_PER_CLASS_FEW_CLASSES>;
-        dispatch_on_n_items<Next>(func, params);
+        return dispatch_on_n_items<Next>(func, params);
       } else {
         params.block_dim_x = FIL_TPB;
         using Next         = typename KernelParams::replace_leaf_algo<GROVE_PER_CLASS_MANY_CLASSES>;
-        dispatch_on_n_items<Next>(func, params);
+        return dispatch_on_n_items<Next>(func, params);
       }
     } else {
       params.block_dim_x = FIL_TPB;
-      dispatch_on_n_items<KernelParams>(func, params);
+      return dispatch_on_n_items<KernelParams>(func, params);
     }
   } else if constexpr (KernelParams::leaf_algo + 1 < static_cast<int>(LEAF_ALGO_INVALID)) {
-    dispatch_on_leaf_algo<class KernelParams::inc_leaf_algo>(func, params);
+    return dispatch_on_leaf_algo<class KernelParams::inc_leaf_algo>(func, params);
   } else {
     ASSERT(false, "internal error: dispatch: invalid leaf_algo %d", params.leaf_algo);
   }
+  return func.run(params);  // appeasing the compiler
 }
 
 template <class KernelParams, class Func>
-void dispatch_on_cats_supported(Func func, predict_params& params)
+auto dispatch_on_cats_supported(Func func, predict_params params) -> decltype(func.run(params))
 {
-  if (params.cats_present)
-    dispatch_on_leaf_algo<typename KernelParams::replace_cats_supported<true>>(func, params);
-  else
-    dispatch_on_leaf_algo<typename KernelParams::replace_cats_supported<false>>(func, params);
+  return params.cats_present
+           ? dispatch_on_leaf_algo<typename KernelParams::replace_cats_supported<true>>(func,
+                                                                                        params)
+           : dispatch_on_leaf_algo<typename KernelParams::replace_cats_supported<false>>(func,
+                                                                                         params);
 }
 
 template <class Func>
-void dispatch_on_cols_in_shmem(Func func, predict_params& params)
+auto dispatch_on_cols_in_shmem(Func func, predict_params params) -> decltype(func.run(params))
 {
-  if (params.cols_in_shmem)
-    dispatch_on_cats_supported<KernelTemplateParameters<true>>(func, params);
-  else
-    dispatch_on_cats_supported<KernelTemplateParameters<false>>(func, params);
+  return params.cols_in_shmem
+           ? dispatch_on_cats_supported<KernelTemplateParameters<true>>(func, params)
+           : dispatch_on_cats_supported<KernelTemplateParameters<false>>(func, params);
 }
 
 }  // namespace dispatch
 
 template <class Func>
-void dispatch_on_fil_template_params(Func func, predict_params& params)
+auto dispatch_on_fil_template_params(Func func, predict_params params) -> decltype(func.run(params))
 {
-  dispatch::dispatch_on_cols_in_shmem(func, params);
+  return dispatch::dispatch_on_cols_in_shmem(func, params);
 }
 
-/* For an example of Func, see this:
- *
- * We need to instantiate all get_smem_footprint instantiations in infer.cu.
- * The only guarantee is by instantiating
- * dispatch_on_FIL_template<compute_smem_footprint...  in infer.cu. This
- * requires a declaration of this struct with the declaration of the `run` template
- * (i.e. all but one line) visible from infer.cu, as well as this full
- * definition visible from fil.cu. We'll just define it in common.cuh.
- */
+// For an example of Func, see this:
 struct compute_smem_footprint {
-  template <class KernelParams>
-  void run(predict_params& ssp);
+  template <class KernelParams = KernelTemplateParameters<>>
+  int run(predict_params ssp);
 };
 
 // infer() calls the inference kernel with the parameters on the stream
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 005d15710a..2c5ef2b27e 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -98,9 +98,9 @@ __global__ void transform_k(float* preds,
 }
 
 // needed to avoid expanding the dispatch template into unresolved
-// compute_smem_footprint::run<KernelParams>() calls. In infer.cu, we don't export those symbols,
+// compute_smem_footprint::run() calls. In infer.cu, we don't export those symbols,
 // but rather one symbol for the whole template specialization, as below.
-extern template void dispatch_on_fil_template_params(compute_smem_footprint, predict_params&);
+extern template int dispatch_on_fil_template_params(compute_smem_footprint, predict_params);
 
 struct forest {
   forest(const raft::handle_t& h) : vector_leaf_(0, h.get_stream()), cat_sets_(h.get_stream()) {}
@@ -137,9 +137,8 @@ struct forest {
       for (bool cols_in_shmem : {false, true}) {
         ssp.cols_in_shmem = cols_in_shmem;
         for (ssp.n_items = min_n_items; ssp.n_items <= max_n_items; ++ssp.n_items) {
-          predict_params pp = ssp;
-          dispatch_on_fil_template_params(compute_smem_footprint(), pp);
-          if (pp.shm_sz < max_shm) ssp_ = pp;
+          ssp.shm_sz = dispatch_on_fil_template_params(compute_smem_footprint(), ssp);
+          if (ssp.shm_sz < max_shm) ssp_ = ssp;
         }
       }
       ASSERT(max_shm >= ssp_.shm_sz,
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index c6a6e3a871..0f2d1cf890 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -855,24 +855,24 @@ size_t shmem_size_params::get_smem_footprint()
 }
 
 template <class KernelParams>
-void compute_smem_footprint::run(predict_params& ssp)
+int compute_smem_footprint::run(predict_params ssp)
 {
   // need GROVE_PER_CLASS_*_CLASSES
   if constexpr (KernelParams::leaf_algo != GROVE_PER_CLASS) {
-    ssp.shm_sz = ssp.template get_smem_footprint<KernelParams::n_items, KernelParams::leaf_algo>();
+    return ssp.template get_smem_footprint<KernelParams::n_items, KernelParams::leaf_algo>();
   }
 }
 
 // make sure to instantiate all possible get_smem_footprint instantiations
-template void dispatch_on_fil_template_params(compute_smem_footprint, predict_params&);
+template int dispatch_on_fil_template_params(compute_smem_footprint, predict_params);
 
 template <typename storage_type>
 struct infer_k_storage_template {
   storage_type forest;
   cudaStream_t stream;
 
-  template <class KernelParams>
-  void run(predict_params& params)
+  template <class KernelParams = KernelTemplateParameters<>>
+  void run(predict_params params)
   {
     params.num_blocks = params.num_blocks != 0
                           ? params.num_blocks

From 7e78203dc62f71c8e0e9c7df9a1744165e2e046b Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 8 Oct 2021 17:18:54 -0700
Subject: [PATCH 22/30] variable renames

---
 cpp/src/fil/common.cuh | 54 +++++++++++++++++++-----------------------
 cpp/src/fil/infer.cu   |  2 +-
 2 files changed, 26 insertions(+), 30 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 847eae4493..14a6d9ec54 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -139,7 +139,7 @@ struct shmem_size_params {
   /// n_items is how many input samples (items) any thread processes. If 0 is given,
   /// choose the reasonable most (<=4) that fit into shared memory. See init_n_items()
   int n_items = 0;
-  // block_dim_x is the CUDA block size
+  // block_dim_x is the CUDA block size. Set by dispatch_on_leaf_algo(...)
   int block_dim_x = 0;
   /// shm_sz is the associated shared memory footprint
   int shm_sz = INT_MAX;
@@ -178,26 +178,23 @@ struct predict_params : shmem_size_params {
   int num_blocks;
 };
 
-template <bool COLS_IN_SHMEM  = false,
-          bool CATS_SUPPORTED = false,
-          int LEAF_ALGO       = 0,
-          int N_ITEMS         = 1>
-struct KernelTemplateParameters {
-  static const bool cols_in_shmem    = COLS_IN_SHMEM;
-  static const bool cats_supported   = CATS_SUPPORTED;
-  static const leaf_algo_t leaf_algo = static_cast<leaf_algo_t>(LEAF_ALGO);
-  static const int n_items           = N_ITEMS;
+template <bool COLS_IN_SHMEM_  = false,
+          bool CATS_SUPPORTED_ = false,
+          int LEAF_ALGO_       = 0,
+          int N_ITEMS_         = 1>
+struct KernelTemplateParams {
+  static const bool COLS_IN_SHMEM    = COLS_IN_SHMEM_;
+  static const bool CATS_SUPPORTED   = CATS_SUPPORTED_;
+  static const leaf_algo_t LEAF_ALGO = static_cast<leaf_algo_t>(LEAF_ALGO_);
+  static const int N_ITEMS           = N_ITEMS_;
 
   template <bool _cats_supported>
-  using replace_cats_supported =
-    KernelTemplateParameters<cols_in_shmem, _cats_supported, leaf_algo, n_items>;
-  using inc_leaf_algo =
-    KernelTemplateParameters<cols_in_shmem, cats_supported, leaf_algo + 1, n_items>;
+  using ReplaceCatsSupported =
+    KernelTemplateParams<COLS_IN_SHMEM, _cats_supported, LEAF_ALGO, N_ITEMS>;
+  using IncLeafAlgo = KernelTemplateParams<COLS_IN_SHMEM, CATS_SUPPORTED, LEAF_ALGO + 1, N_ITEMS>;
   template <int _leaf_algo>
-  using replace_leaf_algo =
-    KernelTemplateParameters<cols_in_shmem, cats_supported, _leaf_algo, n_items>;
-  using inc_n_items =
-    KernelTemplateParameters<cols_in_shmem, cats_supported, leaf_algo, n_items + 1>;
+  using ReplaceLeafAlgo = KernelTemplateParams<COLS_IN_SHMEM, CATS_SUPPORTED, _leaf_algo, N_ITEMS>;
+  using IncNItems = KernelTemplateParams<COLS_IN_SHMEM, CATS_SUPPORTED, LEAF_ALGO, N_ITEMS + 1>;
 };
 
 namespace dispatch {
@@ -208,7 +205,7 @@ auto dispatch_on_n_items(Func func, predict_params params) -> decltype(func.run(
   if (params.n_items == KernelParams::n_items) {
     return func.template run<KernelParams>(params);
   } else if constexpr (KernelParams::n_items < 4) {
-    return dispatch_on_n_items<class KernelParams::inc_n_items>(func, params);
+    return dispatch_on_n_items<class KernelParams::IncNItems>(func, params);
   } else {
     ASSERT(false, "internal error: n_items > 4 or < 1");
   }
@@ -222,11 +219,11 @@ auto dispatch_on_leaf_algo(Func func, predict_params params) -> decltype(func.ru
     if constexpr (KernelParams::leaf_algo == GROVE_PER_CLASS) {
       if (params.num_classes <= FIL_TPB) {
         params.block_dim_x = FIL_TPB - FIL_TPB % params.num_classes;
-        using Next         = typename KernelParams::replace_leaf_algo<GROVE_PER_CLASS_FEW_CLASSES>;
+        using Next         = typename KernelParams::ReplaceLeafAlgo<GROVE_PER_CLASS_FEW_CLASSES>;
         return dispatch_on_n_items<Next>(func, params);
       } else {
         params.block_dim_x = FIL_TPB;
-        using Next         = typename KernelParams::replace_leaf_algo<GROVE_PER_CLASS_MANY_CLASSES>;
+        using Next         = typename KernelParams::ReplaceLeafAlgo<GROVE_PER_CLASS_MANY_CLASSES>;
         return dispatch_on_n_items<Next>(func, params);
       }
     } else {
@@ -234,7 +231,7 @@ auto dispatch_on_leaf_algo(Func func, predict_params params) -> decltype(func.ru
       return dispatch_on_n_items<KernelParams>(func, params);
     }
   } else if constexpr (KernelParams::leaf_algo + 1 < static_cast<int>(LEAF_ALGO_INVALID)) {
-    return dispatch_on_leaf_algo<class KernelParams::inc_leaf_algo>(func, params);
+    return dispatch_on_leaf_algo<class KernelParams::IncLeafAlgo>(func, params);
   } else {
     ASSERT(false, "internal error: dispatch: invalid leaf_algo %d", params.leaf_algo);
   }
@@ -245,18 +242,17 @@ template <class KernelParams, class Func>
 auto dispatch_on_cats_supported(Func func, predict_params params) -> decltype(func.run(params))
 {
   return params.cats_present
-           ? dispatch_on_leaf_algo<typename KernelParams::replace_cats_supported<true>>(func,
-                                                                                        params)
-           : dispatch_on_leaf_algo<typename KernelParams::replace_cats_supported<false>>(func,
-                                                                                         params);
+           ? dispatch_on_leaf_algo<typename KernelParams::ReplaceCatsSupported<true>>(func, params)
+           : dispatch_on_leaf_algo<typename KernelParams::ReplaceCatsSupported<false>>(func,
+                                                                                       params);
 }
 
 template <class Func>
 auto dispatch_on_cols_in_shmem(Func func, predict_params params) -> decltype(func.run(params))
 {
   return params.cols_in_shmem
-           ? dispatch_on_cats_supported<KernelTemplateParameters<true>>(func, params)
-           : dispatch_on_cats_supported<KernelTemplateParameters<false>>(func, params);
+           ? dispatch_on_cats_supported<KernelTemplateParams<true>>(func, params)
+           : dispatch_on_cats_supported<KernelTemplateParams<false>>(func, params);
 }
 
 }  // namespace dispatch
@@ -269,7 +265,7 @@ auto dispatch_on_fil_template_params(Func func, predict_params params) -> declty
 
 // For an example of Func, see this:
 struct compute_smem_footprint {
-  template <class KernelParams = KernelTemplateParameters<>>
+  template <class KernelParams = KernelTemplateParams<>>
   int run(predict_params ssp);
 };
 
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 0f2d1cf890..3d4280e48e 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -871,7 +871,7 @@ struct infer_k_storage_template {
   storage_type forest;
   cudaStream_t stream;
 
-  template <class KernelParams = KernelTemplateParameters<>>
+  template <class KernelParams = KernelTemplateParams<>>
   void run(predict_params params)
   {
     params.num_blocks = params.num_blocks != 0

From 6ae0914a21b7fcd9b5878afcee9df124ab9ff493 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 8 Oct 2021 17:24:02 -0700
Subject: [PATCH 23/30] unnecessary changes

---
 cpp/src/fil/fil.cu   | 12 ++++++------
 cpp/src/fil/infer.cu |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 2c5ef2b27e..8483aefd78 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -162,11 +162,6 @@ struct forest {
                    const std::vector<float>& vector_leaf,
                    const forest_params_t* params)
   {
-    int device          = h.get_device();
-    cudaStream_t stream = h.get_stream();
-    // categorical features
-    cat_sets_ = cat_sets_device_owner(cat_sets, stream);
-
     depth_                           = params->depth;
     num_trees_                       = params->num_trees;
     algo_                            = params->algo;
@@ -178,9 +173,11 @@ struct forest {
     proba_ssp_.leaf_algo             = params->leaf_algo;
     proba_ssp_.num_cols              = params->num_cols;
     proba_ssp_.num_classes           = params->num_classes;
-    proba_ssp_.cats_present          = cat_sets_.accessor().cats_present();
+    proba_ssp_.cats_present          = cat_sets.cats_present();
     class_ssp_                       = proba_ssp_;
 
+    int device          = h.get_device();
+    cudaStream_t stream = h.get_stream();
     init_n_items(device);  // n_items takes priority over blocks_per_sm
     init_fixed_block_count(device, params->blocks_per_sm);
 
@@ -194,6 +191,9 @@ struct forest {
                                  cudaMemcpyHostToDevice,
                                  stream));
     }
+
+    // categorical features
+    cat_sets_ = cat_sets_device_owner(cat_sets, stream);
   }
 
   virtual void infer(predict_params params, cudaStream_t stream) = 0;
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 3d4280e48e..6827e88fdf 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -36,7 +36,7 @@
 #endif  // __CUDA_ARCH__
 #endif  // CUDA_PRAGMA_UNROLL
 
-#define INLINE_CONFIG __noinline__
+#define INLINE_CONFIG __forceinline__
 
 namespace ML {
 namespace fil {
@@ -786,7 +786,7 @@ __device__ INLINE_CONFIG void load_data(float* sdata,
 template <int NITEMS,
           leaf_algo_t leaf_algo,
           bool cols_in_shmem,
-          bool cats_supported,
+          bool CATS_SUPPORTED,
           class storage_type>
 __global__ void infer_k(storage_type forest, predict_params params)
 {
@@ -823,7 +823,7 @@ __global__ void infer_k(storage_type forest, predict_params params)
       typedef typename leaf_output_t<leaf_algo>::T pred_t;
       vec<NITEMS, pred_t> prediction;
       if (tree < forest.num_trees() && thread_num_rows != 0) {
-        prediction = infer_one_tree<NITEMS, cats_supported, pred_t>(
+        prediction = infer_one_tree<NITEMS, CATS_SUPPORTED, pred_t>(
           forest[tree],
           cols_in_shmem ? sdata + thread_row0 * sdata_stride : block_input + thread_row0 * num_cols,
           cols_in_shmem ? sdata_stride : num_cols,

From 0978fc0d90219d4de106a4994f7ee9a834cd244d Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 8 Oct 2021 17:27:59 -0700
Subject: [PATCH 24/30] finish case adjustments

---
 cpp/src/fil/common.cuh | 10 +++++-----
 cpp/src/fil/infer.cu   | 13 +++++--------
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 14a6d9ec54..589ec3c883 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -202,9 +202,9 @@ namespace dispatch {
 template <class KernelParams, class Func>
 auto dispatch_on_n_items(Func func, predict_params params) -> decltype(func.run(params))
 {
-  if (params.n_items == KernelParams::n_items) {
+  if (params.n_items == KernelParams::N_ITEMS) {
     return func.template run<KernelParams>(params);
-  } else if constexpr (KernelParams::n_items < 4) {
+  } else if constexpr (KernelParams::N_ITEMS < 4) {
     return dispatch_on_n_items<class KernelParams::IncNItems>(func, params);
   } else {
     ASSERT(false, "internal error: n_items > 4 or < 1");
@@ -215,8 +215,8 @@ auto dispatch_on_n_items(Func func, predict_params params) -> decltype(func.run(
 template <class KernelParams, class Func>
 auto dispatch_on_leaf_algo(Func func, predict_params params) -> decltype(func.run(params))
 {
-  if (params.leaf_algo == KernelParams::leaf_algo) {
-    if constexpr (KernelParams::leaf_algo == GROVE_PER_CLASS) {
+  if (params.leaf_algo == KernelParams::LEAF_ALGO) {
+    if constexpr (KernelParams::LEAF_ALGO == GROVE_PER_CLASS) {
       if (params.num_classes <= FIL_TPB) {
         params.block_dim_x = FIL_TPB - FIL_TPB % params.num_classes;
         using Next         = typename KernelParams::ReplaceLeafAlgo<GROVE_PER_CLASS_FEW_CLASSES>;
@@ -230,7 +230,7 @@ auto dispatch_on_leaf_algo(Func func, predict_params params) -> decltype(func.ru
       params.block_dim_x = FIL_TPB;
       return dispatch_on_n_items<KernelParams>(func, params);
     }
-  } else if constexpr (KernelParams::leaf_algo + 1 < static_cast<int>(LEAF_ALGO_INVALID)) {
+  } else if constexpr (KernelParams::IncLeafAlgo::LEAF_ALGO < LEAF_ALGO_INVALID) {
     return dispatch_on_leaf_algo<class KernelParams::IncLeafAlgo>(func, params);
   } else {
     ASSERT(false, "internal error: dispatch: invalid leaf_algo %d", params.leaf_algo);
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 6827e88fdf..53af86faa7 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -857,10 +857,7 @@ size_t shmem_size_params::get_smem_footprint()
 template <class KernelParams>
 int compute_smem_footprint::run(predict_params ssp)
 {
-  // need GROVE_PER_CLASS_*_CLASSES
-  if constexpr (KernelParams::leaf_algo != GROVE_PER_CLASS) {
-    return ssp.template get_smem_footprint<KernelParams::n_items, KernelParams::leaf_algo>();
-  }
+  return ssp.template get_smem_footprint<KernelParams::N_ITEMS, KernelParams::LEAF_ALGO>();
 }
 
 // make sure to instantiate all possible get_smem_footprint instantiations
@@ -877,10 +874,10 @@ struct infer_k_storage_template {
     params.num_blocks = params.num_blocks != 0
                           ? params.num_blocks
                           : raft::ceildiv(int(params.num_rows), params.n_items);
-    infer_k<KernelParams::n_items,
-            KernelParams::leaf_algo,
-            KernelParams::cols_in_shmem,
-            KernelParams::cats_supported>
+    infer_k<KernelParams::N_ITEMS,
+            KernelParams::LEAF_ALGO,
+            KernelParams::COLS_IN_SHMEM,
+            KernelParams::CATS_SUPPORTED>
       <<<params.num_blocks, params.block_dim_x, params.shm_sz, stream>>>(forest, params);
     CUDA_CHECK(cudaPeekAtLastError());
   }

From 46eb81933454c00408bdc8f25b9305ae4c643c1a Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 8 Oct 2021 17:33:04 -0700
Subject: [PATCH 25/30] NextLeafAlgo

---
 cpp/src/fil/common.cuh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 589ec3c883..39a943dfa7 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -191,7 +191,7 @@ struct KernelTemplateParams {
   template <bool _cats_supported>
   using ReplaceCatsSupported =
     KernelTemplateParams<COLS_IN_SHMEM, _cats_supported, LEAF_ALGO, N_ITEMS>;
-  using IncLeafAlgo = KernelTemplateParams<COLS_IN_SHMEM, CATS_SUPPORTED, LEAF_ALGO + 1, N_ITEMS>;
+  using NextLeafAlgo = KernelTemplateParams<COLS_IN_SHMEM, CATS_SUPPORTED, LEAF_ALGO + 1, N_ITEMS>;
   template <int _leaf_algo>
   using ReplaceLeafAlgo = KernelTemplateParams<COLS_IN_SHMEM, CATS_SUPPORTED, _leaf_algo, N_ITEMS>;
   using IncNItems = KernelTemplateParams<COLS_IN_SHMEM, CATS_SUPPORTED, LEAF_ALGO, N_ITEMS + 1>;
@@ -230,8 +230,8 @@ auto dispatch_on_leaf_algo(Func func, predict_params params) -> decltype(func.ru
       params.block_dim_x = FIL_TPB;
       return dispatch_on_n_items<KernelParams>(func, params);
     }
-  } else if constexpr (KernelParams::IncLeafAlgo::LEAF_ALGO < LEAF_ALGO_INVALID) {
-    return dispatch_on_leaf_algo<class KernelParams::IncLeafAlgo>(func, params);
+  } else if constexpr (KernelParams::NextLeafAlgo::LEAF_ALGO < LEAF_ALGO_INVALID) {
+    return dispatch_on_leaf_algo<class KernelParams::NextLeafAlgo>(func, params);
   } else {
     ASSERT(false, "internal error: dispatch: invalid leaf_algo %d", params.leaf_algo);
   }

From 4614ca2db84e2f41626846192593f2b27d6ffe44 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 8 Oct 2021 17:43:58 -0700
Subject: [PATCH 26/30] MAX_N_ITEMS

---
 cpp/include/cuml/fil/fil.h | 4 +++-
 cpp/src/fil/common.cuh     | 4 ++--
 cpp/src/fil/fil.cu         | 4 ++--
 cpp/src/fil/internal.cuh   | 2 +-
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cuml/fil/fil.h b/cpp/include/cuml/fil/fil.h
index 7b35e3d789..4d7a847552 100644
--- a/cpp/include/cuml/fil/fil.h
+++ b/cpp/include/cuml/fil/fil.h
@@ -72,6 +72,8 @@ struct forest;
 /** forest_t is the predictor handle */
 typedef forest* forest_t;
 
+constexpr int MAX_N_ITEMS = 4;
+
 /** treelite_params_t are parameters for importing treelite models */
 struct treelite_params_t {
   // algo is the inference algorithm
@@ -94,7 +96,7 @@ struct treelite_params_t {
   // can only be a power of 2
   int threads_per_tree;
   // n_items is how many input samples (items) any thread processes. If 0 is given,
-  // choose most (up to 4) that fit into shared memory.
+  // choose most (up to MAX_N_ITEMS) that fit into shared memory.
   int n_items;
   // if non-nullptr, *pforest_shape_str will be set to caller-owned string that
   // contains forest shape
diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 39a943dfa7..c593e972a5 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -204,10 +204,10 @@ auto dispatch_on_n_items(Func func, predict_params params) -> decltype(func.run(
 {
   if (params.n_items == KernelParams::N_ITEMS) {
     return func.template run<KernelParams>(params);
-  } else if constexpr (KernelParams::N_ITEMS < 4) {
+  } else if constexpr (KernelParams::N_ITEMS < MAX_N_ITEMS) {
     return dispatch_on_n_items<class KernelParams::IncNItems>(func, params);
   } else {
-    ASSERT(false, "internal error: n_items > 4 or < 1");
+    ASSERT(false, "internal error: n_items > %d or < 1", MAX_N_ITEMS);
   }
   return func.run(params);  // appeasing the compiler
 }
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 8483aefd78..ceb4888d43 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -130,10 +130,10 @@ struct forest {
       shmem_size_params& ssp_ = predict_proba ? proba_ssp_ : class_ssp_;
       ssp_.predict_proba      = predict_proba;
       shmem_size_params ssp   = ssp_;
-      // if n_items was not provided, try from 1 to 4. Otherwise, use as-is.
+      // if n_items was not provided, try from 1 to MAX_N_ITEMS. Otherwise, use as-is.
       int min_n_items = ssp.n_items == 0 ? 1 : ssp.n_items;
       int max_n_items =
-        ssp.n_items == 0 ? (algo_ == algo_t::BATCH_TREE_REORG ? 4 : 1) : ssp.n_items;
+        ssp.n_items == 0 ? (algo_ == algo_t::BATCH_TREE_REORG ? MAX_N_ITEMS : 1) : ssp.n_items;
       for (bool cols_in_shmem : {false, true}) {
         ssp.cols_in_shmem = cols_in_shmem;
         for (ssp.n_items = min_n_items; ssp.n_items <= max_n_items; ++ssp.n_items) {
diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh
index 8b8d003e71..51b15bade0 100644
--- a/cpp/src/fil/internal.cuh
+++ b/cpp/src/fil/internal.cuh
@@ -301,7 +301,7 @@ struct forest_params_t {
   // at once inside a block (sharing trees means splitting input rows)
   int threads_per_tree;
   // n_items is how many input samples (items) any thread processes. If 0 is given,
-  // choose most (up to 4) that fit into shared memory.
+  // choose most (up to MAX_N_ITEMS) that fit into shared memory.
   int n_items;
 };
 

From 95281483ef25897c2a752dad1017399005efe7e2 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 8 Oct 2021 17:48:15 -0700
Subject: [PATCH 27/30] stray changes

---
 cpp/src/fil/common.cuh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index c593e972a5..0da3d8b113 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -137,7 +137,7 @@ struct shmem_size_params {
   /// at once inside a block (sharing trees means splitting input rows)
   int log2_threads_per_tree = 0;
   /// n_items is how many input samples (items) any thread processes. If 0 is given,
-  /// choose the reasonable most (<=4) that fit into shared memory. See init_n_items()
+  /// choose the reasonable most (<= MAX_N_ITEMS) that fit into shared memory. See init_n_items()
   int n_items = 0;
   // block_dim_x is the CUDA block size. Set by dispatch_on_leaf_algo(...)
   int block_dim_x = 0;
@@ -159,7 +159,6 @@ struct shmem_size_params {
 // predict_params are parameters for prediction
 struct predict_params : shmem_size_params {
   predict_params(shmem_size_params ssp) : shmem_size_params(ssp) {}
-  predict_params() {}
   // Model parameters.
   algo_t algo;
   // number of outputs for the forest per each data row

From 0b6cea8391542e3df17d77320e475785f2817e64 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Tue, 19 Oct 2021 22:28:57 -0700
Subject: [PATCH 28/30] removed decltype

---
 cpp/src/fil/common.cuh | 30 ++++++++++++++++++++----------
 cpp/src/fil/infer.cu   |  8 ++++++--
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 0da3d8b113..08e5e0292a 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -196,10 +196,19 @@ struct KernelTemplateParams {
   using IncNItems = KernelTemplateParams<COLS_IN_SHMEM, CATS_SUPPORTED, LEAF_ALGO, N_ITEMS + 1>;
 };
 
+// inherit from this struct to pass the functor to dispatch_on_fil_template_params()
+// compiler will prevent defining a .run() method with a different output type
+template <typename T>
+struct dispatch_functor {
+  typedef T return_t;
+  template <class KernelParams = KernelTemplateParams<>>
+  T run(predict_params);
+};
+
 namespace dispatch {
 
 template <class KernelParams, class Func>
-auto dispatch_on_n_items(Func func, predict_params params) -> decltype(func.run(params))
+typename Func::return_t dispatch_on_n_items(Func func, predict_params params)
 {
   if (params.n_items == KernelParams::N_ITEMS) {
     return func.template run<KernelParams>(params);
@@ -208,11 +217,11 @@ auto dispatch_on_n_items(Func func, predict_params params) -> decltype(func.run(
   } else {
     ASSERT(false, "internal error: n_items > %d or < 1", MAX_N_ITEMS);
   }
-  return func.run(params);  // appeasing the compiler
+  return Func::return_t();  // appeasing the compiler
 }
 
 template <class KernelParams, class Func>
-auto dispatch_on_leaf_algo(Func func, predict_params params) -> decltype(func.run(params))
+typename Func::return_t dispatch_on_leaf_algo(Func func, predict_params params)
 {
   if (params.leaf_algo == KernelParams::LEAF_ALGO) {
     if constexpr (KernelParams::LEAF_ALGO == GROVE_PER_CLASS) {
@@ -234,11 +243,11 @@ auto dispatch_on_leaf_algo(Func func, predict_params params) -> decltype(func.ru
   } else {
     ASSERT(false, "internal error: dispatch: invalid leaf_algo %d", params.leaf_algo);
   }
-  return func.run(params);  // appeasing the compiler
+  return Func::return_t();  // appeasing the compiler
 }
 
 template <class KernelParams, class Func>
-auto dispatch_on_cats_supported(Func func, predict_params params) -> decltype(func.run(params))
+typename Func::return_t dispatch_on_cats_supported(Func func, predict_params params)
 {
   return params.cats_present
            ? dispatch_on_leaf_algo<typename KernelParams::ReplaceCatsSupported<true>>(func, params)
@@ -247,7 +256,7 @@ auto dispatch_on_cats_supported(Func func, predict_params params) -> decltype(fu
 }
 
 template <class Func>
-auto dispatch_on_cols_in_shmem(Func func, predict_params params) -> decltype(func.run(params))
+typename Func::return_t dispatch_on_cols_in_shmem(Func func, predict_params params)
 {
   return params.cols_in_shmem
            ? dispatch_on_cats_supported<KernelTemplateParams<true>>(func, params)
@@ -257,15 +266,16 @@ auto dispatch_on_cols_in_shmem(Func func, predict_params params) -> decltype(fun
 }  // namespace dispatch
 
 template <class Func>
-auto dispatch_on_fil_template_params(Func func, predict_params params) -> decltype(func.run(params))
+typename Func::return_t dispatch_on_fil_template_params(Func func, predict_params params)
 {
   return dispatch::dispatch_on_cols_in_shmem(func, params);
 }
 
-// For an example of Func, see this:
-struct compute_smem_footprint {
+// For an example of Func declaration, see this.
+// the .run(predict_params) method will be defined in infer.cu
+struct compute_smem_footprint : dispatch_functor<int> {
   template <class KernelParams = KernelTemplateParams<>>
-  int run(predict_params ssp);
+  int run(predict_params);
 };
 
 // infer() calls the inference kernel with the parameters on the stream
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 53af86faa7..0f709db5ea 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -864,9 +864,13 @@ int compute_smem_footprint::run(predict_params ssp)
 template int dispatch_on_fil_template_params(compute_smem_footprint, predict_params);
 
 template <typename storage_type>
-struct infer_k_storage_template {
+struct infer_k_storage_template : dispatch_functor<void> {
   storage_type forest;
   cudaStream_t stream;
+  infer_k_storage_template(storage_type forest_, cudaStream_t stream_)
+    : forest(forest_), stream(stream_)
+  {
+  }
 
   template <class KernelParams = KernelTemplateParams<>>
   void run(predict_params params)
@@ -886,7 +890,7 @@ struct infer_k_storage_template {
 template <typename storage_type>
 void infer(storage_type forest, predict_params params, cudaStream_t stream)
 {
-  dispatch_on_fil_template_params(infer_k_storage_template<storage_type>{forest, stream}, params);
+  dispatch_on_fil_template_params(infer_k_storage_template<storage_type>(forest, stream), params);
 }
 
 template void infer<dense_storage>(dense_storage forest,

From a5a6ca66c56c53623b1342ad3f6f46367911ce09 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Tue, 19 Oct 2021 23:02:29 -0700
Subject: [PATCH 29/30] next_leaf_algo, LEAF_ALGO_INVALID->MAX_LEAF_ALGO, ...

---
 cpp/include/cuml/fil/fil.h |  1 +
 cpp/src/fil/common.cuh     | 27 +++++++++++++++++----------
 cpp/src/fil/fil.cu         |  3 ---
 cpp/src/fil/internal.cuh   |  4 +++-
 4 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/cpp/include/cuml/fil/fil.h b/cpp/include/cuml/fil/fil.h
index 4d7a847552..ad18b4ed1d 100644
--- a/cpp/include/cuml/fil/fil.h
+++ b/cpp/include/cuml/fil/fil.h
@@ -72,6 +72,7 @@ struct forest;
 /** forest_t is the predictor handle */
 typedef forest* forest_t;
 
+/** MAX_N_ITEMS determines the maximum allowed value for tl_params::n_items */
 constexpr int MAX_N_ITEMS = 4;
 
 /** treelite_params_t are parameters for importing treelite models */
diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 08e5e0292a..31c832154b 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -177,22 +177,29 @@ struct predict_params : shmem_size_params {
   int num_blocks;
 };
 
-template <bool COLS_IN_SHMEM_  = false,
-          bool CATS_SUPPORTED_ = false,
-          int LEAF_ALGO_       = 0,
-          int N_ITEMS_         = 1>
+constexpr leaf_algo_t next_leaf_algo(leaf_algo_t algo)
+{
+  return static_cast<leaf_algo_t>(algo + 1);
+}
+
+template <bool COLS_IN_SHMEM_    = false,
+          bool CATS_SUPPORTED_   = false,
+          leaf_algo_t LEAF_ALGO_ = MIN_LEAF_ALGO,
+          int N_ITEMS_           = 1>
 struct KernelTemplateParams {
   static const bool COLS_IN_SHMEM    = COLS_IN_SHMEM_;
   static const bool CATS_SUPPORTED   = CATS_SUPPORTED_;
-  static const leaf_algo_t LEAF_ALGO = static_cast<leaf_algo_t>(LEAF_ALGO_);
+  static const leaf_algo_t LEAF_ALGO = LEAF_ALGO_;
   static const int N_ITEMS           = N_ITEMS_;
 
   template <bool _cats_supported>
   using ReplaceCatsSupported =
     KernelTemplateParams<COLS_IN_SHMEM, _cats_supported, LEAF_ALGO, N_ITEMS>;
-  using NextLeafAlgo = KernelTemplateParams<COLS_IN_SHMEM, CATS_SUPPORTED, LEAF_ALGO + 1, N_ITEMS>;
-  template <int _leaf_algo>
-  using ReplaceLeafAlgo = KernelTemplateParams<COLS_IN_SHMEM, CATS_SUPPORTED, _leaf_algo, N_ITEMS>;
+  using NextLeafAlgo =
+    KernelTemplateParams<COLS_IN_SHMEM, CATS_SUPPORTED, next_leaf_algo(LEAF_ALGO), N_ITEMS>;
+  template <leaf_algo_t NEW_LEAF_ALGO>
+  using ReplaceLeafAlgo =
+    KernelTemplateParams<COLS_IN_SHMEM, CATS_SUPPORTED, NEW_LEAF_ALGO, N_ITEMS>;
   using IncNItems = KernelTemplateParams<COLS_IN_SHMEM, CATS_SUPPORTED, LEAF_ALGO, N_ITEMS + 1>;
 };
 
@@ -215,7 +222,7 @@ typename Func::return_t dispatch_on_n_items(Func func, predict_params params)
   } else if constexpr (KernelParams::N_ITEMS < MAX_N_ITEMS) {
     return dispatch_on_n_items<class KernelParams::IncNItems>(func, params);
   } else {
-    ASSERT(false, "internal error: n_items > %d or < 1", MAX_N_ITEMS);
+    ASSERT(false, "n_items > %d or < 1", MAX_N_ITEMS);
   }
   return Func::return_t();  // appeasing the compiler
 }
@@ -238,7 +245,7 @@ typename Func::return_t dispatch_on_leaf_algo(Func func, predict_params params)
       params.block_dim_x = FIL_TPB;
       return dispatch_on_n_items<KernelParams>(func, params);
     }
-  } else if constexpr (KernelParams::NextLeafAlgo::LEAF_ALGO < LEAF_ALGO_INVALID) {
+  } else if constexpr (next_leaf_algo(KernelParams::LEAF_ALGO) <= MAX_LEAF_ALGO) {
     return dispatch_on_leaf_algo<class KernelParams::NextLeafAlgo>(func, params);
   } else {
     ASSERT(false, "internal error: dispatch: invalid leaf_algo %d", params.leaf_algo);
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index ceb4888d43..62a7c7c8c8 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -307,7 +307,6 @@ struct forest {
           params.num_outputs = params.num_classes;
           do_transform = (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != 0.0f;
           break;
-        case fil::leaf_algo_t::LEAF_ALGO_INVALID:
         default: ASSERT(false, "internal error: predict: invalid leaf_algo %d", params.leaf_algo);
       }
     } else {
@@ -540,7 +539,6 @@ void check_params(const forest_params_t* params, bool dense)
              "num_classes >= 2 is required for "
              "leaf_algo == VECTOR_LEAF");
       break;
-    case fil::leaf_algo_t::LEAF_ALGO_INVALID:
     default:
       ASSERT(false,
              "leaf_algo must be FLOAT_UNARY_BINARY, CATEGORICAL_LEAF"
@@ -784,7 +782,6 @@ void tl2fil_leaf_payload(fil_node_t* fil_node,
       ASSERT(!tl_tree.HasLeafVector(tl_node_id),
              "some but not all treelite leaves have leaf_vector()");
       break;
-    case fil::leaf_algo_t::LEAF_ALGO_INVALID:
     default: ASSERT(false, "internal error: invalid leaf_algo");
   };
 }
diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh
index 51b15bade0..1d182fc8f5 100644
--- a/cpp/src/fil/internal.cuh
+++ b/cpp/src/fil/internal.cuh
@@ -212,6 +212,8 @@ struct alignas(8) sparse_node8 : base_node {
     and how FIL aggregates them into class margins/regression result/best class
 **/
 enum leaf_algo_t {
+  /** For iteration purposes */
+  MIN_LEAF_ALGO = 0,
   /** storing a class probability or regression summand. We add all margins
       together and determine regression result or use threshold to determine
       one of the two classes. **/
@@ -239,7 +241,7 @@ enum leaf_algo_t {
   /** Leaf contains an index into a vector of class probabilities. **/
   VECTOR_LEAF = 5,
   // to be extended
-  LEAF_ALGO_INVALID  // no explicit numerical value
+  MAX_LEAF_ALGO = 5
 };
 
 template <leaf_algo_t leaf_algo>

From 2b4555fa086674d817c651ea7e568b2acc311408 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Tue, 19 Oct 2021 23:14:22 -0700
Subject: [PATCH 30/30] misc

---
 cpp/src/fil/common.cuh  | 24 ++++++++++++------------
 cpp/test/sg/fil_test.cu |  7 ++-----
 2 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 31c832154b..7de2eb8efd 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -214,8 +214,8 @@ struct dispatch_functor {
 
 namespace dispatch {
 
-template <class KernelParams, class Func>
-typename Func::return_t dispatch_on_n_items(Func func, predict_params params)
+template <class KernelParams, class Func, class T = typename Func::return_t>
+T dispatch_on_n_items(Func func, predict_params params)
 {
   if (params.n_items == KernelParams::N_ITEMS) {
     return func.template run<KernelParams>(params);
@@ -224,11 +224,11 @@ typename Func::return_t dispatch_on_n_items(Func func, predict_params params)
   } else {
     ASSERT(false, "n_items > %d or < 1", MAX_N_ITEMS);
   }
-  return Func::return_t();  // appeasing the compiler
+  return T();  // appeasing the compiler
 }
 
-template <class KernelParams, class Func>
-typename Func::return_t dispatch_on_leaf_algo(Func func, predict_params params)
+template <class KernelParams, class Func, class T = typename Func::return_t>
+T dispatch_on_leaf_algo(Func func, predict_params params)
 {
   if (params.leaf_algo == KernelParams::LEAF_ALGO) {
     if constexpr (KernelParams::LEAF_ALGO == GROVE_PER_CLASS) {
@@ -250,11 +250,11 @@ typename Func::return_t dispatch_on_leaf_algo(Func func, predict_params params)
   } else {
     ASSERT(false, "internal error: dispatch: invalid leaf_algo %d", params.leaf_algo);
   }
-  return Func::return_t();  // appeasing the compiler
+  return T();  // appeasing the compiler
 }
 
-template <class KernelParams, class Func>
-typename Func::return_t dispatch_on_cats_supported(Func func, predict_params params)
+template <class KernelParams, class Func, class T = typename Func::return_t>
+T dispatch_on_cats_supported(Func func, predict_params params)
 {
   return params.cats_present
            ? dispatch_on_leaf_algo<typename KernelParams::ReplaceCatsSupported<true>>(func, params)
@@ -262,8 +262,8 @@ typename Func::return_t dispatch_on_cats_supported(Func func, predict_params par
                                                                                        params);
 }
 
-template <class Func>
-typename Func::return_t dispatch_on_cols_in_shmem(Func func, predict_params params)
+template <class Func, class T = typename Func::return_t>
+T dispatch_on_cols_in_shmem(Func func, predict_params params)
 {
   return params.cols_in_shmem
            ? dispatch_on_cats_supported<KernelTemplateParams<true>>(func, params)
@@ -272,8 +272,8 @@ typename Func::return_t dispatch_on_cols_in_shmem(Func func, predict_params para
 
 }  // namespace dispatch
 
-template <class Func>
-typename Func::return_t dispatch_on_fil_template_params(Func func, predict_params params)
+template <class Func, class T = typename Func::return_t>
+T dispatch_on_fil_template_params(Func func, predict_params params)
 {
   return dispatch::dispatch_on_cols_in_shmem(func, params);
 }
diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu
index f00ca36426..293222667e 100644
--- a/cpp/test/sg/fil_test.cu
+++ b/cpp/test/sg/fil_test.cu
@@ -384,7 +384,6 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
           std::memcpy(&w.f, &weights_h[i], sizeof w.f);
           break;
         case fil::leaf_algo_t::VECTOR_LEAF: w.idx = i; break;
-        case fil::leaf_algo_t::LEAF_ALGO_INVALID:
         default: ASSERT(false, "internal error: invalid ps.leaf_algo");
       }
       // make sure nodes are categorical only when their feature ID is categorical
@@ -547,8 +546,7 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
         }
         break;
       case fil::leaf_algo_t::GROVE_PER_CLASS_FEW_CLASSES:
-      case fil::leaf_algo_t::GROVE_PER_CLASS_MANY_CLASSES:
-      case fil::leaf_algo_t::LEAF_ALGO_INVALID: break;
+      case fil::leaf_algo_t::GROVE_PER_CLASS_MANY_CLASSES: break;
     }
 
     // copy to GPU
@@ -772,8 +770,7 @@ class TreeliteFilTest : public BaseFilTest {
           break;
         }
         case fil::leaf_algo_t::GROVE_PER_CLASS_FEW_CLASSES:
-        case fil::leaf_algo_t::GROVE_PER_CLASS_MANY_CLASSES:
-        case fil::leaf_algo_t::LEAF_ALGO_INVALID: break;
+        case fil::leaf_algo_t::GROVE_PER_CLASS_MANY_CLASSES: break;
       }
     } else {
       int left          = root + 2 * (node - root) + 1;