From 86bc6b5ef96506318a87ce88e7ce97921000335d Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Tue, 8 Feb 2022 19:59:40 -0800
Subject: [PATCH 01/45] templatized node, forest and storage types

---
 cpp/include/cuml/fil/fil.h     |   9 ++-
 cpp/src/fil/common.cuh         |  46 ++++++------
 cpp/src/fil/fil.cu             |  68 ++++++++---------
 cpp/src/fil/infer.cu           |   8 +-
 cpp/src/fil/internal.cuh       | 130 ++++++++++++++++++---------------
 cpp/src/fil/treelite_import.cu |   4 +-
 6 files changed, 136 insertions(+), 129 deletions(-)
diff --git a/cpp/include/cuml/fil/fil.h b/cpp/include/cuml/fil/fil.h
index 4cecb8aef4..dfa66ad1a8 100644
--- a/cpp/include/cuml/fil/fil.h
+++ b/cpp/include/cuml/fil/fil.h
@@ -55,15 +55,16 @@ enum algo_t {
 enum storage_type_t {
   /** decide automatically; currently always builds dense forests */
   AUTO,
-  /** import the forest as dense */
+  /** import the forest as dense (8 or 16-bytes nodes, depending on model precision */
   DENSE,
   /** import the forest as sparse (currently always with 16-byte nodes) */
   SPARSE,
   /** (experimental) import the forest as sparse with 8-byte nodes; can fail if
       8-byte nodes are not enough to store the forest, e.g. there are too many
-      nodes in a tree or too many features; note that the number of bits used to
-      store the child or feature index can change in the future; this can affect
-      whether a particular forest can be imported as SPARSE8 */
+      nodes in a tree or too many features or the thresholds are double precision;
+      note that the number of bits used to store the child or feature index can
+      change in the future; this can affect whether a particular forest can be
+      imported as SPARSE8 */
   SPARSE8,
 };
 static const char* storage_type_repr[] = {"AUTO", "DENSE", "SPARSE", "SPARSE8"};
diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 3174b63c26..56a98b5ac0 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -44,23 +44,27 @@ struct storage_base {
   bool cats_present() const { return sets_.cats_present(); }
 };
 
-/** dense_tree represents a dense tree */
-
-struct dense_tree : tree_base {
-  __host__ __device__ dense_tree(categorical_sets cat_sets, dense_node* nodes, int node_pitch)
+/** represents a dense tree */
+template <typename F>
+struct tree<dense_node<F>> : tree_base {
+  __host__ __device__ tree(categorical_sets cat_sets, dense_node<F>* nodes, int node_pitch)
     : tree_base{cat_sets}, nodes_(nodes), node_pitch_(node_pitch)
   {
   }
-  __host__ __device__ const dense_node& operator[](int i) const { return nodes_[i * node_pitch_]; }
-  dense_node* nodes_ = nullptr;
+  __host__ __device__ const dense_node<F>& operator[](int i) const { return nodes_[i * node_pitch_]; }
+  dense_node<F>* nodes_ = nullptr;
   int node_pitch_    = 0;
 };
 
+using std::enable_if;
+
 /** dense_storage stores the forest as a collection of dense nodes */
-struct dense_storage : storage_base {
-  __host__ __device__ dense_storage(categorical_sets cat_sets,
+template <typename F>
+struct storage<dense_node<F>> : storage_base {
+  using node_t = dense_node<F>;
+  __host__ __device__ storage(categorical_sets cat_sets,
                                     float* vector_leaf,
-                                    dense_node* nodes,
+                                    node_t* nodes,
                                     int num_trees,
                                     int tree_stride,
                                     int node_pitch)
@@ -72,20 +76,20 @@ struct dense_storage : storage_base {
   {
   }
   __host__ __device__ int num_trees() const { return num_trees_; }
-  __host__ __device__ dense_tree operator[](int i) const
+  __host__ __device__ tree<node_t> operator[](int i) const
   {
-    return dense_tree(sets_, nodes_ + i * tree_stride_, node_pitch_);
+    return tree<node_t>(sets_, nodes_ + i * tree_stride_, node_pitch_);
   }
-  dense_node* nodes_ = nullptr;
+  node_t* nodes_ = nullptr;
   int num_trees_     = 0;
   int tree_stride_   = 0;
   int node_pitch_    = 0;
 };
 
-/** sparse_tree is a sparse tree */
+/** sparse tree */
 template <typename node_t>
-struct sparse_tree : tree_base {
-  __host__ __device__ sparse_tree(categorical_sets cat_sets, node_t* nodes)
+struct tree : tree_base {
+  __host__ __device__ tree(categorical_sets cat_sets, node_t* nodes)
     : tree_base{cat_sets}, nodes_(nodes)
   {
   }
@@ -95,24 +99,24 @@ struct sparse_tree : tree_base {
 
 /** sparse_storage stores the forest as a collection of sparse nodes */
 template <typename node_t>
-struct sparse_storage : storage_base {
+struct storage : storage_base {
   int* trees_    = nullptr;
   node_t* nodes_ = nullptr;
   int num_trees_ = 0;
-  __host__ __device__ sparse_storage(
+  __host__ __device__ storage(
     categorical_sets cat_sets, float* vector_leaf, int* trees, node_t* nodes, int num_trees)
     : storage_base{cat_sets, vector_leaf}, trees_(trees), nodes_(nodes), num_trees_(num_trees)
   {
   }
   __host__ __device__ int num_trees() const { return num_trees_; }
-  __host__ __device__ sparse_tree<node_t> operator[](int i) const
+  __host__ __device__ tree<node_t> operator[](int i) const
   {
-    return sparse_tree<node_t>(sets_, &nodes_[trees_[i]]);
+    return tree<node_t>(sets_, &nodes_[trees_[i]]);
   }
 };
 
-typedef sparse_storage<sparse_node16> sparse_storage16;
-typedef sparse_storage<sparse_node8> sparse_storage8;
+typedef storage<sparse_node16<float>> sparse_storage16;
+typedef storage<sparse_node8> sparse_storage8;
 
 /// all model parameters mostly required to compute shared memory footprint,
 /// also the footprint itself
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index f00e310a6b..148fe845b2 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -17,7 +17,7 @@
 /** @file fil.cu fil.cu implements the forest data types (dense and sparse), including their
 creation and prediction (the main inference kernel is defined in infer.cu). */
 
-#include "common.cuh"    // for predict_params, sparse_storage, dense_storage
+#include "common.cuh"    // for predict_params, storage, storage
 #include "internal.cuh"  // for cat_sets_device_owner, categorical_sets, output_t,
 
 #include <cuml/fil/fil.h>  // for algo_t,
@@ -352,10 +352,12 @@ struct opt_into_arch_dependent_shmem : dispatch_functor<void> {
   }
 };
 
-struct dense_forest : forest {
+template <typename F>
+struct dense_forest<dense_node<F>> : forest {
+  using node_t = dense_node<F>;
   dense_forest(const raft::handle_t& h) : forest(h), nodes_(0, h.get_stream()) {}
 
-  void transform_trees(const dense_node* nodes)
+  void transform_trees(const node_t* nodes)
   {
     /* Populate node information:
        For each tree, the nodes are still stored in the breadth-first,
@@ -385,9 +387,9 @@ struct dense_forest : forest {
   /// sparse_forest<node_t>::init()
   void init(const raft::handle_t& h,
             const categorical_sets& cat_sets,
-            const std::vector<float>& vector_leaf,
+            const std::vector<F>& vector_leaf,
             const int* trees,
-            const dense_node* nodes,
+            const node_t* nodes,
             const forest_params_t* params)
   {
     init_common(h, cat_sets, vector_leaf, params);
@@ -403,12 +405,12 @@ struct dense_forest : forest {
     }
     RAFT_CUDA_TRY(cudaMemcpyAsync(nodes_.data(),
                                   h_nodes_.data(),
-                                  num_nodes * sizeof(dense_node),
+                                  num_nodes * sizeof(node_t),
                                   cudaMemcpyHostToDevice,
                                   h.get_stream()));
 
     // predict_proba is a runtime parameter, and opt-in is unconditional
-    dispatch_on_fil_template_params(opt_into_arch_dependent_shmem<dense_storage>(max_shm_),
+    dispatch_on_fil_template_params(opt_into_arch_dependent_shmem<storage>(max_shm_),
                                     static_cast<predict_params>(class_ssp_));
     // copy must be finished before freeing the host data
     RAFT_CUDA_TRY(cudaStreamSynchronize(h.get_stream()));
@@ -418,7 +420,7 @@ struct dense_forest : forest {
 
   virtual void infer(predict_params params, cudaStream_t stream) override
   {
-    dense_storage forest(cat_sets_.accessor(),
+    storage forest(cat_sets_.accessor(),
                          vector_leaf_.data(),
                          nodes_.data(),
                          num_trees_,
@@ -433,8 +435,8 @@ struct dense_forest : forest {
     forest::free(h);
   }
 
-  rmm::device_uvector<dense_node> nodes_;
-  thrust::host_vector<dense_node> h_nodes_;
+  rmm::device_uvector<node_t> nodes_;
+  thrust::host_vector<node_t> h_nodes_;
 };
 
 template <typename node_t>
@@ -446,7 +448,7 @@ struct sparse_forest : forest {
 
   void init(const raft::handle_t& h,
             const categorical_sets& cat_sets,
-            const std::vector<float>& vector_leaf,
+            const std::vector<typename node_t::F>& vector_leaf,
             const int* trees,
             const node_t* nodes,
             const forest_params_t* params)
@@ -467,13 +469,13 @@ struct sparse_forest : forest {
       nodes_.data(), nodes, sizeof(node_t) * num_nodes_, cudaMemcpyHostToDevice, h.get_stream()));
 
     // predict_proba is a runtime parameter, and opt-in is unconditional
-    dispatch_on_fil_template_params(opt_into_arch_dependent_shmem<sparse_storage<node_t>>(max_shm_),
+    dispatch_on_fil_template_params(opt_into_arch_dependent_shmem<storage<node_t>>(max_shm_),
                                     static_cast<predict_params>(class_ssp_));
   }
 
   virtual void infer(predict_params params, cudaStream_t stream) override
   {
-    sparse_storage<node_t> forest(
+    storage<node_t> forest(
       cat_sets_.accessor(), vector_leaf_.data(), trees_.data(), nodes_.data(), num_trees_);
     fil::infer(forest, params, stream);
   }
@@ -572,7 +574,7 @@ template <typename fil_node_t>
 void init(const raft::handle_t& h,
           forest_t* pf,
           const categorical_sets& cat_sets,
-          const std::vector<float>& vector_leaf,
+          const std::vector<typename fil_node_t::F>& vector_leaf,
           const int* trees,
           const fil_node_t* nodes,
           const forest_params_t* params)
@@ -584,30 +586,20 @@ void init(const raft::handle_t& h,
   *pf = f;
 }
 
-// explicit instantiations for init_sparse()
-template void init<sparse_node16>(const raft::handle_t& h,
-                                  forest_t* pf,
-                                  const categorical_sets& cat_sets,
-                                  const std::vector<float>& vector_leaf,
-                                  const int* trees,
-                                  const sparse_node16* nodes,
-                                  const forest_params_t* params);
-
-template void init<sparse_node8>(const raft::handle_t& h,
-                                 forest_t* pf,
-                                 const categorical_sets& cat_sets,
-                                 const std::vector<float>& vector_leaf,
-                                 const int* trees,
-                                 const sparse_node8* nodes,
-                                 const forest_params_t* params);
-
-template void init<dense_node>(const raft::handle_t& h,
-                               forest_t* pf,
-                               const categorical_sets& cat_sets,
-                               const std::vector<float>& vector_leaf,
-                               const int* trees,
-                               const dense_node* nodes,
-                               const forest_params_t* params);
+struct instantiate_forest_init {
+  template <typename fil_node_t>
+  void operator()(fil_node_t) {
+    init(raft::handle_t{},
+         (forest_t*)nullptr,
+         categorical_sets{},
+         std::vector<typename fil_node_t::F>{},
+         (int*)nullptr,
+         (fil_node_t*)nullptr,
+         (forest_params_t*)nullptr);
+  }
+};
+
+template void instantiate_for_all_node_types(instantiate_forest_init);
 
 void free(const raft::handle_t& h, forest_t f)
 {
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index c51d9b3dc4..3e4d5eba56 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -36,7 +36,7 @@
 #endif  // __CUDA_ARCH__
 #endif  // CUDA_PRAGMA_UNROLL
 
-#define INLINE_CONFIG __forceinline__
+#define INLINE_CONFIG __noinline__
 
 namespace ML {
 namespace fil {
@@ -893,13 +893,13 @@ void infer(storage_type forest, predict_params params, cudaStream_t stream)
   dispatch_on_fil_template_params(infer_k_storage_template<storage_type>(forest, stream), params);
 }
 
-template void infer<dense_storage>(dense_storage forest,
+template void inferstorage<dense_node<float>>(storage<dense_node<float>> forest,
                                    predict_params params,
                                    cudaStream_t stream);
-template void infer<sparse_storage16>(sparse_storage16 forest,
+template void infer<storage<sparse_node16<float>>>(storage<sparse_node16<float>> forest,
                                       predict_params params,
                                       cudaStream_t stream);
-template void infer<sparse_storage8>(sparse_storage8 forest,
+template void infer<storage<sparse_node8>>(storage<sparse_node8> forest,
                                      predict_params params,
                                      cudaStream_t stream);
 
diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh
index 253936be96..71e36e0ced 100644
--- a/cpp/src/fil/internal.cuh
+++ b/cpp/src/fil/internal.cuh
@@ -28,6 +28,7 @@
 #include <treelite/c_api.h>
 #include <treelite/tree.h>
 #include <vector>
+#include <utility>
 
 namespace raft {
 class handle_t;
@@ -89,19 +90,22 @@ enum output_t {
 };
 
 /** val_t is the payload within a FIL leaf */
+template<typename F>
 union val_t {
-  /** threshold value for parent node or output value (e.g. class
-      probability or regression summand) for leaf node */
-  float f = NAN;
+  /** floating-point threshold value for parent node or output value
+      (e.g. class probability or regression summand) for leaf node */
+  F f = NAN;
   /** class label, leaf vector index or categorical node set offset */
   int idx;
 };
 
 /** base_node contains common implementation details for dense and sparse nodes */
+template<typename F_>
 struct base_node {
+  using F = F_; // floating-point type
   /** val, for parent nodes, is a threshold or category list offset. For leaf
       nodes, it is the tree prediction (see see leaf_output_t<leaf_algo_t>::T) */
-  val_t val;
+  val_t<F> val;
   /** bits encode various information about the node, with the exact nature of
       this information depending on the node type; it includes e.g. whether the
       node is a leaf or inner node, and for inner nodes, additional information,
@@ -115,16 +119,24 @@ struct base_node {
   static const int IS_CATEGORICAL_MASK   = 1 << IS_CATEGORICAL_OFFSET;
   static const int FID_MASK              = (1 << IS_CATEGORICAL_OFFSET) - 1;
   template <class o_t>
-  __host__ __device__ o_t output() const;
+  __host__ __device__ o_t output() const {
+    if constexpr(std::is_same<o_t, int>()) {
+      return val.idx;
+    } else if constexpr(std::is_same<o_t, F>()) {
+      return val.f;
+    } else {
+      return val;
+    }
+  }
   __host__ __device__ int set() const { return val.idx; }
-  __host__ __device__ float thresh() const { return val.f; }
-  __host__ __device__ val_t split() const { return val; }
+  __host__ __device__ F thresh() const { return val.f; }
+  __host__ __device__ val_t<F> split() const { return val; }
   __host__ __device__ int fid() const { return bits & FID_MASK; }
   __host__ __device__ bool def_left() const { return bits & DEF_LEFT_MASK; }
   __host__ __device__ bool is_leaf() const { return bits & IS_LEAF_MASK; }
   __host__ __device__ bool is_categorical() const { return bits & IS_CATEGORICAL_MASK; }
   __host__ __device__ base_node() : val{}, bits(0) {}
-  base_node(val_t output, val_t split, int fid, bool def_left, bool is_leaf, bool is_categorical)
+  base_node(val_t<F> output, val_t<F> split, int fid, bool def_left, bool is_leaf, bool is_categorical)
   {
     RAFT_EXPECTS((fid & FID_MASK) == fid, "internal error: feature ID doesn't fit into base_node");
     bits = (fid & FID_MASK) | (def_left ? DEF_LEFT_MASK : 0) | (is_leaf ? IS_LEAF_MASK : 0) |
@@ -136,28 +148,13 @@ struct base_node {
   }
 };
 
-template <>
-__host__ __device__ __forceinline__ float base_node::output<float>() const
-{
-  return val.f;
-}
-template <>
-__host__ __device__ __forceinline__ int base_node::output<int>() const
-{
-  return val.idx;
-}
-template <>
-__host__ __device__ __forceinline__ val_t base_node::output<val_t>() const
-{
-  return val;
-}
-
 /** dense_node is a single node of a dense forest */
-struct alignas(8) dense_node : base_node {
+template<typename F>
+struct alignas(8) dense_node : base_node<F> {
   dense_node() = default;
   /// ignoring left_index, this is useful to unify import from treelite
-  dense_node(val_t output,
-             val_t split,
+  dense_node(val_t<F> output,
+             val_t<F> split,
              int fid,
              bool def_left,
              bool is_leaf,
@@ -171,20 +168,19 @@ struct alignas(8) dense_node : base_node {
 };
 
 /** sparse_node16 is a 16-byte node in a sparse forest */
-struct alignas(16) sparse_node16 : base_node {
+template<typename F>
+struct alignas(16) sparse_node16 : base_node<F> {
   int left_idx;
-  int dummy;  // make alignment explicit and reserve for future use
   __host__ __device__ sparse_node16() : left_idx(0), dummy(0) {}
-  sparse_node16(val_t output,
-                val_t split,
+  sparse_node16(val_t<F> output,
+                val_t<F> split,
                 int fid,
                 bool def_left,
                 bool is_leaf,
                 bool is_categorical,
                 int left_index)
     : base_node(output, split, fid, def_left, is_leaf, is_categorical),
-      left_idx(left_index),
-      dummy(0)
+      left_idx(left_index)
   {
   }
   __host__ __device__ int left_index() const { return left_idx; }
@@ -193,7 +189,7 @@ struct alignas(16) sparse_node16 : base_node {
 };
 
 /** sparse_node8 is a node of reduced size (8 bytes) in a sparse forest */
-struct alignas(8) sparse_node8 : base_node {
+struct alignas(8) sparse_node8 : base_node<float> {
   static const int LEFT_NUM_BITS = 16;
   static const int FID_NUM_BITS  = IS_CATEGORICAL_OFFSET - LEFT_NUM_BITS;
   static const int LEFT_OFFSET   = FID_NUM_BITS;
@@ -202,8 +198,8 @@ struct alignas(8) sparse_node8 : base_node {
   __host__ __device__ int fid() const { return bits & FID_MASK; }
   __host__ __device__ int left_index() const { return (bits & LEFT_MASK) >> LEFT_OFFSET; }
   sparse_node8() = default;
-  sparse_node8(val_t output,
-               val_t split,
+  sparse_node8(val_t<float> output,
+               val_t<float> split,
                int fid,
                bool def_left,
                bool is_leaf,
@@ -221,29 +217,40 @@ struct alignas(8) sparse_node8 : base_node {
   __host__ __device__ int left(int curr) const { return left_index(); }
 };
 
-struct dense_forest;
+/// pass a functor with a templated operator()(fil_node_t), i.e. accepting one node as parameter
+template<typename Stuff>
+void instantiate_for_all_node_types(Stuff stuff) {
+  stuff(dense_node<float>{});
+  stuff(dense_node<double>{});
+  stuff(sparse_node16<float>{});
+  stuff(sparse_node16<double>{});
+  stuff(sparse_node8{});
+}
+
 template <typename node_t>
-struct sparse_forest;
+struct storage;
 
-struct dense_storage;
 template <typename node_t>
-struct sparse_storage;
+struct dense_forest;
+template <typename node_t>
+struct sparse_forest;
 
 template <typename node_t>
 struct node_traits {
-  using storage              = sparse_storage<node_t>;
+  using F = typename node_t::F;
+  using storage              = ML::fil::storage<node_t>;
   using forest               = sparse_forest<node_t>;
   static const bool IS_DENSE = false;
   static const storage_type_t storage_type_enum =
-    std::is_same<sparse_node16, node_t>() ? SPARSE : SPARSE8;
+    std::is_same<sparse_node16<typename node_t::F>, node_t>() ? SPARSE : SPARSE8;
   template <typename threshold_t, typename leaf_t>
   static void check(const treelite::ModelImpl<threshold_t, leaf_t>& model);
 };
 
-template <>
-struct node_traits<dense_node> {
-  using storage                                 = dense_storage;
-  using forest                                  = dense_forest;
+template <typename F>
+struct node_traits<dense_node<F>> {
+  using storage                                 = storage<dense_node<F>>;
+  using forest                                  = dense_forest<dense_node<F>>;
   static const bool IS_DENSE                    = true;
   static const storage_type_t storage_type_enum = DENSE;
   template <typename threshold_t, typename leaf_t>
@@ -288,27 +295,30 @@ enum leaf_algo_t {
   MAX_LEAF_ALGO = 5
 };
 
-template <leaf_algo_t leaf_algo>
+template <typename node_t>
+struct tree;
+
+template <leaf_algo_t leaf_algo, typename tree_t>
 struct leaf_output_t {
 };
-template <>
-struct leaf_output_t<leaf_algo_t::FLOAT_UNARY_BINARY> {
-  typedef float T;
+template <typename node_t>
+struct leaf_output_t<leaf_algo_t::FLOAT_UNARY_BINARY, tree<node_t>> {
+  typedef typename node_t::F T;
 };
-template <>
-struct leaf_output_t<leaf_algo_t::CATEGORICAL_LEAF> {
+template <typename node_t>
+struct leaf_output_t<leaf_algo_t::CATEGORICAL_LEAF, tree<node_t>> {
   typedef int T;
 };
-template <>
-struct leaf_output_t<leaf_algo_t::GROVE_PER_CLASS_FEW_CLASSES> {
-  typedef float T;
+template <typename node_t>
+struct leaf_output_t<leaf_algo_t::GROVE_PER_CLASS_FEW_CLASSES, tree<node_t>> {
+  typedef typename node_t::F T;
 };
-template <>
-struct leaf_output_t<leaf_algo_t::GROVE_PER_CLASS_MANY_CLASSES> {
-  typedef float T;
+template <typename node_t>
+struct leaf_output_t<leaf_algo_t::GROVE_PER_CLASS_MANY_CLASSES, tree<node_t>> {
+  typedef typename node_t::F T;
 };
-template <>
-struct leaf_output_t<leaf_algo_t::VECTOR_LEAF> {
+template <typename node_t>
+struct leaf_output_t<leaf_algo_t::VECTOR_LEAF, tree<node_t>> {
   typedef int T;
 };
 
diff --git a/cpp/src/fil/treelite_import.cu b/cpp/src/fil/treelite_import.cu
index 8a3d2121cd..22216a348b 100644
--- a/cpp/src/fil/treelite_import.cu
+++ b/cpp/src/fil/treelite_import.cu
@@ -692,8 +692,8 @@ void from_treelite(const raft::handle_t& handle,
   }
 
   switch (storage_type) {
-    case storage_type_t::DENSE: convert<dense_node>(handle, pforest, model, *tl_params); break;
-    case storage_type_t::SPARSE: convert<sparse_node16>(handle, pforest, model, *tl_params); break;
+    case storage_type_t::DENSE: convert<dense_node<float>>(handle, pforest, model, *tl_params); break;
+    case storage_type_t::SPARSE: convert<sparse_node16<float>>(handle, pforest, model, *tl_params); break;
     case storage_type_t::SPARSE8: convert<sparse_node8>(handle, pforest, model, *tl_params); break;
     default: ASSERT(false, "tl_params->sparse must be one of AUTO, DENSE or SPARSE");
   }

From abfb605f218119a41e457e65a254b704949018eb Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Thu, 10 Feb 2022 17:55:54 -0800
Subject: [PATCH 02/45] compiles?

---
 cpp/src/fil/common.cuh         | 61 +++++++++++++++++++---------------
 cpp/src/fil/fil.cu             | 51 +++++++++++++++-------------
 cpp/src/fil/infer.cu           | 20 +++++------
 cpp/src/fil/internal.cuh       | 38 +++++++++++----------
 cpp/src/fil/treelite_import.cu | 13 +++++---
 5 files changed, 100 insertions(+), 83 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 56a98b5ac0..d5ec24b2af 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -38,9 +38,10 @@ __host__ __device__ __forceinline__ int forest_num_nodes(int num_trees, int dept
   return num_trees * tree_num_nodes(depth);
 }
 
+template <typename F>
 struct storage_base {
   categorical_sets sets_;
-  float* vector_leaf_;
+  F* vector_leaf_;
   bool cats_present() const { return sets_.cats_present(); }
 };
 
@@ -51,24 +52,25 @@ struct tree<dense_node<F>> : tree_base {
     : tree_base{cat_sets}, nodes_(nodes), node_pitch_(node_pitch)
   {
   }
-  __host__ __device__ const dense_node<F>& operator[](int i) const { return nodes_[i * node_pitch_]; }
+  __host__ __device__ const dense_node<F>& operator[](int i) const
+  {
+    return nodes_[i * node_pitch_];
+  }
   dense_node<F>* nodes_ = nullptr;
-  int node_pitch_    = 0;
+  int node_pitch_       = 0;
 };
 
-using std::enable_if;
-
-/** dense_storage stores the forest as a collection of dense nodes */
+/** partial specialization of storage. Stores the forest on GPU as a collection of dense nodes */
 template <typename F>
-struct storage<dense_node<F>> : storage_base {
+struct storage<dense_node<F>> : storage_base<F> {
   using node_t = dense_node<F>;
   __host__ __device__ storage(categorical_sets cat_sets,
-                                    float* vector_leaf,
-                                    node_t* nodes,
-                                    int num_trees,
-                                    int tree_stride,
-                                    int node_pitch)
-    : storage_base{cat_sets, vector_leaf},
+                              F* vector_leaf,
+                              node_t* nodes,
+                              int num_trees,
+                              int tree_stride,
+                              int node_pitch)
+    : storage_base<F>{cat_sets, vector_leaf},
       nodes_(nodes),
       num_trees_(num_trees),
       tree_stride_(tree_stride),
@@ -78,12 +80,13 @@ struct storage<dense_node<F>> : storage_base {
   __host__ __device__ int num_trees() const { return num_trees_; }
   __host__ __device__ tree<node_t> operator[](int i) const
   {
-    return tree<node_t>(sets_, nodes_ + i * tree_stride_, node_pitch_);
+    // sets_ is a dependent name (in template sense)
+    return tree<node_t>(this->sets_, nodes_ + i * tree_stride_, node_pitch_);
   }
-  node_t* nodes_ = nullptr;
-  int num_trees_     = 0;
-  int tree_stride_   = 0;
-  int node_pitch_    = 0;
+  node_t* nodes_   = nullptr;
+  int num_trees_   = 0;
+  int tree_stride_ = 0;
+  int node_pitch_  = 0;
 };
 
 /** sparse tree */
@@ -97,21 +100,23 @@ struct tree : tree_base {
   node_t* nodes_ = nullptr;
 };
 
-/** sparse_storage stores the forest as a collection of sparse nodes */
+/** storage stores the forest on GPU as a collection of sparse nodes */
 template <typename node_t>
-struct storage : storage_base {
+struct storage : storage_base<typename node_t::F> {
+  using F        = typename node_t::F;
   int* trees_    = nullptr;
   node_t* nodes_ = nullptr;
   int num_trees_ = 0;
-  __host__ __device__ storage(
-    categorical_sets cat_sets, float* vector_leaf, int* trees, node_t* nodes, int num_trees)
-    : storage_base{cat_sets, vector_leaf}, trees_(trees), nodes_(nodes), num_trees_(num_trees)
+  __host__ __device__
+  storage(categorical_sets cat_sets, F* vector_leaf, int* trees, node_t* nodes, int num_trees)
+    : storage_base<F>{cat_sets, vector_leaf}, trees_(trees), nodes_(nodes), num_trees_(num_trees)
   {
   }
   __host__ __device__ int num_trees() const { return num_trees_; }
   __host__ __device__ tree<node_t> operator[](int i) const
   {
-    return tree<node_t>(sets_, &nodes_[trees_[i]]);
+    // sets_ is a dependent name (in template sense)
+    return tree<node_t>(this->sets_, &nodes_[trees_[i]]);
   }
 };
 
@@ -147,6 +152,8 @@ struct shmem_size_params {
   int block_dim_x = 0;
   /// shm_sz is the associated shared memory footprint
   int shm_sz = INT_MAX;
+  /// sizeof_fp_vars is the size in bytes of all floating-point variables during inference
+  std::size_t sizeof_fp_vars = 4;
 
   __host__ __device__ int sdata_stride()
   {
@@ -154,7 +161,7 @@ struct shmem_size_params {
   }
   __host__ __device__ int cols_shmem_size()
   {
-    return cols_in_shmem ? sizeof(float) * sdata_stride() * n_items << log2_threads_per_tree : 0;
+    return cols_in_shmem ? sizeof_fp_vars * sdata_stride() * n_items << log2_threads_per_tree : 0;
   }
   template <int NITEMS, leaf_algo_t leaf_algo>
   size_t get_smem_footprint();
@@ -169,8 +176,8 @@ struct predict_params : shmem_size_params {
   int num_outputs;
 
   // Data parameters.
-  float* preds;
-  const float* data;
+  void* preds;
+  const void* data;
   // number of data rows (instances) to predict on
   int64_t num_rows;
 
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 148fe845b2..594d90ad0d 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -87,7 +87,7 @@ struct forest {
        we would have otherwise silently overflowed the index calculation due
        to short division. It would have failed cpp tests, but we might forget
        about this source of bugs, if not for the failing assert. */
-    ASSERT(max_shm_ < int(sizeof(float)) * std::numeric_limits<uint16_t>::max(),
+    ASSERT(max_shm_ < int(proba_ssp_.sizeof_fp_vars) * std::numeric_limits<uint16_t>::max(),
            "internal error: please use a larger type inside"
            " infer_k for column count");
   }
@@ -128,9 +128,10 @@ struct forest {
     fixed_block_count_ = blocks_per_sm * sm_count;
   }
 
+  template <typename F>
   void init_common(const raft::handle_t& h,
                    const categorical_sets& cat_sets,
-                   const std::vector<float>& vector_leaf,
+                   const std::vector<F>& vector_leaf,
                    const forest_params_t* params)
   {
     depth_                           = params->depth;
@@ -145,6 +146,7 @@ struct forest {
     proba_ssp_.num_cols              = params->num_cols;
     proba_ssp_.num_classes           = params->num_classes;
     proba_ssp_.cats_present          = cat_sets.cats_present();
+    proba_ssp_.sizeof_fp_vars        = sizeof(F);
     class_ssp_                       = proba_ssp_;
 
     int device          = h.get_device();
@@ -155,11 +157,11 @@ struct forest {
 
     // vector leaf
     if (!vector_leaf.empty()) {
-      vector_leaf_.resize(vector_leaf.size(), stream);
+      vector_leaf_.resize(vector_leaf.size() * sizeof(F), stream);
 
       RAFT_CUDA_TRY(cudaMemcpyAsync(vector_leaf_.data(),
                                     vector_leaf.data(),
-                                    vector_leaf.size() * sizeof(float),
+                                    vector_leaf.size() * sizeof(F),
                                     cudaMemcpyHostToDevice,
                                     stream));
     }
@@ -328,7 +330,7 @@ struct forest {
   int fixed_block_count_ = 0;
   int max_shm_           = 0;
   // Optionally used
-  rmm::device_uvector<float> vector_leaf_;
+  rmm::device_uvector<char> vector_leaf_;
   cat_sets_device_owner cat_sets_;
 };
 
@@ -410,7 +412,7 @@ struct dense_forest<dense_node<F>> : forest {
                                   h.get_stream()));
 
     // predict_proba is a runtime parameter, and opt-in is unconditional
-    dispatch_on_fil_template_params(opt_into_arch_dependent_shmem<storage>(max_shm_),
+    dispatch_on_fil_template_params(opt_into_arch_dependent_shmem<storage<node_t>>(max_shm_),
                                     static_cast<predict_params>(class_ssp_));
     // copy must be finished before freeing the host data
     RAFT_CUDA_TRY(cudaStreamSynchronize(h.get_stream()));
@@ -420,12 +422,12 @@ struct dense_forest<dense_node<F>> : forest {
 
   virtual void infer(predict_params params, cudaStream_t stream) override
   {
-    storage forest(cat_sets_.accessor(),
-                         vector_leaf_.data(),
-                         nodes_.data(),
-                         num_trees_,
-                         algo_ == algo_t::NAIVE ? tree_num_nodes(depth_) : 1,
-                         algo_ == algo_t::NAIVE ? 1 : num_trees_);
+    storage<node_t> forest(cat_sets_.accessor(),
+                           reinterpret_cast<F*>(vector_leaf_.data()),
+                           nodes_.data(),
+                           num_trees_,
+                           algo_ == algo_t::NAIVE ? tree_num_nodes(depth_) : 1,
+                           algo_ == algo_t::NAIVE ? 1 : num_trees_);
     fil::infer(forest, params, stream);
   }
 
@@ -475,8 +477,11 @@ struct sparse_forest : forest {
 
   virtual void infer(predict_params params, cudaStream_t stream) override
   {
-    storage<node_t> forest(
-      cat_sets_.accessor(), vector_leaf_.data(), trees_.data(), nodes_.data(), num_trees_);
+    storage<node_t> forest(cat_sets_.accessor(),
+                           reinterpret_cast<typename node_t::F*>(vector_leaf_.data()),
+                           trees_.data(),
+                           nodes_.data(),
+                           num_trees_);
     fil::infer(forest, params, stream);
   }
 
@@ -570,11 +575,11 @@ void check_params(const forest_params_t* params, bool dense)
 /** initializes a forest of any type
  * When fil_node_t == dense_node, const int* trees is ignored
  */
-template <typename fil_node_t>
+template <typename fil_node_t, typename F>
 void init(const raft::handle_t& h,
           forest_t* pf,
           const categorical_sets& cat_sets,
-          const std::vector<typename fil_node_t::F>& vector_leaf,
+          const std::vector<F>& vector_leaf,
           const int* trees,
           const fil_node_t* nodes,
           const forest_params_t* params)
@@ -588,14 +593,12 @@ void init(const raft::handle_t& h,
 
 struct instantiate_forest_init {
   template <typename fil_node_t>
-  void operator()(fil_node_t) {
-    init(raft::handle_t{},
-         (forest_t*)nullptr,
-         categorical_sets{},
-         std::vector<typename fil_node_t::F>{},
-         (int*)nullptr,
-         (fil_node_t*)nullptr,
-         (forest_params_t*)nullptr);
+  void operator()(fil_node_t)
+  {
+    if constexpr (std::is_same<typename fil_node_t::F, float>())
+      init<fil_node_t>({}, {}, {}, std::vector<float>(), {}, {}, {});
+    else
+      init<fil_node_t>({}, {}, {}, std::vector<double>(), {}, {}, {});
   }
 };
 
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 3e4d5eba56..88cf63375e 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -800,7 +800,7 @@ __global__ void infer_k(storage_type forest, predict_params params)
        block_row0 += rows_per_block * gridDim.x) {
     int block_num_rows =
       max(0, (int)min((int64_t)rows_per_block, (int64_t)params.num_rows - block_row0));
-    const float* block_input = params.data + block_row0 * num_cols;
+    const float* block_input = reinterpret_cast<const float*>(params.data) + block_row0 * num_cols;
     if constexpr (cols_in_shmem)
       load_data(sdata, block_input, params, rows_per_block, block_num_rows);
 
@@ -820,7 +820,7 @@ __global__ void infer_k(storage_type forest, predict_params params)
          and is made exact below.
          Same with thread_num_rows > 0
       */
-      typedef typename leaf_output_t<leaf_algo>::T pred_t;
+      typedef typename leaf_output_t<leaf_algo, decltype(forest[0])>::T pred_t;
       vec<NITEMS, pred_t> prediction;
       if (tree < forest.num_trees() && thread_num_rows != 0) {
         prediction = infer_one_tree<NITEMS, CATS_SUPPORTED, pred_t>(
@@ -833,7 +833,7 @@ __global__ void infer_k(storage_type forest, predict_params params)
       // Dummy threads can be marked as having 0 rows
       acc.accumulate(prediction, tree, tree < forest.num_trees() ? thread_num_rows : 0);
     }
-    acc.finalize(params.preds + params.num_outputs * block_row0,
+    acc.finalize(reinterpret_cast<float*>(params.preds) + params.num_outputs * block_row0,
                  block_num_rows,
                  params.num_outputs,
                  params.transform,
@@ -893,15 +893,15 @@ void infer(storage_type forest, predict_params params, cudaStream_t stream)
   dispatch_on_fil_template_params(infer_k_storage_template<storage_type>(forest, stream), params);
 }
 
-template void inferstorage<dense_node<float>>(storage<dense_node<float>> forest,
-                                   predict_params params,
-                                   cudaStream_t stream);
+template void infer<storage<dense_node<float>>>(storage<dense_node<float>> forest,
+                                                predict_params params,
+                                                cudaStream_t stream);
 template void infer<storage<sparse_node16<float>>>(storage<sparse_node16<float>> forest,
-                                      predict_params params,
-                                      cudaStream_t stream);
+                                                   predict_params params,
+                                                   cudaStream_t stream);
 template void infer<storage<sparse_node8>>(storage<sparse_node8> forest,
-                                     predict_params params,
-                                     cudaStream_t stream);
+                                           predict_params params,
+                                           cudaStream_t stream);
 
 }  // namespace fil
 }  // namespace ML
diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh
index 71e36e0ced..b75947046d 100644
--- a/cpp/src/fil/internal.cuh
+++ b/cpp/src/fil/internal.cuh
@@ -27,8 +27,8 @@
 #include <rmm/device_uvector.hpp>
 #include <treelite/c_api.h>
 #include <treelite/tree.h>
-#include <vector>
 #include <utility>
+#include <vector>
 
 namespace raft {
 class handle_t;
@@ -90,7 +90,7 @@ enum output_t {
 };
 
 /** val_t is the payload within a FIL leaf */
-template<typename F>
+template <typename F>
 union val_t {
   /** floating-point threshold value for parent node or output value
       (e.g. class probability or regression summand) for leaf node */
@@ -100,9 +100,9 @@ union val_t {
 };
 
 /** base_node contains common implementation details for dense and sparse nodes */
-template<typename F_>
+template <typename F_>
 struct base_node {
-  using F = F_; // floating-point type
+  using F = F_;  // floating-point type
   /** val, for parent nodes, is a threshold or category list offset. For leaf
       nodes, it is the tree prediction (see see leaf_output_t<leaf_algo_t>::T) */
   val_t<F> val;
@@ -119,10 +119,11 @@ struct base_node {
   static const int IS_CATEGORICAL_MASK   = 1 << IS_CATEGORICAL_OFFSET;
   static const int FID_MASK              = (1 << IS_CATEGORICAL_OFFSET) - 1;
   template <class o_t>
-  __host__ __device__ o_t output() const {
-    if constexpr(std::is_same<o_t, int>()) {
+  __host__ __device__ o_t output() const
+  {
+    if constexpr (std::is_same<o_t, int>()) {
       return val.idx;
-    } else if constexpr(std::is_same<o_t, F>()) {
+    } else if constexpr (std::is_same<o_t, F>()) {
       return val.f;
     } else {
       return val;
@@ -136,7 +137,8 @@ struct base_node {
   __host__ __device__ bool is_leaf() const { return bits & IS_LEAF_MASK; }
   __host__ __device__ bool is_categorical() const { return bits & IS_CATEGORICAL_MASK; }
   __host__ __device__ base_node() : val{}, bits(0) {}
-  base_node(val_t<F> output, val_t<F> split, int fid, bool def_left, bool is_leaf, bool is_categorical)
+  base_node(
+    val_t<F> output, val_t<F> split, int fid, bool def_left, bool is_leaf, bool is_categorical)
   {
     RAFT_EXPECTS((fid & FID_MASK) == fid, "internal error: feature ID doesn't fit into base_node");
     bits = (fid & FID_MASK) | (def_left ? DEF_LEFT_MASK : 0) | (is_leaf ? IS_LEAF_MASK : 0) |
@@ -149,7 +151,7 @@ struct base_node {
 };
 
 /** dense_node is a single node of a dense forest */
-template<typename F>
+template <typename F>
 struct alignas(8) dense_node : base_node<F> {
   dense_node() = default;
   /// ignoring left_index, this is useful to unify import from treelite
@@ -160,7 +162,7 @@ struct alignas(8) dense_node : base_node<F> {
              bool is_leaf,
              bool is_categorical,
              int left_index = -1)
-    : base_node(output, split, fid, def_left, is_leaf, is_categorical)
+    : base_node<F>(output, split, fid, def_left, is_leaf, is_categorical)
   {
   }
   /** index of the left child, where curr is the index of the current node */
@@ -168,10 +170,10 @@ struct alignas(8) dense_node : base_node<F> {
 };
 
 /** sparse_node16 is a 16-byte node in a sparse forest */
-template<typename F>
+template <typename F>
 struct alignas(16) sparse_node16 : base_node<F> {
   int left_idx;
-  __host__ __device__ sparse_node16() : left_idx(0), dummy(0) {}
+  __host__ __device__ sparse_node16() : left_idx(0) {}
   sparse_node16(val_t<F> output,
                 val_t<F> split,
                 int fid,
@@ -179,8 +181,7 @@ struct alignas(16) sparse_node16 : base_node<F> {
                 bool is_leaf,
                 bool is_categorical,
                 int left_index)
-    : base_node(output, split, fid, def_left, is_leaf, is_categorical),
-      left_idx(left_index)
+    : base_node<F>(output, split, fid, def_left, is_leaf, is_categorical), left_idx(left_index)
   {
   }
   __host__ __device__ int left_index() const { return left_idx; }
@@ -205,7 +206,7 @@ struct alignas(8) sparse_node8 : base_node<float> {
                bool is_leaf,
                bool is_categorical,
                int left_index)
-    : base_node(output, split, fid, def_left, is_leaf, is_categorical)
+    : base_node<float>(output, split, fid, def_left, is_leaf, is_categorical)
   {
     RAFT_EXPECTS((fid & FID_MASK) == fid,
                  "internal error: feature ID doesn't fit into sparse_node8");
@@ -218,8 +219,9 @@ struct alignas(8) sparse_node8 : base_node<float> {
 };
 
 /// pass a functor with a templated operator()(fil_node_t), i.e. accepting one node as parameter
-template<typename Stuff>
-void instantiate_for_all_node_types(Stuff stuff) {
+template <typename Stuff>
+void instantiate_for_all_node_types(Stuff stuff)
+{
   stuff(dense_node<float>{});
   stuff(dense_node<double>{});
   stuff(sparse_node16<float>{});
@@ -237,7 +239,7 @@ struct sparse_forest;
 
 template <typename node_t>
 struct node_traits {
-  using F = typename node_t::F;
+  using F                    = typename node_t::F;
   using storage              = ML::fil::storage<node_t>;
   using forest               = sparse_forest<node_t>;
   static const bool IS_DENSE = false;
diff --git a/cpp/src/fil/treelite_import.cu b/cpp/src/fil/treelite_import.cu
index 22216a348b..ede325a09d 100644
--- a/cpp/src/fil/treelite_import.cu
+++ b/cpp/src/fil/treelite_import.cu
@@ -324,7 +324,7 @@ conversion_state<fil_node_t> tl2fil_inner_node(int fil_left_child,
                                                std::size_t* bit_pool_offset)
 {
   int tl_left = tree.LeftChild(tl_node_id), tl_right = tree.RightChild(tl_node_id);
-  val_t split         = {.f = NAN};  // yes there's a default initializer already
+  val_t<float> split  = {.f = NAN};  // yes there's a default initializer already
   int feature_id      = tree.SplitIndex(tl_node_id);
   bool is_categorical = tree.SplitType(tl_node_id) == tl::SplitFeatureType::kCategorical &&
                         tree.MatchingCategories(tl_node_id).size() > 0;
@@ -352,7 +352,8 @@ conversion_state<fil_node_t> tl2fil_inner_node(int fil_left_child,
     ASSERT(false, "only numerical and categorical split nodes are supported");
   }
   bool default_left = tree.DefaultLeft(tl_node_id) ^ swap_child_nodes;
-  fil_node_t node(val_t{}, split, feature_id, default_left, false, is_categorical, fil_left_child);
+  fil_node_t node(
+    val_t<float>{}, split, feature_id, default_left, false, is_categorical, fil_left_child);
   return conversion_state<fil_node_t>{node, swap_child_nodes};
 }
 
@@ -692,8 +693,12 @@ void from_treelite(const raft::handle_t& handle,
   }
 
   switch (storage_type) {
-    case storage_type_t::DENSE: convert<dense_node<float>>(handle, pforest, model, *tl_params); break;
-    case storage_type_t::SPARSE: convert<sparse_node16<float>>(handle, pforest, model, *tl_params); break;
+    case storage_type_t::DENSE:
+      convert<dense_node<float>>(handle, pforest, model, *tl_params);
+      break;
+    case storage_type_t::SPARSE:
+      convert<sparse_node16<float>>(handle, pforest, model, *tl_params);
+      break;
     case storage_type_t::SPARSE8: convert<sparse_node8>(handle, pforest, model, *tl_params); break;
     default: ASSERT(false, "tl_params->sparse must be one of AUTO, DENSE or SPARSE");
   }

From 96e2ae6ce85f798417ad90dfca58f36b19068d40 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Thu, 10 Feb 2022 18:54:56 -0800
Subject: [PATCH 03/45] simplify leaf_output_t

---
 cpp/src/fil/infer.cu     |  2 +-
 cpp/src/fil/internal.cuh | 28 ++++++++++++++--------------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 88cf63375e..98afb656f2 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -820,7 +820,7 @@ __global__ void infer_k(storage_type forest, predict_params params)
          and is made exact below.
          Same with thread_num_rows > 0
       */
-      typedef typename leaf_output_t<leaf_algo, decltype(forest[0])>::T pred_t;
+      typedef typename leaf_output_t<leaf_algo, float>::T pred_t;
       vec<NITEMS, pred_t> prediction;
       if (tree < forest.num_trees() && thread_num_rows != 0) {
         prediction = infer_one_tree<NITEMS, CATS_SUPPORTED, pred_t>(
diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh
index b75947046d..0295cde8d1 100644
--- a/cpp/src/fil/internal.cuh
+++ b/cpp/src/fil/internal.cuh
@@ -300,27 +300,27 @@ enum leaf_algo_t {
 template <typename node_t>
 struct tree;
 
-template <leaf_algo_t leaf_algo, typename tree_t>
+template <leaf_algo_t leaf_algo, typename F>
 struct leaf_output_t {
 };
-template <typename node_t>
-struct leaf_output_t<leaf_algo_t::FLOAT_UNARY_BINARY, tree<node_t>> {
-  typedef typename node_t::F T;
+template <typename F>
+struct leaf_output_t<leaf_algo_t::FLOAT_UNARY_BINARY, F> {
+  typedef F T;
 };
-template <typename node_t>
-struct leaf_output_t<leaf_algo_t::CATEGORICAL_LEAF, tree<node_t>> {
+template <typename F>
+struct leaf_output_t<leaf_algo_t::CATEGORICAL_LEAF, F> {
   typedef int T;
 };
-template <typename node_t>
-struct leaf_output_t<leaf_algo_t::GROVE_PER_CLASS_FEW_CLASSES, tree<node_t>> {
-  typedef typename node_t::F T;
+template <typename F>
+struct leaf_output_t<leaf_algo_t::GROVE_PER_CLASS_FEW_CLASSES, F> {
+  typedef F T;
 };
-template <typename node_t>
-struct leaf_output_t<leaf_algo_t::GROVE_PER_CLASS_MANY_CLASSES, tree<node_t>> {
-  typedef typename node_t::F T;
+template <typename F>
+struct leaf_output_t<leaf_algo_t::GROVE_PER_CLASS_MANY_CLASSES, F> {
+  typedef F T;
 };
-template <typename node_t>
-struct leaf_output_t<leaf_algo_t::VECTOR_LEAF, tree<node_t>> {
+template <typename F>
+struct leaf_output_t<leaf_algo_t::VECTOR_LEAF, F> {
   typedef int T;
 };
 

From 35ad5d9fcd5e42c8b4ae8de00913084dca7b3834 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Tue, 15 Feb 2022 16:43:24 -0800
Subject: [PATCH 04/45] draft

---
 cpp/src/fil/common.cuh         |   8 +-
 cpp/src/fil/fil.cu             |  45 +++++---
 cpp/src/fil/infer.cu           | 183 +++++++++++++++++----------------
 cpp/src/fil/internal.cuh       |   8 +-
 cpp/src/fil/treelite_import.cu |  11 +-
 5 files changed, 143 insertions(+), 112 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index d5ec24b2af..18d935213a 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -46,8 +46,9 @@ struct storage_base {
 };
 
 /** represents a dense tree */
-template <typename F>
-struct tree<dense_node<F>> : tree_base {
+template <typename F_>
+struct tree<dense_node<F_>> : tree_base {
+  using F = F_;
   __host__ __device__ tree(categorical_sets cat_sets, dense_node<F>* nodes, int node_pitch)
     : tree_base{cat_sets}, nodes_(nodes), node_pitch_(node_pitch)
   {
@@ -92,6 +93,7 @@ struct storage<dense_node<F>> : storage_base<F> {
 /** sparse tree */
 template <typename node_t>
 struct tree : tree_base {
+  using F = typename node_t::F;
   __host__ __device__ tree(categorical_sets cat_sets, node_t* nodes)
     : tree_base{cat_sets}, nodes_(nodes)
   {
@@ -163,7 +165,7 @@ struct shmem_size_params {
   {
     return cols_in_shmem ? sizeof_fp_vars * sdata_stride() * n_items << log2_threads_per_tree : 0;
   }
-  template <int NITEMS, leaf_algo_t leaf_algo>
+  template <int NITEMS, typename F, leaf_algo_t leaf_algo>
   size_t get_smem_footprint();
 };
 
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index e534d16733..19c0aa64b1 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -34,6 +34,7 @@ creation and prediction (the main inference kernel is defined in infer.cu). */
 namespace ML {
 namespace fil {
 
+__host__ __device__ double sigmoid(double x) { return 1.0 / (1.0 + exp(-x)); }
 __host__ __device__ float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); }
 
 /** performs additional transformations on the array of forest predictions
@@ -41,19 +42,20 @@ __host__ __device__ float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); }
     averaging (multiplying by inv_num_trees), adding global_bias (always done),
     sigmoid and applying threshold. in case of complement_proba,
     fills in the complement probability */
-__global__ void transform_k(float* preds,
+template<typename F>
+__global__ void transform_k(F* preds,
                             size_t n,
                             output_t output,
-                            float inv_num_trees,
-                            float threshold,
-                            float global_bias,
+                            F inv_num_trees,
+                            F threshold,
+                            F global_bias,
                             bool complement_proba)
 {
   size_t i = threadIdx.x + size_t(blockIdx.x) * blockDim.x;
   if (i >= n) return;
   if (complement_proba && i % 2 != 0) return;
 
-  float result = preds[i];
+  F result = preds[i];
   if ((output & output_t::AVG) != 0) result *= inv_num_trees;
   result += global_bias;
   if ((output & output_t::SIGMOID) != 0) result = sigmoid(result);
@@ -172,8 +174,9 @@ struct forest {
 
   virtual void infer(predict_params params, cudaStream_t stream) = 0;
 
+  template<typename F>
   void predict(
-    const raft::handle_t& h, float* preds, const float* data, size_t num_rows, bool predict_proba)
+    const raft::handle_t& h, F* preds, const F* data, size_t num_rows, bool predict_proba)
   {
     // Initialize prediction parameters.
     predict_params params(predict_proba ? proba_ssp_ : class_ssp_);
@@ -252,7 +255,7 @@ struct forest {
     // Simulating treelite order, which cancels out bias.
     // If non-proba prediction used, it still will not matter
     // for the same reason softmax will not.
-    float global_bias     = (ot & output_t::SOFTMAX) != 0 ? 0.0f : global_bias_;
+    F global_bias     = (ot & output_t::SOFTMAX) != 0 ? F(0.0) : global_bias_;
     bool complement_proba = false, do_transform;
 
     if (predict_proba) {
@@ -269,17 +272,17 @@ struct forest {
           // for GROVE_PER_CLASS, averaging happens in infer_k
           ot                 = output_t(ot & ~output_t::AVG);
           params.num_outputs = params.num_classes;
-          do_transform = (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != 0.0f;
+          do_transform = (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != F(0.0);
           break;
         case leaf_algo_t::CATEGORICAL_LEAF:
           params.num_outputs = params.num_classes;
-          do_transform       = ot != output_t::RAW || global_bias_ != 0.0f;
+          do_transform       = ot != output_t::RAW || global_bias_ != F(0.0);
           break;
         case leaf_algo_t::VECTOR_LEAF:
           // for VECTOR_LEAF, averaging happens in infer_k
           ot                 = output_t(ot & ~output_t::AVG);
           params.num_outputs = params.num_classes;
-          do_transform = (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != 0.0f;
+          do_transform = (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != F(0.0);
           break;
         default: ASSERT(false, "internal error: predict: invalid leaf_algo %d", params.leaf_algo);
       }
@@ -304,9 +307,9 @@ struct forest {
         preds,
         num_values_to_transform,
         ot,
-        num_trees_ > 0 ? (1.0f / num_trees_) : 1.0f,
-        threshold_,
-        global_bias,
+        num_trees_ > 0 ? (F(1.0) / num_trees_) : F(1.0),
+        F(threshold_),
+        F(global_bias),
         complement_proba);
       RAFT_CUDA_TRY(cudaPeekAtLastError());
     }
@@ -324,8 +327,8 @@ struct forest {
   int depth_         = 0;
   algo_t algo_       = algo_t::NAIVE;
   output_t output_   = output_t::RAW;
-  float threshold_   = 0.5;
-  float global_bias_ = 0;
+  double threshold_   = 0.5;
+  double global_bias_ = 0;
   shmem_size_params class_ssp_, proba_ssp_;
   int fixed_block_count_ = 0;
   int max_shm_           = 0;
@@ -610,6 +613,18 @@ void free(const raft::handle_t& h, forest_t f)
   delete f;
 }
 
+/// part of C API - overload instead of template
+void predict(const raft::handle_t& h,
+             forest_t f,
+             double* preds,
+             const double* data,
+             size_t num_rows,
+             bool predict_proba)
+{
+  f->predict(h, preds, data, num_rows, predict_proba);
+}
+
+/// part of C API - overload instead of template
 void predict(const raft::handle_t& h,
              forest_t f,
              float* preds,
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 98afb656f2..383c0641c6 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -41,7 +41,7 @@
 namespace ML {
 namespace fil {
 
-// vec wraps float[N] for cub::BlockReduce
+// vec wraps float[N], int[N] or double[N] for cub::BlockReduce
 template <int N, typename T>
 struct vec;
 
@@ -96,33 +96,34 @@ struct vec {
   }
 };
 
-struct best_margin_label : cub::KeyValuePair<int, float> {
-  __host__ __device__ best_margin_label(cub::KeyValuePair<int, float> pair)
-    : cub::KeyValuePair<int, float>(pair)
+template<typename F>
+struct best_margin_label : cub::KeyValuePair<int, F> {
+  __host__ __device__ best_margin_label(cub::KeyValuePair<int, F> pair)
+    : cub::KeyValuePair<int, F>(pair)
   {
   }
-  __host__ __device__ best_margin_label(int c = 0, float f = -INFINITY)
-    : cub::KeyValuePair<int, float>({c, f})
+  __host__ __device__ best_margin_label(int c = 0, F f = -INFINITY)
+    : cub::KeyValuePair<int, F>({c, f})
   {
   }
 };
 
-template <int NITEMS>
-__device__ __forceinline__ vec<NITEMS, best_margin_label> to_vec(int c, vec<NITEMS, float> margin)
+template <int NITEMS, typename F>
+__device__ __forceinline__ vec<NITEMS, best_margin_label<F>> to_vec(int c, vec<NITEMS, F> margin)
 {
-  vec<NITEMS, best_margin_label> ret;
+  vec<NITEMS, best_margin_label<F>> ret;
   CUDA_PRAGMA_UNROLL
   for (int i = 0; i < NITEMS; ++i)
-    ret[i] = best_margin_label(c, margin[i]);
+    ret[i] = best_margin_label<F>(c, margin[i]);
   return ret;
 }
 
 struct ArgMax {
-  template <int NITEMS>
-  __host__ __device__ __forceinline__ vec<NITEMS, best_margin_label> operator()(
-    vec<NITEMS, best_margin_label> a, vec<NITEMS, best_margin_label> b) const
+  template <int NITEMS, typename F>
+  __host__ __device__ __forceinline__ vec<NITEMS, best_margin_label<F>> operator()(
+    vec<NITEMS, best_margin_label<F>> a, vec<NITEMS, best_margin_label<F>> b) const
   {
-    vec<NITEMS, best_margin_label> c;
+    vec<NITEMS, best_margin_label<F>> c;
     CUDA_PRAGMA_UNROLL
     for (int i = 0; i < NITEMS; i++)
       c[i] = cub::ArgMax()(a[i], b[i]);
@@ -155,7 +156,7 @@ __device__ __forceinline__ vec<NITEMS, output_type> tree_leaf_output(tree_type t
 
 template <int NITEMS, bool CATS_SUPPORTED, typename output_type, typename tree_type>
 __device__ __forceinline__ vec<NITEMS, output_type> infer_one_tree(tree_type tree,
-                                                                   const float* input,
+                                                                   const typename tree_type::F* input,
                                                                    int cols,
                                                                    int n_rows)
 {
@@ -186,7 +187,7 @@ __device__ __forceinline__ vec<NITEMS, output_type> infer_one_tree(tree_type tre
 
 template <typename output_type, typename tree_type>
 __device__ __forceinline__ vec<1, output_type> infer_one_tree(tree_type tree,
-                                                              const float* input,
+                                                              const typename tree_type::F* input,
                                                               int cols,
                                                               int rows)
 {
@@ -219,19 +220,19 @@ host code.
 See https://rapids.ai/start.html as well as cmake defaults.
 */
 // values below are defaults as of this change.
-template <int NITEMS>
+template <int NITEMS, typename F>
 size_t block_reduce_footprint_host()
 {
   return sizeof(
     typename cub::
-      BlockReduce<vec<NITEMS, float>, FIL_TPB, cub::BLOCK_REDUCE_WARP_REDUCTIONS, 1, 1, 600>::
+      BlockReduce<vec<NITEMS, F>, FIL_TPB, cub::BLOCK_REDUCE_WARP_REDUCTIONS, 1, 1, 600>::
         TempStorage);
 }
 
-template <int NITEMS>
+template <int NITEMS, typename F>
 size_t block_reduce_best_class_footprint_host()
 {
-  return sizeof(typename cub::BlockReduce<vec<NITEMS, best_margin_label>,
+  return sizeof(typename cub::BlockReduce<vec<NITEMS, best_margin_label<F>>,
                                           FIL_TPB,
                                           cub::BLOCK_REDUCE_WARP_REDUCTIONS,
                                           1,
@@ -249,9 +250,10 @@ __device__ __forceinline__ T block_reduce(T value, BinaryOp op, void* storage)
 }
 
 template <int NITEMS,
+          typename F,
           leaf_algo_t leaf_algo>  // = FLOAT_UNARY_BINARY
 struct tree_aggregator_t {
-  vec<NITEMS, float> acc;
+  vec<NITEMS, F> acc;
   void* tmp_storage;
 
   /** shared memory footprint of the accumulator during
@@ -263,7 +265,7 @@ struct tree_aggregator_t {
                                         int log2_threads_per_tree,
                                         bool predict_proba)
   {
-    return log2_threads_per_tree != 0 ? FIL_TPB * NITEMS * sizeof(float)
+    return log2_threads_per_tree != 0 ? FIL_TPB * NITEMS * sizeof(F)
                                       : block_reduce_footprint_host<NITEMS>();
   }
 
@@ -278,19 +280,19 @@ struct tree_aggregator_t {
   __device__ __forceinline__ tree_aggregator_t(predict_params params,
                                                void* accumulate_workspace,
                                                void* finalize_workspace,
-                                               float* vector_leaf)
+                                               F* vector_leaf)
     : tmp_storage(finalize_workspace)
   {
   }
 
-  __device__ __forceinline__ void accumulate(vec<NITEMS, float> single_tree_prediction,
+  __device__ __forceinline__ void accumulate(vec<NITEMS, F> single_tree_prediction,
                                              int tree,
                                              int thread_num_rows)
   {
     acc += single_tree_prediction;
   }
 
-  __device__ INLINE_CONFIG void finalize(float* block_out,
+  __device__ INLINE_CONFIG void finalize(F* block_out,
                                          int block_num_rows,
                                          int output_stride,
                                          output_t transform,
@@ -303,7 +305,7 @@ struct tree_aggregator_t {
       if (log2_threads_per_tree == 0) {
         acc = block_reduce(acc, vectorized(cub::Sum()), tmp_storage);
       } else {
-        auto per_thread         = (vec<NITEMS, float>*)tmp_storage;
+        auto per_thread         = (vec<NITEMS, F>*)tmp_storage;
         per_thread[threadIdx.x] = acc;
         __syncthreads();
         // We have two pertinent cases for splitting FIL_TPB == 256 values:
@@ -349,13 +351,13 @@ __device__ __forceinline__ auto allreduce_shmem(Iterator begin,
 
 // *begin and *end shall be struct vec
 // tmp_storage may overlap shared memory addressed by [begin, end)
-template <typename Iterator>
+template <typename Iterator, typename F>
 __device__ __forceinline__ void write_best_class(
-  Iterator begin, Iterator end, void* tmp_storage, float* out, int num_rows)
+  Iterator begin, Iterator end, void* tmp_storage, F* out, int num_rows)
 {
   // reduce per-class candidate margins to one best class candidate
   // per thread (for each of the NITEMS rows)
-  auto best = vec<begin->NITEMS, best_margin_label>();
+  auto best = vec<begin->NITEMS, best_margin_label<F>>();
   for (int c = threadIdx.x; c < end - begin; c += blockDim.x)
     best = vectorized(cub::ArgMax())(best, to_vec(c, begin[c]));
   // [begin, end) may overlap tmp_storage
@@ -370,6 +372,7 @@ __device__ __forceinline__ void write_best_class(
 }
 
 /// needed for softmax
+__device__ double shifted_exp(double margin, double max) { return exp(margin - max); }
 __device__ float shifted_exp(float margin, float max) { return expf(margin - max); }
 
 // *begin and *end shall be struct vec
@@ -391,13 +394,13 @@ __device__ __forceinline__ void block_softmax(Iterator begin, Iterator end, void
 
 // *begin and *end shall be struct vec
 // tmp_storage may NOT overlap shared memory addressed by [begin, end)
-template <typename Iterator>
+template <typename Iterator, typename F>
 __device__ __forceinline__ void normalize_softmax_and_write(Iterator begin,
                                                             Iterator end,
                                                             output_t transform,
                                                             int trees_per_class,
                                                             void* tmp_storage,
-                                                            float* out,
+                                                            F* out,
                                                             int num_rows)
 {
   if ((transform & output_t::AVG) != 0) {
@@ -416,13 +419,13 @@ __device__ __forceinline__ void normalize_softmax_and_write(Iterator begin,
 // *begin and *end shall be struct vec
 // tmp_storage may NOT overlap shared memory addressed by [begin, end)
 // in case num_outputs > 1
-template <typename Iterator>
+template <typename Iterator, typename F>
 __device__ __forceinline__ void class_margins_to_global_memory(Iterator begin,
                                                                Iterator end,
                                                                output_t transform,
                                                                int trees_per_class,
                                                                void* tmp_storage,
-                                                               float* out,
+                                                               F* out,
                                                                int num_rows,
                                                                int num_outputs)
 {
@@ -435,11 +438,11 @@ __device__ __forceinline__ void class_margins_to_global_memory(Iterator begin,
   }
 }
 
-template <int NITEMS>
-struct tree_aggregator_t<NITEMS, GROVE_PER_CLASS_FEW_CLASSES> {
-  vec<NITEMS, float> acc;
+template <int NITEMS, typename F>
+struct tree_aggregator_t<NITEMS, F, GROVE_PER_CLASS_FEW_CLASSES> {
+  vec<NITEMS, F> acc;
   int num_classes;
-  vec<NITEMS, float>* per_thread;
+  vec<NITEMS, F>* per_thread;
   void* tmp_storage;
 
   static size_t smem_finalize_footprint(size_t data_row_size,
@@ -447,7 +450,7 @@ struct tree_aggregator_t<NITEMS, GROVE_PER_CLASS_FEW_CLASSES> {
                                         int log2_threads_per_tree,
                                         bool predict_proba)
   {
-    size_t phase1 = (FIL_TPB - FIL_TPB % num_classes) * sizeof(vec<NITEMS, float>);
+    size_t phase1 = (FIL_TPB - FIL_TPB % num_classes) * sizeof(vec<NITEMS, F>);
     size_t phase2 = predict_proba ? block_reduce_footprint_host<NITEMS>()
                                   : block_reduce_best_class_footprint_host<NITEMS>();
     return predict_proba ? phase1 + phase2 : std::max(phase1, phase2);
@@ -458,21 +461,21 @@ struct tree_aggregator_t<NITEMS, GROVE_PER_CLASS_FEW_CLASSES> {
   __device__ __forceinline__ tree_aggregator_t(predict_params params,
                                                void* accumulate_workspace,
                                                void* finalize_workspace,
-                                               float* vector_leaf)
+                                               F* vector_leaf)
     : num_classes(params.num_classes),
-      per_thread((vec<NITEMS, float>*)finalize_workspace),
+      per_thread((vec<NITEMS, F>*)finalize_workspace),
       tmp_storage(params.predict_proba ? per_thread + num_classes : finalize_workspace)
   {
   }
 
-  __device__ __forceinline__ void accumulate(vec<NITEMS, float> single_tree_prediction,
+  __device__ __forceinline__ void accumulate(vec<NITEMS, F> single_tree_prediction,
                                              int tree,
                                              int thread_num_rows)
   {
     acc += single_tree_prediction;
   }
 
-  __device__ INLINE_CONFIG void finalize(float* out,
+  __device__ INLINE_CONFIG void finalize(F* out,
                                          int num_rows,
                                          int num_outputs,
                                          output_t transform,
@@ -498,11 +501,11 @@ struct tree_aggregator_t<NITEMS, GROVE_PER_CLASS_FEW_CLASSES> {
   }
 };
 
-template <int NITEMS>
-struct tree_aggregator_t<NITEMS, GROVE_PER_CLASS_MANY_CLASSES> {
-  vec<NITEMS, float> acc;
+template <int NITEMS, typename F>
+struct tree_aggregator_t<NITEMS, F, GROVE_PER_CLASS_MANY_CLASSES> {
+  vec<NITEMS, F> acc;
   /// at first, per class margin, then, possibly, different softmax partials
-  vec<NITEMS, float>* per_class_margin;
+  vec<NITEMS, F>* per_class_margin;
   void* tmp_storage;
   int num_classes;
 
@@ -519,23 +522,23 @@ struct tree_aggregator_t<NITEMS, GROVE_PER_CLASS_MANY_CLASSES> {
 
   static __host__ __device__ size_t smem_accumulate_footprint(int num_classes)
   {
-    return num_classes * sizeof(vec<NITEMS, float>);
+    return num_classes * sizeof(vec<NITEMS, F>);
   }
 
   __device__ __forceinline__ tree_aggregator_t(predict_params params,
                                                void* accumulate_workspace,
                                                void* finalize_workspace,
-                                               float* vector_leaf)
-    : per_class_margin((vec<NITEMS, float>*)accumulate_workspace),
+                                               F* vector_leaf)
+    : per_class_margin((vec<NITEMS, F>*)accumulate_workspace),
       tmp_storage(params.predict_proba ? per_class_margin + num_classes : finalize_workspace),
       num_classes(params.num_classes)
   {
     for (int c = threadIdx.x; c < num_classes; c += blockDim.x)
-      per_class_margin[c] = vec<NITEMS, float>(0);
+      per_class_margin[c] = vec<NITEMS, F>(0);
     // __syncthreads() is called in infer_k
   }
 
-  __device__ __forceinline__ void accumulate(vec<NITEMS, float> single_tree_prediction,
+  __device__ __forceinline__ void accumulate(vec<NITEMS, F> single_tree_prediction,
                                              int tree,
                                              int thread_num_rows)
   {
@@ -544,7 +547,7 @@ struct tree_aggregator_t<NITEMS, GROVE_PER_CLASS_MANY_CLASSES> {
     __syncthreads();
   }
 
-  __device__ INLINE_CONFIG void finalize(float* out,
+  __device__ INLINE_CONFIG void finalize(F* out,
                                          int num_rows,
                                          int num_outputs,
                                          output_t transform,
@@ -562,17 +565,17 @@ struct tree_aggregator_t<NITEMS, GROVE_PER_CLASS_MANY_CLASSES> {
   }
 };
 
-template <int NITEMS>
-struct tree_aggregator_t<NITEMS, VECTOR_LEAF> {
+template <int NITEMS, typename F>
+struct tree_aggregator_t<NITEMS, F, VECTOR_LEAF> {
   // per_class_margin is a row-major matrix
   // of size num_threads_per_class * num_classes
   // used to acccumulate class values
-  vec<NITEMS, float>* per_class_margin;
+  vec<NITEMS, F>* per_class_margin;
   vec<NITEMS, int>* vector_leaf_indices;
   int* thread_num_rows;
   int num_classes;
   int num_threads_per_class;
-  float* vector_leaf;
+  F* vector_leaf;
   void* tmp_storage;
 
   static size_t smem_finalize_footprint(size_t data_row_size,
@@ -587,14 +590,14 @@ struct tree_aggregator_t<NITEMS, VECTOR_LEAF> {
   }
   static size_t smem_accumulate_footprint(int num_classes)
   {
-    return sizeof(vec<NITEMS, float>) * num_classes * max(1, FIL_TPB / num_classes) +
+    return sizeof(vec<NITEMS, F>) * num_classes * max(1, FIL_TPB / num_classes) +
            sizeof(vec<NITEMS, int>) * FIL_TPB + sizeof(int) * FIL_TPB;
   }
 
   __device__ __forceinline__ tree_aggregator_t(predict_params params,
                                                void* accumulate_workspace,
                                                void* finalize_workspace,
-                                               float* vector_leaf)
+                                               F* vector_leaf)
     : num_classes(params.num_classes),
       num_threads_per_class(max(1, blockDim.x / params.num_classes)),
       vector_leaf(vector_leaf),
@@ -602,15 +605,15 @@ struct tree_aggregator_t<NITEMS, VECTOR_LEAF> {
   {
     // Assign workspace
     char* ptr        = (char*)accumulate_workspace;
-    per_class_margin = (vec<NITEMS, float>*)ptr;
-    ptr += sizeof(vec<NITEMS, float>) * num_classes * num_threads_per_class;
+    per_class_margin = (vec<NITEMS, F>*)ptr;
+    ptr += sizeof(vec<NITEMS, F>) * num_classes * num_threads_per_class;
     vector_leaf_indices = (vec<NITEMS, int>*)ptr;
     ptr += sizeof(vec<NITEMS, int>) * blockDim.x;
     thread_num_rows = (int*)ptr;
 
     // Initialise shared memory
     for (int i = threadIdx.x; i < num_classes * num_threads_per_class; i += blockDim.x) {
-      per_class_margin[i] = vec<NITEMS, float>();
+      per_class_margin[i] = vec<NITEMS, F>();
     }
     vector_leaf_indices[threadIdx.x] = vec<NITEMS, int>();
     thread_num_rows[threadIdx.x]     = 0;
@@ -637,13 +640,13 @@ struct tree_aggregator_t<NITEMS, VECTOR_LEAF> {
       // we have num_classes threads for each j
       for (int j = i / num_classes; j < blockDim.x; j += num_threads_per_class) {
         for (int item = 0; item < thread_num_rows[j]; ++item) {
-          float pred = vector_leaf[vector_leaf_indices[j][item] * num_classes + c];
+          F pred = vector_leaf[vector_leaf_indices[j][item] * num_classes + c];
           per_class_margin[i][item] += pred;
         }
       }
     }
   }
-  __device__ INLINE_CONFIG void finalize(float* out,
+  __device__ INLINE_CONFIG void finalize(F* out,
                                          int num_rows,
                                          int num_outputs,
                                          output_t transform,
@@ -668,8 +671,8 @@ struct tree_aggregator_t<NITEMS, VECTOR_LEAF> {
   }
 };
 
-template <int NITEMS>
-struct tree_aggregator_t<NITEMS, CATEGORICAL_LEAF> {
+template <int NITEMS, typename F>
+struct tree_aggregator_t<NITEMS, F, CATEGORICAL_LEAF> {
   // could switch to uint16_t to save shared memory
   // provided raft::myAtomicAdd(short*) simulated with appropriate shifts
   int* votes;
@@ -691,7 +694,7 @@ struct tree_aggregator_t<NITEMS, CATEGORICAL_LEAF> {
   __device__ __forceinline__ tree_aggregator_t(predict_params params,
                                                void* accumulate_workspace,
                                                void* finalize_workspace,
-                                               float* vector_leaf)
+                                               F* vector_leaf)
     : num_classes(params.num_classes), votes((int*)accumulate_workspace)
   {
     for (int c = threadIdx.x; c < num_classes; c += FIL_TPB * NITEMS)
@@ -712,7 +715,7 @@ struct tree_aggregator_t<NITEMS, CATEGORICAL_LEAF> {
   }
   // class probabilities or regression. for regression, num_classes
   // is just the number of outputs for each data instance
-  __device__ __forceinline__ void finalize_multiple_outputs(float* out, int num_rows)
+  __device__ __forceinline__ void finalize_multiple_outputs(F* out, int num_rows)
   {
     __syncthreads();
     for (int c = threadIdx.x; c < num_classes; c += blockDim.x) {
@@ -723,7 +726,7 @@ struct tree_aggregator_t<NITEMS, CATEGORICAL_LEAF> {
   }
   // using this when predicting a single class label, as opposed to sparse class vector
   // or class probabilities or regression
-  __device__ __forceinline__ void finalize_class_label(float* out, int num_rows)
+  __device__ __forceinline__ void finalize_class_label(F* out, int num_rows)
   {
     __syncthreads();  // make sure all votes[] are final
     int item = threadIdx.x;
@@ -740,7 +743,7 @@ struct tree_aggregator_t<NITEMS, CATEGORICAL_LEAF> {
       out[row] = best_class;
     }
   }
-  __device__ INLINE_CONFIG void finalize(float* out,
+  __device__ INLINE_CONFIG void finalize(F* out,
                                          int num_rows,
                                          int num_outputs,
                                          output_t transform,
@@ -756,8 +759,9 @@ struct tree_aggregator_t<NITEMS, CATEGORICAL_LEAF> {
   }
 };
 
-__device__ INLINE_CONFIG void load_data(float* sdata,
-                                        const float* block_input,
+template<typename F>
+__device__ INLINE_CONFIG void load_data(F* sdata,
+                                        const F* block_input,
                                         predict_params params,
                                         int rows_per_block,
                                         int block_num_rows)
@@ -784,6 +788,7 @@ __device__ INLINE_CONFIG void load_data(float* sdata,
 }
 
 template <int NITEMS,
+          typename F,
           leaf_algo_t leaf_algo,
           bool cols_in_shmem,
           bool CATS_SUPPORTED,
@@ -791,7 +796,7 @@ template <int NITEMS,
 __global__ void infer_k(storage_type forest, predict_params params)
 {
   extern __shared__ char smem[];
-  float* sdata       = (float*)smem;
+  F* sdata       = (F*)smem;
   int sdata_stride   = params.sdata_stride();
   int rows_per_block = NITEMS << params.log2_threads_per_tree;
   int num_cols       = params.num_cols;
@@ -800,11 +805,11 @@ __global__ void infer_k(storage_type forest, predict_params params)
        block_row0 += rows_per_block * gridDim.x) {
     int block_num_rows =
       max(0, (int)min((int64_t)rows_per_block, (int64_t)params.num_rows - block_row0));
-    const float* block_input = reinterpret_cast<const float*>(params.data) + block_row0 * num_cols;
+    const F* block_input = reinterpret_cast<const F*>(params.data) + block_row0 * num_cols;
     if constexpr (cols_in_shmem)
       load_data(sdata, block_input, params, rows_per_block, block_num_rows);
 
-    tree_aggregator_t<NITEMS, leaf_algo> acc(
+    tree_aggregator_t<NITEMS, F, leaf_algo> acc(
       params, (char*)sdata + params.cols_shmem_size(), sdata, forest.vector_leaf_);
 
     __syncthreads();  // for both row cache init and acc init
@@ -820,7 +825,7 @@ __global__ void infer_k(storage_type forest, predict_params params)
          and is made exact below.
          Same with thread_num_rows > 0
       */
-      typedef typename leaf_output_t<leaf_algo, float>::T pred_t;
+      typedef typename leaf_output_t<leaf_algo, F>::T pred_t;
       vec<NITEMS, pred_t> prediction;
       if (tree < forest.num_trees() && thread_num_rows != 0) {
         prediction = infer_one_tree<NITEMS, CATS_SUPPORTED, pred_t>(
@@ -833,7 +838,7 @@ __global__ void infer_k(storage_type forest, predict_params params)
       // Dummy threads can be marked as having 0 rows
       acc.accumulate(prediction, tree, tree < forest.num_trees() ? thread_num_rows : 0);
     }
-    acc.finalize(reinterpret_cast<float*>(params.preds) + params.num_outputs * block_row0,
+    acc.finalize(reinterpret_cast<F*>(params.preds) + params.num_outputs * block_row0,
                  block_num_rows,
                  params.num_outputs,
                  params.transform,
@@ -843,13 +848,13 @@ __global__ void infer_k(storage_type forest, predict_params params)
   }
 }
 
-template <int NITEMS, leaf_algo_t leaf_algo>
+template <int NITEMS, typename F, leaf_algo_t leaf_algo>
 size_t shmem_size_params::get_smem_footprint()
 {
-  size_t finalize_footprint = tree_aggregator_t<NITEMS, leaf_algo>::smem_finalize_footprint(
+  size_t finalize_footprint = tree_aggregator_t<NITEMS, F, leaf_algo>::smem_finalize_footprint(
     cols_shmem_size(), num_classes, log2_threads_per_tree, predict_proba);
   size_t accumulate_footprint =
-    tree_aggregator_t<NITEMS, leaf_algo>::smem_accumulate_footprint(num_classes) +
+    tree_aggregator_t<NITEMS, F, leaf_algo>::smem_accumulate_footprint(num_classes) +
     cols_shmem_size();
   return std::max(accumulate_footprint, finalize_footprint);
 }
@@ -893,15 +898,17 @@ void infer(storage_type forest, predict_params params, cudaStream_t stream)
   dispatch_on_fil_template_params(infer_k_storage_template<storage_type>(forest, stream), params);
 }
 
-template void infer<storage<dense_node<float>>>(storage<dense_node<float>> forest,
-                                                predict_params params,
-                                                cudaStream_t stream);
-template void infer<storage<sparse_node16<float>>>(storage<sparse_node16<float>> forest,
-                                                   predict_params params,
-                                                   cudaStream_t stream);
-template void infer<storage<sparse_node8>>(storage<sparse_node8> forest,
-                                           predict_params params,
-                                           cudaStream_t stream);
+struct instantiate_infer {
+  template <typename fil_node_t>
+  void operator()(fil_node_t)
+  {
+    infer<storage<fil_node_t>>(storage<fil_node_t> forest,
+                               predict_params params,
+                               cudaStream_t stream);
+  }
+};
+
+template void instantiate_for_all_node_types(instantiate_infer);
 
 }  // namespace fil
 }  // namespace ML
diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh
index 0295cde8d1..1d1c3a516a 100644
--- a/cpp/src/fil/internal.cuh
+++ b/cpp/src/fil/internal.cuh
@@ -394,7 +394,7 @@ struct categorical_sets {
 
   // set count is due to tree_idx + node_within_tree_idx are both ints, hence uint32_t result
   template <typename node_t>
-  __host__ __device__ __forceinline__ int category_matches(node_t node, float category) const
+  __host__ __device__ __forceinline__ int category_matches(node_t node, typename node_t::F category) const
   {
     // standard boolean packing. This layout has better ILP
     // node.set() is global across feature IDs and is an offset (as opposed
@@ -410,7 +410,7 @@ struct categorical_sets {
     FIL will reject a model where an integer within [0, fid_num_cats] cannot be represented
     precisely as a 32-bit float.
     */
-    return category < fid_num_cats[node.fid()] && category >= 0.0f &&
+    return static_cast<float>(category) < fid_num_cats[node.fid()] && category >= 0.0f &&
            fetch_bit(bits + node.set(), static_cast<uint32_t>(static_cast<int>(category)));
   }
   static int sizeof_mask_from_num_cats(int num_cats)
@@ -561,11 +561,11 @@ struct cat_sets_device_owner {
  *  @param params pointer to parameters used to initialize the forest
  *  @param vector_leaf optional vector leaves
  */
-template <typename fil_node_t>
+template <typename fil_node_t, typename F>
 void init(const raft::handle_t& h,
           forest_t* pf,
           const categorical_sets& cat_sets,
-          const std::vector<float>& vector_leaf,
+          const std::vector<F>& vector_leaf,
           const int* trees,
           const fil_node_t* nodes,
           const forest_params_t* params);
diff --git a/cpp/src/fil/treelite_import.cu b/cpp/src/fil/treelite_import.cu
index 56536802f5..12e03fd102 100644
--- a/cpp/src/fil/treelite_import.cu
+++ b/cpp/src/fil/treelite_import.cu
@@ -655,6 +655,11 @@ void convert(const raft::handle_t& handle,
   tl2fil.init_forest(handle, pforest);
 }
 
+template<typename F> constexpr bool type_supported() {
+  // not using std::is_floating_point because we did not instantiate fp16-based nodes/trees/forests
+  return std::is_same<F, float>() || std::is_same<F, double>();
+}
+
 template <typename threshold_t, typename leaf_t>
 void from_treelite(const raft::handle_t& handle,
                    forest_t* pforest,
@@ -662,9 +667,9 @@ void from_treelite(const raft::handle_t& handle,
                    const treelite_params_t* tl_params)
 {
   // Invariants on threshold and leaf types
-  static_assert(std::is_same<threshold_t, float>::value || std::is_same<threshold_t, double>::value,
+  static_assert(type_supported<threshold_t>(),
                 "Model must contain float32 or float64 thresholds for splits");
-  ASSERT((std::is_same<leaf_t, float>::value || std::is_same<leaf_t, double>::value),
+  ASSERT(type_supported<leaf_t>(),
          "Models with integer leaf output are not yet supported");
   // Display appropriate warnings when float64 values are being casted into
   // float32, as FIL only supports inferencing with float32 for the time being
@@ -674,6 +679,8 @@ void from_treelite(const raft::handle_t& handle,
       "doesn't support inferencing models with float64 values. "
       "This may lead to predictions with reduced accuracy.");
   }
+  // same as std::common_type: float+double=double, float+int64_t=float
+  typedef decltype(threshold_t{} + leaf_t{}) F;
 
   storage_type_t storage_type = tl_params->storage_type;
   // build dense trees by default

From f6efb88f38a2f56c03f3c9af8154d629b3cf67df Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Tue, 15 Feb 2022 21:46:41 -0800
Subject: [PATCH 05/45] fixed extra/missing instantiations

---
 cpp/src/fil/common.cuh              | 10 +++++----
 cpp/src/fil/fil.cu                  | 26 ++++++++++++----------
 cpp/src/fil/internal.cuh            |  4 ++--
 cpp/test/sg/fil_child_index_test.cu | 19 +++++++++-------
 cpp/test/sg/fil_test.cu             | 34 ++++++++++++++---------------
 5 files changed, 51 insertions(+), 42 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index d5ec24b2af..6c89e1e01f 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -61,8 +61,9 @@ struct tree<dense_node<F>> : tree_base {
 };
 
 /** partial specialization of storage. Stores the forest on GPU as a collection of dense nodes */
-template <typename F>
-struct storage<dense_node<F>> : storage_base<F> {
+template <typename F_>
+struct storage<dense_node<F_>> : storage_base<F_> {
+  using F      = F_;
   using node_t = dense_node<F>;
   __host__ __device__ storage(categorical_sets cat_sets,
                               F* vector_leaf,
@@ -101,8 +102,9 @@ struct tree : tree_base {
 };
 
 /** storage stores the forest on GPU as a collection of sparse nodes */
-template <typename node_t>
-struct storage : storage_base<typename node_t::F> {
+template <typename node_t_>
+struct storage : storage_base<typename node_t_::F> {
+  using node_t   = node_t_;
   using F        = typename node_t::F;
   int* trees_    = nullptr;
   node_t* nodes_ = nullptr;
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index e534d16733..152198fb62 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -342,15 +342,17 @@ struct opt_into_arch_dependent_shmem : dispatch_functor<void> {
   template <typename KernelParams = KernelTemplateParams<>>
   void run(predict_params p)
   {
-    auto kernel = infer_k<KernelParams::N_ITEMS,
-                          KernelParams::LEAF_ALGO,
-                          KernelParams::COLS_IN_SHMEM,
-                          KernelParams::CATS_SUPPORTED,
-                          storage_type>;
-    // p.shm_sz might be > max_shm or < MAX_SHM_STD, but we should not check for either, because
-    // we don't run on both proba_ssp_ and class_ssp_ (only class_ssp_). This should be quick.
-    RAFT_CUDA_TRY(
-      cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shm));
+    if constexpr (std::is_same<typename storage_type::F, float>()) {
+      auto kernel = infer_k<KernelParams::N_ITEMS,
+                            KernelParams::LEAF_ALGO,
+                            KernelParams::COLS_IN_SHMEM,
+                            KernelParams::CATS_SUPPORTED,
+                            storage_type>;
+      // p.shm_sz might be > max_shm or < MAX_SHM_STD, but we should not check for either, because
+      // we don't run on both proba_ssp_ and class_ssp_ (only class_ssp_). This should be quick.
+      RAFT_CUDA_TRY(
+        cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shm));
+    }
   }
 };
 
@@ -428,7 +430,8 @@ struct dense_forest<dense_node<F>> : forest {
                            num_trees_,
                            algo_ == algo_t::NAIVE ? tree_num_nodes(depth_) : 1,
                            algo_ == algo_t::NAIVE ? 1 : num_trees_);
-    fil::infer(forest, params, stream);
+    if constexpr (std::is_same<F, float>())  // to remove in next PR
+      fil::infer(forest, params, stream);
   }
 
   virtual void free(const raft::handle_t& h) override
@@ -482,7 +485,8 @@ struct sparse_forest : forest {
                            trees_.data(),
                            nodes_.data(),
                            num_trees_);
-    fil::infer(forest, params, stream);
+    if constexpr (std::is_same<typename node_t::F, float>())  // to remove in next PR
+      fil::infer(forest, params, stream);
   }
 
   void free(const raft::handle_t& h) override
diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh
index 0295cde8d1..b382ade1a3 100644
--- a/cpp/src/fil/internal.cuh
+++ b/cpp/src/fil/internal.cuh
@@ -561,11 +561,11 @@ struct cat_sets_device_owner {
  *  @param params pointer to parameters used to initialize the forest
  *  @param vector_leaf optional vector leaves
  */
-template <typename fil_node_t>
+template <typename fil_node_t, typename F>
 void init(const raft::handle_t& h,
           forest_t* pf,
           const categorical_sets& cat_sets,
-          const std::vector<float>& vector_leaf,
+          const std::vector<F>& vector_leaf,
           const int* trees,
           const fil_node_t* nodes,
           const forest_params_t* params);
diff --git a/cpp/test/sg/fil_child_index_test.cu b/cpp/test/sg/fil_child_index_test.cu
index cdc87603be..8b1af7d1b2 100644
--- a/cpp/test/sg/fil_child_index_test.cu
+++ b/cpp/test/sg/fil_child_index_test.cu
@@ -40,24 +40,27 @@ struct proto_inner_node {
   int set             = 0;      // which bit set represents the matching category list
   float thresh        = 0.0f;   // threshold, see base_node::thresh
   int left            = 1;      // left child idx, see sparse_node*::left_index()
-  val_t split()
+  val_t<float> split()
   {
-    val_t split;
+    val_t<float> split;
     if (is_categorical)
       split.idx = set;
     else
       split.f = thresh;
     return split;
   }
-  operator sparse_node16()
+  operator sparse_node16<float>()
   {
-    return sparse_node16({}, split(), fid, def_left, false, is_categorical, left);
+    return sparse_node16<float>({}, split(), fid, def_left, false, is_categorical, left);
   }
   operator sparse_node8()
   {
     return sparse_node8({}, split(), fid, def_left, false, is_categorical, left);
   }
-  operator dense_node() { return dense_node({}, split(), fid, def_left, false, is_categorical); }
+  operator dense_node<float>()
+  {
+    return dense_node<float>({}, split(), fid, def_left, false, is_categorical);
+  }
 };
 
 std::ostream& operator<<(std::ostream& os, const proto_inner_node& node)
@@ -138,7 +141,7 @@ class ChildIndexTest : public testing::TestWithParam<ChildIndexTestParams> {
   {
     ChildIndexTestParams param = GetParam();
     tree_base tree{param.cso.accessor()};
-    if (!std::is_same<fil_node_t, fil::dense_node>::value) {
+    if (!std::is_same<fil_node_t, fil::dense_node<float>>::value) {
       // test that the logic uses node.left instead of parent_node_idx
       param.node.left       = param.parent_node_idx * 2 + 1;
       param.parent_node_idx = INT_MIN;
@@ -153,8 +156,8 @@ class ChildIndexTest : public testing::TestWithParam<ChildIndexTestParams> {
   }
 };
 
-typedef ChildIndexTest<fil::dense_node> ChildIndexTestDense;
-typedef ChildIndexTest<fil::sparse_node16> ChildIndexTestSparse16;
+typedef ChildIndexTest<fil::dense_node<float>> ChildIndexTestDense;
+typedef ChildIndexTest<fil::sparse_node16<float>> ChildIndexTestSparse16;
 typedef ChildIndexTest<fil::sparse_node8> ChildIndexTestSparse8;
 
 /* for dense nodes, left (false) == parent * 2 + 1, right (true) == parent * 2 + 2
diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu
index 9627e79e6a..74ab0c4d00 100644
--- a/cpp/test/sg/fil_test.cu
+++ b/cpp/test/sg/fil_test.cu
@@ -378,12 +378,12 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
     // initialize nodes
     nodes.resize(num_nodes);
     for (size_t i = 0; i < num_nodes; ++i) {
-      fil::val_t w;
+      fil::val_t<float> w;
       switch (ps.leaf_algo) {
         case fil::leaf_algo_t::CATEGORICAL_LEAF: w.idx = weights_h[i]; break;
         case fil::leaf_algo_t::FLOAT_UNARY_BINARY:
         case fil::leaf_algo_t::GROVE_PER_CLASS:
-          // not relying on fil::val_t internals
+          // not relying on fil::val_t<float> internals
           // merely that we copied floats into weights_h earlier
           std::memcpy(&w.f, &weights_h[i], sizeof w.f);
           break;
@@ -392,13 +392,13 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
       }
       // make sure nodes are categorical only when their feature ID is categorical
       bool is_categorical = is_categoricals_h[i] == 1.0f;
-      val_t split;
+      val_t<float> split;
       if (is_categorical)
         split.idx = node_cat_set[i];
       else
         split.f = thresholds_h[i];
       nodes[i] =
-        fil::dense_node(w, split, fids_h[i], def_lefts_h[i], is_leafs_h[i], is_categorical);
+        fil::dense_node<float>(w, split, fids_h[i], def_lefts_h[i], is_leafs_h[i], is_categorical);
     }
 
     // clean up
@@ -591,13 +591,13 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
                                   stream));
   }
 
-  fil::val_t infer_one_tree(fil::dense_node* root, float* data, const tree_base& tree)
+  fil::val_t<float> infer_one_tree(fil::dense_node<float>* root, float* data, const tree_base& tree)
   {
     int curr = 0;
-    fil::val_t output{.f = 0.0f};
+    fil::val_t<float> output{.f = 0.0f};
     for (;;) {
-      const fil::dense_node& node = root[curr];
-      if (node.is_leaf()) return node.template output<val_t>();
+      const fil::dense_node<float>& node = root[curr];
+      if (node.is_leaf()) return node.template output<val_t<float>>();
       float val = data[node.fid()];
       curr      = tree.child_index<true>(node, curr, val);
     }
@@ -625,7 +625,7 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
   std::vector<float> want_proba_h;
 
   // forest data
-  std::vector<fil::dense_node> nodes;
+  std::vector<fil::dense_node<float>> nodes;
   std::vector<float> vector_leaf;
   cat_sets_owner cat_sets_h;
   rmm::device_uvector<int> fids_d           = rmm::device_uvector<int>(0, cudaStream_t());
@@ -635,16 +635,16 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
 template <typename fil_node_t>
 class BasePredictFilTest : public BaseFilTest {
  protected:
-  void dense2sparse_node(const fil::dense_node* dense_root,
+  void dense2sparse_node(const fil::dense_node<float>* dense_root,
                          int i_dense,
                          int i_sparse_root,
                          int i_sparse)
   {
-    const fil::dense_node& node = dense_root[i_dense];
+    const fil::dense_node<float>& node = dense_root[i_dense];
     if (node.is_leaf()) {
       // leaf sparse node
-      sparse_nodes[i_sparse] =
-        fil_node_t(node.output<val_t>(), {}, node.fid(), node.def_left(), node.is_leaf(), false, 0);
+      sparse_nodes[i_sparse] = fil_node_t(
+        node.output<val_t<float>>(), {}, node.fid(), node.def_left(), node.is_leaf(), false, 0);
       return;
     }
     // inner sparse node
@@ -663,7 +663,7 @@ class BasePredictFilTest : public BaseFilTest {
     dense2sparse_node(dense_root, 2 * i_dense + 2, i_sparse_root, left_index + 1);
   }
 
-  void dense2sparse_tree(const fil::dense_node* dense_root)
+  void dense2sparse_tree(const fil::dense_node<float>* dense_root)
   {
     int i_sparse_root = sparse_nodes.size();
     sparse_nodes.push_back(fil_node_t());
@@ -719,8 +719,8 @@ class BasePredictFilTest : public BaseFilTest {
   std::vector<int> trees;
 };
 
-typedef BasePredictFilTest<fil::dense_node> PredictDenseFilTest;
-typedef BasePredictFilTest<fil::sparse_node16> PredictSparse16FilTest;
+typedef BasePredictFilTest<fil::dense_node<float>> PredictDenseFilTest;
+typedef BasePredictFilTest<fil::sparse_node16<float>> PredictSparse16FilTest;
 typedef BasePredictFilTest<fil::sparse_node8> PredictSparse8FilTest;
 
 class TreeliteFilTest : public BaseFilTest {
@@ -732,7 +732,7 @@ class TreeliteFilTest : public BaseFilTest {
   {
     int key = (*pkey)++;
     builder->CreateNode(key);
-    const fil::dense_node& dense_node = nodes[node];
+    const fil::dense_node<float>& dense_node = nodes[node];
     std::vector<std::uint32_t> left_categories;
     if (dense_node.is_leaf()) {
       switch (ps.leaf_algo) {

From 1bebcca53609cfab7421730e7f22047c65ce47d0 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Wed, 16 Feb 2022 11:14:53 -0800
Subject: [PATCH 06/45] style

---
 cpp/src/fil/fil.cu             | 14 +++++++-------
 cpp/src/fil/infer.cu           | 24 +++++++++---------------
 cpp/src/fil/internal.cuh       |  3 ++-
 cpp/src/fil/treelite_import.cu |  7 ++++---
 4 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index ff50bd37ca..1a738b70a7 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -42,7 +42,7 @@ __host__ __device__ float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); }
     averaging (multiplying by inv_num_trees), adding global_bias (always done),
     sigmoid and applying threshold. in case of complement_proba,
     fills in the complement probability */
-template<typename F>
+template <typename F>
 __global__ void transform_k(F* preds,
                             size_t n,
                             output_t output,
@@ -174,7 +174,7 @@ struct forest {
 
   virtual void infer(predict_params params, cudaStream_t stream) = 0;
 
-  template<typename F>
+  template <typename F>
   void predict(
     const raft::handle_t& h, F* preds, const F* data, size_t num_rows, bool predict_proba)
   {
@@ -255,7 +255,7 @@ struct forest {
     // Simulating treelite order, which cancels out bias.
     // If non-proba prediction used, it still will not matter
     // for the same reason softmax will not.
-    F global_bias     = (ot & output_t::SOFTMAX) != 0 ? F(0.0) : global_bias_;
+    F global_bias         = (ot & output_t::SOFTMAX) != 0 ? F(0.0) : global_bias_;
     bool complement_proba = false, do_transform;
 
     if (predict_proba) {
@@ -323,10 +323,10 @@ struct forest {
 
   virtual ~forest() {}
 
-  int num_trees_     = 0;
-  int depth_         = 0;
-  algo_t algo_       = algo_t::NAIVE;
-  output_t output_   = output_t::RAW;
+  int num_trees_      = 0;
+  int depth_          = 0;
+  algo_t algo_        = algo_t::NAIVE;
+  output_t output_    = output_t::RAW;
   double threshold_   = 0.5;
   double global_bias_ = 0;
   shmem_size_params class_ssp_, proba_ssp_;
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 1d9aaccb80..8930834fbb 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -96,7 +96,7 @@ struct vec {
   }
 };
 
-template<typename F>
+template <typename F>
 struct best_margin_label : cub::KeyValuePair<int, F> {
   __host__ __device__ best_margin_label(cub::KeyValuePair<int, F> pair)
     : cub::KeyValuePair<int, F>(pair)
@@ -155,10 +155,8 @@ __device__ __forceinline__ vec<NITEMS, output_type> tree_leaf_output(tree_type t
 }
 
 template <int NITEMS, bool CATS_SUPPORTED, typename output_type, typename tree_type>
-__device__ __forceinline__ vec<NITEMS, output_type> infer_one_tree(tree_type tree,
-                                                                   const typename tree_type::F* input,
-                                                                   int cols,
-                                                                   int n_rows)
+__device__ __forceinline__ vec<NITEMS, output_type> infer_one_tree(
+  tree_type tree, const typename tree_type::F* input, int cols, int n_rows)
 {
   // find the leaf nodes for each row
   int curr[NITEMS];
@@ -759,12 +757,9 @@ struct tree_aggregator_t<NITEMS, F, CATEGORICAL_LEAF> {
   }
 };
 
-template<typename F>
-__device__ INLINE_CONFIG void load_data(F* sdata,
-                                        const F* block_input,
-                                        predict_params params,
-                                        int rows_per_block,
-                                        int block_num_rows)
+template <typename F>
+__device__ INLINE_CONFIG void load_data(
+  F* sdata, const F* block_input, predict_params params, int rows_per_block, int block_num_rows)
 {
   int num_cols     = params.num_cols;
   int sdata_stride = params.sdata_stride();
@@ -796,7 +791,7 @@ template <int NITEMS,
 __global__ void infer_k(storage_type forest, predict_params params)
 {
   extern __shared__ char smem[];
-  F* sdata       = (F*)smem;
+  F* sdata           = (F*)smem;
   int sdata_stride   = params.sdata_stride();
   int rows_per_block = NITEMS << params.log2_threads_per_tree;
   int num_cols       = params.num_cols;
@@ -902,9 +897,8 @@ struct instantiate_infer {
   template <typename fil_node_t>
   void operator()(fil_node_t)
   {
-    infer<storage<fil_node_t>>(storage<fil_node_t> forest,
-                               predict_params params,
-                               cudaStream_t stream);
+    infer<storage<fil_node_t>>(
+      storage<fil_node_t> forest, predict_params params, cudaStream_t stream);
   }
 };
 
diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh
index 1d1c3a516a..16c450899b 100644
--- a/cpp/src/fil/internal.cuh
+++ b/cpp/src/fil/internal.cuh
@@ -394,7 +394,8 @@ struct categorical_sets {
 
   // set count is due to tree_idx + node_within_tree_idx are both ints, hence uint32_t result
   template <typename node_t>
-  __host__ __device__ __forceinline__ int category_matches(node_t node, typename node_t::F category) const
+  __host__ __device__ __forceinline__ int category_matches(node_t node,
+                                                           typename node_t::F category) const
   {
     // standard boolean packing. This layout has better ILP
     // node.set() is global across feature IDs and is an offset (as opposed
diff --git a/cpp/src/fil/treelite_import.cu b/cpp/src/fil/treelite_import.cu
index 12e03fd102..ae941452e6 100644
--- a/cpp/src/fil/treelite_import.cu
+++ b/cpp/src/fil/treelite_import.cu
@@ -655,7 +655,9 @@ void convert(const raft::handle_t& handle,
   tl2fil.init_forest(handle, pforest);
 }
 
-template<typename F> constexpr bool type_supported() {
+template <typename F>
+constexpr bool type_supported()
+{
   // not using std::is_floating_point because we did not instantiate fp16-based nodes/trees/forests
   return std::is_same<F, float>() || std::is_same<F, double>();
 }
@@ -669,8 +671,7 @@ void from_treelite(const raft::handle_t& handle,
   // Invariants on threshold and leaf types
   static_assert(type_supported<threshold_t>(),
                 "Model must contain float32 or float64 thresholds for splits");
-  ASSERT(type_supported<leaf_t>(),
-         "Models with integer leaf output are not yet supported");
+  ASSERT(type_supported<leaf_t>(), "Models with integer leaf output are not yet supported");
   // Display appropriate warnings when float64 values are being casted into
   // float32, as FIL only supports inferencing with float32 for the time being
   if (std::is_same<threshold_t, double>::value || std::is_same<leaf_t, double>::value) {

From aab158b231fdc221addaf80bdf4e07ad80c8a7f2 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Wed, 16 Feb 2022 11:16:34 -0800
Subject: [PATCH 07/45] style

---
 cpp/test/sg/fil_child_index_test.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/test/sg/fil_child_index_test.cu b/cpp/test/sg/fil_child_index_test.cu
index 8b1af7d1b2..75eca6cba2 100644
--- a/cpp/test/sg/fil_child_index_test.cu
+++ b/cpp/test/sg/fil_child_index_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 1236c52bf9356cde5d23941261dea4daed7f869e Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 18 Feb 2022 00:28:51 -0800
Subject: [PATCH 08/45] removed ML::fil::init templatization, added
 KeyValuePair templatization

---
 cpp/CMakeLists.txt       |  2 --
 cpp/src/fil/fil.cu       | 20 ++++----------------
 cpp/src/fil/infer.cu     | 13 +++++++------
 cpp/src/fil/internal.cuh | 19 ++++---------------
 4 files changed, 15 insertions(+), 39 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 4cc7a593c3..273c0da76e 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -267,8 +267,6 @@ if(BUILD_CUML_CPP_LIBRARY)
         src/explainer/kernel_shap.cu
         src/explainer/permutation_shap.cu
         src/explainer/tree_shap.cu
-        src/fil/fil.cu
-        src/fil/infer.cu
         src/glm/glm.cu
         src/genetic/genetic.cu
         src/genetic/program.cu
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 152198fb62..f4f13998b2 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -579,11 +579,11 @@ void check_params(const forest_params_t* params, bool dense)
 /** initializes a forest of any type
  * When fil_node_t == dense_node, const int* trees is ignored
  */
-template <typename fil_node_t, typename F>
+template <typename fil_node_t>
 void init(const raft::handle_t& h,
           forest_t* pf,
           const categorical_sets& cat_sets,
-          const std::vector<F>& vector_leaf,
+          const std::vector<float>& vector_leaf,
           const int* trees,
           const fil_node_t* nodes,
           const forest_params_t* params)
@@ -591,23 +591,11 @@ void init(const raft::handle_t& h,
   check_params(params, node_traits<fil_node_t>::IS_DENSE);
   using forest_type = typename node_traits<fil_node_t>::forest;
   forest_type* f    = new forest_type(h);
-  f->init(h, cat_sets, vector_leaf, trees, nodes, params);
+  if constexpr (std::is_same<typename fil_node_t::F, float>())  // to remove in next PR
+    f->init(h, cat_sets, vector_leaf, trees, nodes, params);
   *pf = f;
 }
 
-struct instantiate_forest_init {
-  template <typename fil_node_t>
-  void operator()(fil_node_t)
-  {
-    if constexpr (std::is_same<typename fil_node_t::F, float>())
-      init<fil_node_t>({}, {}, {}, std::vector<float>(), {}, {}, {});
-    else
-      init<fil_node_t>({}, {}, {}, std::vector<double>(), {}, {}, {});
-  }
-};
-
-template void instantiate_for_all_node_types(instantiate_forest_init);
-
 void free(const raft::handle_t& h, forest_t f)
 {
   f->free(h);
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 52d435b1e1..454734b135 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -41,7 +41,7 @@
 namespace ML {
 namespace fil {
 
-// vec wraps float[N] for cub::BlockReduce
+// vec wraps float[N], int[N] or double[N] for cub::BlockReduce
 template <int N, typename T>
 struct vec;
 
@@ -96,13 +96,14 @@ struct vec {
   }
 };
 
-struct best_margin_label : cub::KeyValuePair<int, float> {
-  __host__ __device__ best_margin_label(cub::KeyValuePair<int, float> pair)
-    : cub::KeyValuePair<int, float>(pair)
+template <typename F>
+struct best_margin_label : cub::KeyValuePair<int, F> {
+  __host__ __device__ best_margin_label(cub::KeyValuePair<int, F> pair)
+    : cub::KeyValuePair<int, F>(pair)
   {
   }
-  __host__ __device__ best_margin_label(int c = 0, float f = -INFINITY)
-    : cub::KeyValuePair<int, float>({c, f})
+  __host__ __device__ best_margin_label(int c = 0, F f = -INFINITY)
+    : cub::KeyValuePair<int, F>({c, f})
   {
   }
 };
diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh
index b382ade1a3..48b87b81d8 100644
--- a/cpp/src/fil/internal.cuh
+++ b/cpp/src/fil/internal.cuh
@@ -101,7 +101,7 @@ union val_t {
 
 /** base_node contains common implementation details for dense and sparse nodes */
 template <typename F_>
-struct base_node {
+struct alignas(std::is_same<F_, float>() ? 8 : 16) base_node {
   using F = F_;  // floating-point type
   /** val, for parent nodes, is a threshold or category list offset. For leaf
       nodes, it is the tree prediction (see see leaf_output_t<leaf_algo_t>::T) */
@@ -152,7 +152,7 @@ struct base_node {
 
 /** dense_node is a single node of a dense forest */
 template <typename F>
-struct alignas(8) dense_node : base_node<F> {
+struct dense_node : base_node<F> {
   dense_node() = default;
   /// ignoring left_index, this is useful to unify import from treelite
   dense_node(val_t<F> output,
@@ -218,17 +218,6 @@ struct alignas(8) sparse_node8 : base_node<float> {
   __host__ __device__ int left(int curr) const { return left_index(); }
 };
 
-/// pass a functor with a templated operator()(fil_node_t), i.e. accepting one node as parameter
-template <typename Stuff>
-void instantiate_for_all_node_types(Stuff stuff)
-{
-  stuff(dense_node<float>{});
-  stuff(dense_node<double>{});
-  stuff(sparse_node16<float>{});
-  stuff(sparse_node16<double>{});
-  stuff(sparse_node8{});
-}
-
 template <typename node_t>
 struct storage;
 
@@ -561,11 +550,11 @@ struct cat_sets_device_owner {
  *  @param params pointer to parameters used to initialize the forest
  *  @param vector_leaf optional vector leaves
  */
-template <typename fil_node_t, typename F>
+template <typename fil_node_t>
 void init(const raft::handle_t& h,
           forest_t* pf,
           const categorical_sets& cat_sets,
-          const std::vector<F>& vector_leaf,
+          const std::vector<float>& vector_leaf,
           const int* trees,
           const fil_node_t* nodes,
           const forest_params_t* params);

From cc946cdaa2b6b5096603d976593a5adf68c5d708 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 18 Feb 2022 00:29:37 -0800
Subject: [PATCH 09/45] style

---
 cpp/src/fil/infer.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 454734b135..a05029390a 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -36,7 +36,7 @@
 #endif  // __CUDA_ARCH__
 #endif  // CUDA_PRAGMA_UNROLL
 
-#define INLINE_CONFIG __noinline__
+#define INLINE_CONFIG __forceinline__
 
 namespace ML {
 namespace fil {

From 91ecb205a532e19d50a01bfb4682e3e58c883a02 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 18 Feb 2022 00:30:59 -0800
Subject: [PATCH 10/45] add the old instantiations back

---
 cpp/src/fil/fil.cu | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index f4f13998b2..1c991a5960 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -596,6 +596,31 @@ void init(const raft::handle_t& h,
   *pf = f;
 }
 
+// explicit instantiations for init_sparse()
+template void init<sparse_node16>(const raft::handle_t& h,
+                                  forest_t* pf,
+                                  const categorical_sets& cat_sets,
+                                  const std::vector<float>& vector_leaf,
+                                  const int* trees,
+                                  const sparse_node16* nodes,
+                                  const forest_params_t* params);
+
+template void init<sparse_node8>(const raft::handle_t& h,
+                                 forest_t* pf,
+                                 const categorical_sets& cat_sets,
+                                 const std::vector<float>& vector_leaf,
+                                 const int* trees,
+                                 const sparse_node8* nodes,
+                                 const forest_params_t* params);
+
+template void init<dense_node>(const raft::handle_t& h,
+                               forest_t* pf,
+                               const categorical_sets& cat_sets,
+                               const std::vector<float>& vector_leaf,
+                               const int* trees,
+                               const dense_node* nodes,
+                               const forest_params_t* params);
+
 void free(const raft::handle_t& h, forest_t f)
 {
   f->free(h);

From 67f900e2aa76fbf93ccfa45807929c501aebf2eb Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 18 Feb 2022 15:14:19 -0800
Subject: [PATCH 11/45] fixed

---
 cpp/src/fil/fil.cu   |  8 ++++----
 cpp/src/fil/infer.cu | 16 ++++++++--------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 1c991a5960..81b848f1ed 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -597,12 +597,12 @@ void init(const raft::handle_t& h,
 }
 
 // explicit instantiations for init_sparse()
-template void init<sparse_node16>(const raft::handle_t& h,
+template void init<sparse_node16<float>>(const raft::handle_t& h,
                                   forest_t* pf,
                                   const categorical_sets& cat_sets,
                                   const std::vector<float>& vector_leaf,
                                   const int* trees,
-                                  const sparse_node16* nodes,
+                                  const sparse_node16<float>* nodes,
                                   const forest_params_t* params);
 
 template void init<sparse_node8>(const raft::handle_t& h,
@@ -613,12 +613,12 @@ template void init<sparse_node8>(const raft::handle_t& h,
                                  const sparse_node8* nodes,
                                  const forest_params_t* params);
 
-template void init<dense_node>(const raft::handle_t& h,
+template void init<dense_node<float>>(const raft::handle_t& h,
                                forest_t* pf,
                                const categorical_sets& cat_sets,
                                const std::vector<float>& vector_leaf,
                                const int* trees,
-                               const dense_node* nodes,
+                               const dense_node<float>* nodes,
                                const forest_params_t* params);
 
 void free(const raft::handle_t& h, forest_t f)
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index a05029390a..8b34ea0447 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -109,21 +109,21 @@ struct best_margin_label : cub::KeyValuePair<int, F> {
 };
 
 template <int NITEMS>
-__device__ __forceinline__ vec<NITEMS, best_margin_label> to_vec(int c, vec<NITEMS, float> margin)
+__device__ __forceinline__ vec<NITEMS, best_margin_label<float>> to_vec(int c, vec<NITEMS, float> margin)
 {
-  vec<NITEMS, best_margin_label> ret;
+  vec<NITEMS, best_margin_label<float>> ret;
   CUDA_PRAGMA_UNROLL
   for (int i = 0; i < NITEMS; ++i)
-    ret[i] = best_margin_label(c, margin[i]);
+    ret[i] = best_margin_label<float>(c, margin[i]);
   return ret;
 }
 
 struct ArgMax {
   template <int NITEMS>
-  __host__ __device__ __forceinline__ vec<NITEMS, best_margin_label> operator()(
-    vec<NITEMS, best_margin_label> a, vec<NITEMS, best_margin_label> b) const
+  __host__ __device__ __forceinline__ vec<NITEMS, best_margin_label<float>> operator()(
+    vec<NITEMS, best_margin_label<float>> a, vec<NITEMS, best_margin_label<float>> b) const
   {
-    vec<NITEMS, best_margin_label> c;
+    vec<NITEMS, best_margin_label<float>> c;
     CUDA_PRAGMA_UNROLL
     for (int i = 0; i < NITEMS; i++)
       c[i] = cub::ArgMax()(a[i], b[i]);
@@ -232,7 +232,7 @@ size_t block_reduce_footprint_host()
 template <int NITEMS>
 size_t block_reduce_best_class_footprint_host()
 {
-  return sizeof(typename cub::BlockReduce<vec<NITEMS, best_margin_label>,
+  return sizeof(typename cub::BlockReduce<vec<NITEMS, best_margin_label<float>>,
                                           FIL_TPB,
                                           cub::BLOCK_REDUCE_WARP_REDUCTIONS,
                                           1,
@@ -356,7 +356,7 @@ __device__ __forceinline__ void write_best_class(
 {
   // reduce per-class candidate margins to one best class candidate
   // per thread (for each of the NITEMS rows)
-  auto best = vec<begin->NITEMS, best_margin_label>();
+  auto best = vec<begin->NITEMS, best_margin_label<float>>();
   for (int c = threadIdx.x; c < end - begin; c += blockDim.x)
     best = vectorized(cub::ArgMax())(best, to_vec(c, begin[c]));
   // [begin, end) may overlap tmp_storage

From c3c4deb0d2c774197ea9e72ed11a1cebcc6e379a Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 18 Feb 2022 15:31:52 -0800
Subject: [PATCH 12/45] style

---
 cpp/src/fil/fil.cu   | 24 ++++++++++++------------
 cpp/src/fil/infer.cu |  3 ++-
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 81b848f1ed..90e2aa18c3 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -598,12 +598,12 @@ void init(const raft::handle_t& h,
 
 // explicit instantiations for init_sparse()
 template void init<sparse_node16<float>>(const raft::handle_t& h,
-                                  forest_t* pf,
-                                  const categorical_sets& cat_sets,
-                                  const std::vector<float>& vector_leaf,
-                                  const int* trees,
-                                  const sparse_node16<float>* nodes,
-                                  const forest_params_t* params);
+                                         forest_t* pf,
+                                         const categorical_sets& cat_sets,
+                                         const std::vector<float>& vector_leaf,
+                                         const int* trees,
+                                         const sparse_node16<float>* nodes,
+                                         const forest_params_t* params);
 
 template void init<sparse_node8>(const raft::handle_t& h,
                                  forest_t* pf,
@@ -614,12 +614,12 @@ template void init<sparse_node8>(const raft::handle_t& h,
                                  const forest_params_t* params);
 
 template void init<dense_node<float>>(const raft::handle_t& h,
-                               forest_t* pf,
-                               const categorical_sets& cat_sets,
-                               const std::vector<float>& vector_leaf,
-                               const int* trees,
-                               const dense_node<float>* nodes,
-                               const forest_params_t* params);
+                                      forest_t* pf,
+                                      const categorical_sets& cat_sets,
+                                      const std::vector<float>& vector_leaf,
+                                      const int* trees,
+                                      const dense_node<float>* nodes,
+                                      const forest_params_t* params);
 
 void free(const raft::handle_t& h, forest_t f)
 {
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 8b34ea0447..cb8fdaeaa0 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -109,7 +109,8 @@ struct best_margin_label : cub::KeyValuePair<int, F> {
 };
 
 template <int NITEMS>
-__device__ __forceinline__ vec<NITEMS, best_margin_label<float>> to_vec(int c, vec<NITEMS, float> margin)
+__device__ __forceinline__ vec<NITEMS, best_margin_label<float>> to_vec(int c,
+                                                                        vec<NITEMS, float> margin)
 {
   vec<NITEMS, best_margin_label<float>> ret;
   CUDA_PRAGMA_UNROLL

From 68e7f652332dd7bafb05aef0753e24603b1b94f3 Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Wed, 9 Mar 2022 19:40:37 +0100
Subject: [PATCH 13/45] base_node::output() now compiles.

---
 cpp/src/fil/internal.cuh | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh
index 48b87b81d8..1b1d0c5c5f 100644
--- a/cpp/src/fil/internal.cuh
+++ b/cpp/src/fil/internal.cuh
@@ -121,13 +121,17 @@ struct alignas(std::is_same<F_, float>() ? 8 : 16) base_node {
   template <class o_t>
   __host__ __device__ o_t output() const
   {
-    if constexpr (std::is_same<o_t, int>()) {
+    static_assert(std::is_same_v<o_t, int> || std::is_same_v<o_t, F> ||
+                  std::is_same_v<o_t, val_t<F>>, "invalid o_t type parameter in node.output()");
+    if constexpr (std::is_same_v<o_t, int>) {
       return val.idx;
-    } else if constexpr (std::is_same<o_t, F>()) {
+    } else if constexpr (std::is_same_v<o_t, F>) {
       return val.f;
-    } else {
+    } else if constexpr (std::is_same_v<o_t, val_t<F>>) {
       return val;
     }
+    // control flow should not reach here
+    return o_t();
   }
   __host__ __device__ int set() const { return val.idx; }
   __host__ __device__ F thresh() const { return val.f; }

From b31069d5b94b451b0afc76f894c7a78fb2961e64 Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Thu, 10 Mar 2022 01:45:59 +0100
Subject: [PATCH 14/45] Fixed style.

---
 cpp/src/fil/internal.cuh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh
index 1b1d0c5c5f..9bc5f8a333 100644
--- a/cpp/src/fil/internal.cuh
+++ b/cpp/src/fil/internal.cuh
@@ -121,8 +121,9 @@ struct alignas(std::is_same<F_, float>() ? 8 : 16) base_node {
   template <class o_t>
   __host__ __device__ o_t output() const
   {
-    static_assert(std::is_same_v<o_t, int> || std::is_same_v<o_t, F> ||
-                  std::is_same_v<o_t, val_t<F>>, "invalid o_t type parameter in node.output()");
+    static_assert(
+      std::is_same_v<o_t, int> || std::is_same_v<o_t, F> || std::is_same_v<o_t, val_t<F>>,
+      "invalid o_t type parameter in node.output()");
     if constexpr (std::is_same_v<o_t, int>) {
       return val.idx;
     } else if constexpr (std::is_same_v<o_t, F>) {

From c250653e1faefd4e5987b49799d8e952e06e2bab Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Thu, 10 Mar 2022 02:35:23 +0100
Subject: [PATCH 15/45] F -> real_t.

---
 cpp/src/fil/common.cuh   | 39 +++++++++--------
 cpp/src/fil/fil.cu       | 32 +++++++-------
 cpp/src/fil/infer.cu     | 12 +++---
 cpp/src/fil/internal.cuh | 90 +++++++++++++++++++++-------------------
 4 files changed, 90 insertions(+), 83 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 6c89e1e01f..2696d9dc28 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -38,40 +38,40 @@ __host__ __device__ __forceinline__ int forest_num_nodes(int num_trees, int dept
   return num_trees * tree_num_nodes(depth);
 }
 
-template <typename F>
+template <typename real_t>
 struct storage_base {
   categorical_sets sets_;
-  F* vector_leaf_;
+  real_t* vector_leaf_;
   bool cats_present() const { return sets_.cats_present(); }
 };
 
 /** represents a dense tree */
-template <typename F>
-struct tree<dense_node<F>> : tree_base {
-  __host__ __device__ tree(categorical_sets cat_sets, dense_node<F>* nodes, int node_pitch)
+template <typename real_t>
+struct tree<dense_node<real_t>> : tree_base {
+  __host__ __device__ tree(categorical_sets cat_sets, dense_node<real_t>* nodes, int node_pitch)
     : tree_base{cat_sets}, nodes_(nodes), node_pitch_(node_pitch)
   {
   }
-  __host__ __device__ const dense_node<F>& operator[](int i) const
+  __host__ __device__ const dense_node<real_t>& operator[](int i) const
   {
     return nodes_[i * node_pitch_];
   }
-  dense_node<F>* nodes_ = nullptr;
-  int node_pitch_       = 0;
+  dense_node<real_t>* nodes_ = nullptr;
+  int node_pitch_            = 0;
 };
 
 /** partial specialization of storage. Stores the forest on GPU as a collection of dense nodes */
-template <typename F_>
-struct storage<dense_node<F_>> : storage_base<F_> {
-  using F      = F_;
-  using node_t = dense_node<F>;
+template <typename real_t_>
+struct storage<dense_node<real_t_>> : storage_base<real_t_> {
+  using real_t = real_t_;
+  using node_t = dense_node<real_t>;
   __host__ __device__ storage(categorical_sets cat_sets,
-                              F* vector_leaf,
+                              real_t* vector_leaf,
                               node_t* nodes,
                               int num_trees,
                               int tree_stride,
                               int node_pitch)
-    : storage_base<F>{cat_sets, vector_leaf},
+    : storage_base<real_t>{cat_sets, vector_leaf},
       nodes_(nodes),
       num_trees_(num_trees),
       tree_stride_(tree_stride),
@@ -103,15 +103,18 @@ struct tree : tree_base {
 
 /** storage stores the forest on GPU as a collection of sparse nodes */
 template <typename node_t_>
-struct storage : storage_base<typename node_t_::F> {
+struct storage : storage_base<typename node_t_::real_t> {
   using node_t   = node_t_;
-  using F        = typename node_t::F;
+  using real_t   = typename node_t::real_t;
   int* trees_    = nullptr;
   node_t* nodes_ = nullptr;
   int num_trees_ = 0;
   __host__ __device__
-  storage(categorical_sets cat_sets, F* vector_leaf, int* trees, node_t* nodes, int num_trees)
-    : storage_base<F>{cat_sets, vector_leaf}, trees_(trees), nodes_(nodes), num_trees_(num_trees)
+  storage(categorical_sets cat_sets, real_t* vector_leaf, int* trees, node_t* nodes, int num_trees)
+    : storage_base<real_t>{cat_sets, vector_leaf},
+      trees_(trees),
+      nodes_(nodes),
+      num_trees_(num_trees)
   {
   }
   __host__ __device__ int num_trees() const { return num_trees_; }
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 90e2aa18c3..2738afc9a6 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -128,10 +128,10 @@ struct forest {
     fixed_block_count_ = blocks_per_sm * sm_count;
   }
 
-  template <typename F>
+  template <typename real_t>
   void init_common(const raft::handle_t& h,
                    const categorical_sets& cat_sets,
-                   const std::vector<F>& vector_leaf,
+                   const std::vector<real_t>& vector_leaf,
                    const forest_params_t* params)
   {
     depth_                           = params->depth;
@@ -146,7 +146,7 @@ struct forest {
     proba_ssp_.num_cols              = params->num_cols;
     proba_ssp_.num_classes           = params->num_classes;
     proba_ssp_.cats_present          = cat_sets.cats_present();
-    proba_ssp_.sizeof_fp_vars        = sizeof(F);
+    proba_ssp_.sizeof_fp_vars        = sizeof(real_t);
     class_ssp_                       = proba_ssp_;
 
     int device          = h.get_device();
@@ -157,11 +157,11 @@ struct forest {
 
     // vector leaf
     if (!vector_leaf.empty()) {
-      vector_leaf_.resize(vector_leaf.size() * sizeof(F), stream);
+      vector_leaf_.resize(vector_leaf.size() * sizeof(real_t), stream);
 
       RAFT_CUDA_TRY(cudaMemcpyAsync(vector_leaf_.data(),
                                     vector_leaf.data(),
-                                    vector_leaf.size() * sizeof(F),
+                                    vector_leaf.size() * sizeof(real_t),
                                     cudaMemcpyHostToDevice,
                                     stream));
     }
@@ -342,7 +342,7 @@ struct opt_into_arch_dependent_shmem : dispatch_functor<void> {
   template <typename KernelParams = KernelTemplateParams<>>
   void run(predict_params p)
   {
-    if constexpr (std::is_same<typename storage_type::F, float>()) {
+    if constexpr (std::is_same<typename storage_type::real_t, float>()) {
       auto kernel = infer_k<KernelParams::N_ITEMS,
                             KernelParams::LEAF_ALGO,
                             KernelParams::COLS_IN_SHMEM,
@@ -356,9 +356,9 @@ struct opt_into_arch_dependent_shmem : dispatch_functor<void> {
   }
 };
 
-template <typename F>
-struct dense_forest<dense_node<F>> : forest {
-  using node_t = dense_node<F>;
+template <typename real_t>
+struct dense_forest<dense_node<real_t>> : forest {
+  using node_t = dense_node<real_t>;
   dense_forest(const raft::handle_t& h) : forest(h), nodes_(0, h.get_stream()) {}
 
   void transform_trees(const node_t* nodes)
@@ -391,7 +391,7 @@ struct dense_forest<dense_node<F>> : forest {
   /// sparse_forest<node_t>::init()
   void init(const raft::handle_t& h,
             const categorical_sets& cat_sets,
-            const std::vector<F>& vector_leaf,
+            const std::vector<real_t>& vector_leaf,
             const int* trees,
             const node_t* nodes,
             const forest_params_t* params)
@@ -425,12 +425,12 @@ struct dense_forest<dense_node<F>> : forest {
   virtual void infer(predict_params params, cudaStream_t stream) override
   {
     storage<node_t> forest(cat_sets_.accessor(),
-                           reinterpret_cast<F*>(vector_leaf_.data()),
+                           reinterpret_cast<real_t*>(vector_leaf_.data()),
                            nodes_.data(),
                            num_trees_,
                            algo_ == algo_t::NAIVE ? tree_num_nodes(depth_) : 1,
                            algo_ == algo_t::NAIVE ? 1 : num_trees_);
-    if constexpr (std::is_same<F, float>())  // to remove in next PR
+    if constexpr (std::is_same<real_t, float>())  // to remove in next PR
       fil::infer(forest, params, stream);
   }
 
@@ -453,7 +453,7 @@ struct sparse_forest : forest {
 
   void init(const raft::handle_t& h,
             const categorical_sets& cat_sets,
-            const std::vector<typename node_t::F>& vector_leaf,
+            const std::vector<typename node_t::real_t>& vector_leaf,
             const int* trees,
             const node_t* nodes,
             const forest_params_t* params)
@@ -481,11 +481,11 @@ struct sparse_forest : forest {
   virtual void infer(predict_params params, cudaStream_t stream) override
   {
     storage<node_t> forest(cat_sets_.accessor(),
-                           reinterpret_cast<typename node_t::F*>(vector_leaf_.data()),
+                           reinterpret_cast<typename node_t::real_t*>(vector_leaf_.data()),
                            trees_.data(),
                            nodes_.data(),
                            num_trees_);
-    if constexpr (std::is_same<typename node_t::F, float>())  // to remove in next PR
+    if constexpr (std::is_same<typename node_t::real_t, float>())  // to remove in next PR
       fil::infer(forest, params, stream);
   }
 
@@ -591,7 +591,7 @@ void init(const raft::handle_t& h,
   check_params(params, node_traits<fil_node_t>::IS_DENSE);
   using forest_type = typename node_traits<fil_node_t>::forest;
   forest_type* f    = new forest_type(h);
-  if constexpr (std::is_same<typename fil_node_t::F, float>())  // to remove in next PR
+  if constexpr (std::is_same<typename fil_node_t::real_t, float>())  // to remove in next PR
     f->init(h, cat_sets, vector_leaf, trees, nodes, params);
   *pf = f;
 }
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index cb8fdaeaa0..f03543ba11 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -96,14 +96,14 @@ struct vec {
   }
 };
 
-template <typename F>
-struct best_margin_label : cub::KeyValuePair<int, F> {
-  __host__ __device__ best_margin_label(cub::KeyValuePair<int, F> pair)
-    : cub::KeyValuePair<int, F>(pair)
+template <typename real_t>
+struct best_margin_label : cub::KeyValuePair<int, real_t> {
+  __host__ __device__ best_margin_label(cub::KeyValuePair<int, real_t> pair)
+    : cub::KeyValuePair<int, real_t>(pair)
   {
   }
-  __host__ __device__ best_margin_label(int c = 0, F f = -INFINITY)
-    : cub::KeyValuePair<int, F>({c, f})
+  __host__ __device__ best_margin_label(int c = 0, real_t f = -INFINITY)
+    : cub::KeyValuePair<int, real_t>({c, f})
   {
   }
 };
diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh
index 9bc5f8a333..8d240bc24a 100644
--- a/cpp/src/fil/internal.cuh
+++ b/cpp/src/fil/internal.cuh
@@ -90,22 +90,22 @@ enum output_t {
 };
 
 /** val_t is the payload within a FIL leaf */
-template <typename F>
+template <typename real_t>
 union val_t {
   /** floating-point threshold value for parent node or output value
       (e.g. class probability or regression summand) for leaf node */
-  F f = NAN;
+  real_t f = NAN;
   /** class label, leaf vector index or categorical node set offset */
   int idx;
 };
 
 /** base_node contains common implementation details for dense and sparse nodes */
-template <typename F_>
-struct alignas(std::is_same<F_, float>() ? 8 : 16) base_node {
-  using F = F_;  // floating-point type
+template <typename real_t_>
+struct alignas(std::is_same<real_t_, float>() ? 8 : 16) base_node {
+  using real_t = real_t_;  // floating-point type
   /** val, for parent nodes, is a threshold or category list offset. For leaf
       nodes, it is the tree prediction (see see leaf_output_t<leaf_algo_t>::T) */
-  val_t<F> val;
+  val_t<real_t> val;
   /** bits encode various information about the node, with the exact nature of
       this information depending on the node type; it includes e.g. whether the
       node is a leaf or inner node, and for inner nodes, additional information,
@@ -122,28 +122,32 @@ struct alignas(std::is_same<F_, float>() ? 8 : 16) base_node {
   __host__ __device__ o_t output() const
   {
     static_assert(
-      std::is_same_v<o_t, int> || std::is_same_v<o_t, F> || std::is_same_v<o_t, val_t<F>>,
+      std::is_same_v<o_t, int> || std::is_same_v<o_t, real_t> || std::is_same_v<o_t, val_t<real_t>>,
       "invalid o_t type parameter in node.output()");
     if constexpr (std::is_same_v<o_t, int>) {
       return val.idx;
-    } else if constexpr (std::is_same_v<o_t, F>) {
+    } else if constexpr (std::is_same_v<o_t, real_t>) {
       return val.f;
-    } else if constexpr (std::is_same_v<o_t, val_t<F>>) {
+    } else if constexpr (std::is_same_v<o_t, val_t<real_t>>) {
       return val;
     }
     // control flow should not reach here
     return o_t();
   }
   __host__ __device__ int set() const { return val.idx; }
-  __host__ __device__ F thresh() const { return val.f; }
-  __host__ __device__ val_t<F> split() const { return val; }
+  __host__ __device__ real_t thresh() const { return val.f; }
+  __host__ __device__ val_t<real_t> split() const { return val; }
   __host__ __device__ int fid() const { return bits & FID_MASK; }
   __host__ __device__ bool def_left() const { return bits & DEF_LEFT_MASK; }
   __host__ __device__ bool is_leaf() const { return bits & IS_LEAF_MASK; }
   __host__ __device__ bool is_categorical() const { return bits & IS_CATEGORICAL_MASK; }
   __host__ __device__ base_node() : val{}, bits(0) {}
-  base_node(
-    val_t<F> output, val_t<F> split, int fid, bool def_left, bool is_leaf, bool is_categorical)
+  base_node(val_t<real_t> output,
+            val_t<real_t> split,
+            int fid,
+            bool def_left,
+            bool is_leaf,
+            bool is_categorical)
   {
     RAFT_EXPECTS((fid & FID_MASK) == fid, "internal error: feature ID doesn't fit into base_node");
     bits = (fid & FID_MASK) | (def_left ? DEF_LEFT_MASK : 0) | (is_leaf ? IS_LEAF_MASK : 0) |
@@ -156,18 +160,18 @@ struct alignas(std::is_same<F_, float>() ? 8 : 16) base_node {
 };
 
 /** dense_node is a single node of a dense forest */
-template <typename F>
-struct dense_node : base_node<F> {
+template <typename real_t>
+struct dense_node : base_node<real_t> {
   dense_node() = default;
   /// ignoring left_index, this is useful to unify import from treelite
-  dense_node(val_t<F> output,
-             val_t<F> split,
+  dense_node(val_t<real_t> output,
+             val_t<real_t> split,
              int fid,
              bool def_left,
              bool is_leaf,
              bool is_categorical,
              int left_index = -1)
-    : base_node<F>(output, split, fid, def_left, is_leaf, is_categorical)
+    : base_node<real_t>(output, split, fid, def_left, is_leaf, is_categorical)
   {
   }
   /** index of the left child, where curr is the index of the current node */
@@ -175,18 +179,18 @@ struct dense_node : base_node<F> {
 };
 
 /** sparse_node16 is a 16-byte node in a sparse forest */
-template <typename F>
-struct alignas(16) sparse_node16 : base_node<F> {
+template <typename real_t>
+struct alignas(16) sparse_node16 : base_node<real_t> {
   int left_idx;
   __host__ __device__ sparse_node16() : left_idx(0) {}
-  sparse_node16(val_t<F> output,
-                val_t<F> split,
+  sparse_node16(val_t<real_t> output,
+                val_t<real_t> split,
                 int fid,
                 bool def_left,
                 bool is_leaf,
                 bool is_categorical,
                 int left_index)
-    : base_node<F>(output, split, fid, def_left, is_leaf, is_categorical), left_idx(left_index)
+    : base_node<real_t>(output, split, fid, def_left, is_leaf, is_categorical), left_idx(left_index)
   {
   }
   __host__ __device__ int left_index() const { return left_idx; }
@@ -233,20 +237,20 @@ struct sparse_forest;
 
 template <typename node_t>
 struct node_traits {
-  using F                    = typename node_t::F;
+  using real_t               = typename node_t::real_t;
   using storage              = ML::fil::storage<node_t>;
   using forest               = sparse_forest<node_t>;
   static const bool IS_DENSE = false;
   static const storage_type_t storage_type_enum =
-    std::is_same<sparse_node16<typename node_t::F>, node_t>() ? SPARSE : SPARSE8;
+    std::is_same<sparse_node16<typename node_t::real_t>, node_t>() ? SPARSE : SPARSE8;
   template <typename threshold_t, typename leaf_t>
   static void check(const treelite::ModelImpl<threshold_t, leaf_t>& model);
 };
 
-template <typename F>
-struct node_traits<dense_node<F>> {
-  using storage                                 = storage<dense_node<F>>;
-  using forest                                  = dense_forest<dense_node<F>>;
+template <typename real_t>
+struct node_traits<dense_node<real_t>> {
+  using storage                                 = storage<dense_node<real_t>>;
+  using forest                                  = dense_forest<dense_node<real_t>>;
   static const bool IS_DENSE                    = true;
   static const storage_type_t storage_type_enum = DENSE;
   template <typename threshold_t, typename leaf_t>
@@ -294,27 +298,27 @@ enum leaf_algo_t {
 template <typename node_t>
 struct tree;
 
-template <leaf_algo_t leaf_algo, typename F>
+template <leaf_algo_t leaf_algo, typename real_t>
 struct leaf_output_t {
 };
-template <typename F>
-struct leaf_output_t<leaf_algo_t::FLOAT_UNARY_BINARY, F> {
-  typedef F T;
+template <typename real_t>
+struct leaf_output_t<leaf_algo_t::FLOAT_UNARY_BINARY, real_t> {
+  typedef real_t T;
 };
-template <typename F>
-struct leaf_output_t<leaf_algo_t::CATEGORICAL_LEAF, F> {
+template <typename real_t>
+struct leaf_output_t<leaf_algo_t::CATEGORICAL_LEAF, real_t> {
   typedef int T;
 };
-template <typename F>
-struct leaf_output_t<leaf_algo_t::GROVE_PER_CLASS_FEW_CLASSES, F> {
-  typedef F T;
+template <typename real_t>
+struct leaf_output_t<leaf_algo_t::GROVE_PER_CLASS_FEW_CLASSES, real_t> {
+  typedef real_t T;
 };
-template <typename F>
-struct leaf_output_t<leaf_algo_t::GROVE_PER_CLASS_MANY_CLASSES, F> {
-  typedef F T;
+template <typename real_t>
+struct leaf_output_t<leaf_algo_t::GROVE_PER_CLASS_MANY_CLASSES, real_t> {
+  typedef real_t T;
 };
-template <typename F>
-struct leaf_output_t<leaf_algo_t::VECTOR_LEAF, F> {
+template <typename real_t>
+struct leaf_output_t<leaf_algo_t::VECTOR_LEAF, real_t> {
   typedef int T;
 };
 

From 2f658b5ff09241ff0d13a6a06800dec7f6c4a374 Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Thu, 10 Mar 2022 20:27:34 +0100
Subject: [PATCH 16/45] Small fixes.

---
 cpp/src/fil/common.cuh | 8 ++++----
 cpp/src/fil/fil.cu     | 7 ++++---
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 2696d9dc28..99c0e7afe8 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -157,8 +157,8 @@ struct shmem_size_params {
   int block_dim_x = 0;
   /// shm_sz is the associated shared memory footprint
   int shm_sz = INT_MAX;
-  /// sizeof_fp_vars is the size in bytes of all floating-point variables during inference
-  std::size_t sizeof_fp_vars = 4;
+  /// sizeof_real is the size in bytes of all floating-point variables during inference
+  std::size_t sizeof_real = 4;
 
   __host__ __device__ int sdata_stride()
   {
@@ -166,7 +166,7 @@ struct shmem_size_params {
   }
   __host__ __device__ int cols_shmem_size()
   {
-    return cols_in_shmem ? sizeof_fp_vars * sdata_stride() * n_items << log2_threads_per_tree : 0;
+    return cols_in_shmem ? sizeof_real * sdata_stride() * n_items << log2_threads_per_tree : 0;
   }
   template <int NITEMS, leaf_algo_t leaf_algo>
   size_t get_smem_footprint();
@@ -180,7 +180,7 @@ struct predict_params : shmem_size_params {
   // number of outputs for the forest per each data row
   int num_outputs;
 
-  // Data parameters.
+  // Data parameters; preds and data are pointers to either float or double.
   void* preds;
   const void* data;
   // number of data rows (instances) to predict on
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 2738afc9a6..e5ef05eb33 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -30,6 +30,7 @@ creation and prediction (the main inference kernel is defined in infer.cu). */
 
 #include <cmath>    // for expf
 #include <cstddef>  // for size_t
+#include <cstdint>  // for uint8_t
 
 namespace ML {
 namespace fil {
@@ -87,7 +88,7 @@ struct forest {
        we would have otherwise silently overflowed the index calculation due
        to short division. It would have failed cpp tests, but we might forget
        about this source of bugs, if not for the failing assert. */
-    ASSERT(max_shm_ < int(proba_ssp_.sizeof_fp_vars) * std::numeric_limits<uint16_t>::max(),
+    ASSERT(max_shm_ < int(proba_ssp_.sizeof_real) * std::numeric_limits<uint16_t>::max(),
            "internal error: please use a larger type inside"
            " infer_k for column count");
   }
@@ -146,7 +147,7 @@ struct forest {
     proba_ssp_.num_cols              = params->num_cols;
     proba_ssp_.num_classes           = params->num_classes;
     proba_ssp_.cats_present          = cat_sets.cats_present();
-    proba_ssp_.sizeof_fp_vars        = sizeof(real_t);
+    proba_ssp_.sizeof_real           = sizeof(real_t);
     class_ssp_                       = proba_ssp_;
 
     int device          = h.get_device();
@@ -330,7 +331,7 @@ struct forest {
   int fixed_block_count_ = 0;
   int max_shm_           = 0;
   // Optionally used
-  rmm::device_uvector<char> vector_leaf_;
+  rmm::device_uvector<std::uint8_t> vector_leaf_;
   cat_sets_device_owner cat_sets_;
 };
 

From 6fb105b90922836547431fa2e2563f0c17ebf6cd Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Thu, 10 Mar 2022 22:49:16 +0100
Subject: [PATCH 17/45] Updated alignment.

---
 cpp/src/fil/infer.cu     | 2 +-
 cpp/src/fil/internal.cuh | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index f03543ba11..25494ec10d 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -36,7 +36,7 @@
 #endif  // __CUDA_ARCH__
 #endif  // CUDA_PRAGMA_UNROLL
 
-#define INLINE_CONFIG __forceinline__
+#define INLINE_CONFIG __noinline__
 
 namespace ML {
 namespace fil {
diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh
index 8d240bc24a..e90a5029bb 100644
--- a/cpp/src/fil/internal.cuh
+++ b/cpp/src/fil/internal.cuh
@@ -101,7 +101,7 @@ union val_t {
 
 /** base_node contains common implementation details for dense and sparse nodes */
 template <typename real_t_>
-struct alignas(std::is_same<real_t_, float>() ? 8 : 16) base_node {
+struct alignas(2 * sizeof(real_t_)) base_node {
   using real_t = real_t_;  // floating-point type
   /** val, for parent nodes, is a threshold or category list offset. For leaf
       nodes, it is the tree prediction (see see leaf_output_t<leaf_algo_t>::T) */
@@ -161,7 +161,7 @@ struct alignas(std::is_same<real_t_, float>() ? 8 : 16) base_node {
 
 /** dense_node is a single node of a dense forest */
 template <typename real_t>
-struct dense_node : base_node<real_t> {
+struct alignas(2 * sizeof(real_t)) dense_node : base_node<real_t> {
   dense_node() = default;
   /// ignoring left_index, this is useful to unify import from treelite
   dense_node(val_t<real_t> output,

From f1a10bee38f9d1caa95b1600845db9620912ea2c Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Thu, 10 Mar 2022 23:03:01 +0100
Subject: [PATCH 18/45] static_assert(real_t == float) in a number of places.

---
 cpp/src/fil/fil.cu | 37 ++++++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index e5ef05eb33..f1da7e79a4 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -343,17 +343,17 @@ struct opt_into_arch_dependent_shmem : dispatch_functor<void> {
   template <typename KernelParams = KernelTemplateParams<>>
   void run(predict_params p)
   {
-    if constexpr (std::is_same<typename storage_type::real_t, float>()) {
-      auto kernel = infer_k<KernelParams::N_ITEMS,
-                            KernelParams::LEAF_ALGO,
-                            KernelParams::COLS_IN_SHMEM,
-                            KernelParams::CATS_SUPPORTED,
-                            storage_type>;
-      // p.shm_sz might be > max_shm or < MAX_SHM_STD, but we should not check for either, because
-      // we don't run on both proba_ssp_ and class_ssp_ (only class_ssp_). This should be quick.
-      RAFT_CUDA_TRY(
-        cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shm));
-    }
+    static_assert(std::is_same_v<typename storage_type::real_t, float>,
+                  "real_t must be float; to be removed in the following pull requests");
+    auto kernel = infer_k<KernelParams::N_ITEMS,
+                          KernelParams::LEAF_ALGO,
+                          KernelParams::COLS_IN_SHMEM,
+                          KernelParams::CATS_SUPPORTED,
+                          storage_type>;
+    // p.shm_sz might be > max_shm or < MAX_SHM_STD, but we should not check for either, because
+    // we don't run on both proba_ssp_ and class_ssp_ (only class_ssp_). This should be quick.
+    RAFT_CUDA_TRY(
+      cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shm));
   }
 };
 
@@ -431,8 +431,9 @@ struct dense_forest<dense_node<real_t>> : forest {
                            num_trees_,
                            algo_ == algo_t::NAIVE ? tree_num_nodes(depth_) : 1,
                            algo_ == algo_t::NAIVE ? 1 : num_trees_);
-    if constexpr (std::is_same<real_t, float>())  // to remove in next PR
-      fil::infer(forest, params, stream);
+    static_assert(std::is_same_v<real_t, float>,
+                  "real_t must be float; to be removed in the following pull requests");
+    fil::infer(forest, params, stream);
   }
 
   virtual void free(const raft::handle_t& h) override
@@ -486,8 +487,9 @@ struct sparse_forest : forest {
                            trees_.data(),
                            nodes_.data(),
                            num_trees_);
-    if constexpr (std::is_same<typename node_t::real_t, float>())  // to remove in next PR
-      fil::infer(forest, params, stream);
+    static_assert(std::is_same_v<typename node_t::real_t, float>,
+                  "real_t must be float; to be removed in the following pull requests");
+    fil::infer(forest, params, stream);
   }
 
   void free(const raft::handle_t& h) override
@@ -592,8 +594,9 @@ void init(const raft::handle_t& h,
   check_params(params, node_traits<fil_node_t>::IS_DENSE);
   using forest_type = typename node_traits<fil_node_t>::forest;
   forest_type* f    = new forest_type(h);
-  if constexpr (std::is_same<typename fil_node_t::real_t, float>())  // to remove in next PR
-    f->init(h, cat_sets, vector_leaf, trees, nodes, params);
+  static_assert(std::is_same_v<typename fil_node_t::real_t, float>,
+                "real_t must be float; to be removed in the following pull requests");
+  f->init(h, cat_sets, vector_leaf, trees, nodes, params);
   *pf = f;
 }
 

From ce3624eb6c2f0a8e311f468f63617ea79eda5530 Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Thu, 10 Mar 2022 23:10:18 +0100
Subject: [PATCH 19/45] noinline -> forceinline.

---
 cpp/src/fil/infer.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 25494ec10d..f03543ba11 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -36,7 +36,7 @@
 #endif  // __CUDA_ARCH__
 #endif  // CUDA_PRAGMA_UNROLL
 
-#define INLINE_CONFIG __noinline__
+#define INLINE_CONFIG __forceinline__
 
 namespace ML {
 namespace fil {

From 63fadd1d67a9da2b7a52cd7f1e2263fc41ffc3d4 Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Fri, 11 Mar 2022 23:21:21 +0100
Subject: [PATCH 20/45] Updated comment.

---
 cpp/src/fil/fil.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index f1da7e79a4..705cd96702 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -330,7 +330,8 @@ struct forest {
   shmem_size_params class_ssp_, proba_ssp_;
   int fixed_block_count_ = 0;
   int max_shm_           = 0;
-  // Optionally used
+  // vector_leaf_ is only used if {class,proba}_ssp_.leaf_algo is VECTOR_LEAF,
+  // otherwise it is empty
   rmm::device_uvector<std::uint8_t> vector_leaf_;
   cat_sets_device_owner cat_sets_;
 };

From fda76692dca9008e3aee6f768253d3efc24fba76 Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Sat, 12 Mar 2022 02:51:12 +0100
Subject: [PATCH 21/45] Fixed many compiler errors.

---
 cpp/src/fil/common.cuh |   4 +-
 cpp/src/fil/infer.cu   | 199 ++++++++++++++++++++++-------------------
 2 files changed, 111 insertions(+), 92 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index c03008186e..a9f1933458 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -94,7 +94,7 @@ struct storage<dense_node<real_t_>> : storage_base<real_t_> {
 /** sparse tree */
 template <typename node_t>
 struct tree : tree_base {
-  using F = typename node_t::F;
+  using real_t = typename node_t::real_t;
   __host__ __device__ tree(categorical_sets cat_sets, node_t* nodes)
     : tree_base{cat_sets}, nodes_(nodes)
   {
@@ -170,7 +170,7 @@ struct shmem_size_params {
   {
     return cols_in_shmem ? sizeof_real * sdata_stride() * n_items << log2_threads_per_tree : 0;
   }
-  template <int NITEMS, typename F, leaf_algo_t leaf_algo>
+  template <int NITEMS, typename real_t, leaf_algo_t leaf_algo>
   size_t get_smem_footprint();
 };
 
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 8ae937b444..4eaaede85b 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -157,7 +157,7 @@ __device__ __forceinline__ vec<NITEMS, output_type> tree_leaf_output(tree_type t
 
 template <int NITEMS, bool CATS_SUPPORTED, typename output_type, typename tree_type>
 __device__ __forceinline__ vec<NITEMS, output_type> infer_one_tree(
-  tree_type tree, const typename tree_type::F* input, int cols, int n_rows)
+  tree_type tree, const typename tree_type::real_t* input, int cols, int n_rows)
 {
   // find the leaf nodes for each row
   int curr[NITEMS];
@@ -185,10 +185,8 @@ __device__ __forceinline__ vec<NITEMS, output_type> infer_one_tree(
 }
 
 template <typename output_type, typename tree_type>
-__device__ __forceinline__ vec<1, output_type> infer_one_tree(tree_type tree,
-                                                              const typename tree_type::F* input,
-                                                              int cols,
-                                                              int rows)
+__device__ __forceinline__ vec<1, output_type> infer_one_tree(
+  tree_type tree, const typename tree_type::real_t* input, int cols, int rows)
 {
   int curr = 0;
   for (;;) {
@@ -219,12 +217,12 @@ host code.
 See https://rapids.ai/start.html as well as cmake defaults.
 */
 // values below are defaults as of this change.
-template <int NITEMS, typename F>
+template <int NITEMS, typename real_t>
 size_t block_reduce_footprint_host()
 {
   return sizeof(
     typename cub::
-      BlockReduce<vec<NITEMS, F>, FIL_TPB, cub::BLOCK_REDUCE_WARP_REDUCTIONS, 1, 1, 600>::
+      BlockReduce<vec<NITEMS, real_t>, FIL_TPB, cub::BLOCK_REDUCE_WARP_REDUCTIONS, 1, 1, 600>::
         TempStorage);
 }
 
@@ -249,10 +247,10 @@ __device__ __forceinline__ T block_reduce(T value, BinaryOp op, void* storage)
 }
 
 template <int NITEMS,
-          typename F,
+          typename real_t,
           leaf_algo_t leaf_algo>  // = FLOAT_UNARY_BINARY
 struct tree_aggregator_t {
-  vec<NITEMS, F> acc;
+  vec<NITEMS, real_t> acc;
   void* tmp_storage;
 
   /** shared memory footprint of the accumulator during
@@ -264,8 +262,8 @@ struct tree_aggregator_t {
                                         int log2_threads_per_tree,
                                         bool predict_proba)
   {
-    return log2_threads_per_tree != 0 ? FIL_TPB * NITEMS * sizeof(F)
-                                      : block_reduce_footprint_host<NITEMS>();
+    return log2_threads_per_tree != 0 ? FIL_TPB * NITEMS * sizeof(real_t)
+                                      : block_reduce_footprint_host<NITEMS, real_t>();
   }
 
   /** shared memory footprint of the accumulator during
@@ -279,19 +277,19 @@ struct tree_aggregator_t {
   __device__ __forceinline__ tree_aggregator_t(predict_params params,
                                                void* accumulate_workspace,
                                                void* finalize_workspace,
-                                               F* vector_leaf)
+                                               real_t* vector_leaf)
     : tmp_storage(finalize_workspace)
   {
   }
 
-  __device__ __forceinline__ void accumulate(vec<NITEMS, F> single_tree_prediction,
+  __device__ __forceinline__ void accumulate(vec<NITEMS, real_t> single_tree_prediction,
                                              int tree,
                                              int thread_num_rows)
   {
     acc += single_tree_prediction;
   }
 
-  __device__ INLINE_CONFIG void finalize(F* block_out,
+  __device__ INLINE_CONFIG void finalize(real_t* block_out,
                                          int block_num_rows,
                                          int output_stride,
                                          output_t transform,
@@ -304,7 +302,7 @@ struct tree_aggregator_t {
       if (log2_threads_per_tree == 0) {
         acc = block_reduce(acc, vectorized(cub::Sum()), tmp_storage);
       } else {
-        auto per_thread         = (vec<NITEMS, F>*)tmp_storage;
+        auto per_thread         = (vec<NITEMS, real_t>*)tmp_storage;
         per_thread[threadIdx.x] = acc;
         __syncthreads();
         // We have two pertinent cases for splitting FIL_TPB == 256 values:
@@ -352,7 +350,7 @@ __device__ __forceinline__ auto allreduce_shmem(Iterator begin,
 // tmp_storage may overlap shared memory addressed by [begin, end)
 template <typename Iterator, typename real_t>
 __device__ __forceinline__ void write_best_class(
-  Iterator begin, Iterator end, void* tmp_storage, F* out, int num_rows)
+  Iterator begin, Iterator end, void* tmp_storage, real_t* out, int num_rows)
 {
   // reduce per-class candidate margins to one best class candidate
   // per thread (for each of the NITEMS rows)
@@ -371,8 +369,25 @@ __device__ __forceinline__ void write_best_class(
 }
 
 /// needed for softmax
-__device__ double shifted_exp(double margin, double max) { return exp(margin - max); }
-__device__ float shifted_exp(float margin, float max) { return expf(margin - max); }
+template <typename real_t>
+__device__ real_t shifted_exp(real_t margin, real_t max)
+{
+  // this is always false, since specializations are used for float and double
+  static_assert(std::is_same_v<real_t, float> || std::is_same_v<real_t, double>,
+                "only specializations of shifted_exp<real_t> can be used");
+}
+
+template <>
+__device__ double shifted_exp(double margin, double max)
+{
+  return exp(margin - max);
+}
+
+template <>
+__device__ float shifted_exp(float margin, float max)
+{
+  return expf(margin - max);
+}
 
 // *begin and *end shall be struct vec
 // tmp_storage may NOT overlap shared memory addressed by [begin, end)
@@ -380,12 +395,12 @@ template <typename Iterator>
 __device__ __forceinline__ void block_softmax(Iterator begin, Iterator end, void* tmp_storage)
 {
   // subtract max before exponentiating for numerical stability
-  typedef typename std::iterator_traits<Iterator>::value_type value_type;
-  value_type max = allreduce_shmem(begin, end, vectorized(cub::Max()), tmp_storage);
+  using real_t = typename std::iterator_traits<Iterator>::value_type;
+  real_t max   = allreduce_shmem(begin, end, vectorized(cub::Max()), tmp_storage);
   for (Iterator it = begin + threadIdx.x; it < end; it += blockDim.x)
-    *it = vectorized(shifted_exp)(*it, max);
+    *it = vectorized(shifted_exp<real_t>)(*it, max);
   // sum of exponents
-  value_type soe = allreduce_shmem(begin, end, vectorized(cub::Sum()), tmp_storage);
+  real_t soe = allreduce_shmem(begin, end, vectorized(cub::Sum()), tmp_storage);
   // softmax phase 2: normalization
   for (Iterator it = begin + threadIdx.x; it < end; it += blockDim.x)
     *it /= soe;
@@ -393,13 +408,13 @@ __device__ __forceinline__ void block_softmax(Iterator begin, Iterator end, void
 
 // *begin and *end shall be struct vec
 // tmp_storage may NOT overlap shared memory addressed by [begin, end)
-template <typename Iterator, typename F>
+template <typename Iterator, typename real_t>
 __device__ __forceinline__ void normalize_softmax_and_write(Iterator begin,
                                                             Iterator end,
                                                             output_t transform,
                                                             int trees_per_class,
                                                             void* tmp_storage,
-                                                            F* out,
+                                                            real_t* out,
                                                             int num_rows)
 {
   if ((transform & output_t::AVG) != 0) {
@@ -418,13 +433,13 @@ __device__ __forceinline__ void normalize_softmax_and_write(Iterator begin,
 // *begin and *end shall be struct vec
 // tmp_storage may NOT overlap shared memory addressed by [begin, end)
 // in case num_outputs > 1
-template <typename Iterator, typename F>
+template <typename Iterator, typename real_t>
 __device__ __forceinline__ void class_margins_to_global_memory(Iterator begin,
                                                                Iterator end,
                                                                output_t transform,
                                                                int trees_per_class,
                                                                void* tmp_storage,
-                                                               F* out,
+                                                               real_t* out,
                                                                int num_rows,
                                                                int num_outputs)
 {
@@ -437,11 +452,11 @@ __device__ __forceinline__ void class_margins_to_global_memory(Iterator begin,
   }
 }
 
-template <int NITEMS, typename F>
-struct tree_aggregator_t<NITEMS, F, GROVE_PER_CLASS_FEW_CLASSES> {
-  vec<NITEMS, F> acc;
+template <int NITEMS, typename real_t>
+struct tree_aggregator_t<NITEMS, real_t, GROVE_PER_CLASS_FEW_CLASSES> {
+  vec<NITEMS, real_t> acc;
   int num_classes;
-  vec<NITEMS, F>* per_thread;
+  vec<NITEMS, real_t>* per_thread;
   void* tmp_storage;
 
   static size_t smem_finalize_footprint(size_t data_row_size,
@@ -449,9 +464,9 @@ struct tree_aggregator_t<NITEMS, F, GROVE_PER_CLASS_FEW_CLASSES> {
                                         int log2_threads_per_tree,
                                         bool predict_proba)
   {
-    size_t phase1 = (FIL_TPB - FIL_TPB % num_classes) * sizeof(vec<NITEMS, F>);
-    size_t phase2 = predict_proba ? block_reduce_footprint_host<NITEMS>()
-                                  : block_reduce_best_class_footprint_host<NITEMS>();
+    size_t phase1 = (FIL_TPB - FIL_TPB % num_classes) * sizeof(vec<NITEMS, real_t>);
+    size_t phase2 = predict_proba ? block_reduce_footprint_host<NITEMS, real_t>()
+                                  : block_reduce_best_class_footprint_host<NITEMS, real_t>();
     return predict_proba ? phase1 + phase2 : std::max(phase1, phase2);
   }
 
@@ -460,21 +475,21 @@ struct tree_aggregator_t<NITEMS, F, GROVE_PER_CLASS_FEW_CLASSES> {
   __device__ __forceinline__ tree_aggregator_t(predict_params params,
                                                void* accumulate_workspace,
                                                void* finalize_workspace,
-                                               F* vector_leaf)
+                                               real_t* vector_leaf)
     : num_classes(params.num_classes),
-      per_thread((vec<NITEMS, F>*)finalize_workspace),
+      per_thread((vec<NITEMS, real_t>*)finalize_workspace),
       tmp_storage(params.predict_proba ? per_thread + num_classes : finalize_workspace)
   {
   }
 
-  __device__ __forceinline__ void accumulate(vec<NITEMS, F> single_tree_prediction,
+  __device__ __forceinline__ void accumulate(vec<NITEMS, real_t> single_tree_prediction,
                                              int tree,
                                              int thread_num_rows)
   {
     acc += single_tree_prediction;
   }
 
-  __device__ INLINE_CONFIG void finalize(F* out,
+  __device__ INLINE_CONFIG void finalize(real_t* out,
                                          int num_rows,
                                          int num_outputs,
                                          output_t transform,
@@ -500,11 +515,11 @@ struct tree_aggregator_t<NITEMS, F, GROVE_PER_CLASS_FEW_CLASSES> {
   }
 };
 
-template <int NITEMS, typename F>
-struct tree_aggregator_t<NITEMS, F, GROVE_PER_CLASS_MANY_CLASSES> {
-  vec<NITEMS, F> acc;
+template <int NITEMS, typename real_t>
+struct tree_aggregator_t<NITEMS, real_t, GROVE_PER_CLASS_MANY_CLASSES> {
+  vec<NITEMS, real_t> acc;
   /// at first, per class margin, then, possibly, different softmax partials
-  vec<NITEMS, F>* per_class_margin;
+  vec<NITEMS, real_t>* per_class_margin;
   void* tmp_storage;
   int num_classes;
 
@@ -514,30 +529,30 @@ struct tree_aggregator_t<NITEMS, F, GROVE_PER_CLASS_MANY_CLASSES> {
                                         bool predict_proba)
   {
     size_t phase1 = data_row_size + smem_accumulate_footprint(num_classes);
-    size_t phase2 = predict_proba ? block_reduce_footprint_host<NITEMS>()
-                                  : block_reduce_best_class_footprint_host<NITEMS>();
+    size_t phase2 = predict_proba ? block_reduce_footprint_host<NITEMS, real_t>()
+                                  : block_reduce_best_class_footprint_host<NITEMS, real_t>();
     return predict_proba ? phase1 + phase2 : std::max(phase1, phase2);
   }
 
   static __host__ __device__ size_t smem_accumulate_footprint(int num_classes)
   {
-    return num_classes * sizeof(vec<NITEMS, F>);
+    return num_classes * sizeof(vec<NITEMS, real_t>);
   }
 
   __device__ __forceinline__ tree_aggregator_t(predict_params params,
                                                void* accumulate_workspace,
                                                void* finalize_workspace,
-                                               F* vector_leaf)
-    : per_class_margin((vec<NITEMS, F>*)accumulate_workspace),
+                                               real_t* vector_leaf)
+    : per_class_margin((vec<NITEMS, real_t>*)accumulate_workspace),
       tmp_storage(params.predict_proba ? per_class_margin + num_classes : finalize_workspace),
       num_classes(params.num_classes)
   {
     for (int c = threadIdx.x; c < num_classes; c += blockDim.x)
-      per_class_margin[c] = vec<NITEMS, F>(0);
+      per_class_margin[c] = vec<NITEMS, real_t>(0);
     // __syncthreads() is called in infer_k
   }
 
-  __device__ __forceinline__ void accumulate(vec<NITEMS, F> single_tree_prediction,
+  __device__ __forceinline__ void accumulate(vec<NITEMS, real_t> single_tree_prediction,
                                              int tree,
                                              int thread_num_rows)
   {
@@ -546,7 +561,7 @@ struct tree_aggregator_t<NITEMS, F, GROVE_PER_CLASS_MANY_CLASSES> {
     __syncthreads();
   }
 
-  __device__ INLINE_CONFIG void finalize(F* out,
+  __device__ INLINE_CONFIG void finalize(real_t* out,
                                          int num_rows,
                                          int num_outputs,
                                          output_t transform,
@@ -564,17 +579,17 @@ struct tree_aggregator_t<NITEMS, F, GROVE_PER_CLASS_MANY_CLASSES> {
   }
 };
 
-template <int NITEMS, typename F>
-struct tree_aggregator_t<NITEMS, F, VECTOR_LEAF> {
+template <int NITEMS, typename real_t>
+struct tree_aggregator_t<NITEMS, real_t, VECTOR_LEAF> {
   // per_class_margin is a row-major matrix
   // of size num_threads_per_class * num_classes
   // used to acccumulate class values
-  vec<NITEMS, F>* per_class_margin;
+  vec<NITEMS, real_t>* per_class_margin;
   vec<NITEMS, int>* vector_leaf_indices;
   int* thread_num_rows;
   int num_classes;
   int num_threads_per_class;
-  F* vector_leaf;
+  real_t* vector_leaf;
   void* tmp_storage;
 
   static size_t smem_finalize_footprint(size_t data_row_size,
@@ -583,20 +598,20 @@ struct tree_aggregator_t<NITEMS, F, VECTOR_LEAF> {
                                         bool predict_proba)
   {
     size_t phase1 = data_row_size + smem_accumulate_footprint(num_classes);
-    size_t phase2 = predict_proba ? block_reduce_footprint_host<NITEMS>()
-                                  : block_reduce_best_class_footprint_host<NITEMS>();
+    size_t phase2 = predict_proba ? block_reduce_footprint_host<NITEMS, real_t>()
+                                  : block_reduce_best_class_footprint_host<NITEMS, real_t>();
     return predict_proba ? phase1 + phase2 : std::max(phase1, phase2);
   }
   static size_t smem_accumulate_footprint(int num_classes)
   {
-    return sizeof(vec<NITEMS, F>) * num_classes * max(1, FIL_TPB / num_classes) +
+    return sizeof(vec<NITEMS, real_t>) * num_classes * max(1, FIL_TPB / num_classes) +
            sizeof(vec<NITEMS, int>) * FIL_TPB + sizeof(int) * FIL_TPB;
   }
 
   __device__ __forceinline__ tree_aggregator_t(predict_params params,
                                                void* accumulate_workspace,
                                                void* finalize_workspace,
-                                               F* vector_leaf)
+                                               real_t* vector_leaf)
     : num_classes(params.num_classes),
       num_threads_per_class(max(1, blockDim.x / params.num_classes)),
       vector_leaf(vector_leaf),
@@ -604,15 +619,15 @@ struct tree_aggregator_t<NITEMS, F, VECTOR_LEAF> {
   {
     // Assign workspace
     char* ptr        = (char*)accumulate_workspace;
-    per_class_margin = (vec<NITEMS, F>*)ptr;
-    ptr += sizeof(vec<NITEMS, F>) * num_classes * num_threads_per_class;
+    per_class_margin = (vec<NITEMS, real_t>*)ptr;
+    ptr += sizeof(vec<NITEMS, real_t>) * num_classes * num_threads_per_class;
     vector_leaf_indices = (vec<NITEMS, int>*)ptr;
     ptr += sizeof(vec<NITEMS, int>) * blockDim.x;
     thread_num_rows = (int*)ptr;
 
     // Initialise shared memory
     for (int i = threadIdx.x; i < num_classes * num_threads_per_class; i += blockDim.x) {
-      per_class_margin[i] = vec<NITEMS, F>();
+      per_class_margin[i] = vec<NITEMS, real_t>();
     }
     vector_leaf_indices[threadIdx.x] = vec<NITEMS, int>();
     thread_num_rows[threadIdx.x]     = 0;
@@ -639,13 +654,13 @@ struct tree_aggregator_t<NITEMS, F, VECTOR_LEAF> {
       // we have num_classes threads for each j
       for (int j = i / num_classes; j < blockDim.x; j += num_threads_per_class) {
         for (int item = 0; item < thread_num_rows[j]; ++item) {
-          F pred = vector_leaf[vector_leaf_indices[j][item] * num_classes + c];
+          real_t pred = vector_leaf[vector_leaf_indices[j][item] * num_classes + c];
           per_class_margin[i][item] += pred;
         }
       }
     }
   }
-  __device__ INLINE_CONFIG void finalize(F* out,
+  __device__ INLINE_CONFIG void finalize(real_t* out,
                                          int num_rows,
                                          int num_outputs,
                                          output_t transform,
@@ -670,8 +685,8 @@ struct tree_aggregator_t<NITEMS, F, VECTOR_LEAF> {
   }
 };
 
-template <int NITEMS, typename F>
-struct tree_aggregator_t<NITEMS, F, CATEGORICAL_LEAF> {
+template <int NITEMS, typename real_t>
+struct tree_aggregator_t<NITEMS, real_t, CATEGORICAL_LEAF> {
   // could switch to uint16_t to save shared memory
   // provided raft::myAtomicAdd(short*) simulated with appropriate shifts
   int* votes;
@@ -693,7 +708,7 @@ struct tree_aggregator_t<NITEMS, F, CATEGORICAL_LEAF> {
   __device__ __forceinline__ tree_aggregator_t(predict_params params,
                                                void* accumulate_workspace,
                                                void* finalize_workspace,
-                                               F* vector_leaf)
+                                               real_t* vector_leaf)
     : num_classes(params.num_classes), votes((int*)accumulate_workspace)
   {
     for (int c = threadIdx.x; c < num_classes; c += FIL_TPB * NITEMS)
@@ -714,7 +729,7 @@ struct tree_aggregator_t<NITEMS, F, CATEGORICAL_LEAF> {
   }
   // class probabilities or regression. for regression, num_classes
   // is just the number of outputs for each data instance
-  __device__ __forceinline__ void finalize_multiple_outputs(F* out, int num_rows)
+  __device__ __forceinline__ void finalize_multiple_outputs(real_t* out, int num_rows)
   {
     __syncthreads();
     for (int c = threadIdx.x; c < num_classes; c += blockDim.x) {
@@ -725,7 +740,7 @@ struct tree_aggregator_t<NITEMS, F, CATEGORICAL_LEAF> {
   }
   // using this when predicting a single class label, as opposed to sparse class vector
   // or class probabilities or regression
-  __device__ __forceinline__ void finalize_class_label(F* out, int num_rows)
+  __device__ __forceinline__ void finalize_class_label(real_t* out, int num_rows)
   {
     __syncthreads();  // make sure all votes[] are final
     int item = threadIdx.x;
@@ -742,7 +757,7 @@ struct tree_aggregator_t<NITEMS, F, CATEGORICAL_LEAF> {
       out[row] = best_class;
     }
   }
-  __device__ INLINE_CONFIG void finalize(F* out,
+  __device__ INLINE_CONFIG void finalize(real_t* out,
                                          int num_rows,
                                          int num_outputs,
                                          output_t transform,
@@ -758,9 +773,12 @@ struct tree_aggregator_t<NITEMS, F, CATEGORICAL_LEAF> {
   }
 };
 
-template <typename F>
-__device__ INLINE_CONFIG void load_data(
-  F* sdata, const F* block_input, predict_params params, int rows_per_block, int block_num_rows)
+template <typename real_t>
+__device__ INLINE_CONFIG void load_data(real_t* sdata,
+                                        const real_t* block_input,
+                                        predict_params params,
+                                        int rows_per_block,
+                                        int block_num_rows)
 {
   int num_cols     = params.num_cols;
   int sdata_stride = params.sdata_stride();
@@ -784,7 +802,7 @@ __device__ INLINE_CONFIG void load_data(
 }
 
 template <int NITEMS,
-          typename F,
+          typename real_t,
           leaf_algo_t leaf_algo,
           bool cols_in_shmem,
           bool CATS_SUPPORTED,
@@ -792,7 +810,7 @@ template <int NITEMS,
 __global__ void infer_k(storage_type forest, predict_params params)
 {
   extern __shared__ char smem[];
-  F* sdata           = (F*)smem;
+  real_t* sdata      = (real_t*)smem;
   int sdata_stride   = params.sdata_stride();
   int rows_per_block = NITEMS << params.log2_threads_per_tree;
   int num_cols       = params.num_cols;
@@ -801,11 +819,12 @@ __global__ void infer_k(storage_type forest, predict_params params)
        block_row0 += rows_per_block * gridDim.x) {
     int block_num_rows =
       max(0, (int)min((int64_t)rows_per_block, (int64_t)params.num_rows - block_row0));
-    const F* block_input = reinterpret_cast<const F*>(params.data) + block_row0 * num_cols;
+    const real_t* block_input =
+      reinterpret_cast<const real_t*>(params.data) + block_row0 * num_cols;
     if constexpr (cols_in_shmem)
       load_data(sdata, block_input, params, rows_per_block, block_num_rows);
 
-    tree_aggregator_t<NITEMS, F, leaf_algo> acc(
+    tree_aggregator_t<NITEMS, real_t, leaf_algo> acc(
       params, (char*)sdata + params.cols_shmem_size(), sdata, forest.vector_leaf_);
 
     __syncthreads();  // for both row cache init and acc init
@@ -821,7 +840,7 @@ __global__ void infer_k(storage_type forest, predict_params params)
          and is made exact below.
          Same with thread_num_rows > 0
       */
-      typedef typename leaf_output_t<leaf_algo, F>::T pred_t;
+      typedef typename leaf_output_t<leaf_algo, real_t>::T pred_t;
       vec<NITEMS, pred_t> prediction;
       if (tree < forest.num_trees() && thread_num_rows != 0) {
         prediction = infer_one_tree<NITEMS, CATS_SUPPORTED, pred_t>(
@@ -834,7 +853,7 @@ __global__ void infer_k(storage_type forest, predict_params params)
       // Dummy threads can be marked as having 0 rows
       acc.accumulate(prediction, tree, tree < forest.num_trees() ? thread_num_rows : 0);
     }
-    acc.finalize(reinterpret_cast<F*>(params.preds) + params.num_outputs * block_row0,
+    acc.finalize(reinterpret_cast<real_t*>(params.preds) + params.num_outputs * block_row0,
                  block_num_rows,
                  params.num_outputs,
                  params.transform,
@@ -844,13 +863,13 @@ __global__ void infer_k(storage_type forest, predict_params params)
   }
 }
 
-template <int NITEMS, typename F, leaf_algo_t leaf_algo>
+template <int NITEMS, typename real_t, leaf_algo_t leaf_algo>
 size_t shmem_size_params::get_smem_footprint()
 {
-  size_t finalize_footprint = tree_aggregator_t<NITEMS, F, leaf_algo>::smem_finalize_footprint(
+  size_t finalize_footprint = tree_aggregator_t<NITEMS, real_t, leaf_algo>::smem_finalize_footprint(
     cols_shmem_size(), num_classes, log2_threads_per_tree, predict_proba);
   size_t accumulate_footprint =
-    tree_aggregator_t<NITEMS, F, leaf_algo>::smem_accumulate_footprint(num_classes) +
+    tree_aggregator_t<NITEMS, real_t, leaf_algo>::smem_accumulate_footprint(num_classes) +
     cols_shmem_size();
   return std::max(accumulate_footprint, finalize_footprint);
 }
@@ -858,7 +877,7 @@ size_t shmem_size_params::get_smem_footprint()
 template <class KernelParams>
 int compute_smem_footprint::run(predict_params ssp)
 {
-  return ssp.template get_smem_footprint<KernelParams::N_ITEMS, KernelParams::LEAF_ALGO>();
+  return ssp.template get_smem_footprint<KernelParams::N_ITEMS, float, KernelParams::LEAF_ALGO>();
 }
 
 // make sure to instantiate all possible get_smem_footprint instantiations
@@ -894,16 +913,16 @@ void infer(storage_type forest, predict_params params, cudaStream_t stream)
   dispatch_on_fil_template_params(infer_k_storage_template<storage_type>(forest, stream), params);
 }
 
-struct instantiate_infer {
-  template <typename fil_node_t>
-  void operator()(fil_node_t)
-  {
-    infer<storage<fil_node_t>>(
-      storage<fil_node_t> forest, predict_params params, cudaStream_t stream);
-  }
-};
+// struct instantiate_infer {
+//   template <typename fil_node_t>
+//   void operator()(fil_node_t)
+//   {
+//     infer<storage<fil_node_t>>(
+//       storage<fil_node_t> forest, predict_params params, cudaStream_t stream);
+//   }
+// };
 
-template void instantiate_for_all_node_types(instantiate_infer);
+// template void instantiate_for_all_node_types(instantiate_infer);
 
 }  // namespace fil
 }  // namespace ML

From 40f7a238919482239f37c457d0adaa339b979263 Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Mon, 14 Mar 2022 19:23:41 +0100
Subject: [PATCH 22/45] Multiple changes.

---
 cpp/src/fil/common.cuh         |  1 +
 cpp/src/fil/fil.cu             | 32 +++++++++++++++++---------------
 cpp/src/fil/infer.cu           | 10 ++++++++++
 cpp/src/fil/internal.cuh       |  5 +++--
 cpp/src/fil/treelite_import.cu |  6 +++---
 5 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index a9f1933458..e7d7b2f11e 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -127,6 +127,7 @@ struct storage : storage_base<typename node_t_::real_t> {
   }
 };
 
+typedef storage<dense_node<float>> dense_storage;
 typedef storage<sparse_node16<float>> sparse_storage16;
 typedef storage<sparse_node8> sparse_storage8;
 
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index e6ff17737d..239938a3de 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -43,20 +43,20 @@ __host__ __device__ float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); }
     averaging (multiplying by inv_num_trees), adding global_bias (always done),
     sigmoid and applying threshold. in case of complement_proba,
     fills in the complement probability */
-template <typename F>
-__global__ void transform_k(F* preds,
+template <typename real_t>
+__global__ void transform_k(real_t* preds,
                             size_t n,
                             output_t output,
-                            F inv_num_trees,
-                            F threshold,
-                            F global_bias,
+                            real_t inv_num_trees,
+                            real_t threshold,
+                            real_t global_bias,
                             bool complement_proba)
 {
   size_t i = threadIdx.x + size_t(blockIdx.x) * blockDim.x;
   if (i >= n) return;
   if (complement_proba && i % 2 != 0) return;
 
-  F result = preds[i];
+  real_t result = preds[i];
   if ((output & output_t::AVG) != 0) result *= inv_num_trees;
   result += global_bias;
   if ((output & output_t::SIGMOID) != 0) result = sigmoid(result);
@@ -175,9 +175,9 @@ struct forest {
 
   virtual void infer(predict_params params, cudaStream_t stream) = 0;
 
-  template <typename F>
+  template <typename real_t>
   void predict(
-    const raft::handle_t& h, F* preds, const F* data, size_t num_rows, bool predict_proba)
+    const raft::handle_t& h, real_t* preds, const real_t* data, size_t num_rows, bool predict_proba)
   {
     // Initialize prediction parameters.
     predict_params params(predict_proba ? proba_ssp_ : class_ssp_);
@@ -256,7 +256,7 @@ struct forest {
     // Simulating treelite order, which cancels out bias.
     // If non-proba prediction used, it still will not matter
     // for the same reason softmax will not.
-    F global_bias         = (ot & output_t::SOFTMAX) != 0 ? F(0.0) : global_bias_;
+    real_t global_bias    = (ot & output_t::SOFTMAX) != 0 ? real_t(0.0) : global_bias_;
     bool complement_proba = false, do_transform;
 
     if (predict_proba) {
@@ -273,17 +273,19 @@ struct forest {
           // for GROVE_PER_CLASS, averaging happens in infer_k
           ot                 = output_t(ot & ~output_t::AVG);
           params.num_outputs = params.num_classes;
-          do_transform = (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != F(0.0);
+          do_transform =
+            (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != real_t(0.0);
           break;
         case leaf_algo_t::CATEGORICAL_LEAF:
           params.num_outputs = params.num_classes;
-          do_transform       = ot != output_t::RAW || global_bias_ != F(0.0);
+          do_transform       = ot != output_t::RAW || global_bias_ != real_t(0.0);
           break;
         case leaf_algo_t::VECTOR_LEAF:
           // for VECTOR_LEAF, averaging happens in infer_k
           ot                 = output_t(ot & ~output_t::AVG);
           params.num_outputs = params.num_classes;
-          do_transform = (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != F(0.0);
+          do_transform =
+            (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != real_t(0.0);
           break;
         default: ASSERT(false, "internal error: predict: invalid leaf_algo %d", params.leaf_algo);
       }
@@ -308,9 +310,9 @@ struct forest {
         preds,
         num_values_to_transform,
         ot,
-        num_trees_ > 0 ? (F(1.0) / num_trees_) : F(1.0),
-        F(threshold_),
-        F(global_bias),
+        num_trees_ > 0 ? (real_t(1.0) / num_trees_) : real_t(1.0),
+        real_t(threshold_),
+        real_t(global_bias),
         complement_proba);
       RAFT_CUDA_TRY(cudaPeekAtLastError());
     }
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 4eaaede85b..142e739cfb 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -913,6 +913,16 @@ void infer(storage_type forest, predict_params params, cudaStream_t stream)
   dispatch_on_fil_template_params(infer_k_storage_template<storage_type>(forest, stream), params);
 }
 
+template void infer<dense_storage>(dense_storage forest,
+                                   predict_params params,
+                                   cudaStream_t stream);
+template void infer<sparse_storage16>(sparse_storage16 forest,
+                                      predict_params params,
+                                      cudaStream_t stream);
+template void infer<sparse_storage8>(sparse_storage8 forest,
+                                     predict_params params,
+                                     cudaStream_t stream);
+
 // struct instantiate_infer {
 //   template <typename fil_node_t>
 //   void operator()(fil_node_t)
diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh
index fbfd687248..801b8c6564 100644
--- a/cpp/src/fil/internal.cuh
+++ b/cpp/src/fil/internal.cuh
@@ -393,7 +393,7 @@ struct categorical_sets {
   // set count is due to tree_idx + node_within_tree_idx are both ints, hence uint32_t result
   template <typename node_t>
   __host__ __device__ __forceinline__ int category_matches(node_t node,
-                                                           typename node_t::F category) const
+                                                           typename node_t::real_t category) const
   {
     // standard boolean packing. This layout has better ILP
     // node.set() is global across feature IDs and is an offset (as opposed
@@ -409,7 +409,8 @@ struct categorical_sets {
     FIL will reject a model where an integer within [0, fid_num_cats] cannot be represented
     precisely as a 32-bit float.
     */
-    return static_cast<float>(category) < fid_num_cats[node.fid()] && category >= 0.0f &&
+    return static_cast<typename node_t::real_t>(category) < fid_num_cats[node.fid()] &&
+           category >= 0.0f &&
            fetch_bit(bits + node.set(), static_cast<uint32_t>(static_cast<int>(category)));
   }
   static int sizeof_mask_from_num_cats(int num_cats)
diff --git a/cpp/src/fil/treelite_import.cu b/cpp/src/fil/treelite_import.cu
index ae941452e6..7fc90eab3b 100644
--- a/cpp/src/fil/treelite_import.cu
+++ b/cpp/src/fil/treelite_import.cu
@@ -655,11 +655,11 @@ void convert(const raft::handle_t& handle,
   tl2fil.init_forest(handle, pforest);
 }
 
-template <typename F>
+template <typename real_t>
 constexpr bool type_supported()
 {
   // not using std::is_floating_point because we did not instantiate fp16-based nodes/trees/forests
-  return std::is_same<F, float>() || std::is_same<F, double>();
+  return std::is_same<real_t, float>() || std::is_same<real_t, double>();
 }
 
 template <typename threshold_t, typename leaf_t>
@@ -681,7 +681,7 @@ void from_treelite(const raft::handle_t& handle,
       "This may lead to predictions with reduced accuracy.");
   }
   // same as std::common_type: float+double=double, float+int64_t=float
-  typedef decltype(threshold_t{} + leaf_t{}) F;
+  typedef decltype(threshold_t{} + leaf_t{}) real_t;
 
   storage_type_t storage_type = tl_params->storage_type;
   // build dense trees by default

From 12cc051913860aba9a182dd06813aa07912a9ae0 Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Mon, 14 Mar 2022 20:13:23 +0100
Subject: [PATCH 23/45] Fixed compilation errors; now it compiles.

---
 cpp/src/fil/infer.cu | 36 +++++++++++-------------------------
 1 file changed, 11 insertions(+), 25 deletions(-)

diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 142e739cfb..101a9ef826 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -48,7 +48,7 @@ struct vec;
 template <typename BinaryOp>
 struct Vectorized {
   BinaryOp op;
-  __device__ Vectorized(BinaryOp op_) : op(op_) {}
+  __host__ __device__ Vectorized(BinaryOp op_) : op(op_) {}
   template <int NITEMS, typename T>
   constexpr __host__ __device__ __forceinline__ vec<NITEMS, T> operator()(vec<NITEMS, T> a,
                                                                           vec<NITEMS, T> b) const
@@ -63,7 +63,7 @@ struct Vectorized {
 template <typename BinaryOp>
 constexpr __host__ __device__ Vectorized<BinaryOp> vectorized(BinaryOp op)
 {
-  return op;
+  return Vectorized<BinaryOp>(op);
 }
 
 template <int N, typename T>
@@ -369,25 +369,11 @@ __device__ __forceinline__ void write_best_class(
 }
 
 /// needed for softmax
-template <typename real_t>
-__device__ real_t shifted_exp(real_t margin, real_t max)
-{
-  // this is always false, since specializations are used for float and double
-  static_assert(std::is_same_v<real_t, float> || std::is_same_v<real_t, double>,
-                "only specializations of shifted_exp<real_t> can be used");
-}
-
-template <>
-__device__ double shifted_exp(double margin, double max)
-{
-  return exp(margin - max);
-}
+struct shifted_exp {
+  __device__ double operator()(double margin, double max) const { return exp(margin - max); }
 
-template <>
-__device__ float shifted_exp(float margin, float max)
-{
-  return expf(margin - max);
-}
+  __device__ float operator()(float margin, float max) const { return expf(margin - max); }
+};
 
 // *begin and *end shall be struct vec
 // tmp_storage may NOT overlap shared memory addressed by [begin, end)
@@ -395,12 +381,12 @@ template <typename Iterator>
 __device__ __forceinline__ void block_softmax(Iterator begin, Iterator end, void* tmp_storage)
 {
   // subtract max before exponentiating for numerical stability
-  using real_t = typename std::iterator_traits<Iterator>::value_type;
-  real_t max   = allreduce_shmem(begin, end, vectorized(cub::Max()), tmp_storage);
+  using value_type = typename std::iterator_traits<Iterator>::value_type;
+  value_type max   = allreduce_shmem(begin, end, vectorized(cub::Max()), tmp_storage);
   for (Iterator it = begin + threadIdx.x; it < end; it += blockDim.x)
-    *it = vectorized(shifted_exp<real_t>)(*it, max);
+    *it = vectorized(shifted_exp())(*it, max);
   // sum of exponents
-  real_t soe = allreduce_shmem(begin, end, vectorized(cub::Sum()), tmp_storage);
+  value_type soe = allreduce_shmem(begin, end, vectorized(cub::Sum()), tmp_storage);
   // softmax phase 2: normalization
   for (Iterator it = begin + threadIdx.x; it < end; it += blockDim.x)
     *it /= soe;
@@ -802,13 +788,13 @@ __device__ INLINE_CONFIG void load_data(real_t* sdata,
 }
 
 template <int NITEMS,
-          typename real_t,
           leaf_algo_t leaf_algo,
           bool cols_in_shmem,
           bool CATS_SUPPORTED,
           class storage_type>
 __global__ void infer_k(storage_type forest, predict_params params)
 {
+  using real_t = typename storage_type::real_t;
   extern __shared__ char smem[];
   real_t* sdata      = (real_t*)smem;
   int sdata_stride   = params.sdata_stride();

From 2bc6b6efdb5b281cb33f1637e320ff2596cf9560 Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Wed, 16 Mar 2022 23:52:56 +0100
Subject: [PATCH 24/45] float -> void in predict().

---
 cpp/include/cuml/fil/fil.h |  8 +++++---
 cpp/src/fil/fil.cu         | 21 +++++++--------------
 cpp/src/fil/infer.cu       |  2 +-
 3 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/cpp/include/cuml/fil/fil.h b/cpp/include/cuml/fil/fil.h
index dfa66ad1a8..98be3b5593 100644
--- a/cpp/include/cuml/fil/fil.h
+++ b/cpp/include/cuml/fil/fil.h
@@ -128,17 +128,19 @@ void free(const raft::handle_t& h, forest_t f);
  *  @param h cuML handle used by this function
  *  @param f forest used for predictions
  *  @param preds array in GPU memory to store predictions into
-        size == predict_proba ? (2*num_rows) : num_rows
+ *      size = predict_proba ? (2*num_rows) : num_rows
+ *      type = the type used for the forest representation (float or double)
  *  @param data array of size n * cols (cols is the number of columns
  *      for the forest f) from which to predict
+ *      type = the type used for the forest representation (float or double)
  *  @param num_rows number of data rows
  *  @param predict_proba for classifier models, this forces to output both class probabilities
  *      instead of binary class prediction. format matches scikit-learn API
  */
 void predict(const raft::handle_t& h,
              forest_t f,
-             float* preds,
-             const float* data,
+             void* preds,
+             const void* data,
              size_t num_rows,
              bool predict_proba = false);
 
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 239938a3de..3131f5227b 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -640,23 +640,16 @@ void free(const raft::handle_t& h, forest_t f)
 /// part of C API - overload instead of template
 void predict(const raft::handle_t& h,
              forest_t f,
-             double* preds,
-             const double* data,
+             void* preds,
+             const void* data,
              size_t num_rows,
              bool predict_proba)
 {
-  f->predict(h, preds, data, num_rows, predict_proba);
-}
-
-/// part of C API - overload instead of template
-void predict(const raft::handle_t& h,
-             forest_t f,
-             float* preds,
-             const float* data,
-             size_t num_rows,
-             bool predict_proba)
-{
-  f->predict(h, preds, data, num_rows, predict_proba);
+  f->predict(h,
+             reinterpret_cast<float*>(preds),
+             reinterpret_cast<const float*>(data),
+             num_rows,
+             predict_proba);
 }
 
 }  // namespace fil
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 101a9ef826..de64a2b262 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -36,7 +36,7 @@
 #endif  // __CUDA_ARCH__
 #endif  // CUDA_PRAGMA_UNROLL
 
-#define INLINE_CONFIG __forceinline__
+#define INLINE_CONFIG __noinline__
 
 namespace ML {
 namespace fil {

From 138e1dd38d588e86a06a12b06aae23274c2da73f Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Thu, 17 Mar 2022 00:24:34 +0100
Subject: [PATCH 25/45] Some templating.

---
 cpp/src/fil/fil.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 3131f5227b..8d6068f43c 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -61,11 +61,11 @@ __global__ void transform_k(real_t* preds,
   result += global_bias;
   if ((output & output_t::SIGMOID) != 0) result = sigmoid(result);
   // will not be done on CATEGORICAL_LEAF because the whole kernel will not run
-  if ((output & output_t::CLASS) != 0) { result = result > threshold ? 1.0f : 0.0f; }
+  if ((output & output_t::CLASS) != 0) { result = result > threshold ? real_t(1) : real_t(0); }
   // sklearn outputs numpy array in 'C' order, with the number of classes being last dimension
   // that is also the default order, so we should use the same one
   if (complement_proba) {
-    preds[i]     = 1.0f - result;
+    preds[i]     = real_t(1) - result;
     preds[i + 1] = result;
   } else
     preds[i] = result;

From 532686fe0c49b802523035759108ab674cd984b5 Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Thu, 17 Mar 2022 01:38:25 +0100
Subject: [PATCH 26/45] template_forest<real_t> for type-dependent forest
 members.

---
 cpp/src/fil/fil.cu       | 199 ++++++++++++++++++++++-----------------
 cpp/src/fil/internal.cuh |   4 +-
 2 files changed, 117 insertions(+), 86 deletions(-)

diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 8d6068f43c..100049a082 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -76,8 +76,11 @@ __global__ void transform_k(real_t* preds,
 // but rather one symbol for the whole template specialization, as below.
 extern template int dispatch_on_fil_template_params(compute_smem_footprint, predict_params);
 
+// forest is the base type for all forests and contains data and methods common
+// to both dense and sparse forests and independent of the floating-point type
+// used for model and data
 struct forest {
-  forest(const raft::handle_t& h) : vector_leaf_(0, h.get_stream()), cat_sets_(h.get_stream()) {}
+  forest(const raft::handle_t& h) : cat_sets_(h.get_stream()) {}
 
   void init_shmem_size(int device)
   {
@@ -131,7 +134,34 @@ struct forest {
     fixed_block_count_ = blocks_per_sm * sm_count;
   }
 
-  template <typename real_t>
+  virtual void predict(const raft::handle_t& h,
+                       void* preds,
+                       const void* data,
+                       size_t num_rows,
+                       bool predict_proba) = 0;
+
+  virtual void free(const raft::handle_t& h) { cat_sets_.release(); }
+
+  virtual ~forest() {}
+
+  int num_trees_      = 0;
+  int depth_          = 0;
+  algo_t algo_        = algo_t::NAIVE;
+  output_t output_    = output_t::RAW;
+  double threshold_   = 0.5;
+  double global_bias_ = 0;
+  shmem_size_params class_ssp_, proba_ssp_;
+  int fixed_block_count_ = 0;
+  int max_shm_           = 0;
+  cat_sets_device_owner cat_sets_;
+};
+
+// template_forest contains data and methods common for both dense and sparse forests
+// but dependent on the floating-point type used for model and data
+template <typename real_t>
+struct template_forest : public forest {
+  template_forest(const raft::handle_t& h) : forest(h), vector_leaf_(0, h.get_stream()) {}
+
   void init_common(const raft::handle_t& h,
                    const categorical_sets& cat_sets,
                    const std::vector<real_t>& vector_leaf,
@@ -175,8 +205,20 @@ struct forest {
 
   virtual void infer(predict_params params, cudaStream_t stream) = 0;
 
-  template <typename real_t>
-  void predict(
+  virtual void predict(const raft::handle_t& h,
+                       void* preds,
+                       const void* data,
+                       size_t num_rows,
+                       bool predict_proba) override
+  {
+    template_predict(h,
+                     reinterpret_cast<real_t*>(preds),
+                     reinterpret_cast<const real_t*>(data),
+                     num_rows,
+                     predict_proba);
+  }
+
+  void template_predict(
     const raft::handle_t& h, real_t* preds, const real_t* data, size_t num_rows, bool predict_proba)
   {
     // Initialize prediction parameters.
@@ -320,25 +362,13 @@ struct forest {
 
   virtual void free(const raft::handle_t& h)
   {
-    cat_sets_.release();
     vector_leaf_.release();
+    forest::free(h);
   }
 
-  virtual ~forest() {}
-
-  int num_trees_      = 0;
-  int depth_          = 0;
-  algo_t algo_        = algo_t::NAIVE;
-  output_t output_    = output_t::RAW;
-  double threshold_   = 0.5;
-  double global_bias_ = 0;
-  shmem_size_params class_ssp_, proba_ssp_;
-  int fixed_block_count_ = 0;
-  int max_shm_           = 0;
   // vector_leaf_ is only used if {class,proba}_ssp_.leaf_algo is VECTOR_LEAF,
   // otherwise it is empty
-  rmm::device_uvector<std::uint8_t> vector_leaf_;
-  cat_sets_device_owner cat_sets_;
+  rmm::device_uvector<real_t> vector_leaf_;
 };
 
 template <typename storage_type>
@@ -364,9 +394,9 @@ struct opt_into_arch_dependent_shmem : dispatch_functor<void> {
 };
 
 template <typename real_t>
-struct dense_forest<dense_node<real_t>> : forest {
+struct dense_forest<dense_node<real_t>> : template_forest<real_t> {
   using node_t = dense_node<real_t>;
-  dense_forest(const raft::handle_t& h) : forest(h), nodes_(0, h.get_stream()) {}
+  dense_forest(const raft::handle_t& h) : template_forest<real_t>(h), nodes_(0, h.get_stream()) {}
 
   void transform_trees(const node_t* nodes)
   {
@@ -379,14 +409,14 @@ struct dense_forest<dense_node<real_t>> : forest {
        roots of all trees (node 2), and so on.
     */
     int global_node = 0;
-    for (int tree = 0; tree < num_trees_; ++tree) {
+    for (int tree = 0; tree < this->num_trees_; ++tree) {
       int tree_node = 0;
       // the counters `level` and `branch` are not used for computing node
       // indices, they are only here to highlight the node ordering within
       // each tree
-      for (int level = 0; level <= depth_; ++level) {
+      for (int level = 0; level <= this->depth_; ++level) {
         for (int branch = 0; branch < 1 << level; ++branch) {
-          h_nodes_[tree_node * num_trees_ + tree] = nodes[global_node];
+          h_nodes_[tree_node * this->num_trees_ + tree] = nodes[global_node];
           ++tree_node;
           ++global_node;
         }
@@ -403,13 +433,13 @@ struct dense_forest<dense_node<real_t>> : forest {
             const node_t* nodes,
             const forest_params_t* params)
   {
-    init_common(h, cat_sets, vector_leaf, params);
-    if (algo_ == algo_t::NAIVE) algo_ = algo_t::BATCH_TREE_REORG;
+    this->init_common(h, cat_sets, vector_leaf, params);
+    if (this->algo_ == algo_t::NAIVE) this->algo_ = algo_t::BATCH_TREE_REORG;
 
-    int num_nodes = forest_num_nodes(num_trees_, depth_);
+    int num_nodes = forest_num_nodes(this->num_trees_, this->depth_);
     nodes_.resize(num_nodes, h.get_stream());
     h_nodes_.resize(num_nodes);
-    if (algo_ == algo_t::NAIVE) {
+    if (this->algo_ == algo_t::NAIVE) {
       std::copy(nodes, nodes + num_nodes, h_nodes_.begin());
     } else {
       transform_trees(nodes);
@@ -421,8 +451,8 @@ struct dense_forest<dense_node<real_t>> : forest {
                                   h.get_stream()));
 
     // predict_proba is a runtime parameter, and opt-in is unconditional
-    dispatch_on_fil_template_params(opt_into_arch_dependent_shmem<storage<node_t>>(max_shm_),
-                                    static_cast<predict_params>(class_ssp_));
+    dispatch_on_fil_template_params(opt_into_arch_dependent_shmem<storage<node_t>>(this->max_shm_),
+                                    static_cast<predict_params>(this->class_ssp_));
     // copy must be finished before freeing the host data
     h.sync_stream();
     h_nodes_.clear();
@@ -431,12 +461,12 @@ struct dense_forest<dense_node<real_t>> : forest {
 
   virtual void infer(predict_params params, cudaStream_t stream) override
   {
-    storage<node_t> forest(cat_sets_.accessor(),
-                           reinterpret_cast<real_t*>(vector_leaf_.data()),
+    storage<node_t> forest(this->cat_sets_.accessor(),
+                           reinterpret_cast<real_t*>(this->vector_leaf_.data()),
                            nodes_.data(),
-                           num_trees_,
-                           algo_ == algo_t::NAIVE ? tree_num_nodes(depth_) : 1,
-                           algo_ == algo_t::NAIVE ? 1 : num_trees_);
+                           this->num_trees_,
+                           this->algo_ == algo_t::NAIVE ? tree_num_nodes(this->depth_) : 1,
+                           this->algo_ == algo_t::NAIVE ? 1 : this->num_trees_);
     static_assert(std::is_same_v<real_t, float>,
                   "real_t must be float; to be removed in the following pull requests");
     fil::infer(forest, params, stream);
@@ -445,7 +475,7 @@ struct dense_forest<dense_node<real_t>> : forest {
   virtual void free(const raft::handle_t& h) override
   {
     nodes_.release();
-    forest::free(h);
+    template_forest<real_t>::free(h);
   }
 
   rmm::device_uvector<node_t> nodes_;
@@ -453,28 +483,35 @@ struct dense_forest<dense_node<real_t>> : forest {
 };
 
 template <typename node_t>
-struct sparse_forest : forest {
+struct sparse_forest : template_forest<typename node_t::real_t> {
+  using real_t = typename node_t::real_t;
+
   sparse_forest(const raft::handle_t& h)
-    : forest(h), trees_(0, h.get_stream()), nodes_(0, h.get_stream())
+    : template_forest<typename node_t::real_t>(h),
+      trees_(0, h.get_stream()),
+      nodes_(0, h.get_stream())
   {
   }
 
   void init(const raft::handle_t& h,
             const categorical_sets& cat_sets,
-            const std::vector<typename node_t::real_t>& vector_leaf,
+            const std::vector<real_t>& vector_leaf,
             const int* trees,
             const node_t* nodes,
             const forest_params_t* params)
   {
-    init_common(h, cat_sets, vector_leaf, params);
-    if (algo_ == algo_t::ALGO_AUTO) algo_ = algo_t::NAIVE;
-    depth_     = 0;  // a placeholder value
-    num_nodes_ = params->num_nodes;
+    this->init_common(h, cat_sets, vector_leaf, params);
+    if (this->algo_ == algo_t::ALGO_AUTO) this->algo_ = algo_t::NAIVE;
+    this->depth_ = 0;  // a placeholder value
+    num_nodes_   = params->num_nodes;
 
     // trees
-    trees_.resize(num_trees_, h.get_stream());
-    RAFT_CUDA_TRY(cudaMemcpyAsync(
-      trees_.data(), trees, sizeof(int) * num_trees_, cudaMemcpyHostToDevice, h.get_stream()));
+    trees_.resize(this->num_trees_, h.get_stream());
+    RAFT_CUDA_TRY(cudaMemcpyAsync(trees_.data(),
+                                  trees,
+                                  sizeof(int) * this->num_trees_,
+                                  cudaMemcpyHostToDevice,
+                                  h.get_stream()));
 
     // nodes
     nodes_.resize(num_nodes_, h.get_stream());
@@ -482,25 +519,25 @@ struct sparse_forest : forest {
       nodes_.data(), nodes, sizeof(node_t) * num_nodes_, cudaMemcpyHostToDevice, h.get_stream()));
 
     // predict_proba is a runtime parameter, and opt-in is unconditional
-    dispatch_on_fil_template_params(opt_into_arch_dependent_shmem<storage<node_t>>(max_shm_),
-                                    static_cast<predict_params>(class_ssp_));
+    dispatch_on_fil_template_params(opt_into_arch_dependent_shmem<storage<node_t>>(this->max_shm_),
+                                    static_cast<predict_params>(this->class_ssp_));
   }
 
   virtual void infer(predict_params params, cudaStream_t stream) override
   {
-    storage<node_t> forest(cat_sets_.accessor(),
-                           reinterpret_cast<typename node_t::real_t*>(vector_leaf_.data()),
+    storage<node_t> forest(this->cat_sets_.accessor(),
+                           reinterpret_cast<real_t*>(this->vector_leaf_.data()),
                            trees_.data(),
                            nodes_.data(),
-                           num_trees_);
-    static_assert(std::is_same_v<typename node_t::real_t, float>,
+                           this->num_trees_);
+    static_assert(std::is_same_v<real_t, float>,
                   "real_t must be float; to be removed in the following pull requests");
     fil::infer(forest, params, stream);
   }
 
   void free(const raft::handle_t& h) override
   {
-    forest::free(h);
+    template_forest<real_t>::free(h);
     trees_.release();
     nodes_.release();
   }
@@ -588,11 +625,11 @@ void check_params(const forest_params_t* params, bool dense)
 /** initializes a forest of any type
  * When fil_node_t == dense_node, const int* trees is ignored
  */
-template <typename fil_node_t>
+template <typename fil_node_t, typename real_t>
 void init(const raft::handle_t& h,
           forest_t* pf,
           const categorical_sets& cat_sets,
-          const std::vector<float>& vector_leaf,
+          const std::vector<real_t>& vector_leaf,
           const int* trees,
           const fil_node_t* nodes,
           const forest_params_t* params)
@@ -600,36 +637,34 @@ void init(const raft::handle_t& h,
   check_params(params, node_traits<fil_node_t>::IS_DENSE);
   using forest_type = typename node_traits<fil_node_t>::forest;
   forest_type* f    = new forest_type(h);
-  static_assert(std::is_same_v<typename fil_node_t::real_t, float>,
+  static_assert(std::is_same_v<real_t, float>,
                 "real_t must be float; to be removed in the following pull requests");
   f->init(h, cat_sets, vector_leaf, trees, nodes, params);
   *pf = f;
 }
 
-// explicit instantiations for init_sparse()
-template void init<sparse_node16<float>>(const raft::handle_t& h,
-                                         forest_t* pf,
-                                         const categorical_sets& cat_sets,
-                                         const std::vector<float>& vector_leaf,
-                                         const int* trees,
-                                         const sparse_node16<float>* nodes,
-                                         const forest_params_t* params);
-
-template void init<sparse_node8>(const raft::handle_t& h,
-                                 forest_t* pf,
-                                 const categorical_sets& cat_sets,
-                                 const std::vector<float>& vector_leaf,
-                                 const int* trees,
-                                 const sparse_node8* nodes,
-                                 const forest_params_t* params);
-
-template void init<dense_node<float>>(const raft::handle_t& h,
-                                      forest_t* pf,
-                                      const categorical_sets& cat_sets,
-                                      const std::vector<float>& vector_leaf,
-                                      const int* trees,
-                                      const dense_node<float>* nodes,
-                                      const forest_params_t* params);
+// explicit instantiations for init()
+template void init<sparse_node16<float>, float>(const raft::handle_t& h,
+                                                forest_t* pf,
+                                                const categorical_sets& cat_sets,
+                                                const std::vector<float>& vector_leaf,
+                                                const int* trees,
+                                                const sparse_node16<float>* nodes,
+                                                const forest_params_t* params);
+template void init<sparse_node8, float>(const raft::handle_t& h,
+                                        forest_t* pf,
+                                        const categorical_sets& cat_sets,
+                                        const std::vector<float>& vector_leaf,
+                                        const int* trees,
+                                        const sparse_node8* nodes,
+                                        const forest_params_t* params);
+template void init<dense_node<float>, float>(const raft::handle_t& h,
+                                             forest_t* pf,
+                                             const categorical_sets& cat_sets,
+                                             const std::vector<float>& vector_leaf,
+                                             const int* trees,
+                                             const dense_node<float>* nodes,
+                                             const forest_params_t* params);
 
 void free(const raft::handle_t& h, forest_t f)
 {
@@ -645,11 +680,7 @@ void predict(const raft::handle_t& h,
              size_t num_rows,
              bool predict_proba)
 {
-  f->predict(h,
-             reinterpret_cast<float*>(preds),
-             reinterpret_cast<const float*>(data),
-             num_rows,
-             predict_proba);
+  f->predict(h, preds, data, num_rows, predict_proba);
 }
 
 }  // namespace fil
diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh
index 801b8c6564..5223a141b3 100644
--- a/cpp/src/fil/internal.cuh
+++ b/cpp/src/fil/internal.cuh
@@ -561,11 +561,11 @@ struct cat_sets_device_owner {
  *  @param params pointer to parameters used to initialize the forest
  *  @param vector_leaf optional vector leaves
  */
-template <typename fil_node_t>
+template <typename fil_node_t, typename real_t = typename fil_node_t::real_t>
 void init(const raft::handle_t& h,
           forest_t* pf,
           const categorical_sets& cat_sets,
-          const std::vector<float>& vector_leaf,
+          const std::vector<real_t>& vector_leaf,
           const int* trees,
           const fil_node_t* nodes,
           const forest_params_t* params);

From ce689f7c2e0d97ae1a0b5ad43a20201f654a32b4 Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Thu, 17 Mar 2022 21:59:44 +0100
Subject: [PATCH 27/45] Instantiate forests with double.

---
 cpp/src/fil/common.cuh   |  8 +++++---
 cpp/src/fil/fil.cu       | 36 +++++++++++++++++++++---------------
 cpp/src/fil/infer.cu     | 29 ++++++++++++-----------------
 cpp/src/fil/internal.cuh |  4 ++--
 4 files changed, 40 insertions(+), 37 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index e7d7b2f11e..71cb1ad08d 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -127,9 +127,11 @@ struct storage : storage_base<typename node_t_::real_t> {
   }
 };
 
-typedef storage<dense_node<float>> dense_storage;
-typedef storage<sparse_node16<float>> sparse_storage16;
-typedef storage<sparse_node8> sparse_storage8;
+using dense_storage_f32    = storage<dense_node<float>>;
+using dense_storage_f64    = storage<dense_node<double>>;
+using sparse_storage16_f32 = storage<sparse_node16<float>>;
+using sparse_storage16_f64 = storage<sparse_node16<double>>;
+using sparse_storage8      = storage<sparse_node8>;
 
 /// all model parameters mostly required to compute shared memory footprint,
 /// also the footprint itself
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 100049a082..49e98f9e8b 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -379,8 +379,6 @@ struct opt_into_arch_dependent_shmem : dispatch_functor<void> {
   template <typename KernelParams = KernelTemplateParams<>>
   void run(predict_params p)
   {
-    static_assert(std::is_same_v<typename storage_type::real_t, float>,
-                  "real_t must be float; to be removed in the following pull requests");
     auto kernel = infer_k<KernelParams::N_ITEMS,
                           KernelParams::LEAF_ALGO,
                           KernelParams::COLS_IN_SHMEM,
@@ -467,8 +465,6 @@ struct dense_forest<dense_node<real_t>> : template_forest<real_t> {
                            this->num_trees_,
                            this->algo_ == algo_t::NAIVE ? tree_num_nodes(this->depth_) : 1,
                            this->algo_ == algo_t::NAIVE ? 1 : this->num_trees_);
-    static_assert(std::is_same_v<real_t, float>,
-                  "real_t must be float; to be removed in the following pull requests");
     fil::infer(forest, params, stream);
   }
 
@@ -530,8 +526,6 @@ struct sparse_forest : template_forest<typename node_t::real_t> {
                            trees_.data(),
                            nodes_.data(),
                            this->num_trees_);
-    static_assert(std::is_same_v<real_t, float>,
-                  "real_t must be float; to be removed in the following pull requests");
     fil::infer(forest, params, stream);
   }
 
@@ -637,13 +631,25 @@ void init(const raft::handle_t& h,
   check_params(params, node_traits<fil_node_t>::IS_DENSE);
   using forest_type = typename node_traits<fil_node_t>::forest;
   forest_type* f    = new forest_type(h);
-  static_assert(std::is_same_v<real_t, float>,
-                "real_t must be float; to be removed in the following pull requests");
   f->init(h, cat_sets, vector_leaf, trees, nodes, params);
   *pf = f;
 }
 
 // explicit instantiations for init()
+template void init<dense_node<float>, float>(const raft::handle_t& h,
+                                             forest_t* pf,
+                                             const categorical_sets& cat_sets,
+                                             const std::vector<float>& vector_leaf,
+                                             const int* trees,
+                                             const dense_node<float>* nodes,
+                                             const forest_params_t* params);
+template void init<dense_node<double>, double>(const raft::handle_t& h,
+                                               forest_t* pf,
+                                               const categorical_sets& cat_sets,
+                                               const std::vector<double>& vector_leaf,
+                                               const int* trees,
+                                               const dense_node<double>* nodes,
+                                               const forest_params_t* params);
 template void init<sparse_node16<float>, float>(const raft::handle_t& h,
                                                 forest_t* pf,
                                                 const categorical_sets& cat_sets,
@@ -651,6 +657,13 @@ template void init<sparse_node16<float>, float>(const raft::handle_t& h,
                                                 const int* trees,
                                                 const sparse_node16<float>* nodes,
                                                 const forest_params_t* params);
+template void init<sparse_node16<double>, double>(const raft::handle_t& h,
+                                                  forest_t* pf,
+                                                  const categorical_sets& cat_sets,
+                                                  const std::vector<double>& vector_leaf,
+                                                  const int* trees,
+                                                  const sparse_node16<double>* nodes,
+                                                  const forest_params_t* params);
 template void init<sparse_node8, float>(const raft::handle_t& h,
                                         forest_t* pf,
                                         const categorical_sets& cat_sets,
@@ -658,13 +671,6 @@ template void init<sparse_node8, float>(const raft::handle_t& h,
                                         const int* trees,
                                         const sparse_node8* nodes,
                                         const forest_params_t* params);
-template void init<dense_node<float>, float>(const raft::handle_t& h,
-                                             forest_t* pf,
-                                             const categorical_sets& cat_sets,
-                                             const std::vector<float>& vector_leaf,
-                                             const int* trees,
-                                             const dense_node<float>* nodes,
-                                             const forest_params_t* params);
 
 void free(const raft::handle_t& h, forest_t f)
 {
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index de64a2b262..233b1bec5d 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -899,26 +899,21 @@ void infer(storage_type forest, predict_params params, cudaStream_t stream)
   dispatch_on_fil_template_params(infer_k_storage_template<storage_type>(forest, stream), params);
 }
 
-template void infer<dense_storage>(dense_storage forest,
-                                   predict_params params,
-                                   cudaStream_t stream);
-template void infer<sparse_storage16>(sparse_storage16 forest,
-                                      predict_params params,
-                                      cudaStream_t stream);
+template void infer<dense_storage_f32>(dense_storage_f32 forest,
+                                       predict_params params,
+                                       cudaStream_t stream);
+template void infer<dense_storage_f64>(dense_storage_f64 forest,
+                                       predict_params params,
+                                       cudaStream_t stream);
+template void infer<sparse_storage16_f32>(sparse_storage16_f32 forest,
+                                          predict_params params,
+                                          cudaStream_t stream);
+template void infer<sparse_storage16_f64>(sparse_storage16_f64 forest,
+                                          predict_params params,
+                                          cudaStream_t stream);
 template void infer<sparse_storage8>(sparse_storage8 forest,
                                      predict_params params,
                                      cudaStream_t stream);
 
-// struct instantiate_infer {
-//   template <typename fil_node_t>
-//   void operator()(fil_node_t)
-//   {
-//     infer<storage<fil_node_t>>(
-//       storage<fil_node_t> forest, predict_params params, cudaStream_t stream);
-//   }
-// };
-
-// template void instantiate_for_all_node_types(instantiate_infer);
-
 }  // namespace fil
 }  // namespace ML
diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh
index 5223a141b3..0f39270588 100644
--- a/cpp/src/fil/internal.cuh
+++ b/cpp/src/fil/internal.cuh
@@ -409,8 +409,8 @@ struct categorical_sets {
     FIL will reject a model where an integer within [0, fid_num_cats] cannot be represented
     precisely as a 32-bit float.
     */
-    return static_cast<typename node_t::real_t>(category) < fid_num_cats[node.fid()] &&
-           category >= 0.0f &&
+    using real_t = typename node_t::real_t;
+    return category < static_cast<real_t>(fid_num_cats[node.fid()]) && category >= real_t(0) &&
            fetch_bit(bits + node.set(), static_cast<uint32_t>(static_cast<int>(category)));
   }
   static int sizeof_mask_from_num_cats(int num_cats)

From 98b997a0ea934db899b12b12881a375563c4801b Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Thu, 17 Mar 2022 23:27:07 +0100
Subject: [PATCH 28/45] Small changes.

---
 cpp/src/fil/fil.cu       | 38 +++++++++++++++++++-------------------
 cpp/src/fil/internal.cuh |  4 ++--
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 49e98f9e8b..43ec9fcad4 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -144,12 +144,10 @@ struct forest {
 
   virtual ~forest() {}
 
-  int num_trees_      = 0;
-  int depth_          = 0;
-  algo_t algo_        = algo_t::NAIVE;
-  output_t output_    = output_t::RAW;
-  double threshold_   = 0.5;
-  double global_bias_ = 0;
+  int num_trees_   = 0;
+  int depth_       = 0;
+  algo_t algo_     = algo_t::NAIVE;
+  output_t output_ = output_t::RAW;
   shmem_size_params class_ssp_, proba_ssp_;
   int fixed_block_count_ = 0;
   int max_shm_           = 0;
@@ -171,8 +169,8 @@ struct template_forest : public forest {
     num_trees_                       = params->num_trees;
     algo_                            = params->algo;
     output_                          = params->output;
-    threshold_                       = params->threshold;
-    global_bias_                     = params->global_bias;
+    threshold_                       = static_cast<real_t>(params->threshold);
+    global_bias_                     = static_cast<real_t>(params->global_bias);
     proba_ssp_.n_items               = params->n_items;
     proba_ssp_.log2_threads_per_tree = log2(params->threads_per_tree);
     proba_ssp_.leaf_algo             = params->leaf_algo;
@@ -298,7 +296,7 @@ struct template_forest : public forest {
     // Simulating treelite order, which cancels out bias.
     // If non-proba prediction used, it still will not matter
     // for the same reason softmax will not.
-    real_t global_bias    = (ot & output_t::SOFTMAX) != 0 ? real_t(0.0) : global_bias_;
+    real_t global_bias    = (ot & output_t::SOFTMAX) != 0 ? real_t(0) : global_bias_;
     bool complement_proba = false, do_transform;
 
     if (predict_proba) {
@@ -316,24 +314,24 @@ struct template_forest : public forest {
           ot                 = output_t(ot & ~output_t::AVG);
           params.num_outputs = params.num_classes;
           do_transform =
-            (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != real_t(0.0);
+            (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != real_t(0);
           break;
         case leaf_algo_t::CATEGORICAL_LEAF:
           params.num_outputs = params.num_classes;
-          do_transform       = ot != output_t::RAW || global_bias_ != real_t(0.0);
+          do_transform       = ot != output_t::RAW || global_bias_ != real_t(0);
           break;
         case leaf_algo_t::VECTOR_LEAF:
           // for VECTOR_LEAF, averaging happens in infer_k
           ot                 = output_t(ot & ~output_t::AVG);
           params.num_outputs = params.num_classes;
           do_transform =
-            (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != real_t(0.0);
+            (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != real_t(0);
           break;
         default: ASSERT(false, "internal error: predict: invalid leaf_algo %d", params.leaf_algo);
       }
     } else {
       if (params.leaf_algo == leaf_algo_t::FLOAT_UNARY_BINARY) {
-        do_transform = ot != output_t::RAW || global_bias_ != 0.0f;
+        do_transform = ot != output_t::RAW || global_bias_ != real_t(0);
       } else {
         // GROVE_PER_CLASS, CATEGORICAL_LEAF: moot since choosing best class and
         // all transforms are monotonic. also, would break current code
@@ -352,9 +350,9 @@ struct template_forest : public forest {
         preds,
         num_values_to_transform,
         ot,
-        num_trees_ > 0 ? (real_t(1.0) / num_trees_) : real_t(1.0),
-        real_t(threshold_),
-        real_t(global_bias),
+        num_trees_ > 0 ? (real_t(1) / num_trees_) : real_t(1),
+        threshold_,
+        global_bias,
         complement_proba);
       RAFT_CUDA_TRY(cudaPeekAtLastError());
     }
@@ -366,6 +364,8 @@ struct template_forest : public forest {
     forest::free(h);
   }
 
+  real_t threshold_   = 0.5;
+  real_t global_bias_ = 0;
   // vector_leaf_ is only used if {class,proba}_ssp_.leaf_algo is VECTOR_LEAF,
   // otherwise it is empty
   rmm::device_uvector<real_t> vector_leaf_;
@@ -460,7 +460,7 @@ struct dense_forest<dense_node<real_t>> : template_forest<real_t> {
   virtual void infer(predict_params params, cudaStream_t stream) override
   {
     storage<node_t> forest(this->cat_sets_.accessor(),
-                           reinterpret_cast<real_t*>(this->vector_leaf_.data()),
+                           this->vector_leaf_.data(),
                            nodes_.data(),
                            this->num_trees_,
                            this->algo_ == algo_t::NAIVE ? tree_num_nodes(this->depth_) : 1,
@@ -522,7 +522,7 @@ struct sparse_forest : template_forest<typename node_t::real_t> {
   virtual void infer(predict_params params, cudaStream_t stream) override
   {
     storage<node_t> forest(this->cat_sets_.accessor(),
-                           reinterpret_cast<real_t*>(this->vector_leaf_.data()),
+                           this->vector_leaf_.data(),
                            trees_.data(),
                            nodes_.data(),
                            this->num_trees_);
@@ -678,7 +678,7 @@ void free(const raft::handle_t& h, forest_t f)
   delete f;
 }
 
-/// part of C API - overload instead of template
+/// part of C API - preds and data are either both pointers to float or to double
 void predict(const raft::handle_t& h,
              forest_t f,
              void* preds,
diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh
index 0f39270588..37b600d201 100644
--- a/cpp/src/fil/internal.cuh
+++ b/cpp/src/fil/internal.cuh
@@ -242,7 +242,7 @@ struct node_traits {
   using forest               = sparse_forest<node_t>;
   static const bool IS_DENSE = false;
   static const storage_type_t storage_type_enum =
-    std::is_same<sparse_node16<typename node_t::real_t>, node_t>() ? SPARSE : SPARSE8;
+    std::is_same<sparse_node16<real_t>, node_t>() ? SPARSE : SPARSE8;
   template <typename threshold_t, typename leaf_t>
   static void check(const treelite::ModelImpl<threshold_t, leaf_t>& model);
 };
@@ -431,7 +431,7 @@ struct tree_base {
   template <bool CATS_SUPPORTED, typename node_t>
   __host__ __device__ __forceinline__ int child_index(const node_t& node,
                                                       int node_idx,
-                                                      float val) const
+                                                      typename node_t::real_t val) const
   {
     bool cond;
 

From 8c84cf7d0663ee8ff764792a2c1cb694eff70e6b Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Fri, 18 Mar 2022 03:14:00 +0100
Subject: [PATCH 29/45] Templatized BaseFilTest.

---
 cpp/test/sg/fil_test.cu | 195 +++++++++++++++++++++-------------------
 1 file changed, 103 insertions(+), 92 deletions(-)

diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu
index 74ab0c4d00..893c6bf92a 100644
--- a/cpp/test/sg/fil_test.cu
+++ b/cpp/test/sg/fil_test.cu
@@ -131,7 +131,8 @@ std::ostream& operator<<(std::ostream& os, const FilTestParams& ps)
   return os;
 }
 
-__global__ void nan_kernel(float* data, const bool* mask, int len, float nan)
+template <typename real_t>
+__global__ void nan_kernel(real_t* data, const bool* mask, int len, real_t nan)
 {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid >= len) return;
@@ -139,6 +140,7 @@ __global__ void nan_kernel(float* data, const bool* mask, int len, float nan)
 }
 
 float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); }
+double sigmoid(double x) { return 1.0 / (1.0 + exp(-x)); }
 
 void hard_clipped_bernoulli(
   raft::random::Rng rng, float* d, std::size_t n_vals, float prob_of_zero, cudaStream_t stream)
@@ -154,23 +156,24 @@ void hard_clipped_bernoulli(
     });
 }
 
+template <typename real_t>
 struct replace_some_floating_with_categorical {
   float* fid_num_cats_d;
   int num_cols;
-  __device__ float operator()(float data, int data_idx)
+  __device__ real_t operator()(real_t data, int data_idx)
   {
-    float fid_num_cats = fid_num_cats_d[data_idx % num_cols];
-    if (fid_num_cats == 0.0f) return data;
+    real_t fid_num_cats = fid_num_cats_d[data_idx % num_cols];
+    if (fid_num_cats == real_t(0)) return data;
     // Transform `data` from (uniform on) [-1.0, 1.0] into [-fid_num_cats-3, fid_num_cats+3].
-    float tmp = data * (fid_num_cats + 3.0f);
+    real_t tmp = data * (fid_num_cats + real_t(3));
     // Also test invalid (negative and above fid_num_cats) categories: samples within
     // [fid_num_cats+2.5, fid_num_cats+3) and opposite will test infinite floats as categorical.
-    if (tmp + fid_num_cats < -2.5f) return -INFINITY;
-    if (tmp - fid_num_cats > +2.5f) return +INFINITY;
+    if (tmp + fid_num_cats < real_t(-2.5f)) return -INFINITY;
+    if (tmp - fid_num_cats > real_t(+2.5f)) return +INFINITY;
     // Samples within [fid_num_cats+2, fid_num_cats+2.5) (and their negative counterparts) will
     // test huge invalid categories.
-    if (tmp + fid_num_cats < -2.0f) tmp -= MAX_FIL_INT_FLOAT;
-    if (tmp - fid_num_cats > +2.0f) tmp += MAX_FIL_INT_FLOAT;
+    if (tmp + fid_num_cats < real_t(-2.0f)) tmp -= MAX_FIL_INT_FLOAT;
+    if (tmp - fid_num_cats > real_t(+2.0f)) tmp += MAX_FIL_INT_FLOAT;
     // Samples within [0, fid_num_cats+2) will be valid categories, rounded towards 0 with a cast.
     // Negative categories are always invalid. For correct interpretation, see
     // cpp/src/fil/internal.cuh `int category_matches(node_t node, float category)`
@@ -178,7 +181,8 @@ struct replace_some_floating_with_categorical {
   }
 };
 
-__global__ void floats_to_bit_stream_k(uint8_t* dst, float* src, std::size_t size)
+template <typename real_t>
+__global__ void floats_to_bit_stream_k(uint8_t* dst, real_t* src, std::size_t size)
 {
   std::size_t idx = std::size_t(blockIdx.x) * blockDim.x + threadIdx.x;
   if (idx >= size) return;
@@ -221,6 +225,7 @@ void adjust_threshold_to_treelite(
   }
 }
 
+template <typename real_t>
 class BaseFilTest : public testing::TestWithParam<FilTestParams> {
  public:
   BaseFilTest()
@@ -252,9 +257,9 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
     size_t num_nodes = forest_num_nodes();
 
     // helper data
-    /// weights, used as float* or int*
-    rmm::device_uvector<int> weights_d(num_nodes, stream);
-    rmm::device_uvector<float> thresholds_d(num_nodes, stream);
+    rmm::device_uvector<int> weights_int_d(num_nodes, stream);
+    rmm::device_uvector<real_t> weights_real_d(num_nodes, stream);
+    rmm::device_uvector<real_t> thresholds_d(num_nodes, stream);
     rmm::device_uvector<bool> def_lefts_d(num_nodes, stream);
     rmm::device_uvector<bool> is_leafs_d(num_nodes, stream);
     rmm::device_uvector<float> is_categoricals_d(num_nodes, stream);
@@ -266,25 +271,25 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
     raft::random::Rng r(ps.seed);
     if (ps.leaf_algo == fil::leaf_algo_t::CATEGORICAL_LEAF) {
       // [0..num_classes)
-      r.uniformInt((int*)weights_d.data(), num_nodes, 0, ps.num_classes, stream);
+      r.uniformInt(weights_int_d.data(), num_nodes, 0, ps.num_classes, stream);
     } else if (ps.leaf_algo == fil::leaf_algo_t::VECTOR_LEAF) {
       std::mt19937 gen(3);
-      std::uniform_real_distribution<> dist(0, 1);
+      std::uniform_real_distribution<real_t> dist(0, 1);
       vector_leaf.resize(num_nodes * ps.num_classes);
       for (size_t i = 0; i < vector_leaf.size(); i++) {
         vector_leaf[i] = dist(gen);
       }
       // Normalise probabilities to 1
       for (size_t i = 0; i < vector_leaf.size(); i += ps.num_classes) {
-        auto sum = std::accumulate(&vector_leaf[i], &vector_leaf[i + ps.num_classes], 0.0f);
+        auto sum = std::accumulate(&vector_leaf[i], &vector_leaf[i + ps.num_classes], real_t(0));
         for (size_t j = i; j < i + ps.num_classes; j++) {
           vector_leaf[j] /= sum;
         }
       }
     } else {
-      r.uniform((float*)weights_d.data(), num_nodes, -1.0f, 1.0f, stream);
+      r.uniform(weights_real_d.data(), num_nodes, real_t(-1), real_t(1), stream);
     }
-    r.uniform(thresholds_d.data(), num_nodes, -1.0f, 1.0f, stream);
+    r.uniform(thresholds_d.data(), num_nodes, real_t(-1), real_t(1), stream);
     r.uniformInt(fids_d.data(), num_nodes, 0, ps.num_cols, stream);
     r.bernoulli(def_lefts_d.data(), num_nodes, 0.5f, stream);
     r.bernoulli(is_leafs_d.data(), num_nodes, 1.0f - ps.leaf_prob, stream);
@@ -292,8 +297,9 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
       r, is_categoricals_d.data(), num_nodes, 1.0f - ps.node_categorical_prob, stream);
 
     // copy data to host
-    std::vector<float> thresholds_h(num_nodes), is_categoricals_h(num_nodes);
-    std::vector<int> weights_h(num_nodes), fids_h(num_nodes), node_cat_set(num_nodes);
+    std::vector<real_t> thresholds_h(num_nodes), weights_real_h(num_nodes);
+    std::vector<float> is_categoricals_h(num_nodes);
+    std::vector<int> weights_int_h(num_nodes), fids_h(num_nodes), node_cat_set(num_nodes);
     std::vector<float> fid_num_cats_h(ps.num_cols);
     std::vector<bool> feature_categorical(ps.num_cols);
     // bool vectors are not guaranteed to be stored byte-per-value
@@ -320,7 +326,8 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
         cat_sets_h.fid_num_cats[fid] = 0.0f;
       }
     }
-    raft::update_host(weights_h.data(), (int*)weights_d.data(), num_nodes, stream);
+    raft::update_host(weights_int_h.data(), weights_int_d.data(), num_nodes, stream);
+    raft::update_host(weights_real_h.data(), weights_real_d.data(), num_nodes, stream);
     raft::update_host(thresholds_h.data(), thresholds_d.data(), num_nodes, stream);
     raft::update_host(fids_h.data(), fids_d.data(), num_nodes, stream);
     raft::update_host(def_lefts_h, def_lefts_d.data(), num_nodes, stream);
@@ -346,7 +353,7 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
 
       if (!feature_categorical[fid] || is_leafs_h[node_id]) is_categoricals_h[node_id] = 0.0f;
 
-      if (is_categoricals_h[node_id] == 1.0) {
+      if (is_categoricals_h[node_id] == 1.0f) {
         // might allocate a categorical set for an unreachable inner node. That's OK.
         ++cat_sets_h.n_nodes[fid];
         node_cat_set[node_id] = bit_pool_size;
@@ -378,27 +385,27 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
     // initialize nodes
     nodes.resize(num_nodes);
     for (size_t i = 0; i < num_nodes; ++i) {
-      fil::val_t<float> w;
+      fil::val_t<real_t> w;
       switch (ps.leaf_algo) {
-        case fil::leaf_algo_t::CATEGORICAL_LEAF: w.idx = weights_h[i]; break;
+        case fil::leaf_algo_t::CATEGORICAL_LEAF: w.idx = weights_int_h[i]; break;
         case fil::leaf_algo_t::FLOAT_UNARY_BINARY:
         case fil::leaf_algo_t::GROVE_PER_CLASS:
           // not relying on fil::val_t<float> internals
           // merely that we copied floats into weights_h earlier
-          std::memcpy(&w.f, &weights_h[i], sizeof w.f);
+          w.f = weights_real_h[i];
           break;
         case fil::leaf_algo_t::VECTOR_LEAF: w.idx = i; break;
         default: ASSERT(false, "internal error: invalid ps.leaf_algo");
       }
       // make sure nodes are categorical only when their feature ID is categorical
       bool is_categorical = is_categoricals_h[i] == 1.0f;
-      val_t<float> split;
+      val_t<real_t> split;
       if (is_categorical)
         split.idx = node_cat_set[i];
       else
         split.f = thresholds_h[i];
       nodes[i] =
-        fil::dense_node<float>(w, split, fids_h[i], def_lefts_h[i], is_leafs_h[i], is_categorical);
+        fil::dense_node<real_t>(w, split, fids_h[i], def_lefts_h[i], is_leafs_h[i], is_categorical);
     }
 
     // clean up
@@ -416,17 +423,18 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
 
     // generate random data
     raft::random::Rng r(ps.seed);
-    r.uniform(data_d.data(), num_data, -1.0f, 1.0f, stream);
-    thrust::transform(thrust::cuda::par.on(stream),
-                      data_d.data(),
-                      data_d.data() + num_data,
-                      thrust::counting_iterator(0),
-                      data_d.data(),
-                      replace_some_floating_with_categorical{fid_num_cats_d.data(), ps.num_cols});
+    r.uniform(data_d.data(), num_data, real_t(-1), real_t(1), stream);
+    thrust::transform(
+      thrust::cuda::par.on(stream),
+      data_d.data(),
+      data_d.data() + num_data,
+      thrust::counting_iterator(0),
+      data_d.data(),
+      replace_some_floating_with_categorical<real_t>{fid_num_cats_d.data(), ps.num_cols});
     r.bernoulli(mask_d.data(), num_data, ps.nan_prob, stream);
     int tpb = 256;
     nan_kernel<<<raft::ceildiv(int(num_data), tpb), tpb, 0, stream>>>(
-      data_d.data(), mask_d.data(), num_data, std::numeric_limits<float>::quiet_NaN());
+      data_d.data(), mask_d.data(), num_data, std::numeric_limits<real_t>::quiet_NaN());
     RAFT_CUDA_TRY(cudaPeekAtLastError());
 
     // copy to host
@@ -435,48 +443,48 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
     handle.sync_stream();
   }
 
-  void apply_softmax(float* class_scores)
+  void apply_softmax(real_t* class_scores)
   {
-    float max = *std::max_element(class_scores, &class_scores[ps.num_classes]);
+    real_t max = *std::max_element(class_scores, &class_scores[ps.num_classes]);
     for (int i = 0; i < ps.num_classes; ++i)
-      class_scores[i] = expf(class_scores[i] - max);
-    float sum = std::accumulate(class_scores, &class_scores[ps.num_classes], 0.0f);
+      class_scores[i] = exp(class_scores[i] - max);
+    real_t sum = std::accumulate(class_scores, &class_scores[ps.num_classes], real_t(0));
     for (int i = 0; i < ps.num_classes; ++i)
       class_scores[i] /= sum;
   }
 
-  void transform(float f, float& proba, float& output)
+  void transform(real_t f, real_t& proba, real_t& output)
   {
     if ((ps.output & fil::output_t::AVG) != 0) {
       if (ps.leaf_algo == fil::leaf_algo_t::GROVE_PER_CLASS) {
         f /= ps.num_trees / ps.num_classes;
       } else {
-        f *= 1.0f / ps.num_trees;
+        f *= real_t(1) / ps.num_trees;
       }
     }
     f += ps.global_bias;
     if ((ps.output & fil::output_t::SIGMOID) != 0) { f = sigmoid(f); }
     proba = f;
-    if ((ps.output & fil::output_t::CLASS) != 0) { f = f > ps.threshold ? 1.0f : 0.0f; }
+    if ((ps.output & fil::output_t::CLASS) != 0) { f = f > ps.threshold ? real_t(1) : real_t(0); }
     output = f;
   }
 
-  void complement(float* proba) { proba[0] = 1.0f - proba[1]; }
+  void complement(real_t* proba) { proba[0] = real_t(1) - proba[1]; }
 
   void predict_on_cpu()
   {
     auto stream = handle.get_stream();
     // predict on host
-    std::vector<float> want_preds_h(ps.num_preds_outputs());
+    std::vector<real_t> want_preds_h(ps.num_preds_outputs());
     want_proba_h.resize(ps.num_proba_outputs());
     int num_nodes = tree_num_nodes();
-    std::vector<float> class_scores(ps.num_classes);
+    std::vector<real_t> class_scores(ps.num_classes);
     // we use tree_base::child_index() on CPU
     tree_base base{cat_sets_h.accessor()};
     switch (ps.leaf_algo) {
       case fil::leaf_algo_t::FLOAT_UNARY_BINARY:
         for (int i = 0; i < ps.num_rows; ++i) {
-          float pred = 0.0f;
+          real_t pred = 0;
           for (int j = 0; j < ps.num_trees; ++j) {
             pred += infer_one_tree(&nodes[j * num_nodes], &data_h[i * ps.num_cols], base).f;
           }
@@ -486,7 +494,7 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
         break;
       case fil::leaf_algo_t::GROVE_PER_CLASS:
         for (int row = 0; row < ps.num_rows; ++row) {
-          std::fill(class_scores.begin(), class_scores.end(), 0.0f);
+          std::fill(class_scores.begin(), class_scores.end(), real_t(0));
           for (int tree = 0; tree < ps.num_trees; ++tree) {
             class_scores[tree % ps.num_classes] +=
               infer_one_tree(&nodes[tree * num_nodes], &data_h[row * ps.num_cols], base).f;
@@ -494,7 +502,7 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
           want_preds_h[row] =
             std::max_element(class_scores.begin(), class_scores.end()) - class_scores.begin();
           for (int c = 0; c < ps.num_classes; ++c) {
-            float thresholded_proba;  // not used;
+            real_t thresholded_proba;  // not used;
             transform(class_scores[c], want_proba_h[row * ps.num_classes + c], thresholded_proba);
           }
           if ((ps.output & fil::output_t::SOFTMAX) != 0)
@@ -511,7 +519,7 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
             ++class_votes[class_label];
           }
           for (int c = 0; c < ps.num_classes; ++c) {
-            float thresholded_proba;  // not used; do argmax instead
+            real_t thresholded_proba;  // not used; do argmax instead
             transform(class_votes[c], want_proba_h[r * ps.num_classes + c], thresholded_proba);
           }
           want_preds_h[r] =
@@ -521,16 +529,16 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
       }
       case fil::leaf_algo_t::VECTOR_LEAF:
         for (int r = 0; r < ps.num_rows; ++r) {
-          std::vector<float> class_probabilities(ps.num_classes);
+          std::vector<real_t> class_probabilities(ps.num_classes);
           for (int j = 0; j < ps.num_trees; ++j) {
             int vector_index =
               infer_one_tree(&nodes[j * num_nodes], &data_h[r * ps.num_cols], base).idx;
-            float sum = 0.0;
+            real_t sum = 0;
             for (int k = 0; k < ps.num_classes; k++) {
               class_probabilities[k] += vector_leaf[vector_index * ps.num_classes + k];
               sum += vector_leaf[vector_index * ps.num_classes + k];
             }
-            ASSERT_LE(std::abs(sum - 1.0f), 1e-5);
+            ASSERT_LE(std::abs(sum - real_t(1)), real_t(1e-5));
           }
 
           for (int c = 0; c < ps.num_classes; ++c) {
@@ -564,8 +572,9 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
     // predict
     preds_d.resize(ps.num_preds_outputs(), stream);
     proba_d.resize(ps.num_proba_outputs(), stream);
-    fil::predict(handle, forest, preds_d.data(), data_d.data(), ps.num_rows);
-    fil::predict(handle, forest, proba_d.data(), data_d.data(), ps.num_rows, true);
+    fil::predict(handle, forest, (void*)preds_d.data(), (const void*)data_d.data(), ps.num_rows);
+    fil::predict(
+      handle, forest, (void*)proba_d.data(), (const void*)data_d.data(), ps.num_rows, true);
     handle.sync_stream();
 
     // cleanup
@@ -577,29 +586,31 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
     ASSERT_TRUE(raft::devArrMatch(want_proba_d.data(),
                                   proba_d.data(),
                                   ps.num_proba_outputs(),
-                                  raft::CompareApprox<float>(ps.tolerance),
+                                  raft::CompareApprox<real_t>(ps.tolerance),
                                   stream));
     float tolerance = ps.leaf_algo == fil::leaf_algo_t::FLOAT_UNARY_BINARY
                         ? ps.tolerance
-                        : std::numeric_limits<float>::epsilon();
+                        : std::numeric_limits<real_t>::epsilon();
     // in multi-class prediction, floats represent the most likely class
     // and would be generated by converting an int to float
     ASSERT_TRUE(raft::devArrMatch(want_preds_d.data(),
                                   preds_d.data(),
                                   ps.num_rows,
-                                  raft::CompareApprox<float>(tolerance),
+                                  raft::CompareApprox<real_t>(tolerance),
                                   stream));
   }
 
-  fil::val_t<float> infer_one_tree(fil::dense_node<float>* root, float* data, const tree_base& tree)
+  fil::val_t<real_t> infer_one_tree(fil::dense_node<real_t>* root,
+                                    real_t* data,
+                                    const tree_base& tree)
   {
     int curr = 0;
-    fil::val_t<float> output{.f = 0.0f};
+    fil::val_t<real_t> output{.f = 0.0f};
     for (;;) {
-      const fil::dense_node<float>& node = root[curr];
-      if (node.is_leaf()) return node.template output<val_t<float>>();
-      float val = data[node.fid()];
-      curr      = tree.child_index<true>(node, curr, val);
+      const fil::dense_node<real_t>& node = root[curr];
+      if (node.is_leaf()) return node.template output<val_t<real_t>>();
+      real_t val = data[node.fid()];
+      curr       = tree.child_index<true>(node, curr, val);
     }
     return output;
   }
@@ -614,26 +625,26 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
   cudaStream_t stream = 0;
 
   // predictions
-  rmm::device_uvector<float> preds_d;
-  rmm::device_uvector<float> proba_d;
-  rmm::device_uvector<float> want_preds_d;
-  rmm::device_uvector<float> want_proba_d;
+  rmm::device_uvector<real_t> preds_d;
+  rmm::device_uvector<real_t> proba_d;
+  rmm::device_uvector<real_t> want_preds_d;
+  rmm::device_uvector<real_t> want_proba_d;
 
   // input data
-  rmm::device_uvector<float> data_d;
-  std::vector<float> data_h;
-  std::vector<float> want_proba_h;
+  rmm::device_uvector<real_t> data_d;
+  std::vector<real_t> data_h;
+  std::vector<real_t> want_proba_h;
 
   // forest data
-  std::vector<fil::dense_node<float>> nodes;
-  std::vector<float> vector_leaf;
+  std::vector<fil::dense_node<real_t>> nodes;
+  std::vector<real_t> vector_leaf;
   cat_sets_owner cat_sets_h;
   rmm::device_uvector<int> fids_d           = rmm::device_uvector<int>(0, cudaStream_t());
   rmm::device_uvector<float> fid_num_cats_d = rmm::device_uvector<float>(0, cudaStream_t());
 };
 
 template <typename fil_node_t>
-class BasePredictFilTest : public BaseFilTest {
+class BasePredictFilTest : public BaseFilTest<typename fil_node_t::real_t> {
  protected:
   void dense2sparse_node(const fil::dense_node<float>* dense_root,
                          int i_dense,
@@ -673,8 +684,8 @@ class BasePredictFilTest : public BaseFilTest {
 
   void dense2sparse()
   {
-    for (int tree = 0; tree < ps.num_trees; ++tree) {
-      dense2sparse_tree(&nodes[tree * tree_num_nodes()]);
+    for (int tree = 0; tree < this->ps.num_trees; ++tree) {
+      dense2sparse_tree(&this->nodes[tree * this->tree_num_nodes()]);
     }
   }
 
@@ -686,31 +697,31 @@ class BasePredictFilTest : public BaseFilTest {
       dense2sparse();
       init_nodes = sparse_nodes;
     } else {
-      init_nodes = nodes;
+      init_nodes = this->nodes;
     }
     ASSERT(init_nodes.size() < std::size_t(INT_MAX), "generated too many nodes");
 
     // init FIL model
     fil::forest_params_t fil_params = {
       .num_nodes        = static_cast<int>(init_nodes.size()),
-      .depth            = ps.depth,
-      .num_trees        = ps.num_trees,
-      .num_cols         = ps.num_cols,
-      .leaf_algo        = ps.leaf_algo,
-      .algo             = ps.algo,
-      .output           = ps.output,
-      .threshold        = ps.threshold,
-      .global_bias      = ps.global_bias,
-      .num_classes      = ps.num_classes,
-      .blocks_per_sm    = ps.blocks_per_sm,
-      .threads_per_tree = ps.threads_per_tree,
-      .n_items          = ps.n_items,
+      .depth            = this->ps.depth,
+      .num_trees        = this->ps.num_trees,
+      .num_cols         = this->ps.num_cols,
+      .leaf_algo        = this->ps.leaf_algo,
+      .algo             = this->ps.algo,
+      .output           = this->ps.output,
+      .threshold        = this->ps.threshold,
+      .global_bias      = this->ps.global_bias,
+      .num_classes      = this->ps.num_classes,
+      .blocks_per_sm    = this->ps.blocks_per_sm,
+      .threads_per_tree = this->ps.threads_per_tree,
+      .n_items          = this->ps.n_items,
     };
 
-    fil::init(handle,
+    fil::init(this->handle,
               pforest,
-              cat_sets_h.accessor(),
-              vector_leaf,
+              this->cat_sets_h.accessor(),
+              this->vector_leaf,
               trees.data(),
               init_nodes.data(),
               &fil_params);
@@ -723,7 +734,7 @@ typedef BasePredictFilTest<fil::dense_node<float>> PredictDenseFilTest;
 typedef BasePredictFilTest<fil::sparse_node16<float>> PredictSparse16FilTest;
 typedef BasePredictFilTest<fil::sparse_node8> PredictSparse8FilTest;
 
-class TreeliteFilTest : public BaseFilTest {
+class TreeliteFilTest : public BaseFilTest<float> {
  protected:
   /** adds nodes[node] of tree starting at index root to builder
       at index at *pkey, increments *pkey,

From 05de38dd89a7cdff0fdaf851365081692ee5ba9b Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Fri, 18 Mar 2022 22:42:30 +0100
Subject: [PATCH 30/45] Templatized child_index tests, added float64-only
 tests.

---
 cpp/test/sg/fil_child_index_test.cu | 135 +++++++++++++++++-----------
 1 file changed, 81 insertions(+), 54 deletions(-)

diff --git a/cpp/test/sg/fil_child_index_test.cu b/cpp/test/sg/fil_child_index_test.cu
index 75eca6cba2..3425c51190 100644
--- a/cpp/test/sg/fil_child_index_test.cu
+++ b/cpp/test/sg/fil_child_index_test.cu
@@ -38,28 +38,39 @@ struct proto_inner_node {
   bool is_categorical = false;  // see base_node::is_categorical
   int fid             = 0;      // feature id, see base_node::fid
   int set             = 0;      // which bit set represents the matching category list
-  float thresh        = 0.0f;   // threshold, see base_node::thresh
+  double thresh       = 0.0;    // threshold, see base_node::thresh
   int left            = 1;      // left child idx, see sparse_node*::left_index()
-  val_t<float> split()
+  template <typename real_t>
+  val_t<real_t> split()
   {
-    val_t<float> split;
+    val_t<real_t> split;
     if (is_categorical)
       split.idx = set;
+    else if (std::isnan(thresh))
+      split.f = std::numeric_limits<real_t>::quiet_NaN();
     else
-      split.f = thresh;
+      split.f = static_cast<real_t>(thresh);
     return split;
   }
+  operator dense_node<float>()
+  {
+    return dense_node<float>({}, split<float>(), fid, def_left, false, is_categorical);
+  }
+  operator dense_node<double>()
+  {
+    return dense_node<double>({}, split<double>(), fid, def_left, false, is_categorical);
+  }
   operator sparse_node16<float>()
   {
-    return sparse_node16<float>({}, split(), fid, def_left, false, is_categorical, left);
+    return sparse_node16<float>({}, split<float>(), fid, def_left, false, is_categorical, left);
   }
-  operator sparse_node8()
+  operator sparse_node16<double>()
   {
-    return sparse_node8({}, split(), fid, def_left, false, is_categorical, left);
+    return sparse_node16<double>({}, split<double>(), fid, def_left, false, is_categorical, left);
   }
-  operator dense_node<float>()
+  operator sparse_node8()
   {
-    return dense_node<float>({}, split(), fid, def_left, false, is_categorical);
+    return sparse_node8({}, split<float>(), fid, def_left, false, is_categorical, left);
   }
 };
 
@@ -108,8 +119,9 @@ struct ChildIndexTestParams {
   proto_inner_node node;
   int parent_node_idx = 0;
   cat_sets_owner cso;
-  float input = 0.0f;
-  int correct = INT_MAX;
+  double input  = 0.0;
+  int correct   = INT_MAX;
+  bool skip_f32 = false;  // if true, the test only runs for float64
 };
 
 std::ostream& operator<<(std::ostream& os, const ChildIndexTestParams& ps)
@@ -136,29 +148,36 @@ std::ostream& operator<<(std::ostream& os, const ChildIndexTestParams& ps)
 
 template <typename fil_node_t>
 class ChildIndexTest : public testing::TestWithParam<ChildIndexTestParams> {
+  using real_t = typename fil_node_t::real_t;
+
  protected:
   void check()
   {
     ChildIndexTestParams param = GetParam();
+
+    // skip tests that require float64 to work correctly
+    if (std::is_same_v<real_t, float> && param.skip_f32) return;
+
     tree_base tree{param.cso.accessor()};
-    if (!std::is_same<fil_node_t, fil::dense_node<float>>::value) {
+    if (!std::is_same_v<fil_node_t, fil::dense_node<real_t>>) {
       // test that the logic uses node.left instead of parent_node_idx
       param.node.left       = param.parent_node_idx * 2 + 1;
       param.parent_node_idx = INT_MIN;
     }
+    real_t input = isnan(param.input) ? std::numeric_limits<real_t>::quiet_NaN()
+                                      : static_cast<real_t>(param.input);
     // nan -> !def_left, categorical -> if matches, numerical -> input >= threshold
-    int test_idx =
-      tree.child_index<true>((fil_node_t)param.node, param.parent_node_idx, param.input);
-    ASSERT(test_idx == param.correct,
-           "child index test: actual %d != correct %d",
-           test_idx,
-           param.correct);
+    int test_idx = tree.child_index<true>((fil_node_t)param.node, param.parent_node_idx, input);
+    ASSERT_EQ(test_idx, param.correct)
+      << "child index test: actual " << test_idx << "  != correct %d" << param.correct;
   }
 };
 
-typedef ChildIndexTest<fil::dense_node<float>> ChildIndexTestDense;
-typedef ChildIndexTest<fil::sparse_node16<float>> ChildIndexTestSparse16;
-typedef ChildIndexTest<fil::sparse_node8> ChildIndexTestSparse8;
+using ChildIndexTestDenseFloat32    = ChildIndexTest<fil::dense_node<float>>;
+using ChildIndexTestDenseFloat64    = ChildIndexTest<fil::dense_node<double>>;
+using ChildIndexTestSparse16Float32 = ChildIndexTest<fil::sparse_node16<float>>;
+using ChildIndexTestSparse16Float64 = ChildIndexTest<fil::sparse_node16<double>>;
+using ChildIndexTestSparse8         = ChildIndexTest<fil::sparse_node8>;
 
 /* for dense nodes, left (false) == parent * 2 + 1, right (true) == parent * 2 + 2
    E.g. see tree below:
@@ -168,48 +187,52 @@ typedef ChildIndexTest<fil::sparse_node8> ChildIndexTestSparse8;
  3 -> 7, 8
  4 -> 9, 10
  */
-const float INF = std::numeric_limits<float>::infinity();
+const double INF  = std::numeric_limits<double>::infinity();
+const double QNAN = std::numeric_limits<double>::quiet_NaN();
 
 std::vector<ChildIndexTestParams> params = {
-  CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = 0.0f), input = -INF, correct = 1),  // val !>= thresh
-  CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = 0.0f), input = 0.0f, correct = 2),  // val >= thresh
-  CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = 0.0f), input = +INF, correct = 2),  // val >= thresh
+  CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = 0.0), input = -INF, correct = 1),  // val !>= thresh
+  CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = 0.0), input = 0.0, correct = 2),   // val >= thresh
+  CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = 0.0), input = +INF, correct = 2),  // val >= thresh
+  // the following two tests only work for float64
+  CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = 0.0), input = -1e-50, correct = 1, skip_f32 = true),
+  CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = 1e-50), input = 0.0, correct = 1, skip_f32 = true),
   CHILD_INDEX_TEST_PARAMS(
-    node = NODE(thresh = 1.0f), input = -3.141592f, correct = 1),  // val !>= thresh
-  CHILD_INDEX_TEST_PARAMS(                                         // val >= thresh (e**pi > pi**e)
-    node    = NODE(thresh = 22.459158f),
-    input   = 23.140693f,
+    node = NODE(thresh = 1.0), input = -3.141592, correct = 1),  // val !>= thresh
+  CHILD_INDEX_TEST_PARAMS(                                       // val >= thresh (e**pi > pi**e)
+    node    = NODE(thresh = 22.459158),
+    input   = 23.140693,
     correct = 2),
   CHILD_INDEX_TEST_PARAMS(  // val >= thresh for both negative
-    node    = NODE(thresh = -0.37f),
-    input   = -0.36f,
-    correct = 2),                                                                   // val >= thresh
-  CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = -INF), input = 0.36f, correct = 2),  // val >= thresh
-  CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = 0.0f), input = NAN, correct = 2),    // !def_left
-  CHILD_INDEX_TEST_PARAMS(node = NODE(def_left = true), input = NAN, correct = 1),  // !def_left
-  CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = NAN), input = NAN, correct = 2),     // !def_left
+    node    = NODE(thresh = -0.37),
+    input   = -0.36,
+    correct = 2),                                                                  // val >= thresh
+  CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = -INF), input = 0.36, correct = 2),  // val >= thresh
+  CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = 0.0f), input = QNAN, correct = 2),  // !def_left
+  CHILD_INDEX_TEST_PARAMS(node = NODE(def_left = true), input = QNAN, correct = 1),  // !def_left
+  CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = QNAN), input = QNAN, correct = 2),    // !def_left
   CHILD_INDEX_TEST_PARAMS(
-    node = NODE(def_left = true, thresh = NAN), input = NAN, correct = 1),        // !def_left
-  CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = NAN), input = 0.0f, correct = 1),  // val !>= thresh
+    node = NODE(def_left = true, thresh = QNAN), input = QNAN, correct = 1),      // !def_left
+  CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = QNAN), input = 0.0, correct = 1),  // val !>= thresh
   CHILD_INDEX_TEST_PARAMS(
-    node = NODE(thresh = 0.0f), parent_node_idx = 1, input = -INF, correct = 3),
+    node = NODE(thresh = 0.0), parent_node_idx = 1, input = -INF, correct = 3),
   CHILD_INDEX_TEST_PARAMS(
-    node = NODE(thresh = 0.0f), parent_node_idx = 1, input = 0.0f, correct = 4),
+    node = NODE(thresh = 0.0), parent_node_idx = 1, input = 0.0f, correct = 4),
   CHILD_INDEX_TEST_PARAMS(
-    node = NODE(thresh = 0.0f), parent_node_idx = 2, input = -INF, correct = 5),
+    node = NODE(thresh = 0.0), parent_node_idx = 2, input = -INF, correct = 5),
   CHILD_INDEX_TEST_PARAMS(
-    node = NODE(thresh = 0.0f), parent_node_idx = 2, input = 0.0f, correct = 6),
+    node = NODE(thresh = 0.0), parent_node_idx = 2, input = 0.0f, correct = 6),
   CHILD_INDEX_TEST_PARAMS(
-    node = NODE(thresh = 0.0f), parent_node_idx = 3, input = -INF, correct = 7),
+    node = NODE(thresh = 0.0), parent_node_idx = 3, input = -INF, correct = 7),
   CHILD_INDEX_TEST_PARAMS(
-    node = NODE(thresh = 0.0f), parent_node_idx = 3, input = 0.0f, correct = 8),
+    node = NODE(thresh = 0.0), parent_node_idx = 3, input = 0.0f, correct = 8),
   CHILD_INDEX_TEST_PARAMS(
-    node = NODE(thresh = 0.0f), parent_node_idx = 4, input = -INF, correct = 9),
+    node = NODE(thresh = 0.0), parent_node_idx = 4, input = -INF, correct = 9),
   CHILD_INDEX_TEST_PARAMS(
-    node = NODE(thresh = 0.0f), parent_node_idx = 4, input = 0.0f, correct = 10),
-  CHILD_INDEX_TEST_PARAMS(parent_node_idx = 4, input = NAN, correct = 10),  // !def_left
+    node = NODE(thresh = 0.0), parent_node_idx = 4, input = 0.0, correct = 10),
+  CHILD_INDEX_TEST_PARAMS(parent_node_idx = 4, input = QNAN, correct = 10),  // !def_left
   CHILD_INDEX_TEST_PARAMS(
-    node = NODE(def_left = true), input = NAN, parent_node_idx = 4, correct = 9),  // !def_left
+    node = NODE(def_left = true), input = QNAN, parent_node_idx = 4, correct = 9),  // !def_left
   // cannot match ( < 0 and realistic fid_num_cats)
   CHILD_INDEX_TEST_PARAMS(node             = NODE(is_categorical = true),
                           cso.bits         = {},
@@ -282,21 +305,25 @@ std::vector<ChildIndexTestParams> params = {
   CHILD_INDEX_TEST_PARAMS(node             = NODE(is_categorical = true, def_left = true),
                           cso.bits         = {0b0000'0101},
                           cso.fid_num_cats = {3.0f},
-                          input            = NAN,
+                          input            = QNAN,
                           correct          = 1),
   // default right
   CHILD_INDEX_TEST_PARAMS(node             = NODE(is_categorical = true, def_left = false),
                           cso.bits         = {0b0000'0101},
                           cso.fid_num_cats = {3.0f},
-                          input            = NAN,
+                          input            = QNAN,
                           correct          = 2),
 };
 
-TEST_P(ChildIndexTestDense, Predict) { check(); }
-TEST_P(ChildIndexTestSparse16, Predict) { check(); }
+TEST_P(ChildIndexTestDenseFloat32, Predict) { check(); }
+TEST_P(ChildIndexTestDenseFloat64, Predict) { check(); }
+TEST_P(ChildIndexTestSparse16Float32, Predict) { check(); }
+TEST_P(ChildIndexTestSparse16Float64, Predict) { check(); }
 TEST_P(ChildIndexTestSparse8, Predict) { check(); }
 
-INSTANTIATE_TEST_CASE_P(FilTests, ChildIndexTestDense, testing::ValuesIn(params));
-INSTANTIATE_TEST_CASE_P(FilTests, ChildIndexTestSparse16, testing::ValuesIn(params));
+INSTANTIATE_TEST_CASE_P(FilTests, ChildIndexTestDenseFloat32, testing::ValuesIn(params));
+INSTANTIATE_TEST_CASE_P(FilTests, ChildIndexTestDenseFloat64, testing::ValuesIn(params));
+INSTANTIATE_TEST_CASE_P(FilTests, ChildIndexTestSparse16Float32, testing::ValuesIn(params));
+INSTANTIATE_TEST_CASE_P(FilTests, ChildIndexTestSparse16Float64, testing::ValuesIn(params));
 INSTANTIATE_TEST_CASE_P(FilTests, ChildIndexTestSparse8, testing::ValuesIn(params));
 }  // namespace ML

From 316e99a886bbefd6f650e8fb502c4d11fd9ba70b Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Sat, 19 Mar 2022 01:45:58 +0100
Subject: [PATCH 31/45] float64 versions of multi-sum and FIL predict tests.

---
 cpp/include/cuml/fil/multi_sum.cuh |  2 +-
 cpp/test/sg/fil_test.cu            | 46 ++++++++++++++++++--------
 cpp/test/sg/multi_sum_test.cu      | 53 +++++++++++++++++++++---------
 3 files changed, 71 insertions(+), 30 deletions(-)

diff --git a/cpp/include/cuml/fil/multi_sum.cuh b/cpp/include/cuml/fil/multi_sum.cuh
index 818ae225ec..d5d08766b9 100644
--- a/cpp/include/cuml/fil/multi_sum.cuh
+++ b/cpp/include/cuml/fil/multi_sum.cuh
@@ -34,7 +34,7 @@
  @data[] values are ordered such that the stride is 1 for values belonging
    to the same group and @n_groups for values that are to be added together
 */
-template <int R = 5, typename T = float>
+template <int R = 5, typename T>
 __device__ T multi_sum(T* data, int n_groups, int n_values)
 {
   T acc = threadIdx.x < n_groups * n_values ? data[threadIdx.x] : T();
diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu
index 893c6bf92a..1531c1be74 100644
--- a/cpp/test/sg/fil_test.cu
+++ b/cpp/test/sg/fil_test.cu
@@ -645,17 +645,24 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
 
 template <typename fil_node_t>
 class BasePredictFilTest : public BaseFilTest<typename fil_node_t::real_t> {
+  using real_t = typename fil_node_t::real_t;
+
  protected:
-  void dense2sparse_node(const fil::dense_node<float>* dense_root,
+  void dense2sparse_node(const fil::dense_node<real_t>* dense_root,
                          int i_dense,
                          int i_sparse_root,
                          int i_sparse)
   {
-    const fil::dense_node<float>& node = dense_root[i_dense];
+    const fil::dense_node<real_t>& node = dense_root[i_dense];
     if (node.is_leaf()) {
       // leaf sparse node
-      sparse_nodes[i_sparse] = fil_node_t(
-        node.output<val_t<float>>(), {}, node.fid(), node.def_left(), node.is_leaf(), false, 0);
+      sparse_nodes[i_sparse] = fil_node_t(node.template output<fil::val_t<real_t>>(),
+                                          {},
+                                          node.fid(),
+                                          node.def_left(),
+                                          node.is_leaf(),
+                                          false,
+                                          0);
       return;
     }
     // inner sparse node
@@ -674,7 +681,7 @@ class BasePredictFilTest : public BaseFilTest<typename fil_node_t::real_t> {
     dense2sparse_node(dense_root, 2 * i_dense + 2, i_sparse_root, left_index + 1);
   }
 
-  void dense2sparse_tree(const fil::dense_node<float>* dense_root)
+  void dense2sparse_tree(const fil::dense_node<real_t>* dense_root)
   {
     int i_sparse_root = sparse_nodes.size();
     sparse_nodes.push_back(fil_node_t());
@@ -730,9 +737,11 @@ class BasePredictFilTest : public BaseFilTest<typename fil_node_t::real_t> {
   std::vector<int> trees;
 };
 
-typedef BasePredictFilTest<fil::dense_node<float>> PredictDenseFilTest;
-typedef BasePredictFilTest<fil::sparse_node16<float>> PredictSparse16FilTest;
-typedef BasePredictFilTest<fil::sparse_node8> PredictSparse8FilTest;
+using PredictDenseFloat32FilTest    = BasePredictFilTest<fil::dense_node<float>>;
+using PredictDenseFloat64FilTest    = BasePredictFilTest<fil::dense_node<double>>;
+using PredictSparse16Float32FilTest = BasePredictFilTest<fil::sparse_node16<float>>;
+using PredictSparse16Float64FilTest = BasePredictFilTest<fil::sparse_node16<double>>;
+using PredictSparse8FilTest         = BasePredictFilTest<fil::sparse_node8>;
 
 class TreeliteFilTest : public BaseFilTest<float> {
  protected:
@@ -1097,9 +1106,15 @@ std::vector<FilTestParams> predict_dense_inputs = {
                   max_magnitude_of_matching_cat = 5),
 };
 
-TEST_P(PredictDenseFilTest, Predict) { compare(); }
+TEST_P(PredictDenseFloat32FilTest, Predict) { compare(); }
+TEST_P(PredictDenseFloat64FilTest, Predict) { compare(); }
 
-INSTANTIATE_TEST_CASE_P(FilTests, PredictDenseFilTest, testing::ValuesIn(predict_dense_inputs));
+INSTANTIATE_TEST_CASE_P(FilTests,
+                        PredictDenseFloat32FilTest,
+                        testing::ValuesIn(predict_dense_inputs));
+INSTANTIATE_TEST_CASE_P(FilTests,
+                        PredictDenseFloat64FilTest,
+                        testing::ValuesIn(predict_dense_inputs));
 
 std::vector<FilTestParams> predict_sparse_inputs = {
   FIL_TEST_PARAMS(),
@@ -1175,10 +1190,15 @@ std::vector<FilTestParams> predict_sparse_inputs = {
                   max_magnitude_of_matching_cat = 5),
 };
 
-TEST_P(PredictSparse16FilTest, Predict) { compare(); }
+TEST_P(PredictSparse16Float32FilTest, Predict) { compare(); }
+TEST_P(PredictSparse16Float64FilTest, Predict) { compare(); }
 
-// Temporarily disabled, see https://github.com/rapidsai/cuml/issues/3205
-INSTANTIATE_TEST_CASE_P(FilTests, PredictSparse16FilTest, testing::ValuesIn(predict_sparse_inputs));
+INSTANTIATE_TEST_CASE_P(FilTests,
+                        PredictSparse16Float32FilTest,
+                        testing::ValuesIn(predict_sparse_inputs));
+INSTANTIATE_TEST_CASE_P(FilTests,
+                        PredictSparse16Float64FilTest,
+                        testing::ValuesIn(predict_sparse_inputs));
 
 TEST_P(PredictSparse8FilTest, Predict) { compare(); }
 
diff --git a/cpp/test/sg/multi_sum_test.cu b/cpp/test/sg/multi_sum_test.cu
index 6d960b8a4d..5adb18a1fc 100644
--- a/cpp/test/sg/multi_sum_test.cu
+++ b/cpp/test/sg/multi_sum_test.cu
@@ -44,7 +44,7 @@ __device__ void serial_multi_sum(const T* in, T* out, int n_groups, int n_values
 }
 
 // the most threads a block can have
-const int max_threads = 1024;
+const int MAX_THREADS = 1024;
 
 struct MultiSumTestParams {
   int radix;     // number of elements summed to 1 at each stage of the sum
@@ -52,14 +52,22 @@ struct MultiSumTestParams {
   int n_values;  // number of elements to add in each sum
 };
 
+template <typename T>
+struct multi_sum_test_shmem {
+  T work[MAX_THREADS];
+  T correct_result[MAX_THREADS];
+};
+
 template <int R, typename T>
-__device__ void test_single_radix(T thread_value, MultiSumTestParams p, int* block_error_flag)
+__device__ void test_single_radix(multi_sum_test_shmem<T>& s,
+                                  T thread_value,
+                                  MultiSumTestParams p,
+                                  int* block_error_flag)
 {
-  __shared__ T work[max_threads], correct_result[max_threads];
-  work[threadIdx.x] = thread_value;
-  serial_multi_sum(work, correct_result, p.n_groups, p.n_values);
-  T sum = multi_sum<R>(work, p.n_groups, p.n_values);
-  if (threadIdx.x < p.n_groups && 1e-4 < fabsf(sum - correct_result[threadIdx.x])) {
+  s.work[threadIdx.x] = thread_value;
+  serial_multi_sum(s.work, s.correct_result, p.n_groups, p.n_values);
+  T sum = multi_sum<R>(s.work, p.n_groups, p.n_values);
+  if (threadIdx.x < p.n_groups && 1e-4 < fabsf(sum - s.correct_result[threadIdx.x])) {
     atomicAdd(block_error_flag, 1);
   }
 }
@@ -67,13 +75,14 @@ __device__ void test_single_radix(T thread_value, MultiSumTestParams p, int* blo
 template <typename T>
 __global__ void test_multi_sum_k(T* data, MultiSumTestParams* params, int* error_flags)
 {
+  __shared__ multi_sum_test_shmem<T> s;
   MultiSumTestParams p = params[blockIdx.x];
   switch (p.radix) {
-    case 2: test_single_radix<2>(data[threadIdx.x], p, &error_flags[blockIdx.x]); break;
-    case 3: test_single_radix<3>(data[threadIdx.x], p, &error_flags[blockIdx.x]); break;
-    case 4: test_single_radix<4>(data[threadIdx.x], p, &error_flags[blockIdx.x]); break;
-    case 5: test_single_radix<5>(data[threadIdx.x], p, &error_flags[blockIdx.x]); break;
-    case 6: test_single_radix<6>(data[threadIdx.x], p, &error_flags[blockIdx.x]); break;
+    case 2: test_single_radix<2>(s, data[threadIdx.x], p, &error_flags[blockIdx.x]); break;
+    case 3: test_single_radix<3>(s, data[threadIdx.x], p, &error_flags[blockIdx.x]); break;
+    case 4: test_single_radix<4>(s, data[threadIdx.x], p, &error_flags[blockIdx.x]); break;
+    case 5: test_single_radix<5>(s, data[threadIdx.x], p, &error_flags[blockIdx.x]); break;
+    case 6: test_single_radix<6>(s, data[threadIdx.x], p, &error_flags[blockIdx.x]); break;
   }
 }
 
@@ -142,12 +151,12 @@ std::vector<int> block_sizes = []() {
   std::vector<int> res;
   for (int i = 2; i < 50; ++i)
     res.push_back(i);
-  for (int i = max_threads - 50; i <= max_threads; ++i)
+  for (int i = MAX_THREADS - 50; i <= MAX_THREADS; ++i)
     res.push_back(i);
   return res;
 }();
 
-class MultiSumTestFloat : public MultiSumTest<float> {
+class MultiSumTestFloat32 : public MultiSumTest<float> {
  public:
   void generate_data()
   {
@@ -155,8 +164,20 @@ class MultiSumTestFloat : public MultiSumTest<float> {
     r.uniform(data_d.data().get(), data_d.size(), -1.0f, 1.0f, cudaStreamDefault);
   }
 };
-TEST_P(MultiSumTestFloat, Import) { check(); }
-INSTANTIATE_TEST_CASE_P(FilTests, MultiSumTestFloat, testing::ValuesIn(block_sizes));
+TEST_P(MultiSumTestFloat32, Import) { check(); }
+INSTANTIATE_TEST_CASE_P(FilTests, MultiSumTestFloat32, testing::ValuesIn(block_sizes));
+
+class MultiSumTestFloat64 : public MultiSumTest<double> {
+ public:
+  void generate_data()
+  {
+    raft::random::Rng r(4321);
+    r.uniform(data_d.data().get(), data_d.size(), -1.0, 1.0, cudaStreamDefault);
+  }
+};
+
+TEST_P(MultiSumTestFloat64, Import) { check(); }
+INSTANTIATE_TEST_CASE_P(FilTests, MultiSumTestFloat64, testing::ValuesIn(block_sizes));
 
 class MultiSumTestInt : public MultiSumTest<int> {
  public:

From 2ba5eed15da51f0ef43729ca1b9334c6f6a48201 Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Sat, 19 Mar 2022 03:23:51 +0100
Subject: [PATCH 32/45] compute_smem_footprint() uses float or double, based on
 sizeof_real.

---
 cpp/src/fil/infer.cu | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 233b1bec5d..ccc09edfda 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -863,7 +863,20 @@ size_t shmem_size_params::get_smem_footprint()
 template <class KernelParams>
 int compute_smem_footprint::run(predict_params ssp)
 {
-  return ssp.template get_smem_footprint<KernelParams::N_ITEMS, float, KernelParams::LEAF_ALGO>();
+  switch (ssp.sizeof_real) {
+    case 4:
+      return ssp
+        .template get_smem_footprint<KernelParams::N_ITEMS, float, KernelParams::LEAF_ALGO>();
+    case 8:
+      return ssp
+        .template get_smem_footprint<KernelParams::N_ITEMS, double, KernelParams::LEAF_ALGO>();
+    default:
+      ASSERT(ssp.sizeof_real == 4 || ssp.sizeof_real == 8,
+             "internal error: sizeof_real == %d, but must be 4 or 8",
+             static_cast<int>(ssp.sizeof_real));
+      // unreachable
+      return 0;
+  }
 }
 
 // make sure to instantiate all possible get_smem_footprint instantiations

From ac92be785e788baf6dd88d8df8f41ac1593f3e9e Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Tue, 22 Mar 2022 13:36:19 +0100
Subject: [PATCH 33/45] Removed stray static_asserts.

---
 cpp/src/fil/fil.cu | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index d1dc8cdc25..43ec9fcad4 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -379,8 +379,6 @@ struct opt_into_arch_dependent_shmem : dispatch_functor<void> {
   template <typename KernelParams = KernelTemplateParams<>>
   void run(predict_params p)
   {
-    static_assert(std::is_same_v<typename storage_type::real_t, float>,
-                  "real_t must be float; to be removed in the following pull requests");
     auto kernel = infer_k<KernelParams::N_ITEMS,
                           KernelParams::LEAF_ALGO,
                           KernelParams::COLS_IN_SHMEM,
@@ -633,8 +631,6 @@ void init(const raft::handle_t& h,
   check_params(params, node_traits<fil_node_t>::IS_DENSE);
   using forest_type = typename node_traits<fil_node_t>::forest;
   forest_type* f    = new forest_type(h);
-  static_assert(std::is_same_v<typename fil_node_t::real_t, float>,
-                "real_t must be float; to be removed in the following pull requests");
   f->init(h, cat_sets, vector_leaf, trees, nodes, params);
   *pf = f;
 }

From 92c44af6f32c76946056fab457a000216625f9ab Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Mon, 4 Apr 2022 20:14:59 +0200
Subject: [PATCH 34/45] Finish merge.

---
 cpp/src/fil/common.cuh | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index 3f31db199e..c4877216a8 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -46,15 +46,9 @@ struct storage_base {
 };
 
 /** represents a dense tree */
-<<<<<<< HEAD
-template <typename real_t_>
-struct tree<dense_node<real_t_>> : tree_base {
-  using real_t = real_t_;
-=======
 template <typename real_t>
 struct tree<dense_node<real_t>> : tree_base {
   using real_type = real_t;
->>>>>>> branch-22.06
   __host__ __device__ tree(categorical_sets cat_sets, dense_node<real_t>* nodes, int node_pitch)
     : tree_base{cat_sets}, nodes_(nodes), node_pitch_(node_pitch)
   {

From 938e02a846ed1a111b6fa0ba0b951e27c560c19a Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Mon, 4 Apr 2022 21:13:00 +0200
Subject: [PATCH 35/45] Fixed compilation errors.

---
 cpp/test/sg/fil_test.cu | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu
index d08aaa22a1..a3cc8ed75d 100644
--- a/cpp/test/sg/fil_test.cu
+++ b/cpp/test/sg/fil_test.cu
@@ -162,7 +162,7 @@ struct replace_some_floating_with_categorical {
   int num_cols;
   __device__ real_t operator()(real_t data, int data_idx)
   {
-    real_t fid_num_cats = fid_num_cats_d[data_idx % num_cols];
+    real_t fid_num_cats = static_cast<real_t>(fid_num_cats_d[data_idx % num_cols]);
     if (fid_num_cats == real_t(0)) return data;
     // Transform `data` from (uniform on) [-1.0, 1.0] into [-fid_num_cats-3, fid_num_cats+3].
     real_t tmp = data * (fid_num_cats + real_t(3));
@@ -561,20 +561,19 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
     handle.sync_stream();
   }
 
-  virtual void init_forest(fil::forest_t<float>* pforest) = 0;
+  virtual void init_forest(fil::forest_t<real_t>* pforest) = 0;
 
   void predict_on_gpu()
   {
-    auto stream                 = handle.get_stream();
-    fil::forest_t<float> forest = nullptr;
+    auto stream                  = handle.get_stream();
+    fil::forest_t<real_t> forest = nullptr;
     init_forest(&forest);
 
     // predict
     preds_d.resize(ps.num_preds_outputs(), stream);
     proba_d.resize(ps.num_proba_outputs(), stream);
-    fil::predict(handle, forest, (void*)preds_d.data(), (const void*)data_d.data(), ps.num_rows);
-    fil::predict(
-      handle, forest, (void*)proba_d.data(), (const void*)data_d.data(), ps.num_rows, true);
+    fil::predict(handle, forest, preds_d.data(), data_d.data(), ps.num_rows);
+    fil::predict(handle, forest, proba_d.data(), data_d.data(), ps.num_rows, true);
     handle.sync_stream();
 
     // cleanup
@@ -644,8 +643,8 @@ class BaseFilTest : public testing::TestWithParam<FilTestParams> {
 };
 
 template <typename fil_node_t>
-class BasePredictFilTest : public BaseFilTest<typename fil_node_t::real_t> {
-  using real_t = typename fil_node_t::real_t;
+class BasePredictFilTest : public BaseFilTest<typename fil_node_t::real_type> {
+  using real_t = typename fil_node_t::real_type;
 
  protected:
   void dense2sparse_node(const fil::dense_node<real_t>* dense_root,
@@ -696,7 +695,7 @@ class BasePredictFilTest : public BaseFilTest<typename fil_node_t::real_t> {
     }
   }
 
-  void init_forest(fil::forest_t<float>* pforest) override
+  void init_forest(fil::forest_t<real_t>* pforest) override
   {
     constexpr bool IS_DENSE = node_traits<fil_node_t>::IS_DENSE;
     std::vector<fil_node_t> init_nodes;

From c665bbfa05ac88f72bee37cabd246ee630b33f24 Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Mon, 4 Apr 2022 21:16:24 +0200
Subject: [PATCH 36/45] Fixed endless recursion in forest::free().

---
 cpp/src/fil/fil.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index afdd2fd467..887a79e7dc 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -324,8 +324,8 @@ struct forest {
 
   virtual void free(const raft::handle_t& h)
   {
+    cat_sets_.release();
     vector_leaf_.release();
-    forest::free(h);
   }
 
   virtual ~forest() {}
@@ -504,9 +504,9 @@ struct sparse_forest : forest<typename node_t::real_type> {
 
   void free(const raft::handle_t& h) override
   {
-    forest<real_type>::free(h);
     trees_.release();
     nodes_.release();
+    forest<real_type>::free(h);
   }
 
   int num_nodes_ = 0;

From 175837a2bcc642edfdee07eb2a24cc00098dc222 Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Tue, 5 Apr 2022 00:38:01 +0200
Subject: [PATCH 37/45] Removed changes to fil.h.

---
 cpp/include/cuml/fil/fil.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/include/cuml/fil/fil.h b/cpp/include/cuml/fil/fil.h
index 22b7b53059..581fe3eb13 100644
--- a/cpp/include/cuml/fil/fil.h
+++ b/cpp/include/cuml/fil/fil.h
@@ -135,7 +135,6 @@ void free(const raft::handle_t& h, forest_t<real_t> f);
  *      size = predict_proba ? (2*num_rows) : num_rows
  *  @param data array of size n * cols (cols is the number of columns
  *      for the forest f) from which to predict
- *      type = the type used for the forest representation (float or double)
  *  @param num_rows number of data rows
  *  @param predict_proba for classifier models, this forces to output both class probabilities
  *      instead of binary class prediction. format matches scikit-learn API

From 886c64959b036e9521f75c4878759e57cdc15977 Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Tue, 5 Apr 2022 00:53:14 +0200
Subject: [PATCH 38/45] Refactored tests.

---
 cpp/test/sg/fil_test.cu | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu
index a3cc8ed75d..4a727efdc0 100644
--- a/cpp/test/sg/fil_test.cu
+++ b/cpp/test/sg/fil_test.cu
@@ -139,8 +139,11 @@ __global__ void nan_kernel(real_t* data, const bool* mask, int len, real_t nan)
   if (!mask[tid]) data[tid] = nan;
 }
 
-float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); }
-double sigmoid(double x) { return 1.0 / (1.0 + exp(-x)); }
+template <typename real_t>
+real_t sigmoid(real_t x)
+{
+  return real_t(1) / (real_t(1) + exp(-x));
+}
 
 void hard_clipped_bernoulli(
   raft::random::Rng rng, float* d, std::size_t n_vals, float prob_of_zero, cudaStream_t stream)
@@ -168,12 +171,12 @@ struct replace_some_floating_with_categorical {
     real_t tmp = data * (fid_num_cats + real_t(3));
     // Also test invalid (negative and above fid_num_cats) categories: samples within
     // [fid_num_cats+2.5, fid_num_cats+3) and opposite will test infinite floats as categorical.
-    if (tmp + fid_num_cats < real_t(-2.5f)) return -INFINITY;
-    if (tmp - fid_num_cats > real_t(+2.5f)) return +INFINITY;
+    if (tmp + fid_num_cats < real_t(-2.5f)) return -std::numeric_limits<real_t>::infinity();
+    if (tmp - fid_num_cats > real_t(+2.5f)) return +std::numeric_limits<real_t>::infinity();
     // Samples within [fid_num_cats+2, fid_num_cats+2.5) (and their negative counterparts) will
     // test huge invalid categories.
-    if (tmp + fid_num_cats < real_t(-2.0f)) tmp -= MAX_FIL_INT_FLOAT;
-    if (tmp - fid_num_cats > real_t(+2.0f)) tmp += MAX_FIL_INT_FLOAT;
+    if (tmp + fid_num_cats < real_t(-2.0f)) tmp -= real_t(MAX_FIL_INT_FLOAT);
+    if (tmp - fid_num_cats > real_t(+2.0f)) tmp += real_t(MAX_FIL_INT_FLOAT);
     // Samples within [0, fid_num_cats+2) will be valid categories, rounded towards 0 with a cast.
     // Negative categories are always invalid. For correct interpretation, see
     // cpp/src/fil/internal.cuh `int category_matches(node_t node, float category)`

From 1426c14b7f08745828e59183f446b0304ec0465f Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Tue, 5 Apr 2022 01:00:40 +0200
Subject: [PATCH 39/45] noinline -> forceinline.

---
 cpp/src/fil/infer.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 7166c600e6..5af31aa76e 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -36,7 +36,7 @@
 #endif  // __CUDA_ARCH__
 #endif  // CUDA_PRAGMA_UNROLL
 
-#define INLINE_CONFIG __noinline__
+#define INLINE_CONFIG __forceinline__
 
 namespace ML {
 namespace fil {

From 56ecd5119a91a314400a0d8df79b981c48947088 Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Wed, 6 Apr 2022 23:55:01 +0200
Subject: [PATCH 40/45] Addressed review comments.

---
 cpp/test/sg/fil_test.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu
index 4a727efdc0..cec210280a 100644
--- a/cpp/test/sg/fil_test.cu
+++ b/cpp/test/sg/fil_test.cu
@@ -165,7 +165,7 @@ struct replace_some_floating_with_categorical {
   int num_cols;
   __device__ real_t operator()(real_t data, int data_idx)
   {
-    real_t fid_num_cats = static_cast<real_t>(fid_num_cats_d[data_idx % num_cols]);
+    auto fid_num_cats = static_cast<real_t>(fid_num_cats_d[data_idx % num_cols]);
     if (fid_num_cats == real_t(0)) return data;
     // Transform `data` from (uniform on) [-1.0, 1.0] into [-fid_num_cats-3, fid_num_cats+3].
     real_t tmp = data * (fid_num_cats + real_t(3));

From 9098b86d9c1a808493876dddeb0454383de7dd4e Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Thu, 7 Apr 2022 02:50:21 +0200
Subject: [PATCH 41/45] float64 support in treelite->FIL import.

---
 cpp/bench/sg/fil.cu            |   4 +-
 cpp/include/cuml/fil/fil.h     |   7 +-
 cpp/src/fil/infer.cu           |   2 +-
 cpp/src/fil/treelite_import.cu |  69 ++++++++------
 cpp/test/sg/fil_test.cu        | 166 ++++++++++++++++++++-------------
 cpp/test/sg/rf_test.cu         |  15 +--
 6 files changed, 157 insertions(+), 106 deletions(-)

diff --git a/cpp/bench/sg/fil.cu b/cpp/bench/sg/fil.cu
index adf283fbaf..67017fd9f5 100644
--- a/cpp/bench/sg/fil.cu
+++ b/cpp/bench/sg/fil.cu
@@ -91,7 +91,9 @@ class FIL : public RegressionFixture<float> {
       .threads_per_tree  = 1,
       .n_items           = 0,
       .pforest_shape_str = nullptr};
-    ML::fil::from_treelite(*handle, &forest, model, &tl_params);
+    ML::fil::forest_variant forest_variant;
+    ML::fil::from_treelite(*handle, &forest_variant, model, &tl_params);
+    forest = std::get<ML::fil::forest_t<float>>(forest_variant);
 
     // only time prediction
     this->loopOnState(state, [this]() {
diff --git a/cpp/include/cuml/fil/fil.h b/cpp/include/cuml/fil/fil.h
index 581fe3eb13..5cc4a9f748 100644
--- a/cpp/include/cuml/fil/fil.h
+++ b/cpp/include/cuml/fil/fil.h
@@ -20,6 +20,8 @@
 
 #include <stddef.h>
 
+#include <variant>  // for std::get<>, std::variant<>
+
 #include <cuml/ensemble/treelite_defs.hpp>
 
 namespace raft {
@@ -76,6 +78,9 @@ struct forest;
 template <typename real_t>
 using forest_t = forest<real_t>*;
 
+/** forest_variant is used to get a forest represented with either float or double. */
+using forest_variant = std::variant<forest_t<float>, forest_t<double>>;
+
 /** MAX_N_ITEMS determines the maximum allowed value for tl_params::n_items */
 constexpr int MAX_N_ITEMS = 4;
 
@@ -116,7 +121,7 @@ struct treelite_params_t {
  */
 // TODO (canonizer): use std::variant<forest_t<float> forest_t<double>>* for pforest
 void from_treelite(const raft::handle_t& handle,
-                   forest_t<float>* pforest,
+                   forest_variant* pforest,
                    ModelHandle model,
                    const treelite_params_t* tl_params);
 
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 5af31aa76e..7166c600e6 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -36,7 +36,7 @@
 #endif  // __CUDA_ARCH__
 #endif  // CUDA_PRAGMA_UNROLL
 
-#define INLINE_CONFIG __forceinline__
+#define INLINE_CONFIG __noinline__
 
 namespace ML {
 namespace fil {
diff --git a/cpp/src/fil/treelite_import.cu b/cpp/src/fil/treelite_import.cu
index 68634fe26a..2b9e320c95 100644
--- a/cpp/src/fil/treelite_import.cu
+++ b/cpp/src/fil/treelite_import.cu
@@ -40,6 +40,7 @@
 #include <cstddef>      // for std::size_t
 #include <cstdint>      // for uint8_t
 #include <iosfwd>       // for ios, stringstream
+#include <limits>       // for std::numeric_limits
 #include <stack>        // for std::stack
 #include <string>       // for std::string
 #include <type_traits>  // for std::is_same
@@ -223,7 +224,8 @@ cat_sets_owner allocate_cat_sets_owner(const tl::ModelImpl<T, L>& model)
   return cat_sets;
 }
 
-void adjust_threshold(float* pthreshold, bool* swap_child_nodes, tl::Operator comparison_op)
+template <typename real_t>
+void adjust_threshold(real_t* pthreshold, bool* swap_child_nodes, tl::Operator comparison_op)
 {
   // in treelite (take left node if val [op] threshold),
   // the meaning of the condition is reversed compared to FIL;
@@ -237,12 +239,12 @@ void adjust_threshold(float* pthreshold, bool* swap_child_nodes, tl::Operator co
     case tl::Operator::kLT: break;
     case tl::Operator::kLE:
       // x <= y is equivalent to x < y', where y' is the next representable float
-      *pthreshold = std::nextafterf(*pthreshold, std::numeric_limits<float>::infinity());
+      *pthreshold = std::nextafterf(*pthreshold, std::numeric_limits<real_t>::infinity());
       break;
     case tl::Operator::kGT:
       // x > y is equivalent to x >= y', where y' is the next representable float
       // left and right still need to be swapped
-      *pthreshold = std::nextafterf(*pthreshold, std::numeric_limits<float>::infinity());
+      *pthreshold = std::nextafterf(*pthreshold, std::numeric_limits<real_t>::infinity());
     case tl::Operator::kGE:
       // swap left and right
       *swap_child_nodes = !*swap_child_nodes;
@@ -279,7 +281,7 @@ void tl2fil_leaf_payload(fil_node_t* fil_node,
                          const tl::Tree<T, L>& tl_tree,
                          int tl_node_id,
                          const forest_params_t& forest_params,
-                         std::vector<float>* vector_leaf,
+                         std::vector<typename fil_node_t::real_type>* vector_leaf,
                          size_t* leaf_counter)
 {
   auto vec = tl_tree.LeafVector(tl_node_id);
@@ -301,7 +303,7 @@ void tl2fil_leaf_payload(fil_node_t* fil_node,
     }
     case leaf_algo_t::FLOAT_UNARY_BINARY:
     case leaf_algo_t::GROVE_PER_CLASS:
-      fil_node->val.f = static_cast<float>(tl_tree.LeafValue(tl_node_id));
+      fil_node->val.f = static_cast<typename fil_node_t::real_type>(tl_tree.LeafValue(tl_node_id));
       ASSERT(!tl_tree.HasLeafVector(tl_node_id),
              "some but not all treelite leaves have leaf_vector()");
       break;
@@ -323,14 +325,15 @@ conversion_state<fil_node_t> tl2fil_inner_node(int fil_left_child,
                                                cat_sets_owner* cat_sets,
                                                std::size_t* bit_pool_offset)
 {
+  using real_t = typename fil_node_t::real_type;
   int tl_left = tree.LeftChild(tl_node_id), tl_right = tree.RightChild(tl_node_id);
-  val_t<float> split  = {.f = NAN};  // yes there's a default initializer already
+  val_t<real_t> split = {.f = std::numeric_limits<real_t>::quiet_NaN()};
   int feature_id      = tree.SplitIndex(tl_node_id);
   bool is_categorical = tree.SplitType(tl_node_id) == tl::SplitFeatureType::kCategorical &&
                         tree.MatchingCategories(tl_node_id).size() > 0;
   bool swap_child_nodes = false;
   if (tree.SplitType(tl_node_id) == tl::SplitFeatureType::kNumerical) {
-    split.f = static_cast<float>(tree.Threshold(tl_node_id));
+    split.f = static_cast<real_t>(tree.Threshold(tl_node_id));
     adjust_threshold(&split.f, &swap_child_nodes, tree.ComparisonOp(tl_node_id));
   } else if (tree.SplitType(tl_node_id) == tl::SplitFeatureType::kCategorical) {
     // for FIL, the list of categories is always for the right child
@@ -346,14 +349,14 @@ conversion_state<fil_node_t> tl2fil_inner_node(int fil_left_child,
       }
     } else {
       // always branch left in FIL. Already accounted for Treelite branching direction above.
-      split.f = NAN;
+      split.f = std::numeric_limits<real_t>::quiet_NaN();
     }
   } else {
     ASSERT(false, "only numerical and categorical split nodes are supported");
   }
   bool default_left = tree.DefaultLeft(tl_node_id) ^ swap_child_nodes;
   fil_node_t node(
-    val_t<float>{}, split, feature_id, default_left, false, is_categorical, fil_left_child);
+    val_t<real_t>{}, split, feature_id, default_left, false, is_categorical, fil_left_child);
   return conversion_state<fil_node_t>{node, swap_child_nodes};
 }
 
@@ -363,7 +366,7 @@ int tree2fil(std::vector<fil_node_t>& nodes,
              const tl::Tree<T, L>& tree,
              std::size_t tree_idx,
              const forest_params_t& forest_params,
-             std::vector<float>* vector_leaf,
+             std::vector<typename fil_node_t::real_type>* vector_leaf,
              std::size_t* leaf_counter,
              cat_sets_owner* cat_sets)
 {
@@ -443,10 +446,11 @@ std::stringstream depth_hist_and_max(const tl::ModelImpl<T, L>& model)
   forest_shape << "Total: branches: " << total_branches << " leaves: " << total_leaves
                << " nodes: " << total_nodes << endl;
   forest_shape << "Avg nodes per tree: " << setprecision(2)
-               << total_nodes / (float)hist[0].n_branch_nodes << endl;
+               << total_nodes / static_cast<double>(hist[0].n_branch_nodes) << endl;
   forest_shape.copyfmt(default_state);
   forest_shape << "Leaf depth: min: " << min_leaf_depth << " avg: " << setprecision(2) << fixed
-               << leaves_times_depth / (float)total_leaves << " max: " << hist.size() - 1 << endl;
+               << leaves_times_depth / static_cast<double>(total_leaves)
+               << " max: " << hist.size() - 1 << endl;
   forest_shape.copyfmt(default_state);
 
   vector<char> hist_bytes(hist.size() * sizeof(hist[0]));
@@ -575,9 +579,10 @@ void node_traits<node_t>::check(const treelite::ModelImpl<threshold_t, leaf_t>&
 
 template <typename fil_node_t, typename threshold_t, typename leaf_t>
 struct tl2fil_t {
+  using real_t = typename fil_node_t::real_type;
   std::vector<int> roots_;
   std::vector<fil_node_t> nodes_;
-  std::vector<float> vector_leaf_;
+  std::vector<real_t> vector_leaf_;
   forest_params_t params_;
   cat_sets_owner cat_sets_;
   const tl::ModelImpl<threshold_t, leaf_t>& model_;
@@ -631,7 +636,7 @@ struct tl2fil_t {
   }
 
   /// initializes FIL forest object, to be ready to infer
-  void init_forest(const raft::handle_t& handle, forest_t<float>* pforest)
+  void init_forest(const raft::handle_t& handle, forest_t<real_t>* pforest)
   {
     ML::fil::init(
       handle, pforest, cat_sets_.accessor(), vector_leaf_, roots_.data(), nodes_.data(), &params_);
@@ -646,7 +651,7 @@ struct tl2fil_t {
 
 template <typename fil_node_t, typename threshold_t, typename leaf_t>
 void convert(const raft::handle_t& handle,
-             forest_t<float>* pforest,
+             forest_t<typename fil_node_t::real_type>* pforest,
              const tl::ModelImpl<threshold_t, leaf_t>& model,
              const treelite_params_t& tl_params)
 {
@@ -664,24 +669,21 @@ constexpr bool type_supported()
 
 template <typename threshold_t, typename leaf_t>
 void from_treelite(const raft::handle_t& handle,
-                   forest_t<float>* pforest,
+                   forest_variant* pforest_variant,
                    const tl::ModelImpl<threshold_t, leaf_t>& model,
                    const treelite_params_t* tl_params)
 {
+  // floating-point type used for model representation
+  using real_t = decltype(threshold_t(0) + leaf_t(0));
+
+  // get the pointer to the right forest variant
+  *pforest_variant          = (forest_t<real_t>)nullptr;
+  forest_t<real_t>* pforest = &std::get<forest_t<real_t>>(*pforest_variant);
+
   // Invariants on threshold and leaf types
   static_assert(type_supported<threshold_t>(),
                 "Model must contain float32 or float64 thresholds for splits");
   ASSERT(type_supported<leaf_t>(), "Models with integer leaf output are not yet supported");
-  // Display appropriate warnings when float64 values are being casted into
-  // float32, as FIL only supports inferencing with float32 for the time being
-  if (std::is_same<threshold_t, double>::value || std::is_same<leaf_t, double>::value) {
-    CUML_LOG_WARN(
-      "Casting all thresholds and leaf values to float32, as FIL currently "
-      "doesn't support inferencing models with float64 values. "
-      "This may lead to predictions with reduced accuracy.");
-  }
-  // same as std::common_type: float+double=double, float+int64_t=float
-  using real_t = decltype(threshold_t(0) + leaf_t(0));
 
   storage_type_t storage_type = tl_params->storage_type;
   // build dense trees by default
@@ -702,18 +704,25 @@ void from_treelite(const raft::handle_t& handle,
 
   switch (storage_type) {
     case storage_type_t::DENSE:
-      convert<dense_node<float>>(handle, pforest, model, *tl_params);
+      convert<dense_node<real_t>>(handle, pforest, model, *tl_params);
       break;
     case storage_type_t::SPARSE:
-      convert<sparse_node16<float>>(handle, pforest, model, *tl_params);
+      convert<sparse_node16<real_t>>(handle, pforest, model, *tl_params);
+      break;
+    case storage_type_t::SPARSE8:
+      // SPARSE8 is only supported for float32
+      if constexpr (std::is_same_v<real_t, float>) {
+        convert<sparse_node8>(handle, pforest, model, *tl_params);
+      } else {
+        ASSERT(false, "SPARSE8 is only supported for float32 treelite models");
+      }
       break;
-    case storage_type_t::SPARSE8: convert<sparse_node8>(handle, pforest, model, *tl_params); break;
     default: ASSERT(false, "tl_params->sparse must be one of AUTO, DENSE or SPARSE");
   }
 }
 
 void from_treelite(const raft::handle_t& handle,
-                   forest_t<float>* pforest,
+                   forest_variant* pforest,
                    ModelHandle model,
                    const treelite_params_t* tl_params)
 {
diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu
index cec210280a..cb3bcf8495 100644
--- a/cpp/test/sg/fil_test.cu
+++ b/cpp/test/sg/fil_test.cu
@@ -196,8 +196,9 @@ __global__ void floats_to_bit_stream_k(uint8_t* dst, real_t* src, std::size_t si
   dst[idx] = byte;
 }
 
+template <typename real_t>
 void adjust_threshold_to_treelite(
-  float* pthreshold, int* tl_left, int* tl_right, bool* default_left, tl::Operator comparison_op)
+  real_t* pthreshold, int* tl_left, int* tl_right, bool* default_left, tl::Operator comparison_op)
 {
   // in treelite (take left node if val [op] threshold),
   // the meaning of the condition is reversed compared to FIL;
@@ -213,12 +214,12 @@ void adjust_threshold_to_treelite(
     case tl::Operator::kLT: break;
     case tl::Operator::kLE:
       // x <= y is equivalent to x < y', where y' is the next representable float
-      *pthreshold = std::nextafterf(*pthreshold, -std::numeric_limits<float>::infinity());
+      *pthreshold = std::nextafterf(*pthreshold, -std::numeric_limits<real_t>::infinity());
       break;
     case tl::Operator::kGT:
       // x > y is equivalent to x >= y', where y' is the next representable float
       // left and right still need to be swapped
-      *pthreshold = std::nextafterf(*pthreshold, -std::numeric_limits<float>::infinity());
+      *pthreshold = std::nextafterf(*pthreshold, -std::numeric_limits<real_t>::infinity());
     case tl::Operator::kGE:
       // swap left and right
       std::swap(*tl_left, *tl_right);
@@ -745,7 +746,8 @@ using PredictSparse16Float32FilTest = BasePredictFilTest<fil::sparse_node16<floa
 using PredictSparse16Float64FilTest = BasePredictFilTest<fil::sparse_node16<double>>;
 using PredictSparse8FilTest         = BasePredictFilTest<fil::sparse_node8>;
 
-class TreeliteFilTest : public BaseFilTest<float> {
+template <typename real_t>
+class TreeliteFilTest : public BaseFilTest<real_t> {
  protected:
   /** adds nodes[node] of tree starting at index root to builder
       at index at *pkey, increments *pkey,
@@ -754,28 +756,29 @@ class TreeliteFilTest : public BaseFilTest<float> {
   {
     int key = (*pkey)++;
     builder->CreateNode(key);
-    const fil::dense_node<float>& dense_node = nodes[node];
+    const fil::dense_node<real_t>& dense_node = this->nodes[node];
     std::vector<std::uint32_t> left_categories;
     if (dense_node.is_leaf()) {
-      switch (ps.leaf_algo) {
+      switch (this->ps.leaf_algo) {
         case fil::leaf_algo_t::FLOAT_UNARY_BINARY:
         case fil::leaf_algo_t::GROVE_PER_CLASS:
           // default is fil::FLOAT_UNARY_BINARY
-          builder->SetLeafNode(key, tlf::Value::Create(dense_node.output<float>()));
+          builder->SetLeafNode(key, tlf::Value::Create(dense_node.template output<real_t>()));
           break;
         case fil::leaf_algo_t::CATEGORICAL_LEAF: {
-          std::vector<tlf::Value> vec(ps.num_classes);
-          for (int i = 0; i < ps.num_classes; ++i) {
-            vec[i] = tlf::Value::Create(i == dense_node.output<int>() ? 1.0f : 0.0f);
+          std::vector<tlf::Value> vec(this->ps.num_classes);
+          for (int i = 0; i < this->ps.num_classes; ++i) {
+            vec[i] =
+              tlf::Value::Create(i == dense_node.template output<int>() ? real_t(1) : real_t(0));
           }
           builder->SetLeafVectorNode(key, vec);
           break;
         }
         case fil::leaf_algo_t::VECTOR_LEAF: {
-          std::vector<tlf::Value> vec(ps.num_classes);
-          for (int i = 0; i < ps.num_classes; ++i) {
-            auto idx = dense_node.output<int>();
-            vec[i]   = tlf::Value::Create(vector_leaf[idx * ps.num_classes + i]);
+          std::vector<tlf::Value> vec(this->ps.num_classes);
+          for (int i = 0; i < this->ps.num_classes; ++i) {
+            auto idx = dense_node.template output<int>();
+            vec[i]   = tlf::Value::Create(this->vector_leaf[idx * this->ps.num_classes + i]);
           }
           builder->SetLeafVectorNode(key, vec);
           break;
@@ -787,14 +790,15 @@ class TreeliteFilTest : public BaseFilTest<float> {
       int left          = root + 2 * (node - root) + 1;
       int right         = root + 2 * (node - root) + 2;
       bool default_left = dense_node.def_left();
-      float threshold   = dense_node.is_categorical() ? NAN : dense_node.thresh();
+      real_t threshold  = dense_node.is_categorical() ? std::numeric_limits<real_t>::quiet_NaN()
+                                                      : dense_node.thresh();
       if (dense_node.is_categorical()) {
         uint8_t byte = 0;
         for (int category = 0;
-             category < static_cast<int>(cat_sets_h.fid_num_cats[dense_node.fid()]);
+             category < static_cast<int>(this->cat_sets_h.fid_num_cats[dense_node.fid()]);
              ++category) {
           if (category % BITS_PER_BYTE == 0) {
-            byte = cat_sets_h.bits[dense_node.set() + category / BITS_PER_BYTE];
+            byte = this->cat_sets_h.bits[dense_node.set() + category / BITS_PER_BYTE];
           }
           if ((byte & (1 << (category % BITS_PER_BYTE))) != 0) {
             left_categories.push_back(category);
@@ -815,10 +819,10 @@ class TreeliteFilTest : public BaseFilTest<float> {
         builder->SetCategoricalTestNode(
           key, dense_node.fid(), left_categories, default_left, left_key, right_key);
       } else {
-        adjust_threshold_to_treelite(&threshold, &left_key, &right_key, &default_left, ps.op);
+        adjust_threshold_to_treelite(&threshold, &left_key, &right_key, &default_left, this->ps.op);
         builder->SetNumericalTestNode(key,
                                       dense_node.fid(),
-                                      ps.op,
+                                      this->ps.op,
                                       tlf::Value::Create(threshold),
                                       default_left,
                                       left_key,
@@ -828,28 +832,27 @@ class TreeliteFilTest : public BaseFilTest<float> {
     return key;
   }
 
-  void init_forest_impl(fil::forest_t<float>* pforest, fil::storage_type_t storage_type)
+  void init_forest_impl(fil::forest_t<real_t>* pforest, fil::storage_type_t storage_type)
   {
-    auto stream             = handle.get_stream();
-    bool random_forest_flag = (ps.output & fil::output_t::AVG) != 0;
+    auto stream             = this->handle.get_stream();
+    bool random_forest_flag = (this->ps.output & fil::output_t::AVG) != 0;
+    tl::TypeInfo tl_type_info =
+      std::is_same_v<real_t, float> ? tl::TypeInfo::kFloat32 : tl::TypeInfo::kFloat64;
     int treelite_num_classes =
-      ps.leaf_algo == fil::leaf_algo_t::FLOAT_UNARY_BINARY ? 1 : ps.num_classes;
-    std::unique_ptr<tlf::ModelBuilder> model_builder(new tlf::ModelBuilder(ps.num_cols,
-                                                                           treelite_num_classes,
-                                                                           random_forest_flag,
-                                                                           tl::TypeInfo::kFloat32,
-                                                                           tl::TypeInfo::kFloat32));
+      this->ps.leaf_algo == fil::leaf_algo_t::FLOAT_UNARY_BINARY ? 1 : this->ps.num_classes;
+    std::unique_ptr<tlf::ModelBuilder> model_builder(new tlf::ModelBuilder(
+      this->ps.num_cols, treelite_num_classes, random_forest_flag, tl_type_info, tl_type_info));
 
     // prediction transform
-    if ((ps.output & fil::output_t::SIGMOID) != 0) {
-      if (ps.num_classes > 2)
+    if ((this->ps.output & fil::output_t::SIGMOID) != 0) {
+      if (this->ps.num_classes > 2)
         model_builder->SetModelParam("pred_transform", "multiclass_ova");
       else
         model_builder->SetModelParam("pred_transform", "sigmoid");
-    } else if (ps.leaf_algo != fil::leaf_algo_t::FLOAT_UNARY_BINARY) {
+    } else if (this->ps.leaf_algo != fil::leaf_algo_t::FLOAT_UNARY_BINARY) {
       model_builder->SetModelParam("pred_transform", "max_index");
-      ps.output = fil::output_t(ps.output | fil::output_t::CLASS);
-    } else if (ps.leaf_algo == GROVE_PER_CLASS) {
+      this->ps.output = fil::output_t(this->ps.output | fil::output_t::CLASS);
+    } else if (this->ps.leaf_algo == GROVE_PER_CLASS) {
       model_builder->SetModelParam("pred_transform", "identity_multiclass");
     } else {
       model_builder->SetModelParam("pred_transform", "identity");
@@ -857,18 +860,17 @@ class TreeliteFilTest : public BaseFilTest<float> {
 
     // global bias
     char* global_bias_str = nullptr;
-    ASSERT(asprintf(&global_bias_str, "%f", double(ps.global_bias)) > 0,
+    ASSERT(asprintf(&global_bias_str, "%f", double(this->ps.global_bias)) > 0,
            "cannot convert global_bias into a string");
     model_builder->SetModelParam("global_bias", global_bias_str);
     ::free(global_bias_str);
 
     // build the trees
-    for (int i_tree = 0; i_tree < ps.num_trees; ++i_tree) {
-      tlf::TreeBuilder* tree_builder =
-        new tlf::TreeBuilder(tl::TypeInfo::kFloat32, tl::TypeInfo::kFloat32);
-      int key_counter = 0;
-      int root        = i_tree * tree_num_nodes();
-      int root_key    = node_to_treelite(tree_builder, &key_counter, root, root);
+    for (int i_tree = 0; i_tree < this->ps.num_trees; ++i_tree) {
+      tlf::TreeBuilder* tree_builder = new tlf::TreeBuilder(tl_type_info, tl_type_info);
+      int key_counter                = 0;
+      int root                       = i_tree * this->tree_num_nodes();
+      int root_key                   = node_to_treelite(tree_builder, &key_counter, root, root);
       tree_builder->SetRootNode(root_key);
       // InsertTree() consumes tree_builder
       TL_CPP_CHECK(model_builder->InsertTree(tree_builder));
@@ -880,17 +882,19 @@ class TreeliteFilTest : public BaseFilTest<float> {
     // init FIL forest with the model
     char* forest_shape_str = nullptr;
     fil::treelite_params_t params;
-    params.algo              = ps.algo;
-    params.threshold         = ps.threshold;
-    params.output_class      = (ps.output & fil::output_t::CLASS) != 0;
+    params.algo              = this->ps.algo;
+    params.threshold         = this->ps.threshold;
+    params.output_class      = (this->ps.output & fil::output_t::CLASS) != 0;
     params.storage_type      = storage_type;
-    params.blocks_per_sm     = ps.blocks_per_sm;
-    params.threads_per_tree  = ps.threads_per_tree;
-    params.n_items           = ps.n_items;
-    params.pforest_shape_str = ps.print_forest_shape ? &forest_shape_str : nullptr;
-    fil::from_treelite(handle, pforest, (ModelHandle)model.get(), &params);
-    handle.sync_stream(stream);
-    if (ps.print_forest_shape) {
+    params.blocks_per_sm     = this->ps.blocks_per_sm;
+    params.threads_per_tree  = this->ps.threads_per_tree;
+    params.n_items           = this->ps.n_items;
+    params.pforest_shape_str = this->ps.print_forest_shape ? &forest_shape_str : nullptr;
+    fil::forest_variant forest_variant;
+    fil::from_treelite(this->handle, &forest_variant, (ModelHandle)model.get(), &params);
+    *pforest = std::get<fil::forest_t<real_t>>(forest_variant);
+    this->handle.sync_stream(stream);
+    if (this->ps.print_forest_shape) {
       std::string str(forest_shape_str);
       for (const char* substr : {"model size",
                                  " MB",
@@ -908,38 +912,48 @@ class TreeliteFilTest : public BaseFilTest<float> {
   }
 };
 
-class TreeliteDenseFilTest : public TreeliteFilTest {
+template <typename real_t>
+class TreeliteDenseFilTest : public TreeliteFilTest<real_t> {
  protected:
-  void init_forest(fil::forest_t<float>* pforest) override
+  void init_forest(fil::forest_t<real_t>* pforest) override
   {
-    init_forest_impl(pforest, fil::storage_type_t::DENSE);
+    this->init_forest_impl(pforest, fil::storage_type_t::DENSE);
   }
 };
 
-class TreeliteSparse16FilTest : public TreeliteFilTest {
+template <typename real_t>
+class TreeliteSparse16FilTest : public TreeliteFilTest<real_t> {
  protected:
-  void init_forest(fil::forest_t<float>* pforest) override
+  void init_forest(fil::forest_t<real_t>* pforest) override
   {
-    init_forest_impl(pforest, fil::storage_type_t::SPARSE);
+    this->init_forest_impl(pforest, fil::storage_type_t::SPARSE);
   }
 };
 
-class TreeliteSparse8FilTest : public TreeliteFilTest {
+class TreeliteSparse8FilTest : public TreeliteFilTest<float> {
  protected:
   void init_forest(fil::forest_t<float>* pforest) override
   {
-    init_forest_impl(pforest, fil::storage_type_t::SPARSE8);
+    this->init_forest_impl(pforest, fil::storage_type_t::SPARSE8);
   }
 };
 
-class TreeliteAutoFilTest : public TreeliteFilTest {
+template <typename real_t>
+class TreeliteAutoFilTest : public TreeliteFilTest<real_t> {
  protected:
-  void init_forest(fil::forest_t<float>* pforest) override
+  void init_forest(fil::forest_t<real_t>* pforest) override
   {
-    init_forest_impl(pforest, fil::storage_type_t::AUTO);
+    this->init_forest_impl(pforest, fil::storage_type_t::AUTO);
   }
 };
 
+using TreeliteDenseFloat32FilTest    = TreeliteDenseFilTest<float>;
+using TreeliteDenseFloat64FilTest    = TreeliteDenseFilTest<double>;
+using TreeliteSparse16Float32FilTest = TreeliteDenseFilTest<float>;
+using TreeliteSparse16Float64FilTest = TreeliteDenseFilTest<double>;
+using TreeliteAutoFloat32FilTest     = TreeliteAutoFilTest<float>;
+using TreeliteAutoFloat64FilTest     = TreeliteAutoFilTest<double>;
+
 // test for failures; currently only supported for sparse8 nodes
 class TreeliteThrowSparse8FilTest : public TreeliteSparse8FilTest {
  protected:
@@ -1300,9 +1314,15 @@ std::vector<FilTestParams> import_dense_inputs = {
                   max_magnitude_of_matching_cat = 5),
 };
 
-TEST_P(TreeliteDenseFilTest, Import) { compare(); }
+TEST_P(TreeliteDenseFloat32FilTest, Import) { compare(); }
+TEST_P(TreeliteDenseFloat64FilTest, Import) { compare(); }
 
-INSTANTIATE_TEST_CASE_P(FilTests, TreeliteDenseFilTest, testing::ValuesIn(import_dense_inputs));
+INSTANTIATE_TEST_CASE_P(FilTests,
+                        TreeliteDenseFloat32FilTest,
+                        testing::ValuesIn(import_dense_inputs));
+INSTANTIATE_TEST_CASE_P(FilTests,
+                        TreeliteDenseFloat64FilTest,
+                        testing::ValuesIn(import_dense_inputs));
 
 std::vector<FilTestParams> import_sparse_inputs = {
   FIL_TEST_PARAMS(),
@@ -1353,9 +1373,15 @@ std::vector<FilTestParams> import_sparse_inputs = {
                   max_magnitude_of_matching_cat = 5),
 };
 
-TEST_P(TreeliteSparse16FilTest, Import) { compare(); }
+TEST_P(TreeliteSparse16Float32FilTest, Import) { compare(); }
+TEST_P(TreeliteSparse16Float64FilTest, Import) { compare(); }
 
-INSTANTIATE_TEST_CASE_P(FilTests, TreeliteSparse16FilTest, testing::ValuesIn(import_sparse_inputs));
+INSTANTIATE_TEST_CASE_P(FilTests,
+                        TreeliteSparse16Float32FilTest,
+                        testing::ValuesIn(import_sparse_inputs));
+INSTANTIATE_TEST_CASE_P(FilTests,
+                        TreeliteSparse16Float64FilTest,
+                        testing::ValuesIn(import_sparse_inputs));
 
 TEST_P(TreeliteSparse8FilTest, Import) { compare(); }
 
@@ -1381,9 +1407,15 @@ std::vector<FilTestParams> import_auto_inputs = {
 #endif
 };
 
-TEST_P(TreeliteAutoFilTest, Import) { compare(); }
+TEST_P(TreeliteAutoFloat32FilTest, Import) { compare(); }
+TEST_P(TreeliteAutoFloat64FilTest, Import) { compare(); }
 
-INSTANTIATE_TEST_CASE_P(FilTests, TreeliteAutoFilTest, testing::ValuesIn(import_auto_inputs));
+INSTANTIATE_TEST_CASE_P(FilTests,
+                        TreeliteAutoFloat32FilTest,
+                        testing::ValuesIn(import_auto_inputs));
+INSTANTIATE_TEST_CASE_P(FilTests,
+                        TreeliteAutoFloat64FilTest,
+                        testing::ValuesIn(import_auto_inputs));
 
 // adjust test parameters if the sparse8 format changes
 std::vector<FilTestParams> import_throw_sparse8_inputs = {
diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu
index 345770efa1..18923c4baa 100644
--- a/cpp/test/sg/rf_test.cu
+++ b/cpp/test/sg/rf_test.cu
@@ -172,8 +172,9 @@ auto FilPredict(const raft::handle_t& handle,
                                    1,
                                    0,
                                    nullptr};
-  fil::forest_t<float> fil_forest;
-  fil::from_treelite(handle, &fil_forest, model, &tl_params);
+  fil::forest_variant forest_variant;
+  fil::from_treelite(handle, &forest_variant, model, &tl_params);
+  fil::forest_t<float> fil_forest = std::get<fil::forest_t<float>>(forest_variant);
   fil::predict(handle, fil_forest, pred->data().get(), X_transpose, params.n_rows, false);
   return pred;
 }
@@ -191,8 +192,9 @@ auto FilPredictProba(const raft::handle_t& handle,
   build_treelite_forest(&model, forest, params.n_cols);
   fil::treelite_params_t tl_params{
     fil::algo_t::ALGO_AUTO, 0, 0.0f, fil::storage_type_t::AUTO, 8, 1, 0, nullptr};
-  fil::forest_t<float> fil_forest;
-  fil::from_treelite(handle, &fil_forest, model, &tl_params);
+  fil::forest_variant forest_variant;
+  fil::from_treelite(handle, &forest_variant, model, &tl_params);
+  fil::forest_t<float> fil_forest = std::get<fil::forest_t<float>>(forest_variant);
   fil::predict(handle, fil_forest, pred->data().get(), X_transpose, params.n_rows, true);
   return pred;
 }
@@ -557,8 +559,9 @@ TEST(RfTests, IntegerOverflow)
                                    1,
                                    0,
                                    nullptr};
-  fil::forest_t<float> fil_forest;
-  fil::from_treelite(handle, &fil_forest, model, &tl_params);
+  fil::forest_variant forest_variant;
+  fil::from_treelite(handle, &forest_variant, model, &tl_params);
+  fil::forest_t<float> fil_forest = std::get<fil::forest_t<float>>(forest_variant);
   fil::predict(handle, fil_forest, pred.data().get(), X.data().get(), m, false);
 }
 

From 1fd862401fb28e16772c83005aee936b5a4b18fc Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Thu, 7 Apr 2022 23:41:45 +0200
Subject: [PATCH 42/45] float64 support in FIL Python layer.

---
 cpp/include/cuml/fil/fil.h |  4 +++
 python/cuml/fil/fil.pyx    | 73 ++++++++++++++++++++++++++++----------
 2 files changed, 58 insertions(+), 19 deletions(-)

diff --git a/cpp/include/cuml/fil/fil.h b/cpp/include/cuml/fil/fil.h
index 5cc4a9f748..c842a68313 100644
--- a/cpp/include/cuml/fil/fil.h
+++ b/cpp/include/cuml/fil/fil.h
@@ -78,6 +78,10 @@ struct forest;
 template <typename real_t>
 using forest_t = forest<real_t>*;
 
+/** forest32_t and forest64_t are definitions required in Cython */
+using forest32_t = forest<float>*;
+using forest64_t = forest<double>*;
+
 /** forest_variant is used to get a forest represented with either float or double. */
 using forest_variant = std::variant<forest_t<float>, forest_t<double>>;
 
diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx
index a9e0b79e1a..f084190e69 100644
--- a/python/cuml/fil/fil.pyx
+++ b/python/cuml/fil/fil.pyx
@@ -177,6 +177,13 @@ cdef class TreeliteModel():
         model.set_handle(handle)
         return model
 
+cdef extern from "variant" namespace "std":
+    cdef cppclass variant[T1, T2]:
+        variant()
+        size_t index()
+
+    cdef T& get[T, T1, T2](variant[T1, T2]& v)
+
 cdef extern from "cuml/fil/fil.h" namespace "ML::fil":
     cdef enum algo_t:
         ALGO_AUTO,
@@ -193,6 +200,10 @@ cdef extern from "cuml/fil/fil.h" namespace "ML::fil":
     cdef cppclass forest[real_t]:
         pass
 
+    ctypedef forest[float]* forest32_t
+    ctypedef forest[double]* forest64_t
+    ctypedef variant[forest32_t, forest64_t] forest_variant
+
     # TODO(canonizer): use something like
     # ctypedef forest[real_t]* forest_t[real_t]
     # once it is supported in Cython
@@ -227,15 +238,15 @@ cdef extern from "cuml/fil/fil.h" namespace "ML::fil":
                               size_t,
                               bool) except +
 
-    cdef forest[float]* from_treelite(handle_t& handle,
-                                      forest[float]**,
-                                      ModelHandle,
-                                      treelite_params_t*) except +
+    cdef void from_treelite(handle_t& handle,
+                            forest_variant*,
+                            ModelHandle,
+                            treelite_params_t*) except +
 
 cdef class ForestInference_impl():
 
     cdef object handle
-    cdef forest[float]* forest_data
+    cdef forest_variant forest_data
     cdef size_t num_class
     cdef bool output_class
     cdef char* shape_str
@@ -243,7 +254,7 @@ cdef class ForestInference_impl():
     def __cinit__(self,
                   handle=None):
         self.handle = handle
-        self.forest_data = NULL
+        self.forest_data = forest_variant()
         self.shape_str = NULL
 
     def get_shape_str(self):
@@ -251,6 +262,10 @@ cdef class ForestInference_impl():
             return unicode(self.shape_str, 'utf-8')
         return None
 
+    def get_dtype(self):
+        dtype_array = [np.float32, np.float64]
+        return dtype_array[self.forest_data.index()]
+
     def get_algo(self, algo_str):
         algo_dict={'AUTO': algo_t.ALGO_AUTO,
                    'auto': algo_t.ALGO_AUTO,
@@ -327,12 +342,13 @@ cdef class ForestInference_impl():
                                       " using a Classification model, please "
                                       " set `output_class=True` while creating"
                                       " the FIL model.")
+        fil_dtype = self.get_dtype()
         cdef uintptr_t X_ptr
         X_m, n_rows, n_cols, dtype = \
             input_to_cuml_array(X, order='C',
-                                convert_to_dtype=np.float32,
+                                convert_to_dtype=fil_dtype,
                                 safe_dtype_conversion=safe_dtype_conversion,
-                                check_dtype=np.float32)
+                                check_dtype=fil_dtype)
         X_ptr = X_m.ptr
 
         cdef handle_t* handle_ =\
@@ -345,7 +361,7 @@ cdef class ForestInference_impl():
                     shape += (2,)
                 else:
                     shape += (self.num_class,)
-            preds = CumlArray.empty(shape=shape, dtype=np.float32, order='C',
+            preds = CumlArray.empty(shape=shape, dtype=fil_dtype, order='C',
                                     index=X_m.index)
         else:
             if not hasattr(preds, "__cuda_array_interface__"):
@@ -356,12 +372,24 @@ cdef class ForestInference_impl():
         cdef uintptr_t preds_ptr
         preds_ptr = preds.ptr
 
-        predict(handle_[0],
-                self.forest_data,
-                <float*> preds_ptr,
-                <float*> X_ptr,
-                <size_t> n_rows,
-                <bool> predict_proba)
+        if fil_dtype == np.float32:
+            predict(handle_[0],
+                    get[forest32_t, forest32_t, forest64_t](self.forest_data),
+                    <float*> preds_ptr,
+                    <float*> X_ptr,
+                    <size_t> n_rows,
+                    <bool> predict_proba)
+        elif fil_dtype == np.float64:
+            predict(handle_[0],
+                    get[forest64_t, forest32_t, forest64_t](self.forest_data),
+                    <double*> preds_ptr,
+                    <double*> X_ptr,
+                    <size_t> n_rows,
+                    <bool> predict_proba)
+        else:
+            # should not reach here
+            assert False, 'invalid fil_dtype, must be np.float32 or np.float64'
+            
         self.handle.sync()
 
         # special case due to predict and predict_proba
@@ -372,7 +400,7 @@ cdef class ForestInference_impl():
         return preds
 
     def load_from_treelite_model_handle(self, **kwargs):
-        self.forest_data = NULL
+        self.forest_data = forest_variant()
         return self.load_using_treelite_handle(**kwargs)
 
     def load_from_treelite_model(self, **kwargs):
@@ -413,9 +441,16 @@ cdef class ForestInference_impl():
 
     def __dealloc__(self):
         cdef handle_t* handle_ = <handle_t*><size_t>self.handle.getHandle()
-
-        if self.forest_data !=NULL:
-            free(handle_[0], self.forest_data)
+        fil_dtype = self.get_dtype()
+        if fil_dtype == np.float32:
+            if get[forest32_t, forest32_t, forest64_t](self.forest_data) != NULL:
+                free[float](handle_[0], get[forest32_t, forest32_t, forest64_t](self.forest_data))
+        elif fil_dtype == np.float64:
+            if get[forest64_t, forest32_t, forest64_t](self.forest_data) != NULL:
+                free[double](handle_[0], get[forest64_t, forest32_t, forest64_t](self.forest_data))
+        else:
+            # should not reach here
+            assert False, 'invalid fil_dtype, must be np.float32 or np.float64'
 
 
 class ForestInference(Base,

From aaa326182c6d8599cc77f7806e9887681c2adf93 Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Fri, 8 Apr 2022 01:21:27 +0200
Subject: [PATCH 43/45] get_forest{32,64} in fil.pyx.

---
 python/cuml/fil/fil.pyx | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx
index f084190e69..4d4d1f8950 100644
--- a/python/cuml/fil/fil.pyx
+++ b/python/cuml/fil/fil.pyx
@@ -251,6 +251,12 @@ cdef class ForestInference_impl():
     cdef bool output_class
     cdef char* shape_str
 
+    cdef forest32_t get_forest32(self):
+        return get[forest32_t, forest32_t, forest64_t](self.forest_data)
+
+    cdef forest64_t get_forest64(self):
+        return get[forest64_t, forest32_t, forest64_t](self.forest_data)
+
     def __cinit__(self,
                   handle=None):
         self.handle = handle
@@ -374,14 +380,14 @@ cdef class ForestInference_impl():
 
         if fil_dtype == np.float32:
             predict(handle_[0],
-                    get[forest32_t, forest32_t, forest64_t](self.forest_data),
+                    self.get_forest32(),
                     <float*> preds_ptr,
                     <float*> X_ptr,
                     <size_t> n_rows,
                     <bool> predict_proba)
         elif fil_dtype == np.float64:
             predict(handle_[0],
-                    get[forest64_t, forest32_t, forest64_t](self.forest_data),
+                    self.get_forest64(),
                     <double*> preds_ptr,
                     <double*> X_ptr,
                     <size_t> n_rows,
@@ -389,7 +395,7 @@ cdef class ForestInference_impl():
         else:
             # should not reach here
             assert False, 'invalid fil_dtype, must be np.float32 or np.float64'
-            
+
         self.handle.sync()
 
         # special case due to predict and predict_proba
@@ -443,11 +449,11 @@ cdef class ForestInference_impl():
         cdef handle_t* handle_ = <handle_t*><size_t>self.handle.getHandle()
         fil_dtype = self.get_dtype()
         if fil_dtype == np.float32:
-            if get[forest32_t, forest32_t, forest64_t](self.forest_data) != NULL:
-                free[float](handle_[0], get[forest32_t, forest32_t, forest64_t](self.forest_data))
+            if self.get_forest32() != NULL:
+                free[float](handle_[0], self.get_forest32())
         elif fil_dtype == np.float64:
-            if get[forest64_t, forest32_t, forest64_t](self.forest_data) != NULL:
-                free[double](handle_[0], get[forest64_t, forest32_t, forest64_t](self.forest_data))
+            if self.get_forest64() != NULL:
+                free[double](handle_[0], self.get_forest64())
         else:
             # should not reach here
             assert False, 'invalid fil_dtype, must be np.float32 or np.float64'

From 606ee1d01676fc7474f2572c4ac98b07496f5340 Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Fri, 8 Apr 2022 01:43:53 +0200
Subject: [PATCH 44/45] Initializing forest_data with forest_variant(NULL).

---
 python/cuml/fil/fil.pyx | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx
index 4d4d1f8950..8908b0fc4a 100644
--- a/python/cuml/fil/fil.pyx
+++ b/python/cuml/fil/fil.pyx
@@ -180,6 +180,7 @@ cdef class TreeliteModel():
 cdef extern from "variant" namespace "std":
     cdef cppclass variant[T1, T2]:
         variant()
+        variant(T1)
         size_t index()
 
     cdef T& get[T, T1, T2](variant[T1, T2]& v)
@@ -260,7 +261,7 @@ cdef class ForestInference_impl():
     def __cinit__(self,
                   handle=None):
         self.handle = handle
-        self.forest_data = forest_variant()
+        self.forest_data = forest_variant(<forest32_t> NULL)
         self.shape_str = NULL
 
     def get_shape_str(self):
@@ -406,7 +407,7 @@ cdef class ForestInference_impl():
         return preds
 
     def load_from_treelite_model_handle(self, **kwargs):
-        self.forest_data = forest_variant()
+        self.forest_data = forest_variant(<forest32_t> NULL)
         return self.load_using_treelite_handle(**kwargs)
 
     def load_from_treelite_model(self, **kwargs):

From 0d2c51299b0b86a339d3ee954edbb68f838a3c11 Mon Sep 17 00:00:00 2001
From: Andy Adinets <aadinets@nvidia.com>
Date: Sat, 9 Apr 2022 03:55:01 +0200
Subject: [PATCH 45/45] Addressed review comments.

---
 cpp/include/cuml/fil/fil.h | 7 ++-----
 cpp/src/fil/infer.cu       | 2 +-
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cuml/fil/fil.h b/cpp/include/cuml/fil/fil.h
index c842a68313..2d5d786520 100644
--- a/cpp/include/cuml/fil/fil.h
+++ b/cpp/include/cuml/fil/fil.h
@@ -31,10 +31,8 @@ class handle_t;
 namespace ML {
 namespace fil {
 
-/** @note FIL only supports inference with single precision.
- *  TODO(canonizer): parameterize the functions and structures by the data type
- *  and the threshold/weight type.
- */
+/** @note FIL supports inference with both single and double precision. However,
+    the floating-point type used in the data and model must be the same. */
 
 /** Inference algorithm to use. */
 enum algo_t {
@@ -123,7 +121,6 @@ struct treelite_params_t {
  * @param model treelite model used to initialize the forest
  * @param tl_params additional parameters for the forest
  */
-// TODO (canonizer): use std::variant<forest_t<float> forest_t<double>>* for pforest
 void from_treelite(const raft::handle_t& handle,
                    forest_variant* pforest,
                    ModelHandle model,
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 7166c600e6..5af31aa76e 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -36,7 +36,7 @@
 #endif  // __CUDA_ARCH__
 #endif  // CUDA_PRAGMA_UNROLL
 
-#define INLINE_CONFIG __noinline__
+#define INLINE_CONFIG __forceinline__
 
 namespace ML {
 namespace fil {