From 86bc6b5ef96506318a87ce88e7ce97921000335d Mon Sep 17 00:00:00 2001 From: Levs Dolgovs Date: Tue, 8 Feb 2022 19:59:40 -0800 Subject: [PATCH 01/45] templatized node, forest and storage types --- cpp/include/cuml/fil/fil.h | 9 ++- cpp/src/fil/common.cuh | 46 ++++++------ cpp/src/fil/fil.cu | 68 ++++++++--------- cpp/src/fil/infer.cu | 8 +- cpp/src/fil/internal.cuh | 130 ++++++++++++++++++--------------- cpp/src/fil/treelite_import.cu | 4 +- 6 files changed, 136 insertions(+), 129 deletions(-) diff --git a/cpp/include/cuml/fil/fil.h b/cpp/include/cuml/fil/fil.h index 4cecb8aef4..dfa66ad1a8 100644 --- a/cpp/include/cuml/fil/fil.h +++ b/cpp/include/cuml/fil/fil.h @@ -55,15 +55,16 @@ enum algo_t { enum storage_type_t { /** decide automatically; currently always builds dense forests */ AUTO, - /** import the forest as dense */ + /** import the forest as dense (8 or 16-bytes nodes, depending on model precision */ DENSE, /** import the forest as sparse (currently always with 16-byte nodes) */ SPARSE, /** (experimental) import the forest as sparse with 8-byte nodes; can fail if 8-byte nodes are not enough to store the forest, e.g. there are too many - nodes in a tree or too many features; note that the number of bits used to - store the child or feature index can change in the future; this can affect - whether a particular forest can be imported as SPARSE8 */ + nodes in a tree or too many features or the thresholds are double precision; + note that the number of bits used to store the child or feature index can + change in the future; this can affect whether a particular forest can be + imported as SPARSE8 */ SPARSE8, }; static const char* storage_type_repr[] = {"AUTO", "DENSE", "SPARSE", "SPARSE8"}; diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh index 3174b63c26..56a98b5ac0 100644 --- a/cpp/src/fil/common.cuh +++ b/cpp/src/fil/common.cuh @@ -44,23 +44,27 @@ struct storage_base { bool cats_present() const { return sets_.cats_present(); } }; -/** dense_tree represents a dense tree */ - -struct dense_tree : tree_base { - __host__ __device__ dense_tree(categorical_sets cat_sets, dense_node* nodes, int node_pitch) +/** represents a dense tree */ +template +struct tree> : tree_base { + __host__ __device__ tree(categorical_sets cat_sets, dense_node* nodes, int node_pitch) : tree_base{cat_sets}, nodes_(nodes), node_pitch_(node_pitch) { } - __host__ __device__ const dense_node& operator[](int i) const { return nodes_[i * node_pitch_]; } - dense_node* nodes_ = nullptr; + __host__ __device__ const dense_node& operator[](int i) const { return nodes_[i * node_pitch_]; } + dense_node* nodes_ = nullptr; int node_pitch_ = 0; }; +using std::enable_if; + /** dense_storage stores the forest as a collection of dense nodes */ -struct dense_storage : storage_base { - __host__ __device__ dense_storage(categorical_sets cat_sets, +template +struct storage> : storage_base { + using node_t = dense_node; + __host__ __device__ storage(categorical_sets cat_sets, float* vector_leaf, - dense_node* nodes, + node_t* nodes, int num_trees, int tree_stride, int node_pitch) @@ -72,20 +76,20 @@ struct dense_storage : storage_base { { } __host__ __device__ int num_trees() const { return num_trees_; } - __host__ __device__ dense_tree operator[](int i) const + __host__ __device__ tree operator[](int i) const { - return dense_tree(sets_, nodes_ + i * tree_stride_, node_pitch_); + return tree(sets_, nodes_ + i * tree_stride_, node_pitch_); } - dense_node* nodes_ = nullptr; + node_t* nodes_ = nullptr; int num_trees_ = 0; int tree_stride_ = 0; int node_pitch_ = 0; }; -/** sparse_tree is a sparse tree */ +/** sparse tree */ template -struct sparse_tree : tree_base { - __host__ __device__ sparse_tree(categorical_sets cat_sets, node_t* nodes) +struct tree : tree_base { + __host__ __device__ tree(categorical_sets cat_sets, node_t* nodes) : tree_base{cat_sets}, nodes_(nodes) { } @@ -95,24 +99,24 @@ struct sparse_tree : tree_base { /** sparse_storage stores the forest as a collection of sparse nodes */ template -struct sparse_storage : storage_base { +struct storage : storage_base { int* trees_ = nullptr; node_t* nodes_ = nullptr; int num_trees_ = 0; - __host__ __device__ sparse_storage( + __host__ __device__ storage( categorical_sets cat_sets, float* vector_leaf, int* trees, node_t* nodes, int num_trees) : storage_base{cat_sets, vector_leaf}, trees_(trees), nodes_(nodes), num_trees_(num_trees) { } __host__ __device__ int num_trees() const { return num_trees_; } - __host__ __device__ sparse_tree operator[](int i) const + __host__ __device__ tree operator[](int i) const { - return sparse_tree(sets_, &nodes_[trees_[i]]); + return tree(sets_, &nodes_[trees_[i]]); } }; -typedef sparse_storage sparse_storage16; -typedef sparse_storage sparse_storage8; +typedef storage> sparse_storage16; +typedef storage sparse_storage8; /// all model parameters mostly required to compute shared memory footprint, /// also the footprint itself diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu index f00e310a6b..148fe845b2 100644 --- a/cpp/src/fil/fil.cu +++ b/cpp/src/fil/fil.cu @@ -17,7 +17,7 @@ /** @file fil.cu fil.cu implements the forest data types (dense and sparse), including their creation and prediction (the main inference kernel is defined in infer.cu). */ -#include "common.cuh" // for predict_params, sparse_storage, dense_storage +#include "common.cuh" // for predict_params, storage, storage #include "internal.cuh" // for cat_sets_device_owner, categorical_sets, output_t, #include // for algo_t, @@ -352,10 +352,12 @@ struct opt_into_arch_dependent_shmem : dispatch_functor { } }; -struct dense_forest : forest { +template +struct dense_forest> : forest { + using node_t = dense_node; dense_forest(const raft::handle_t& h) : forest(h), nodes_(0, h.get_stream()) {} - void transform_trees(const dense_node* nodes) + void transform_trees(const node_t* nodes) { /* Populate node information: For each tree, the nodes are still stored in the breadth-first, @@ -385,9 +387,9 @@ struct dense_forest : forest { /// sparse_forest::init() void init(const raft::handle_t& h, const categorical_sets& cat_sets, - const std::vector& vector_leaf, + const std::vector& vector_leaf, const int* trees, - const dense_node* nodes, + const node_t* nodes, const forest_params_t* params) { init_common(h, cat_sets, vector_leaf, params); @@ -403,12 +405,12 @@ struct dense_forest : forest { } RAFT_CUDA_TRY(cudaMemcpyAsync(nodes_.data(), h_nodes_.data(), - num_nodes * sizeof(dense_node), + num_nodes * sizeof(node_t), cudaMemcpyHostToDevice, h.get_stream())); // predict_proba is a runtime parameter, and opt-in is unconditional - dispatch_on_fil_template_params(opt_into_arch_dependent_shmem(max_shm_), + dispatch_on_fil_template_params(opt_into_arch_dependent_shmem(max_shm_), static_cast(class_ssp_)); // copy must be finished before freeing the host data RAFT_CUDA_TRY(cudaStreamSynchronize(h.get_stream())); @@ -418,7 +420,7 @@ struct dense_forest : forest { virtual void infer(predict_params params, cudaStream_t stream) override { - dense_storage forest(cat_sets_.accessor(), + storage forest(cat_sets_.accessor(), vector_leaf_.data(), nodes_.data(), num_trees_, @@ -433,8 +435,8 @@ struct dense_forest : forest { forest::free(h); } - rmm::device_uvector nodes_; - thrust::host_vector h_nodes_; + rmm::device_uvector nodes_; + thrust::host_vector h_nodes_; }; template @@ -446,7 +448,7 @@ struct sparse_forest : forest { void init(const raft::handle_t& h, const categorical_sets& cat_sets, - const std::vector& vector_leaf, + const std::vector& vector_leaf, const int* trees, const node_t* nodes, const forest_params_t* params) @@ -467,13 +469,13 @@ struct sparse_forest : forest { nodes_.data(), nodes, sizeof(node_t) * num_nodes_, cudaMemcpyHostToDevice, h.get_stream())); // predict_proba is a runtime parameter, and opt-in is unconditional - dispatch_on_fil_template_params(opt_into_arch_dependent_shmem>(max_shm_), + dispatch_on_fil_template_params(opt_into_arch_dependent_shmem>(max_shm_), static_cast(class_ssp_)); } virtual void infer(predict_params params, cudaStream_t stream) override { - sparse_storage forest( + storage forest( cat_sets_.accessor(), vector_leaf_.data(), trees_.data(), nodes_.data(), num_trees_); fil::infer(forest, params, stream); } @@ -572,7 +574,7 @@ template void init(const raft::handle_t& h, forest_t* pf, const categorical_sets& cat_sets, - const std::vector& vector_leaf, + const std::vector& vector_leaf, const int* trees, const fil_node_t* nodes, const forest_params_t* params) @@ -584,30 +586,20 @@ void init(const raft::handle_t& h, *pf = f; } -// explicit instantiations for init_sparse() -template void init(const raft::handle_t& h, - forest_t* pf, - const categorical_sets& cat_sets, - const std::vector& vector_leaf, - const int* trees, - const sparse_node16* nodes, - const forest_params_t* params); - -template void init(const raft::handle_t& h, - forest_t* pf, - const categorical_sets& cat_sets, - const std::vector& vector_leaf, - const int* trees, - const sparse_node8* nodes, - const forest_params_t* params); - -template void init(const raft::handle_t& h, - forest_t* pf, - const categorical_sets& cat_sets, - const std::vector& vector_leaf, - const int* trees, - const dense_node* nodes, - const forest_params_t* params); +struct instantiate_forest_init { + template + void operator()(fil_node_t) { + init(raft::handle_t{}, + (forest_t*)nullptr, + categorical_sets{}, + std::vector{}, + (int*)nullptr, + (fil_node_t*)nullptr, + (forest_params_t*)nullptr); + } +}; + +template void instantiate_for_all_node_types(instantiate_forest_init); void free(const raft::handle_t& h, forest_t f) { diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu index c51d9b3dc4..3e4d5eba56 100644 --- a/cpp/src/fil/infer.cu +++ b/cpp/src/fil/infer.cu @@ -36,7 +36,7 @@ #endif // __CUDA_ARCH__ #endif // CUDA_PRAGMA_UNROLL -#define INLINE_CONFIG __forceinline__ +#define INLINE_CONFIG __noinline__ namespace ML { namespace fil { @@ -893,13 +893,13 @@ void infer(storage_type forest, predict_params params, cudaStream_t stream) dispatch_on_fil_template_params(infer_k_storage_template(forest, stream), params); } -template void infer(dense_storage forest, +template void inferstorage>(storage> forest, predict_params params, cudaStream_t stream); -template void infer(sparse_storage16 forest, +template void infer>>(storage> forest, predict_params params, cudaStream_t stream); -template void infer(sparse_storage8 forest, +template void infer>(storage forest, predict_params params, cudaStream_t stream); diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh index 253936be96..71e36e0ced 100644 --- a/cpp/src/fil/internal.cuh +++ b/cpp/src/fil/internal.cuh @@ -28,6 +28,7 @@ #include #include #include +#include namespace raft { class handle_t; @@ -89,19 +90,22 @@ enum output_t { }; /** val_t is the payload within a FIL leaf */ +template union val_t { - /** threshold value for parent node or output value (e.g. class - probability or regression summand) for leaf node */ - float f = NAN; + /** floating-point threshold value for parent node or output value + (e.g. class probability or regression summand) for leaf node */ + F f = NAN; /** class label, leaf vector index or categorical node set offset */ int idx; }; /** base_node contains common implementation details for dense and sparse nodes */ +template struct base_node { + using F = F_; // floating-point type /** val, for parent nodes, is a threshold or category list offset. For leaf nodes, it is the tree prediction (see see leaf_output_t::T) */ - val_t val; + val_t val; /** bits encode various information about the node, with the exact nature of this information depending on the node type; it includes e.g. whether the node is a leaf or inner node, and for inner nodes, additional information, @@ -115,16 +119,24 @@ struct base_node { static const int IS_CATEGORICAL_MASK = 1 << IS_CATEGORICAL_OFFSET; static const int FID_MASK = (1 << IS_CATEGORICAL_OFFSET) - 1; template - __host__ __device__ o_t output() const; + __host__ __device__ o_t output() const { + if constexpr(std::is_same()) { + return val.idx; + } else if constexpr(std::is_same()) { + return val.f; + } else { + return val; + } + } __host__ __device__ int set() const { return val.idx; } - __host__ __device__ float thresh() const { return val.f; } - __host__ __device__ val_t split() const { return val; } + __host__ __device__ F thresh() const { return val.f; } + __host__ __device__ val_t split() const { return val; } __host__ __device__ int fid() const { return bits & FID_MASK; } __host__ __device__ bool def_left() const { return bits & DEF_LEFT_MASK; } __host__ __device__ bool is_leaf() const { return bits & IS_LEAF_MASK; } __host__ __device__ bool is_categorical() const { return bits & IS_CATEGORICAL_MASK; } __host__ __device__ base_node() : val{}, bits(0) {} - base_node(val_t output, val_t split, int fid, bool def_left, bool is_leaf, bool is_categorical) + base_node(val_t output, val_t split, int fid, bool def_left, bool is_leaf, bool is_categorical) { RAFT_EXPECTS((fid & FID_MASK) == fid, "internal error: feature ID doesn't fit into base_node"); bits = (fid & FID_MASK) | (def_left ? DEF_LEFT_MASK : 0) | (is_leaf ? IS_LEAF_MASK : 0) | @@ -136,28 +148,13 @@ struct base_node { } }; -template <> -__host__ __device__ __forceinline__ float base_node::output() const -{ - return val.f; -} -template <> -__host__ __device__ __forceinline__ int base_node::output() const -{ - return val.idx; -} -template <> -__host__ __device__ __forceinline__ val_t base_node::output() const -{ - return val; -} - /** dense_node is a single node of a dense forest */ -struct alignas(8) dense_node : base_node { +template +struct alignas(8) dense_node : base_node { dense_node() = default; /// ignoring left_index, this is useful to unify import from treelite - dense_node(val_t output, - val_t split, + dense_node(val_t output, + val_t split, int fid, bool def_left, bool is_leaf, @@ -171,20 +168,19 @@ struct alignas(8) dense_node : base_node { }; /** sparse_node16 is a 16-byte node in a sparse forest */ -struct alignas(16) sparse_node16 : base_node { +template +struct alignas(16) sparse_node16 : base_node { int left_idx; - int dummy; // make alignment explicit and reserve for future use __host__ __device__ sparse_node16() : left_idx(0), dummy(0) {} - sparse_node16(val_t output, - val_t split, + sparse_node16(val_t output, + val_t split, int fid, bool def_left, bool is_leaf, bool is_categorical, int left_index) : base_node(output, split, fid, def_left, is_leaf, is_categorical), - left_idx(left_index), - dummy(0) + left_idx(left_index) { } __host__ __device__ int left_index() const { return left_idx; } @@ -193,7 +189,7 @@ struct alignas(16) sparse_node16 : base_node { }; /** sparse_node8 is a node of reduced size (8 bytes) in a sparse forest */ -struct alignas(8) sparse_node8 : base_node { +struct alignas(8) sparse_node8 : base_node { static const int LEFT_NUM_BITS = 16; static const int FID_NUM_BITS = IS_CATEGORICAL_OFFSET - LEFT_NUM_BITS; static const int LEFT_OFFSET = FID_NUM_BITS; @@ -202,8 +198,8 @@ struct alignas(8) sparse_node8 : base_node { __host__ __device__ int fid() const { return bits & FID_MASK; } __host__ __device__ int left_index() const { return (bits & LEFT_MASK) >> LEFT_OFFSET; } sparse_node8() = default; - sparse_node8(val_t output, - val_t split, + sparse_node8(val_t output, + val_t split, int fid, bool def_left, bool is_leaf, @@ -221,29 +217,40 @@ struct alignas(8) sparse_node8 : base_node { __host__ __device__ int left(int curr) const { return left_index(); } }; -struct dense_forest; +/// pass a functor with a templated operator()(fil_node_t), i.e. accepting one node as parameter +template +void instantiate_for_all_node_types(Stuff stuff) { + stuff(dense_node{}); + stuff(dense_node{}); + stuff(sparse_node16{}); + stuff(sparse_node16{}); + stuff(sparse_node8{}); +} + template -struct sparse_forest; +struct storage; -struct dense_storage; template -struct sparse_storage; +struct dense_forest; +template +struct sparse_forest; template struct node_traits { - using storage = sparse_storage; + using F = typename node_t::F; + using storage = ML::fil::storage; using forest = sparse_forest; static const bool IS_DENSE = false; static const storage_type_t storage_type_enum = - std::is_same() ? SPARSE : SPARSE8; + std::is_same, node_t>() ? SPARSE : SPARSE8; template static void check(const treelite::ModelImpl& model); }; -template <> -struct node_traits { - using storage = dense_storage; - using forest = dense_forest; +template +struct node_traits> { + using storage = storage>; + using forest = dense_forest>; static const bool IS_DENSE = true; static const storage_type_t storage_type_enum = DENSE; template @@ -288,27 +295,30 @@ enum leaf_algo_t { MAX_LEAF_ALGO = 5 }; -template +template +struct tree; + +template struct leaf_output_t { }; -template <> -struct leaf_output_t { - typedef float T; +template +struct leaf_output_t> { + typedef typename node_t::F T; }; -template <> -struct leaf_output_t { +template +struct leaf_output_t> { typedef int T; }; -template <> -struct leaf_output_t { - typedef float T; +template +struct leaf_output_t> { + typedef typename node_t::F T; }; -template <> -struct leaf_output_t { - typedef float T; +template +struct leaf_output_t> { + typedef typename node_t::F T; }; -template <> -struct leaf_output_t { +template +struct leaf_output_t> { typedef int T; }; diff --git a/cpp/src/fil/treelite_import.cu b/cpp/src/fil/treelite_import.cu index 8a3d2121cd..22216a348b 100644 --- a/cpp/src/fil/treelite_import.cu +++ b/cpp/src/fil/treelite_import.cu @@ -692,8 +692,8 @@ void from_treelite(const raft::handle_t& handle, } switch (storage_type) { - case storage_type_t::DENSE: convert(handle, pforest, model, *tl_params); break; - case storage_type_t::SPARSE: convert(handle, pforest, model, *tl_params); break; + case storage_type_t::DENSE: convert>(handle, pforest, model, *tl_params); break; + case storage_type_t::SPARSE: convert>(handle, pforest, model, *tl_params); break; case storage_type_t::SPARSE8: convert(handle, pforest, model, *tl_params); break; default: ASSERT(false, "tl_params->sparse must be one of AUTO, DENSE or SPARSE"); } From abfb605f218119a41e457e65a254b704949018eb Mon Sep 17 00:00:00 2001 From: Levs Dolgovs Date: Thu, 10 Feb 2022 17:55:54 -0800 Subject: [PATCH 02/45] compiles? --- cpp/src/fil/common.cuh | 61 +++++++++++++++++++--------------- cpp/src/fil/fil.cu | 51 +++++++++++++++------------- cpp/src/fil/infer.cu | 20 +++++------ cpp/src/fil/internal.cuh | 38 +++++++++++---------- cpp/src/fil/treelite_import.cu | 13 +++++--- 5 files changed, 100 insertions(+), 83 deletions(-) diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh index 56a98b5ac0..d5ec24b2af 100644 --- a/cpp/src/fil/common.cuh +++ b/cpp/src/fil/common.cuh @@ -38,9 +38,10 @@ __host__ __device__ __forceinline__ int forest_num_nodes(int num_trees, int dept return num_trees * tree_num_nodes(depth); } +template struct storage_base { categorical_sets sets_; - float* vector_leaf_; + F* vector_leaf_; bool cats_present() const { return sets_.cats_present(); } }; @@ -51,24 +52,25 @@ struct tree> : tree_base { : tree_base{cat_sets}, nodes_(nodes), node_pitch_(node_pitch) { } - __host__ __device__ const dense_node& operator[](int i) const { return nodes_[i * node_pitch_]; } + __host__ __device__ const dense_node& operator[](int i) const + { + return nodes_[i * node_pitch_]; + } dense_node* nodes_ = nullptr; - int node_pitch_ = 0; + int node_pitch_ = 0; }; -using std::enable_if; - -/** dense_storage stores the forest as a collection of dense nodes */ +/** partial specialization of storage. Stores the forest on GPU as a collection of dense nodes */ template -struct storage> : storage_base { +struct storage> : storage_base { using node_t = dense_node; __host__ __device__ storage(categorical_sets cat_sets, - float* vector_leaf, - node_t* nodes, - int num_trees, - int tree_stride, - int node_pitch) - : storage_base{cat_sets, vector_leaf}, + F* vector_leaf, + node_t* nodes, + int num_trees, + int tree_stride, + int node_pitch) + : storage_base{cat_sets, vector_leaf}, nodes_(nodes), num_trees_(num_trees), tree_stride_(tree_stride), @@ -78,12 +80,13 @@ struct storage> : storage_base { __host__ __device__ int num_trees() const { return num_trees_; } __host__ __device__ tree operator[](int i) const { - return tree(sets_, nodes_ + i * tree_stride_, node_pitch_); + // sets_ is a dependent name (in template sense) + return tree(this->sets_, nodes_ + i * tree_stride_, node_pitch_); } - node_t* nodes_ = nullptr; - int num_trees_ = 0; - int tree_stride_ = 0; - int node_pitch_ = 0; + node_t* nodes_ = nullptr; + int num_trees_ = 0; + int tree_stride_ = 0; + int node_pitch_ = 0; }; /** sparse tree */ @@ -97,21 +100,23 @@ struct tree : tree_base { node_t* nodes_ = nullptr; }; -/** sparse_storage stores the forest as a collection of sparse nodes */ +/** storage stores the forest on GPU as a collection of sparse nodes */ template -struct storage : storage_base { +struct storage : storage_base { + using F = typename node_t::F; int* trees_ = nullptr; node_t* nodes_ = nullptr; int num_trees_ = 0; - __host__ __device__ storage( - categorical_sets cat_sets, float* vector_leaf, int* trees, node_t* nodes, int num_trees) - : storage_base{cat_sets, vector_leaf}, trees_(trees), nodes_(nodes), num_trees_(num_trees) + __host__ __device__ + storage(categorical_sets cat_sets, F* vector_leaf, int* trees, node_t* nodes, int num_trees) + : storage_base{cat_sets, vector_leaf}, trees_(trees), nodes_(nodes), num_trees_(num_trees) { } __host__ __device__ int num_trees() const { return num_trees_; } __host__ __device__ tree operator[](int i) const { - return tree(sets_, &nodes_[trees_[i]]); + // sets_ is a dependent name (in template sense) + return tree(this->sets_, &nodes_[trees_[i]]); } }; @@ -147,6 +152,8 @@ struct shmem_size_params { int block_dim_x = 0; /// shm_sz is the associated shared memory footprint int shm_sz = INT_MAX; + /// sizeof_fp_vars is the size in bytes of all floating-point variables during inference + std::size_t sizeof_fp_vars = 4; __host__ __device__ int sdata_stride() { @@ -154,7 +161,7 @@ struct shmem_size_params { } __host__ __device__ int cols_shmem_size() { - return cols_in_shmem ? sizeof(float) * sdata_stride() * n_items << log2_threads_per_tree : 0; + return cols_in_shmem ? sizeof_fp_vars * sdata_stride() * n_items << log2_threads_per_tree : 0; } template size_t get_smem_footprint(); @@ -169,8 +176,8 @@ struct predict_params : shmem_size_params { int num_outputs; // Data parameters. - float* preds; - const float* data; + void* preds; + const void* data; // number of data rows (instances) to predict on int64_t num_rows; diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu index 148fe845b2..594d90ad0d 100644 --- a/cpp/src/fil/fil.cu +++ b/cpp/src/fil/fil.cu @@ -87,7 +87,7 @@ struct forest { we would have otherwise silently overflowed the index calculation due to short division. It would have failed cpp tests, but we might forget about this source of bugs, if not for the failing assert. */ - ASSERT(max_shm_ < int(sizeof(float)) * std::numeric_limits::max(), + ASSERT(max_shm_ < int(proba_ssp_.sizeof_fp_vars) * std::numeric_limits::max(), "internal error: please use a larger type inside" " infer_k for column count"); } @@ -128,9 +128,10 @@ struct forest { fixed_block_count_ = blocks_per_sm * sm_count; } + template void init_common(const raft::handle_t& h, const categorical_sets& cat_sets, - const std::vector& vector_leaf, + const std::vector& vector_leaf, const forest_params_t* params) { depth_ = params->depth; @@ -145,6 +146,7 @@ struct forest { proba_ssp_.num_cols = params->num_cols; proba_ssp_.num_classes = params->num_classes; proba_ssp_.cats_present = cat_sets.cats_present(); + proba_ssp_.sizeof_fp_vars = sizeof(F); class_ssp_ = proba_ssp_; int device = h.get_device(); @@ -155,11 +157,11 @@ struct forest { // vector leaf if (!vector_leaf.empty()) { - vector_leaf_.resize(vector_leaf.size(), stream); + vector_leaf_.resize(vector_leaf.size() * sizeof(F), stream); RAFT_CUDA_TRY(cudaMemcpyAsync(vector_leaf_.data(), vector_leaf.data(), - vector_leaf.size() * sizeof(float), + vector_leaf.size() * sizeof(F), cudaMemcpyHostToDevice, stream)); } @@ -328,7 +330,7 @@ struct forest { int fixed_block_count_ = 0; int max_shm_ = 0; // Optionally used - rmm::device_uvector vector_leaf_; + rmm::device_uvector vector_leaf_; cat_sets_device_owner cat_sets_; }; @@ -410,7 +412,7 @@ struct dense_forest> : forest { h.get_stream())); // predict_proba is a runtime parameter, and opt-in is unconditional - dispatch_on_fil_template_params(opt_into_arch_dependent_shmem(max_shm_), + dispatch_on_fil_template_params(opt_into_arch_dependent_shmem>(max_shm_), static_cast(class_ssp_)); // copy must be finished before freeing the host data RAFT_CUDA_TRY(cudaStreamSynchronize(h.get_stream())); @@ -420,12 +422,12 @@ struct dense_forest> : forest { virtual void infer(predict_params params, cudaStream_t stream) override { - storage forest(cat_sets_.accessor(), - vector_leaf_.data(), - nodes_.data(), - num_trees_, - algo_ == algo_t::NAIVE ? tree_num_nodes(depth_) : 1, - algo_ == algo_t::NAIVE ? 1 : num_trees_); + storage forest(cat_sets_.accessor(), + reinterpret_cast(vector_leaf_.data()), + nodes_.data(), + num_trees_, + algo_ == algo_t::NAIVE ? tree_num_nodes(depth_) : 1, + algo_ == algo_t::NAIVE ? 1 : num_trees_); fil::infer(forest, params, stream); } @@ -475,8 +477,11 @@ struct sparse_forest : forest { virtual void infer(predict_params params, cudaStream_t stream) override { - storage forest( - cat_sets_.accessor(), vector_leaf_.data(), trees_.data(), nodes_.data(), num_trees_); + storage forest(cat_sets_.accessor(), + reinterpret_cast(vector_leaf_.data()), + trees_.data(), + nodes_.data(), + num_trees_); fil::infer(forest, params, stream); } @@ -570,11 +575,11 @@ void check_params(const forest_params_t* params, bool dense) /** initializes a forest of any type * When fil_node_t == dense_node, const int* trees is ignored */ -template +template void init(const raft::handle_t& h, forest_t* pf, const categorical_sets& cat_sets, - const std::vector& vector_leaf, + const std::vector& vector_leaf, const int* trees, const fil_node_t* nodes, const forest_params_t* params) @@ -588,14 +593,12 @@ void init(const raft::handle_t& h, struct instantiate_forest_init { template - void operator()(fil_node_t) { - init(raft::handle_t{}, - (forest_t*)nullptr, - categorical_sets{}, - std::vector{}, - (int*)nullptr, - (fil_node_t*)nullptr, - (forest_params_t*)nullptr); + void operator()(fil_node_t) + { + if constexpr (std::is_same()) + init({}, {}, {}, std::vector(), {}, {}, {}); + else + init({}, {}, {}, std::vector(), {}, {}, {}); } }; diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu index 3e4d5eba56..88cf63375e 100644 --- a/cpp/src/fil/infer.cu +++ b/cpp/src/fil/infer.cu @@ -800,7 +800,7 @@ __global__ void infer_k(storage_type forest, predict_params params) block_row0 += rows_per_block * gridDim.x) { int block_num_rows = max(0, (int)min((int64_t)rows_per_block, (int64_t)params.num_rows - block_row0)); - const float* block_input = params.data + block_row0 * num_cols; + const float* block_input = reinterpret_cast(params.data) + block_row0 * num_cols; if constexpr (cols_in_shmem) load_data(sdata, block_input, params, rows_per_block, block_num_rows); @@ -820,7 +820,7 @@ __global__ void infer_k(storage_type forest, predict_params params) and is made exact below. Same with thread_num_rows > 0 */ - typedef typename leaf_output_t::T pred_t; + typedef typename leaf_output_t::T pred_t; vec prediction; if (tree < forest.num_trees() && thread_num_rows != 0) { prediction = infer_one_tree( @@ -833,7 +833,7 @@ __global__ void infer_k(storage_type forest, predict_params params) // Dummy threads can be marked as having 0 rows acc.accumulate(prediction, tree, tree < forest.num_trees() ? thread_num_rows : 0); } - acc.finalize(params.preds + params.num_outputs * block_row0, + acc.finalize(reinterpret_cast(params.preds) + params.num_outputs * block_row0, block_num_rows, params.num_outputs, params.transform, @@ -893,15 +893,15 @@ void infer(storage_type forest, predict_params params, cudaStream_t stream) dispatch_on_fil_template_params(infer_k_storage_template(forest, stream), params); } -template void inferstorage>(storage> forest, - predict_params params, - cudaStream_t stream); +template void infer>>(storage> forest, + predict_params params, + cudaStream_t stream); template void infer>>(storage> forest, - predict_params params, - cudaStream_t stream); + predict_params params, + cudaStream_t stream); template void infer>(storage forest, - predict_params params, - cudaStream_t stream); + predict_params params, + cudaStream_t stream); } // namespace fil } // namespace ML diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh index 71e36e0ced..b75947046d 100644 --- a/cpp/src/fil/internal.cuh +++ b/cpp/src/fil/internal.cuh @@ -27,8 +27,8 @@ #include #include #include -#include #include +#include namespace raft { class handle_t; @@ -90,7 +90,7 @@ enum output_t { }; /** val_t is the payload within a FIL leaf */ -template +template union val_t { /** floating-point threshold value for parent node or output value (e.g. class probability or regression summand) for leaf node */ @@ -100,9 +100,9 @@ union val_t { }; /** base_node contains common implementation details for dense and sparse nodes */ -template +template struct base_node { - using F = F_; // floating-point type + using F = F_; // floating-point type /** val, for parent nodes, is a threshold or category list offset. For leaf nodes, it is the tree prediction (see see leaf_output_t::T) */ val_t val; @@ -119,10 +119,11 @@ struct base_node { static const int IS_CATEGORICAL_MASK = 1 << IS_CATEGORICAL_OFFSET; static const int FID_MASK = (1 << IS_CATEGORICAL_OFFSET) - 1; template - __host__ __device__ o_t output() const { - if constexpr(std::is_same()) { + __host__ __device__ o_t output() const + { + if constexpr (std::is_same()) { return val.idx; - } else if constexpr(std::is_same()) { + } else if constexpr (std::is_same()) { return val.f; } else { return val; @@ -136,7 +137,8 @@ struct base_node { __host__ __device__ bool is_leaf() const { return bits & IS_LEAF_MASK; } __host__ __device__ bool is_categorical() const { return bits & IS_CATEGORICAL_MASK; } __host__ __device__ base_node() : val{}, bits(0) {} - base_node(val_t output, val_t split, int fid, bool def_left, bool is_leaf, bool is_categorical) + base_node( + val_t output, val_t split, int fid, bool def_left, bool is_leaf, bool is_categorical) { RAFT_EXPECTS((fid & FID_MASK) == fid, "internal error: feature ID doesn't fit into base_node"); bits = (fid & FID_MASK) | (def_left ? DEF_LEFT_MASK : 0) | (is_leaf ? IS_LEAF_MASK : 0) | @@ -149,7 +151,7 @@ struct base_node { }; /** dense_node is a single node of a dense forest */ -template +template struct alignas(8) dense_node : base_node { dense_node() = default; /// ignoring left_index, this is useful to unify import from treelite @@ -160,7 +162,7 @@ struct alignas(8) dense_node : base_node { bool is_leaf, bool is_categorical, int left_index = -1) - : base_node(output, split, fid, def_left, is_leaf, is_categorical) + : base_node(output, split, fid, def_left, is_leaf, is_categorical) { } /** index of the left child, where curr is the index of the current node */ @@ -168,10 +170,10 @@ struct alignas(8) dense_node : base_node { }; /** sparse_node16 is a 16-byte node in a sparse forest */ -template +template struct alignas(16) sparse_node16 : base_node { int left_idx; - __host__ __device__ sparse_node16() : left_idx(0), dummy(0) {} + __host__ __device__ sparse_node16() : left_idx(0) {} sparse_node16(val_t output, val_t split, int fid, @@ -179,8 +181,7 @@ struct alignas(16) sparse_node16 : base_node { bool is_leaf, bool is_categorical, int left_index) - : base_node(output, split, fid, def_left, is_leaf, is_categorical), - left_idx(left_index) + : base_node(output, split, fid, def_left, is_leaf, is_categorical), left_idx(left_index) { } __host__ __device__ int left_index() const { return left_idx; } @@ -205,7 +206,7 @@ struct alignas(8) sparse_node8 : base_node { bool is_leaf, bool is_categorical, int left_index) - : base_node(output, split, fid, def_left, is_leaf, is_categorical) + : base_node(output, split, fid, def_left, is_leaf, is_categorical) { RAFT_EXPECTS((fid & FID_MASK) == fid, "internal error: feature ID doesn't fit into sparse_node8"); @@ -218,8 +219,9 @@ struct alignas(8) sparse_node8 : base_node { }; /// pass a functor with a templated operator()(fil_node_t), i.e. accepting one node as parameter -template -void instantiate_for_all_node_types(Stuff stuff) { +template +void instantiate_for_all_node_types(Stuff stuff) +{ stuff(dense_node{}); stuff(dense_node{}); stuff(sparse_node16{}); @@ -237,7 +239,7 @@ struct sparse_forest; template struct node_traits { - using F = typename node_t::F; + using F = typename node_t::F; using storage = ML::fil::storage; using forest = sparse_forest; static const bool IS_DENSE = false; diff --git a/cpp/src/fil/treelite_import.cu b/cpp/src/fil/treelite_import.cu index 22216a348b..ede325a09d 100644 --- a/cpp/src/fil/treelite_import.cu +++ b/cpp/src/fil/treelite_import.cu @@ -324,7 +324,7 @@ conversion_state tl2fil_inner_node(int fil_left_child, std::size_t* bit_pool_offset) { int tl_left = tree.LeftChild(tl_node_id), tl_right = tree.RightChild(tl_node_id); - val_t split = {.f = NAN}; // yes there's a default initializer already + val_t split = {.f = NAN}; // yes there's a default initializer already int feature_id = tree.SplitIndex(tl_node_id); bool is_categorical = tree.SplitType(tl_node_id) == tl::SplitFeatureType::kCategorical && tree.MatchingCategories(tl_node_id).size() > 0; @@ -352,7 +352,8 @@ conversion_state tl2fil_inner_node(int fil_left_child, ASSERT(false, "only numerical and categorical split nodes are supported"); } bool default_left = tree.DefaultLeft(tl_node_id) ^ swap_child_nodes; - fil_node_t node(val_t{}, split, feature_id, default_left, false, is_categorical, fil_left_child); + fil_node_t node( + val_t{}, split, feature_id, default_left, false, is_categorical, fil_left_child); return conversion_state{node, swap_child_nodes}; } @@ -692,8 +693,12 @@ void from_treelite(const raft::handle_t& handle, } switch (storage_type) { - case storage_type_t::DENSE: convert>(handle, pforest, model, *tl_params); break; - case storage_type_t::SPARSE: convert>(handle, pforest, model, *tl_params); break; + case storage_type_t::DENSE: + convert>(handle, pforest, model, *tl_params); + break; + case storage_type_t::SPARSE: + convert>(handle, pforest, model, *tl_params); + break; case storage_type_t::SPARSE8: convert(handle, pforest, model, *tl_params); break; default: ASSERT(false, "tl_params->sparse must be one of AUTO, DENSE or SPARSE"); } From 96e2ae6ce85f798417ad90dfca58f36b19068d40 Mon Sep 17 00:00:00 2001 From: Levs Dolgovs Date: Thu, 10 Feb 2022 18:54:56 -0800 Subject: [PATCH 03/45] simplify leaf_output_t --- cpp/src/fil/infer.cu | 2 +- cpp/src/fil/internal.cuh | 28 ++++++++++++++-------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu index 88cf63375e..98afb656f2 100644 --- a/cpp/src/fil/infer.cu +++ b/cpp/src/fil/infer.cu @@ -820,7 +820,7 @@ __global__ void infer_k(storage_type forest, predict_params params) and is made exact below. Same with thread_num_rows > 0 */ - typedef typename leaf_output_t::T pred_t; + typedef typename leaf_output_t::T pred_t; vec prediction; if (tree < forest.num_trees() && thread_num_rows != 0) { prediction = infer_one_tree( diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh index b75947046d..0295cde8d1 100644 --- a/cpp/src/fil/internal.cuh +++ b/cpp/src/fil/internal.cuh @@ -300,27 +300,27 @@ enum leaf_algo_t { template struct tree; -template +template struct leaf_output_t { }; -template -struct leaf_output_t> { - typedef typename node_t::F T; +template +struct leaf_output_t { + typedef F T; }; -template -struct leaf_output_t> { +template +struct leaf_output_t { typedef int T; }; -template -struct leaf_output_t> { - typedef typename node_t::F T; +template +struct leaf_output_t { + typedef F T; }; -template -struct leaf_output_t> { - typedef typename node_t::F T; +template +struct leaf_output_t { + typedef F T; }; -template -struct leaf_output_t> { +template +struct leaf_output_t { typedef int T; }; From 35ad5d9fcd5e42c8b4ae8de00913084dca7b3834 Mon Sep 17 00:00:00 2001 From: Levs Dolgovs Date: Tue, 15 Feb 2022 16:43:24 -0800 Subject: [PATCH 04/45] draft --- cpp/src/fil/common.cuh | 8 +- cpp/src/fil/fil.cu | 45 +++++--- cpp/src/fil/infer.cu | 183 +++++++++++++++++---------------- cpp/src/fil/internal.cuh | 8 +- cpp/src/fil/treelite_import.cu | 11 +- 5 files changed, 143 insertions(+), 112 deletions(-) diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh index d5ec24b2af..18d935213a 100644 --- a/cpp/src/fil/common.cuh +++ b/cpp/src/fil/common.cuh @@ -46,8 +46,9 @@ struct storage_base { }; /** represents a dense tree */ -template -struct tree> : tree_base { +template +struct tree> : tree_base { + using F = F_; __host__ __device__ tree(categorical_sets cat_sets, dense_node* nodes, int node_pitch) : tree_base{cat_sets}, nodes_(nodes), node_pitch_(node_pitch) { @@ -92,6 +93,7 @@ struct storage> : storage_base { /** sparse tree */ template struct tree : tree_base { + using F = typename node_t::F; __host__ __device__ tree(categorical_sets cat_sets, node_t* nodes) : tree_base{cat_sets}, nodes_(nodes) { @@ -163,7 +165,7 @@ struct shmem_size_params { { return cols_in_shmem ? sizeof_fp_vars * sdata_stride() * n_items << log2_threads_per_tree : 0; } - template + template size_t get_smem_footprint(); }; diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu index e534d16733..19c0aa64b1 100644 --- a/cpp/src/fil/fil.cu +++ b/cpp/src/fil/fil.cu @@ -34,6 +34,7 @@ creation and prediction (the main inference kernel is defined in infer.cu). */ namespace ML { namespace fil { +__host__ __device__ double sigmoid(double x) { return 1.0 / (1.0 + exp(-x)); } __host__ __device__ float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); } /** performs additional transformations on the array of forest predictions @@ -41,19 +42,20 @@ __host__ __device__ float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); } averaging (multiplying by inv_num_trees), adding global_bias (always done), sigmoid and applying threshold. in case of complement_proba, fills in the complement probability */ -__global__ void transform_k(float* preds, +template +__global__ void transform_k(F* preds, size_t n, output_t output, - float inv_num_trees, - float threshold, - float global_bias, + F inv_num_trees, + F threshold, + F global_bias, bool complement_proba) { size_t i = threadIdx.x + size_t(blockIdx.x) * blockDim.x; if (i >= n) return; if (complement_proba && i % 2 != 0) return; - float result = preds[i]; + F result = preds[i]; if ((output & output_t::AVG) != 0) result *= inv_num_trees; result += global_bias; if ((output & output_t::SIGMOID) != 0) result = sigmoid(result); @@ -172,8 +174,9 @@ struct forest { virtual void infer(predict_params params, cudaStream_t stream) = 0; + template void predict( - const raft::handle_t& h, float* preds, const float* data, size_t num_rows, bool predict_proba) + const raft::handle_t& h, F* preds, const F* data, size_t num_rows, bool predict_proba) { // Initialize prediction parameters. predict_params params(predict_proba ? proba_ssp_ : class_ssp_); @@ -252,7 +255,7 @@ struct forest { // Simulating treelite order, which cancels out bias. // If non-proba prediction used, it still will not matter // for the same reason softmax will not. - float global_bias = (ot & output_t::SOFTMAX) != 0 ? 0.0f : global_bias_; + F global_bias = (ot & output_t::SOFTMAX) != 0 ? F(0.0) : global_bias_; bool complement_proba = false, do_transform; if (predict_proba) { @@ -269,17 +272,17 @@ struct forest { // for GROVE_PER_CLASS, averaging happens in infer_k ot = output_t(ot & ~output_t::AVG); params.num_outputs = params.num_classes; - do_transform = (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != 0.0f; + do_transform = (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != F(0.0); break; case leaf_algo_t::CATEGORICAL_LEAF: params.num_outputs = params.num_classes; - do_transform = ot != output_t::RAW || global_bias_ != 0.0f; + do_transform = ot != output_t::RAW || global_bias_ != F(0.0); break; case leaf_algo_t::VECTOR_LEAF: // for VECTOR_LEAF, averaging happens in infer_k ot = output_t(ot & ~output_t::AVG); params.num_outputs = params.num_classes; - do_transform = (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != 0.0f; + do_transform = (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != F(0.0); break; default: ASSERT(false, "internal error: predict: invalid leaf_algo %d", params.leaf_algo); } @@ -304,9 +307,9 @@ struct forest { preds, num_values_to_transform, ot, - num_trees_ > 0 ? (1.0f / num_trees_) : 1.0f, - threshold_, - global_bias, + num_trees_ > 0 ? (F(1.0) / num_trees_) : F(1.0), + F(threshold_), + F(global_bias), complement_proba); RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -324,8 +327,8 @@ struct forest { int depth_ = 0; algo_t algo_ = algo_t::NAIVE; output_t output_ = output_t::RAW; - float threshold_ = 0.5; - float global_bias_ = 0; + double threshold_ = 0.5; + double global_bias_ = 0; shmem_size_params class_ssp_, proba_ssp_; int fixed_block_count_ = 0; int max_shm_ = 0; @@ -610,6 +613,18 @@ void free(const raft::handle_t& h, forest_t f) delete f; } +/// part of C API - overload instead of template +void predict(const raft::handle_t& h, + forest_t f, + double* preds, + const double* data, + size_t num_rows, + bool predict_proba) +{ + f->predict(h, preds, data, num_rows, predict_proba); +} + +/// part of C API - overload instead of template void predict(const raft::handle_t& h, forest_t f, float* preds, diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu index 98afb656f2..383c0641c6 100644 --- a/cpp/src/fil/infer.cu +++ b/cpp/src/fil/infer.cu @@ -41,7 +41,7 @@ namespace ML { namespace fil { -// vec wraps float[N] for cub::BlockReduce +// vec wraps float[N], int[N] or double[N] for cub::BlockReduce template struct vec; @@ -96,33 +96,34 @@ struct vec { } }; -struct best_margin_label : cub::KeyValuePair { - __host__ __device__ best_margin_label(cub::KeyValuePair pair) - : cub::KeyValuePair(pair) +template +struct best_margin_label : cub::KeyValuePair { + __host__ __device__ best_margin_label(cub::KeyValuePair pair) + : cub::KeyValuePair(pair) { } - __host__ __device__ best_margin_label(int c = 0, float f = -INFINITY) - : cub::KeyValuePair({c, f}) + __host__ __device__ best_margin_label(int c = 0, F f = -INFINITY) + : cub::KeyValuePair({c, f}) { } }; -template -__device__ __forceinline__ vec to_vec(int c, vec margin) +template +__device__ __forceinline__ vec> to_vec(int c, vec margin) { - vec ret; + vec> ret; CUDA_PRAGMA_UNROLL for (int i = 0; i < NITEMS; ++i) - ret[i] = best_margin_label(c, margin[i]); + ret[i] = best_margin_label(c, margin[i]); return ret; } struct ArgMax { - template - __host__ __device__ __forceinline__ vec operator()( - vec a, vec b) const + template + __host__ __device__ __forceinline__ vec> operator()( + vec> a, vec> b) const { - vec c; + vec> c; CUDA_PRAGMA_UNROLL for (int i = 0; i < NITEMS; i++) c[i] = cub::ArgMax()(a[i], b[i]); @@ -155,7 +156,7 @@ __device__ __forceinline__ vec tree_leaf_output(tree_type t template __device__ __forceinline__ vec infer_one_tree(tree_type tree, - const float* input, + const typename tree_type::F* input, int cols, int n_rows) { @@ -186,7 +187,7 @@ __device__ __forceinline__ vec infer_one_tree(tree_type tre template __device__ __forceinline__ vec<1, output_type> infer_one_tree(tree_type tree, - const float* input, + const typename tree_type::F* input, int cols, int rows) { @@ -219,19 +220,19 @@ host code. See https://rapids.ai/start.html as well as cmake defaults. */ // values below are defaults as of this change. -template +template size_t block_reduce_footprint_host() { return sizeof( typename cub:: - BlockReduce, FIL_TPB, cub::BLOCK_REDUCE_WARP_REDUCTIONS, 1, 1, 600>:: + BlockReduce, FIL_TPB, cub::BLOCK_REDUCE_WARP_REDUCTIONS, 1, 1, 600>:: TempStorage); } -template +template size_t block_reduce_best_class_footprint_host() { - return sizeof(typename cub::BlockReduce, + return sizeof(typename cub::BlockReduce>, FIL_TPB, cub::BLOCK_REDUCE_WARP_REDUCTIONS, 1, @@ -249,9 +250,10 @@ __device__ __forceinline__ T block_reduce(T value, BinaryOp op, void* storage) } template // = FLOAT_UNARY_BINARY struct tree_aggregator_t { - vec acc; + vec acc; void* tmp_storage; /** shared memory footprint of the accumulator during @@ -263,7 +265,7 @@ struct tree_aggregator_t { int log2_threads_per_tree, bool predict_proba) { - return log2_threads_per_tree != 0 ? FIL_TPB * NITEMS * sizeof(float) + return log2_threads_per_tree != 0 ? FIL_TPB * NITEMS * sizeof(F) : block_reduce_footprint_host(); } @@ -278,19 +280,19 @@ struct tree_aggregator_t { __device__ __forceinline__ tree_aggregator_t(predict_params params, void* accumulate_workspace, void* finalize_workspace, - float* vector_leaf) + F* vector_leaf) : tmp_storage(finalize_workspace) { } - __device__ __forceinline__ void accumulate(vec single_tree_prediction, + __device__ __forceinline__ void accumulate(vec single_tree_prediction, int tree, int thread_num_rows) { acc += single_tree_prediction; } - __device__ INLINE_CONFIG void finalize(float* block_out, + __device__ INLINE_CONFIG void finalize(F* block_out, int block_num_rows, int output_stride, output_t transform, @@ -303,7 +305,7 @@ struct tree_aggregator_t { if (log2_threads_per_tree == 0) { acc = block_reduce(acc, vectorized(cub::Sum()), tmp_storage); } else { - auto per_thread = (vec*)tmp_storage; + auto per_thread = (vec*)tmp_storage; per_thread[threadIdx.x] = acc; __syncthreads(); // We have two pertinent cases for splitting FIL_TPB == 256 values: @@ -349,13 +351,13 @@ __device__ __forceinline__ auto allreduce_shmem(Iterator begin, // *begin and *end shall be struct vec // tmp_storage may overlap shared memory addressed by [begin, end) -template +template __device__ __forceinline__ void write_best_class( - Iterator begin, Iterator end, void* tmp_storage, float* out, int num_rows) + Iterator begin, Iterator end, void* tmp_storage, F* out, int num_rows) { // reduce per-class candidate margins to one best class candidate // per thread (for each of the NITEMS rows) - auto best = vecNITEMS, best_margin_label>(); + auto best = vecNITEMS, best_margin_label>(); for (int c = threadIdx.x; c < end - begin; c += blockDim.x) best = vectorized(cub::ArgMax())(best, to_vec(c, begin[c])); // [begin, end) may overlap tmp_storage @@ -370,6 +372,7 @@ __device__ __forceinline__ void write_best_class( } /// needed for softmax +__device__ double shifted_exp(double margin, double max) { return exp(margin - max); } __device__ float shifted_exp(float margin, float max) { return expf(margin - max); } // *begin and *end shall be struct vec @@ -391,13 +394,13 @@ __device__ __forceinline__ void block_softmax(Iterator begin, Iterator end, void // *begin and *end shall be struct vec // tmp_storage may NOT overlap shared memory addressed by [begin, end) -template +template __device__ __forceinline__ void normalize_softmax_and_write(Iterator begin, Iterator end, output_t transform, int trees_per_class, void* tmp_storage, - float* out, + F* out, int num_rows) { if ((transform & output_t::AVG) != 0) { @@ -416,13 +419,13 @@ __device__ __forceinline__ void normalize_softmax_and_write(Iterator begin, // *begin and *end shall be struct vec // tmp_storage may NOT overlap shared memory addressed by [begin, end) // in case num_outputs > 1 -template +template __device__ __forceinline__ void class_margins_to_global_memory(Iterator begin, Iterator end, output_t transform, int trees_per_class, void* tmp_storage, - float* out, + F* out, int num_rows, int num_outputs) { @@ -435,11 +438,11 @@ __device__ __forceinline__ void class_margins_to_global_memory(Iterator begin, } } -template -struct tree_aggregator_t { - vec acc; +template +struct tree_aggregator_t { + vec acc; int num_classes; - vec* per_thread; + vec* per_thread; void* tmp_storage; static size_t smem_finalize_footprint(size_t data_row_size, @@ -447,7 +450,7 @@ struct tree_aggregator_t { int log2_threads_per_tree, bool predict_proba) { - size_t phase1 = (FIL_TPB - FIL_TPB % num_classes) * sizeof(vec); + size_t phase1 = (FIL_TPB - FIL_TPB % num_classes) * sizeof(vec); size_t phase2 = predict_proba ? block_reduce_footprint_host() : block_reduce_best_class_footprint_host(); return predict_proba ? phase1 + phase2 : std::max(phase1, phase2); @@ -458,21 +461,21 @@ struct tree_aggregator_t { __device__ __forceinline__ tree_aggregator_t(predict_params params, void* accumulate_workspace, void* finalize_workspace, - float* vector_leaf) + F* vector_leaf) : num_classes(params.num_classes), - per_thread((vec*)finalize_workspace), + per_thread((vec*)finalize_workspace), tmp_storage(params.predict_proba ? per_thread + num_classes : finalize_workspace) { } - __device__ __forceinline__ void accumulate(vec single_tree_prediction, + __device__ __forceinline__ void accumulate(vec single_tree_prediction, int tree, int thread_num_rows) { acc += single_tree_prediction; } - __device__ INLINE_CONFIG void finalize(float* out, + __device__ INLINE_CONFIG void finalize(F* out, int num_rows, int num_outputs, output_t transform, @@ -498,11 +501,11 @@ struct tree_aggregator_t { } }; -template -struct tree_aggregator_t { - vec acc; +template +struct tree_aggregator_t { + vec acc; /// at first, per class margin, then, possibly, different softmax partials - vec* per_class_margin; + vec* per_class_margin; void* tmp_storage; int num_classes; @@ -519,23 +522,23 @@ struct tree_aggregator_t { static __host__ __device__ size_t smem_accumulate_footprint(int num_classes) { - return num_classes * sizeof(vec); + return num_classes * sizeof(vec); } __device__ __forceinline__ tree_aggregator_t(predict_params params, void* accumulate_workspace, void* finalize_workspace, - float* vector_leaf) - : per_class_margin((vec*)accumulate_workspace), + F* vector_leaf) + : per_class_margin((vec*)accumulate_workspace), tmp_storage(params.predict_proba ? per_class_margin + num_classes : finalize_workspace), num_classes(params.num_classes) { for (int c = threadIdx.x; c < num_classes; c += blockDim.x) - per_class_margin[c] = vec(0); + per_class_margin[c] = vec(0); // __syncthreads() is called in infer_k } - __device__ __forceinline__ void accumulate(vec single_tree_prediction, + __device__ __forceinline__ void accumulate(vec single_tree_prediction, int tree, int thread_num_rows) { @@ -544,7 +547,7 @@ struct tree_aggregator_t { __syncthreads(); } - __device__ INLINE_CONFIG void finalize(float* out, + __device__ INLINE_CONFIG void finalize(F* out, int num_rows, int num_outputs, output_t transform, @@ -562,17 +565,17 @@ struct tree_aggregator_t { } }; -template -struct tree_aggregator_t { +template +struct tree_aggregator_t { // per_class_margin is a row-major matrix // of size num_threads_per_class * num_classes // used to acccumulate class values - vec* per_class_margin; + vec* per_class_margin; vec* vector_leaf_indices; int* thread_num_rows; int num_classes; int num_threads_per_class; - float* vector_leaf; + F* vector_leaf; void* tmp_storage; static size_t smem_finalize_footprint(size_t data_row_size, @@ -587,14 +590,14 @@ struct tree_aggregator_t { } static size_t smem_accumulate_footprint(int num_classes) { - return sizeof(vec) * num_classes * max(1, FIL_TPB / num_classes) + + return sizeof(vec) * num_classes * max(1, FIL_TPB / num_classes) + sizeof(vec) * FIL_TPB + sizeof(int) * FIL_TPB; } __device__ __forceinline__ tree_aggregator_t(predict_params params, void* accumulate_workspace, void* finalize_workspace, - float* vector_leaf) + F* vector_leaf) : num_classes(params.num_classes), num_threads_per_class(max(1, blockDim.x / params.num_classes)), vector_leaf(vector_leaf), @@ -602,15 +605,15 @@ struct tree_aggregator_t { { // Assign workspace char* ptr = (char*)accumulate_workspace; - per_class_margin = (vec*)ptr; - ptr += sizeof(vec) * num_classes * num_threads_per_class; + per_class_margin = (vec*)ptr; + ptr += sizeof(vec) * num_classes * num_threads_per_class; vector_leaf_indices = (vec*)ptr; ptr += sizeof(vec) * blockDim.x; thread_num_rows = (int*)ptr; // Initialise shared memory for (int i = threadIdx.x; i < num_classes * num_threads_per_class; i += blockDim.x) { - per_class_margin[i] = vec(); + per_class_margin[i] = vec(); } vector_leaf_indices[threadIdx.x] = vec(); thread_num_rows[threadIdx.x] = 0; @@ -637,13 +640,13 @@ struct tree_aggregator_t { // we have num_classes threads for each j for (int j = i / num_classes; j < blockDim.x; j += num_threads_per_class) { for (int item = 0; item < thread_num_rows[j]; ++item) { - float pred = vector_leaf[vector_leaf_indices[j][item] * num_classes + c]; + F pred = vector_leaf[vector_leaf_indices[j][item] * num_classes + c]; per_class_margin[i][item] += pred; } } } } - __device__ INLINE_CONFIG void finalize(float* out, + __device__ INLINE_CONFIG void finalize(F* out, int num_rows, int num_outputs, output_t transform, @@ -668,8 +671,8 @@ struct tree_aggregator_t { } }; -template -struct tree_aggregator_t { +template +struct tree_aggregator_t { // could switch to uint16_t to save shared memory // provided raft::myAtomicAdd(short*) simulated with appropriate shifts int* votes; @@ -691,7 +694,7 @@ struct tree_aggregator_t { __device__ __forceinline__ tree_aggregator_t(predict_params params, void* accumulate_workspace, void* finalize_workspace, - float* vector_leaf) + F* vector_leaf) : num_classes(params.num_classes), votes((int*)accumulate_workspace) { for (int c = threadIdx.x; c < num_classes; c += FIL_TPB * NITEMS) @@ -712,7 +715,7 @@ struct tree_aggregator_t { } // class probabilities or regression. for regression, num_classes // is just the number of outputs for each data instance - __device__ __forceinline__ void finalize_multiple_outputs(float* out, int num_rows) + __device__ __forceinline__ void finalize_multiple_outputs(F* out, int num_rows) { __syncthreads(); for (int c = threadIdx.x; c < num_classes; c += blockDim.x) { @@ -723,7 +726,7 @@ struct tree_aggregator_t { } // using this when predicting a single class label, as opposed to sparse class vector // or class probabilities or regression - __device__ __forceinline__ void finalize_class_label(float* out, int num_rows) + __device__ __forceinline__ void finalize_class_label(F* out, int num_rows) { __syncthreads(); // make sure all votes[] are final int item = threadIdx.x; @@ -740,7 +743,7 @@ struct tree_aggregator_t { out[row] = best_class; } } - __device__ INLINE_CONFIG void finalize(float* out, + __device__ INLINE_CONFIG void finalize(F* out, int num_rows, int num_outputs, output_t transform, @@ -756,8 +759,9 @@ struct tree_aggregator_t { } }; -__device__ INLINE_CONFIG void load_data(float* sdata, - const float* block_input, +template +__device__ INLINE_CONFIG void load_data(F* sdata, + const F* block_input, predict_params params, int rows_per_block, int block_num_rows) @@ -784,6 +788,7 @@ __device__ INLINE_CONFIG void load_data(float* sdata, } template (params.data) + block_row0 * num_cols; + const F* block_input = reinterpret_cast(params.data) + block_row0 * num_cols; if constexpr (cols_in_shmem) load_data(sdata, block_input, params, rows_per_block, block_num_rows); - tree_aggregator_t acc( + tree_aggregator_t acc( params, (char*)sdata + params.cols_shmem_size(), sdata, forest.vector_leaf_); __syncthreads(); // for both row cache init and acc init @@ -820,7 +825,7 @@ __global__ void infer_k(storage_type forest, predict_params params) and is made exact below. Same with thread_num_rows > 0 */ - typedef typename leaf_output_t::T pred_t; + typedef typename leaf_output_t::T pred_t; vec prediction; if (tree < forest.num_trees() && thread_num_rows != 0) { prediction = infer_one_tree( @@ -833,7 +838,7 @@ __global__ void infer_k(storage_type forest, predict_params params) // Dummy threads can be marked as having 0 rows acc.accumulate(prediction, tree, tree < forest.num_trees() ? thread_num_rows : 0); } - acc.finalize(reinterpret_cast(params.preds) + params.num_outputs * block_row0, + acc.finalize(reinterpret_cast(params.preds) + params.num_outputs * block_row0, block_num_rows, params.num_outputs, params.transform, @@ -843,13 +848,13 @@ __global__ void infer_k(storage_type forest, predict_params params) } } -template +template size_t shmem_size_params::get_smem_footprint() { - size_t finalize_footprint = tree_aggregator_t::smem_finalize_footprint( + size_t finalize_footprint = tree_aggregator_t::smem_finalize_footprint( cols_shmem_size(), num_classes, log2_threads_per_tree, predict_proba); size_t accumulate_footprint = - tree_aggregator_t::smem_accumulate_footprint(num_classes) + + tree_aggregator_t::smem_accumulate_footprint(num_classes) + cols_shmem_size(); return std::max(accumulate_footprint, finalize_footprint); } @@ -893,15 +898,17 @@ void infer(storage_type forest, predict_params params, cudaStream_t stream) dispatch_on_fil_template_params(infer_k_storage_template(forest, stream), params); } -template void infer>>(storage> forest, - predict_params params, - cudaStream_t stream); -template void infer>>(storage> forest, - predict_params params, - cudaStream_t stream); -template void infer>(storage forest, - predict_params params, - cudaStream_t stream); +struct instantiate_infer { + template + void operator()(fil_node_t) + { + infer>(storage forest, + predict_params params, + cudaStream_t stream); + } +}; + +template void instantiate_for_all_node_types(instantiate_infer); } // namespace fil } // namespace ML diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh index 0295cde8d1..1d1c3a516a 100644 --- a/cpp/src/fil/internal.cuh +++ b/cpp/src/fil/internal.cuh @@ -394,7 +394,7 @@ struct categorical_sets { // set count is due to tree_idx + node_within_tree_idx are both ints, hence uint32_t result template - __host__ __device__ __forceinline__ int category_matches(node_t node, float category) const + __host__ __device__ __forceinline__ int category_matches(node_t node, typename node_t::F category) const { // standard boolean packing. This layout has better ILP // node.set() is global across feature IDs and is an offset (as opposed @@ -410,7 +410,7 @@ struct categorical_sets { FIL will reject a model where an integer within [0, fid_num_cats] cannot be represented precisely as a 32-bit float. */ - return category < fid_num_cats[node.fid()] && category >= 0.0f && + return static_cast(category) < fid_num_cats[node.fid()] && category >= 0.0f && fetch_bit(bits + node.set(), static_cast(static_cast(category))); } static int sizeof_mask_from_num_cats(int num_cats) @@ -561,11 +561,11 @@ struct cat_sets_device_owner { * @param params pointer to parameters used to initialize the forest * @param vector_leaf optional vector leaves */ -template +template void init(const raft::handle_t& h, forest_t* pf, const categorical_sets& cat_sets, - const std::vector& vector_leaf, + const std::vector& vector_leaf, const int* trees, const fil_node_t* nodes, const forest_params_t* params); diff --git a/cpp/src/fil/treelite_import.cu b/cpp/src/fil/treelite_import.cu index 56536802f5..12e03fd102 100644 --- a/cpp/src/fil/treelite_import.cu +++ b/cpp/src/fil/treelite_import.cu @@ -655,6 +655,11 @@ void convert(const raft::handle_t& handle, tl2fil.init_forest(handle, pforest); } +template constexpr bool type_supported() { + // not using std::is_floating_point because we did not instantiate fp16-based nodes/trees/forests + return std::is_same() || std::is_same(); +} + template void from_treelite(const raft::handle_t& handle, forest_t* pforest, @@ -662,9 +667,9 @@ void from_treelite(const raft::handle_t& handle, const treelite_params_t* tl_params) { // Invariants on threshold and leaf types - static_assert(std::is_same::value || std::is_same::value, + static_assert(type_supported(), "Model must contain float32 or float64 thresholds for splits"); - ASSERT((std::is_same::value || std::is_same::value), + ASSERT(type_supported(), "Models with integer leaf output are not yet supported"); // Display appropriate warnings when float64 values are being casted into // float32, as FIL only supports inferencing with float32 for the time being @@ -674,6 +679,8 @@ void from_treelite(const raft::handle_t& handle, "doesn't support inferencing models with float64 values. " "This may lead to predictions with reduced accuracy."); } + // same as std::common_type: float+double=double, float+int64_t=float + typedef decltype(threshold_t{} + leaf_t{}) F; storage_type_t storage_type = tl_params->storage_type; // build dense trees by default From f6efb88f38a2f56c03f3c9af8154d629b3cf67df Mon Sep 17 00:00:00 2001 From: Levs Dolgovs Date: Tue, 15 Feb 2022 21:46:41 -0800 Subject: [PATCH 05/45] fixed extra/missing instantiations --- cpp/src/fil/common.cuh | 10 +++++---- cpp/src/fil/fil.cu | 26 ++++++++++++---------- cpp/src/fil/internal.cuh | 4 ++-- cpp/test/sg/fil_child_index_test.cu | 19 +++++++++------- cpp/test/sg/fil_test.cu | 34 ++++++++++++++--------------- 5 files changed, 51 insertions(+), 42 deletions(-) diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh index d5ec24b2af..6c89e1e01f 100644 --- a/cpp/src/fil/common.cuh +++ b/cpp/src/fil/common.cuh @@ -61,8 +61,9 @@ struct tree> : tree_base { }; /** partial specialization of storage. Stores the forest on GPU as a collection of dense nodes */ -template -struct storage> : storage_base { +template +struct storage> : storage_base { + using F = F_; using node_t = dense_node; __host__ __device__ storage(categorical_sets cat_sets, F* vector_leaf, @@ -101,8 +102,9 @@ struct tree : tree_base { }; /** storage stores the forest on GPU as a collection of sparse nodes */ -template -struct storage : storage_base { +template +struct storage : storage_base { + using node_t = node_t_; using F = typename node_t::F; int* trees_ = nullptr; node_t* nodes_ = nullptr; diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu index e534d16733..152198fb62 100644 --- a/cpp/src/fil/fil.cu +++ b/cpp/src/fil/fil.cu @@ -342,15 +342,17 @@ struct opt_into_arch_dependent_shmem : dispatch_functor { template > void run(predict_params p) { - auto kernel = infer_k; - // p.shm_sz might be > max_shm or < MAX_SHM_STD, but we should not check for either, because - // we don't run on both proba_ssp_ and class_ssp_ (only class_ssp_). This should be quick. - RAFT_CUDA_TRY( - cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shm)); + if constexpr (std::is_same()) { + auto kernel = infer_k; + // p.shm_sz might be > max_shm or < MAX_SHM_STD, but we should not check for either, because + // we don't run on both proba_ssp_ and class_ssp_ (only class_ssp_). This should be quick. + RAFT_CUDA_TRY( + cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shm)); + } } }; @@ -428,7 +430,8 @@ struct dense_forest> : forest { num_trees_, algo_ == algo_t::NAIVE ? tree_num_nodes(depth_) : 1, algo_ == algo_t::NAIVE ? 1 : num_trees_); - fil::infer(forest, params, stream); + if constexpr (std::is_same()) // to remove in next PR + fil::infer(forest, params, stream); } virtual void free(const raft::handle_t& h) override @@ -482,7 +485,8 @@ struct sparse_forest : forest { trees_.data(), nodes_.data(), num_trees_); - fil::infer(forest, params, stream); + if constexpr (std::is_same()) // to remove in next PR + fil::infer(forest, params, stream); } void free(const raft::handle_t& h) override diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh index 0295cde8d1..b382ade1a3 100644 --- a/cpp/src/fil/internal.cuh +++ b/cpp/src/fil/internal.cuh @@ -561,11 +561,11 @@ struct cat_sets_device_owner { * @param params pointer to parameters used to initialize the forest * @param vector_leaf optional vector leaves */ -template +template void init(const raft::handle_t& h, forest_t* pf, const categorical_sets& cat_sets, - const std::vector& vector_leaf, + const std::vector& vector_leaf, const int* trees, const fil_node_t* nodes, const forest_params_t* params); diff --git a/cpp/test/sg/fil_child_index_test.cu b/cpp/test/sg/fil_child_index_test.cu index cdc87603be..8b1af7d1b2 100644 --- a/cpp/test/sg/fil_child_index_test.cu +++ b/cpp/test/sg/fil_child_index_test.cu @@ -40,24 +40,27 @@ struct proto_inner_node { int set = 0; // which bit set represents the matching category list float thresh = 0.0f; // threshold, see base_node::thresh int left = 1; // left child idx, see sparse_node*::left_index() - val_t split() + val_t split() { - val_t split; + val_t split; if (is_categorical) split.idx = set; else split.f = thresh; return split; } - operator sparse_node16() + operator sparse_node16() { - return sparse_node16({}, split(), fid, def_left, false, is_categorical, left); + return sparse_node16({}, split(), fid, def_left, false, is_categorical, left); } operator sparse_node8() { return sparse_node8({}, split(), fid, def_left, false, is_categorical, left); } - operator dense_node() { return dense_node({}, split(), fid, def_left, false, is_categorical); } + operator dense_node() + { + return dense_node({}, split(), fid, def_left, false, is_categorical); + } }; std::ostream& operator<<(std::ostream& os, const proto_inner_node& node) @@ -138,7 +141,7 @@ class ChildIndexTest : public testing::TestWithParam { { ChildIndexTestParams param = GetParam(); tree_base tree{param.cso.accessor()}; - if (!std::is_same::value) { + if (!std::is_same>::value) { // test that the logic uses node.left instead of parent_node_idx param.node.left = param.parent_node_idx * 2 + 1; param.parent_node_idx = INT_MIN; @@ -153,8 +156,8 @@ class ChildIndexTest : public testing::TestWithParam { } }; -typedef ChildIndexTest ChildIndexTestDense; -typedef ChildIndexTest ChildIndexTestSparse16; +typedef ChildIndexTest> ChildIndexTestDense; +typedef ChildIndexTest> ChildIndexTestSparse16; typedef ChildIndexTest ChildIndexTestSparse8; /* for dense nodes, left (false) == parent * 2 + 1, right (true) == parent * 2 + 2 diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu index 9627e79e6a..74ab0c4d00 100644 --- a/cpp/test/sg/fil_test.cu +++ b/cpp/test/sg/fil_test.cu @@ -378,12 +378,12 @@ class BaseFilTest : public testing::TestWithParam { // initialize nodes nodes.resize(num_nodes); for (size_t i = 0; i < num_nodes; ++i) { - fil::val_t w; + fil::val_t w; switch (ps.leaf_algo) { case fil::leaf_algo_t::CATEGORICAL_LEAF: w.idx = weights_h[i]; break; case fil::leaf_algo_t::FLOAT_UNARY_BINARY: case fil::leaf_algo_t::GROVE_PER_CLASS: - // not relying on fil::val_t internals + // not relying on fil::val_t internals // merely that we copied floats into weights_h earlier std::memcpy(&w.f, &weights_h[i], sizeof w.f); break; @@ -392,13 +392,13 @@ class BaseFilTest : public testing::TestWithParam { } // make sure nodes are categorical only when their feature ID is categorical bool is_categorical = is_categoricals_h[i] == 1.0f; - val_t split; + val_t split; if (is_categorical) split.idx = node_cat_set[i]; else split.f = thresholds_h[i]; nodes[i] = - fil::dense_node(w, split, fids_h[i], def_lefts_h[i], is_leafs_h[i], is_categorical); + fil::dense_node(w, split, fids_h[i], def_lefts_h[i], is_leafs_h[i], is_categorical); } // clean up @@ -591,13 +591,13 @@ class BaseFilTest : public testing::TestWithParam { stream)); } - fil::val_t infer_one_tree(fil::dense_node* root, float* data, const tree_base& tree) + fil::val_t infer_one_tree(fil::dense_node* root, float* data, const tree_base& tree) { int curr = 0; - fil::val_t output{.f = 0.0f}; + fil::val_t output{.f = 0.0f}; for (;;) { - const fil::dense_node& node = root[curr]; - if (node.is_leaf()) return node.template output(); + const fil::dense_node& node = root[curr]; + if (node.is_leaf()) return node.template output>(); float val = data[node.fid()]; curr = tree.child_index(node, curr, val); } @@ -625,7 +625,7 @@ class BaseFilTest : public testing::TestWithParam { std::vector want_proba_h; // forest data - std::vector nodes; + std::vector> nodes; std::vector vector_leaf; cat_sets_owner cat_sets_h; rmm::device_uvector fids_d = rmm::device_uvector(0, cudaStream_t()); @@ -635,16 +635,16 @@ class BaseFilTest : public testing::TestWithParam { template class BasePredictFilTest : public BaseFilTest { protected: - void dense2sparse_node(const fil::dense_node* dense_root, + void dense2sparse_node(const fil::dense_node* dense_root, int i_dense, int i_sparse_root, int i_sparse) { - const fil::dense_node& node = dense_root[i_dense]; + const fil::dense_node& node = dense_root[i_dense]; if (node.is_leaf()) { // leaf sparse node - sparse_nodes[i_sparse] = - fil_node_t(node.output(), {}, node.fid(), node.def_left(), node.is_leaf(), false, 0); + sparse_nodes[i_sparse] = fil_node_t( + node.output>(), {}, node.fid(), node.def_left(), node.is_leaf(), false, 0); return; } // inner sparse node @@ -663,7 +663,7 @@ class BasePredictFilTest : public BaseFilTest { dense2sparse_node(dense_root, 2 * i_dense + 2, i_sparse_root, left_index + 1); } - void dense2sparse_tree(const fil::dense_node* dense_root) + void dense2sparse_tree(const fil::dense_node* dense_root) { int i_sparse_root = sparse_nodes.size(); sparse_nodes.push_back(fil_node_t()); @@ -719,8 +719,8 @@ class BasePredictFilTest : public BaseFilTest { std::vector trees; }; -typedef BasePredictFilTest PredictDenseFilTest; -typedef BasePredictFilTest PredictSparse16FilTest; +typedef BasePredictFilTest> PredictDenseFilTest; +typedef BasePredictFilTest> PredictSparse16FilTest; typedef BasePredictFilTest PredictSparse8FilTest; class TreeliteFilTest : public BaseFilTest { @@ -732,7 +732,7 @@ class TreeliteFilTest : public BaseFilTest { { int key = (*pkey)++; builder->CreateNode(key); - const fil::dense_node& dense_node = nodes[node]; + const fil::dense_node& dense_node = nodes[node]; std::vector left_categories; if (dense_node.is_leaf()) { switch (ps.leaf_algo) { From 1bebcca53609cfab7421730e7f22047c65ce47d0 Mon Sep 17 00:00:00 2001 From: Levs Dolgovs Date: Wed, 16 Feb 2022 11:14:53 -0800 Subject: [PATCH 06/45] style --- cpp/src/fil/fil.cu | 14 +++++++------- cpp/src/fil/infer.cu | 24 +++++++++--------------- cpp/src/fil/internal.cuh | 3 ++- cpp/src/fil/treelite_import.cu | 7 ++++--- 4 files changed, 22 insertions(+), 26 deletions(-) diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu index ff50bd37ca..1a738b70a7 100644 --- a/cpp/src/fil/fil.cu +++ b/cpp/src/fil/fil.cu @@ -42,7 +42,7 @@ __host__ __device__ float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); } averaging (multiplying by inv_num_trees), adding global_bias (always done), sigmoid and applying threshold. in case of complement_proba, fills in the complement probability */ -template +template __global__ void transform_k(F* preds, size_t n, output_t output, @@ -174,7 +174,7 @@ struct forest { virtual void infer(predict_params params, cudaStream_t stream) = 0; - template + template void predict( const raft::handle_t& h, F* preds, const F* data, size_t num_rows, bool predict_proba) { @@ -255,7 +255,7 @@ struct forest { // Simulating treelite order, which cancels out bias. // If non-proba prediction used, it still will not matter // for the same reason softmax will not. - F global_bias = (ot & output_t::SOFTMAX) != 0 ? F(0.0) : global_bias_; + F global_bias = (ot & output_t::SOFTMAX) != 0 ? F(0.0) : global_bias_; bool complement_proba = false, do_transform; if (predict_proba) { @@ -323,10 +323,10 @@ struct forest { virtual ~forest() {} - int num_trees_ = 0; - int depth_ = 0; - algo_t algo_ = algo_t::NAIVE; - output_t output_ = output_t::RAW; + int num_trees_ = 0; + int depth_ = 0; + algo_t algo_ = algo_t::NAIVE; + output_t output_ = output_t::RAW; double threshold_ = 0.5; double global_bias_ = 0; shmem_size_params class_ssp_, proba_ssp_; diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu index 1d9aaccb80..8930834fbb 100644 --- a/cpp/src/fil/infer.cu +++ b/cpp/src/fil/infer.cu @@ -96,7 +96,7 @@ struct vec { } }; -template +template struct best_margin_label : cub::KeyValuePair { __host__ __device__ best_margin_label(cub::KeyValuePair pair) : cub::KeyValuePair(pair) @@ -155,10 +155,8 @@ __device__ __forceinline__ vec tree_leaf_output(tree_type t } template -__device__ __forceinline__ vec infer_one_tree(tree_type tree, - const typename tree_type::F* input, - int cols, - int n_rows) +__device__ __forceinline__ vec infer_one_tree( + tree_type tree, const typename tree_type::F* input, int cols, int n_rows) { // find the leaf nodes for each row int curr[NITEMS]; @@ -759,12 +757,9 @@ struct tree_aggregator_t { } }; -template -__device__ INLINE_CONFIG void load_data(F* sdata, - const F* block_input, - predict_params params, - int rows_per_block, - int block_num_rows) +template +__device__ INLINE_CONFIG void load_data( + F* sdata, const F* block_input, predict_params params, int rows_per_block, int block_num_rows) { int num_cols = params.num_cols; int sdata_stride = params.sdata_stride(); @@ -796,7 +791,7 @@ template void operator()(fil_node_t) { - infer>(storage forest, - predict_params params, - cudaStream_t stream); + infer>( + storage forest, predict_params params, cudaStream_t stream); } }; diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh index 1d1c3a516a..16c450899b 100644 --- a/cpp/src/fil/internal.cuh +++ b/cpp/src/fil/internal.cuh @@ -394,7 +394,8 @@ struct categorical_sets { // set count is due to tree_idx + node_within_tree_idx are both ints, hence uint32_t result template - __host__ __device__ __forceinline__ int category_matches(node_t node, typename node_t::F category) const + __host__ __device__ __forceinline__ int category_matches(node_t node, + typename node_t::F category) const { // standard boolean packing. This layout has better ILP // node.set() is global across feature IDs and is an offset (as opposed diff --git a/cpp/src/fil/treelite_import.cu b/cpp/src/fil/treelite_import.cu index 12e03fd102..ae941452e6 100644 --- a/cpp/src/fil/treelite_import.cu +++ b/cpp/src/fil/treelite_import.cu @@ -655,7 +655,9 @@ void convert(const raft::handle_t& handle, tl2fil.init_forest(handle, pforest); } -template constexpr bool type_supported() { +template +constexpr bool type_supported() +{ // not using std::is_floating_point because we did not instantiate fp16-based nodes/trees/forests return std::is_same() || std::is_same(); } @@ -669,8 +671,7 @@ void from_treelite(const raft::handle_t& handle, // Invariants on threshold and leaf types static_assert(type_supported(), "Model must contain float32 or float64 thresholds for splits"); - ASSERT(type_supported(), - "Models with integer leaf output are not yet supported"); + ASSERT(type_supported(), "Models with integer leaf output are not yet supported"); // Display appropriate warnings when float64 values are being casted into // float32, as FIL only supports inferencing with float32 for the time being if (std::is_same::value || std::is_same::value) { From aab158b231fdc221addaf80bdf4e07ad80c8a7f2 Mon Sep 17 00:00:00 2001 From: Levs Dolgovs Date: Wed, 16 Feb 2022 11:16:34 -0800 Subject: [PATCH 07/45] style --- cpp/test/sg/fil_child_index_test.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/test/sg/fil_child_index_test.cu b/cpp/test/sg/fil_child_index_test.cu index 8b1af7d1b2..75eca6cba2 100644 --- a/cpp/test/sg/fil_child_index_test.cu +++ b/cpp/test/sg/fil_child_index_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 1236c52bf9356cde5d23941261dea4daed7f869e Mon Sep 17 00:00:00 2001 From: Levs Dolgovs Date: Fri, 18 Feb 2022 00:28:51 -0800 Subject: [PATCH 08/45] removed ML::fil::init templatization, added KeyValuePair templatization --- cpp/CMakeLists.txt | 2 -- cpp/src/fil/fil.cu | 20 ++++---------------- cpp/src/fil/infer.cu | 13 +++++++------ cpp/src/fil/internal.cuh | 19 ++++--------------- 4 files changed, 15 insertions(+), 39 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 4cc7a593c3..273c0da76e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -267,8 +267,6 @@ if(BUILD_CUML_CPP_LIBRARY) src/explainer/kernel_shap.cu src/explainer/permutation_shap.cu src/explainer/tree_shap.cu - src/fil/fil.cu - src/fil/infer.cu src/glm/glm.cu src/genetic/genetic.cu src/genetic/program.cu diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu index 152198fb62..f4f13998b2 100644 --- a/cpp/src/fil/fil.cu +++ b/cpp/src/fil/fil.cu @@ -579,11 +579,11 @@ void check_params(const forest_params_t* params, bool dense) /** initializes a forest of any type * When fil_node_t == dense_node, const int* trees is ignored */ -template +template void init(const raft::handle_t& h, forest_t* pf, const categorical_sets& cat_sets, - const std::vector& vector_leaf, + const std::vector& vector_leaf, const int* trees, const fil_node_t* nodes, const forest_params_t* params) @@ -591,23 +591,11 @@ void init(const raft::handle_t& h, check_params(params, node_traits::IS_DENSE); using forest_type = typename node_traits::forest; forest_type* f = new forest_type(h); - f->init(h, cat_sets, vector_leaf, trees, nodes, params); + if constexpr (std::is_same()) // to remove in next PR + f->init(h, cat_sets, vector_leaf, trees, nodes, params); *pf = f; } -struct instantiate_forest_init { - template - void operator()(fil_node_t) - { - if constexpr (std::is_same()) - init({}, {}, {}, std::vector(), {}, {}, {}); - else - init({}, {}, {}, std::vector(), {}, {}, {}); - } -}; - -template void instantiate_for_all_node_types(instantiate_forest_init); - void free(const raft::handle_t& h, forest_t f) { f->free(h); diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu index 52d435b1e1..454734b135 100644 --- a/cpp/src/fil/infer.cu +++ b/cpp/src/fil/infer.cu @@ -41,7 +41,7 @@ namespace ML { namespace fil { -// vec wraps float[N] for cub::BlockReduce +// vec wraps float[N], int[N] or double[N] for cub::BlockReduce template struct vec; @@ -96,13 +96,14 @@ struct vec { } }; -struct best_margin_label : cub::KeyValuePair { - __host__ __device__ best_margin_label(cub::KeyValuePair pair) - : cub::KeyValuePair(pair) +template +struct best_margin_label : cub::KeyValuePair { + __host__ __device__ best_margin_label(cub::KeyValuePair pair) + : cub::KeyValuePair(pair) { } - __host__ __device__ best_margin_label(int c = 0, float f = -INFINITY) - : cub::KeyValuePair({c, f}) + __host__ __device__ best_margin_label(int c = 0, F f = -INFINITY) + : cub::KeyValuePair({c, f}) { } }; diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh index b382ade1a3..48b87b81d8 100644 --- a/cpp/src/fil/internal.cuh +++ b/cpp/src/fil/internal.cuh @@ -101,7 +101,7 @@ union val_t { /** base_node contains common implementation details for dense and sparse nodes */ template -struct base_node { +struct alignas(std::is_same() ? 8 : 16) base_node { using F = F_; // floating-point type /** val, for parent nodes, is a threshold or category list offset. For leaf nodes, it is the tree prediction (see see leaf_output_t::T) */ @@ -152,7 +152,7 @@ struct base_node { /** dense_node is a single node of a dense forest */ template -struct alignas(8) dense_node : base_node { +struct dense_node : base_node { dense_node() = default; /// ignoring left_index, this is useful to unify import from treelite dense_node(val_t output, @@ -218,17 +218,6 @@ struct alignas(8) sparse_node8 : base_node { __host__ __device__ int left(int curr) const { return left_index(); } }; -/// pass a functor with a templated operator()(fil_node_t), i.e. accepting one node as parameter -template -void instantiate_for_all_node_types(Stuff stuff) -{ - stuff(dense_node{}); - stuff(dense_node{}); - stuff(sparse_node16{}); - stuff(sparse_node16{}); - stuff(sparse_node8{}); -} - template struct storage; @@ -561,11 +550,11 @@ struct cat_sets_device_owner { * @param params pointer to parameters used to initialize the forest * @param vector_leaf optional vector leaves */ -template +template void init(const raft::handle_t& h, forest_t* pf, const categorical_sets& cat_sets, - const std::vector& vector_leaf, + const std::vector& vector_leaf, const int* trees, const fil_node_t* nodes, const forest_params_t* params); From cc946cdaa2b6b5096603d976593a5adf68c5d708 Mon Sep 17 00:00:00 2001 From: Levs Dolgovs Date: Fri, 18 Feb 2022 00:29:37 -0800 Subject: [PATCH 09/45] style --- cpp/src/fil/infer.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu index 454734b135..a05029390a 100644 --- a/cpp/src/fil/infer.cu +++ b/cpp/src/fil/infer.cu @@ -36,7 +36,7 @@ #endif // __CUDA_ARCH__ #endif // CUDA_PRAGMA_UNROLL -#define INLINE_CONFIG __noinline__ +#define INLINE_CONFIG __forceinline__ namespace ML { namespace fil { From 91ecb205a532e19d50a01bfb4682e3e58c883a02 Mon Sep 17 00:00:00 2001 From: Levs Dolgovs Date: Fri, 18 Feb 2022 00:30:59 -0800 Subject: [PATCH 10/45] add the old instantiations back --- cpp/src/fil/fil.cu | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu index f4f13998b2..1c991a5960 100644 --- a/cpp/src/fil/fil.cu +++ b/cpp/src/fil/fil.cu @@ -596,6 +596,31 @@ void init(const raft::handle_t& h, *pf = f; } +// explicit instantiations for init_sparse() +template void init(const raft::handle_t& h, + forest_t* pf, + const categorical_sets& cat_sets, + const std::vector& vector_leaf, + const int* trees, + const sparse_node16* nodes, + const forest_params_t* params); + +template void init(const raft::handle_t& h, + forest_t* pf, + const categorical_sets& cat_sets, + const std::vector& vector_leaf, + const int* trees, + const sparse_node8* nodes, + const forest_params_t* params); + +template void init(const raft::handle_t& h, + forest_t* pf, + const categorical_sets& cat_sets, + const std::vector& vector_leaf, + const int* trees, + const dense_node* nodes, + const forest_params_t* params); + void free(const raft::handle_t& h, forest_t f) { f->free(h); From 67f900e2aa76fbf93ccfa45807929c501aebf2eb Mon Sep 17 00:00:00 2001 From: Levs Dolgovs Date: Fri, 18 Feb 2022 15:14:19 -0800 Subject: [PATCH 11/45] fixed --- cpp/src/fil/fil.cu | 8 ++++---- cpp/src/fil/infer.cu | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu index 1c991a5960..81b848f1ed 100644 --- a/cpp/src/fil/fil.cu +++ b/cpp/src/fil/fil.cu @@ -597,12 +597,12 @@ void init(const raft::handle_t& h, } // explicit instantiations for init_sparse() -template void init(const raft::handle_t& h, +template void init>(const raft::handle_t& h, forest_t* pf, const categorical_sets& cat_sets, const std::vector& vector_leaf, const int* trees, - const sparse_node16* nodes, + const sparse_node16* nodes, const forest_params_t* params); template void init(const raft::handle_t& h, @@ -613,12 +613,12 @@ template void init(const raft::handle_t& h, const sparse_node8* nodes, const forest_params_t* params); -template void init(const raft::handle_t& h, +template void init>(const raft::handle_t& h, forest_t* pf, const categorical_sets& cat_sets, const std::vector& vector_leaf, const int* trees, - const dense_node* nodes, + const dense_node* nodes, const forest_params_t* params); void free(const raft::handle_t& h, forest_t f) diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu index a05029390a..8b34ea0447 100644 --- a/cpp/src/fil/infer.cu +++ b/cpp/src/fil/infer.cu @@ -109,21 +109,21 @@ struct best_margin_label : cub::KeyValuePair { }; template -__device__ __forceinline__ vec to_vec(int c, vec margin) +__device__ __forceinline__ vec> to_vec(int c, vec margin) { - vec ret; + vec> ret; CUDA_PRAGMA_UNROLL for (int i = 0; i < NITEMS; ++i) - ret[i] = best_margin_label(c, margin[i]); + ret[i] = best_margin_label(c, margin[i]); return ret; } struct ArgMax { template - __host__ __device__ __forceinline__ vec operator()( - vec a, vec b) const + __host__ __device__ __forceinline__ vec> operator()( + vec> a, vec> b) const { - vec c; + vec> c; CUDA_PRAGMA_UNROLL for (int i = 0; i < NITEMS; i++) c[i] = cub::ArgMax()(a[i], b[i]); @@ -232,7 +232,7 @@ size_t block_reduce_footprint_host() template size_t block_reduce_best_class_footprint_host() { - return sizeof(typename cub::BlockReduce, + return sizeof(typename cub::BlockReduce>, FIL_TPB, cub::BLOCK_REDUCE_WARP_REDUCTIONS, 1, @@ -356,7 +356,7 @@ __device__ __forceinline__ void write_best_class( { // reduce per-class candidate margins to one best class candidate // per thread (for each of the NITEMS rows) - auto best = vecNITEMS, best_margin_label>(); + auto best = vecNITEMS, best_margin_label>(); for (int c = threadIdx.x; c < end - begin; c += blockDim.x) best = vectorized(cub::ArgMax())(best, to_vec(c, begin[c])); // [begin, end) may overlap tmp_storage From c3c4deb0d2c774197ea9e72ed11a1cebcc6e379a Mon Sep 17 00:00:00 2001 From: Levs Dolgovs Date: Fri, 18 Feb 2022 15:31:52 -0800 Subject: [PATCH 12/45] style --- cpp/src/fil/fil.cu | 24 ++++++++++++------------ cpp/src/fil/infer.cu | 3 ++- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu index 81b848f1ed..90e2aa18c3 100644 --- a/cpp/src/fil/fil.cu +++ b/cpp/src/fil/fil.cu @@ -598,12 +598,12 @@ void init(const raft::handle_t& h, // explicit instantiations for init_sparse() template void init>(const raft::handle_t& h, - forest_t* pf, - const categorical_sets& cat_sets, - const std::vector& vector_leaf, - const int* trees, - const sparse_node16* nodes, - const forest_params_t* params); + forest_t* pf, + const categorical_sets& cat_sets, + const std::vector& vector_leaf, + const int* trees, + const sparse_node16* nodes, + const forest_params_t* params); template void init(const raft::handle_t& h, forest_t* pf, @@ -614,12 +614,12 @@ template void init(const raft::handle_t& h, const forest_params_t* params); template void init>(const raft::handle_t& h, - forest_t* pf, - const categorical_sets& cat_sets, - const std::vector& vector_leaf, - const int* trees, - const dense_node* nodes, - const forest_params_t* params); + forest_t* pf, + const categorical_sets& cat_sets, + const std::vector& vector_leaf, + const int* trees, + const dense_node* nodes, + const forest_params_t* params); void free(const raft::handle_t& h, forest_t f) { diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu index 8b34ea0447..cb8fdaeaa0 100644 --- a/cpp/src/fil/infer.cu +++ b/cpp/src/fil/infer.cu @@ -109,7 +109,8 @@ struct best_margin_label : cub::KeyValuePair { }; template -__device__ __forceinline__ vec> to_vec(int c, vec margin) +__device__ __forceinline__ vec> to_vec(int c, + vec margin) { vec> ret; CUDA_PRAGMA_UNROLL From 68e7f652332dd7bafb05aef0753e24603b1b94f3 Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Wed, 9 Mar 2022 19:40:37 +0100 Subject: [PATCH 13/45] base_node::output() now compiles. --- cpp/src/fil/internal.cuh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh index 48b87b81d8..1b1d0c5c5f 100644 --- a/cpp/src/fil/internal.cuh +++ b/cpp/src/fil/internal.cuh @@ -121,13 +121,17 @@ struct alignas(std::is_same() ? 8 : 16) base_node { template __host__ __device__ o_t output() const { - if constexpr (std::is_same()) { + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v>, "invalid o_t type parameter in node.output()"); + if constexpr (std::is_same_v) { return val.idx; - } else if constexpr (std::is_same()) { + } else if constexpr (std::is_same_v) { return val.f; - } else { + } else if constexpr (std::is_same_v>) { return val; } + // control flow should not reach here + return o_t(); } __host__ __device__ int set() const { return val.idx; } __host__ __device__ F thresh() const { return val.f; } From b31069d5b94b451b0afc76f894c7a78fb2961e64 Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Thu, 10 Mar 2022 01:45:59 +0100 Subject: [PATCH 14/45] Fixed style. --- cpp/src/fil/internal.cuh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh index 1b1d0c5c5f..9bc5f8a333 100644 --- a/cpp/src/fil/internal.cuh +++ b/cpp/src/fil/internal.cuh @@ -121,8 +121,9 @@ struct alignas(std::is_same() ? 8 : 16) base_node { template __host__ __device__ o_t output() const { - static_assert(std::is_same_v || std::is_same_v || - std::is_same_v>, "invalid o_t type parameter in node.output()"); + static_assert( + std::is_same_v || std::is_same_v || std::is_same_v>, + "invalid o_t type parameter in node.output()"); if constexpr (std::is_same_v) { return val.idx; } else if constexpr (std::is_same_v) { From c250653e1faefd4e5987b49799d8e952e06e2bab Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Thu, 10 Mar 2022 02:35:23 +0100 Subject: [PATCH 15/45] F -> real_t. --- cpp/src/fil/common.cuh | 39 +++++++++-------- cpp/src/fil/fil.cu | 32 +++++++------- cpp/src/fil/infer.cu | 12 +++--- cpp/src/fil/internal.cuh | 90 +++++++++++++++++++++------------------- 4 files changed, 90 insertions(+), 83 deletions(-) diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh index 6c89e1e01f..2696d9dc28 100644 --- a/cpp/src/fil/common.cuh +++ b/cpp/src/fil/common.cuh @@ -38,40 +38,40 @@ __host__ __device__ __forceinline__ int forest_num_nodes(int num_trees, int dept return num_trees * tree_num_nodes(depth); } -template +template struct storage_base { categorical_sets sets_; - F* vector_leaf_; + real_t* vector_leaf_; bool cats_present() const { return sets_.cats_present(); } }; /** represents a dense tree */ -template -struct tree> : tree_base { - __host__ __device__ tree(categorical_sets cat_sets, dense_node* nodes, int node_pitch) +template +struct tree> : tree_base { + __host__ __device__ tree(categorical_sets cat_sets, dense_node* nodes, int node_pitch) : tree_base{cat_sets}, nodes_(nodes), node_pitch_(node_pitch) { } - __host__ __device__ const dense_node& operator[](int i) const + __host__ __device__ const dense_node& operator[](int i) const { return nodes_[i * node_pitch_]; } - dense_node* nodes_ = nullptr; - int node_pitch_ = 0; + dense_node* nodes_ = nullptr; + int node_pitch_ = 0; }; /** partial specialization of storage. Stores the forest on GPU as a collection of dense nodes */ -template -struct storage> : storage_base { - using F = F_; - using node_t = dense_node; +template +struct storage> : storage_base { + using real_t = real_t_; + using node_t = dense_node; __host__ __device__ storage(categorical_sets cat_sets, - F* vector_leaf, + real_t* vector_leaf, node_t* nodes, int num_trees, int tree_stride, int node_pitch) - : storage_base{cat_sets, vector_leaf}, + : storage_base{cat_sets, vector_leaf}, nodes_(nodes), num_trees_(num_trees), tree_stride_(tree_stride), @@ -103,15 +103,18 @@ struct tree : tree_base { /** storage stores the forest on GPU as a collection of sparse nodes */ template -struct storage : storage_base { +struct storage : storage_base { using node_t = node_t_; - using F = typename node_t::F; + using real_t = typename node_t::real_t; int* trees_ = nullptr; node_t* nodes_ = nullptr; int num_trees_ = 0; __host__ __device__ - storage(categorical_sets cat_sets, F* vector_leaf, int* trees, node_t* nodes, int num_trees) - : storage_base{cat_sets, vector_leaf}, trees_(trees), nodes_(nodes), num_trees_(num_trees) + storage(categorical_sets cat_sets, real_t* vector_leaf, int* trees, node_t* nodes, int num_trees) + : storage_base{cat_sets, vector_leaf}, + trees_(trees), + nodes_(nodes), + num_trees_(num_trees) { } __host__ __device__ int num_trees() const { return num_trees_; } diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu index 90e2aa18c3..2738afc9a6 100644 --- a/cpp/src/fil/fil.cu +++ b/cpp/src/fil/fil.cu @@ -128,10 +128,10 @@ struct forest { fixed_block_count_ = blocks_per_sm * sm_count; } - template + template void init_common(const raft::handle_t& h, const categorical_sets& cat_sets, - const std::vector& vector_leaf, + const std::vector& vector_leaf, const forest_params_t* params) { depth_ = params->depth; @@ -146,7 +146,7 @@ struct forest { proba_ssp_.num_cols = params->num_cols; proba_ssp_.num_classes = params->num_classes; proba_ssp_.cats_present = cat_sets.cats_present(); - proba_ssp_.sizeof_fp_vars = sizeof(F); + proba_ssp_.sizeof_fp_vars = sizeof(real_t); class_ssp_ = proba_ssp_; int device = h.get_device(); @@ -157,11 +157,11 @@ struct forest { // vector leaf if (!vector_leaf.empty()) { - vector_leaf_.resize(vector_leaf.size() * sizeof(F), stream); + vector_leaf_.resize(vector_leaf.size() * sizeof(real_t), stream); RAFT_CUDA_TRY(cudaMemcpyAsync(vector_leaf_.data(), vector_leaf.data(), - vector_leaf.size() * sizeof(F), + vector_leaf.size() * sizeof(real_t), cudaMemcpyHostToDevice, stream)); } @@ -342,7 +342,7 @@ struct opt_into_arch_dependent_shmem : dispatch_functor { template > void run(predict_params p) { - if constexpr (std::is_same()) { + if constexpr (std::is_same()) { auto kernel = infer_k { } }; -template -struct dense_forest> : forest { - using node_t = dense_node; +template +struct dense_forest> : forest { + using node_t = dense_node; dense_forest(const raft::handle_t& h) : forest(h), nodes_(0, h.get_stream()) {} void transform_trees(const node_t* nodes) @@ -391,7 +391,7 @@ struct dense_forest> : forest { /// sparse_forest::init() void init(const raft::handle_t& h, const categorical_sets& cat_sets, - const std::vector& vector_leaf, + const std::vector& vector_leaf, const int* trees, const node_t* nodes, const forest_params_t* params) @@ -425,12 +425,12 @@ struct dense_forest> : forest { virtual void infer(predict_params params, cudaStream_t stream) override { storage forest(cat_sets_.accessor(), - reinterpret_cast(vector_leaf_.data()), + reinterpret_cast(vector_leaf_.data()), nodes_.data(), num_trees_, algo_ == algo_t::NAIVE ? tree_num_nodes(depth_) : 1, algo_ == algo_t::NAIVE ? 1 : num_trees_); - if constexpr (std::is_same()) // to remove in next PR + if constexpr (std::is_same()) // to remove in next PR fil::infer(forest, params, stream); } @@ -453,7 +453,7 @@ struct sparse_forest : forest { void init(const raft::handle_t& h, const categorical_sets& cat_sets, - const std::vector& vector_leaf, + const std::vector& vector_leaf, const int* trees, const node_t* nodes, const forest_params_t* params) @@ -481,11 +481,11 @@ struct sparse_forest : forest { virtual void infer(predict_params params, cudaStream_t stream) override { storage forest(cat_sets_.accessor(), - reinterpret_cast(vector_leaf_.data()), + reinterpret_cast(vector_leaf_.data()), trees_.data(), nodes_.data(), num_trees_); - if constexpr (std::is_same()) // to remove in next PR + if constexpr (std::is_same()) // to remove in next PR fil::infer(forest, params, stream); } @@ -591,7 +591,7 @@ void init(const raft::handle_t& h, check_params(params, node_traits::IS_DENSE); using forest_type = typename node_traits::forest; forest_type* f = new forest_type(h); - if constexpr (std::is_same()) // to remove in next PR + if constexpr (std::is_same()) // to remove in next PR f->init(h, cat_sets, vector_leaf, trees, nodes, params); *pf = f; } diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu index cb8fdaeaa0..f03543ba11 100644 --- a/cpp/src/fil/infer.cu +++ b/cpp/src/fil/infer.cu @@ -96,14 +96,14 @@ struct vec { } }; -template -struct best_margin_label : cub::KeyValuePair { - __host__ __device__ best_margin_label(cub::KeyValuePair pair) - : cub::KeyValuePair(pair) +template +struct best_margin_label : cub::KeyValuePair { + __host__ __device__ best_margin_label(cub::KeyValuePair pair) + : cub::KeyValuePair(pair) { } - __host__ __device__ best_margin_label(int c = 0, F f = -INFINITY) - : cub::KeyValuePair({c, f}) + __host__ __device__ best_margin_label(int c = 0, real_t f = -INFINITY) + : cub::KeyValuePair({c, f}) { } }; diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh index 9bc5f8a333..8d240bc24a 100644 --- a/cpp/src/fil/internal.cuh +++ b/cpp/src/fil/internal.cuh @@ -90,22 +90,22 @@ enum output_t { }; /** val_t is the payload within a FIL leaf */ -template +template union val_t { /** floating-point threshold value for parent node or output value (e.g. class probability or regression summand) for leaf node */ - F f = NAN; + real_t f = NAN; /** class label, leaf vector index or categorical node set offset */ int idx; }; /** base_node contains common implementation details for dense and sparse nodes */ -template -struct alignas(std::is_same() ? 8 : 16) base_node { - using F = F_; // floating-point type +template +struct alignas(std::is_same() ? 8 : 16) base_node { + using real_t = real_t_; // floating-point type /** val, for parent nodes, is a threshold or category list offset. For leaf nodes, it is the tree prediction (see see leaf_output_t::T) */ - val_t val; + val_t val; /** bits encode various information about the node, with the exact nature of this information depending on the node type; it includes e.g. whether the node is a leaf or inner node, and for inner nodes, additional information, @@ -122,28 +122,32 @@ struct alignas(std::is_same() ? 8 : 16) base_node { __host__ __device__ o_t output() const { static_assert( - std::is_same_v || std::is_same_v || std::is_same_v>, + std::is_same_v || std::is_same_v || std::is_same_v>, "invalid o_t type parameter in node.output()"); if constexpr (std::is_same_v) { return val.idx; - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { return val.f; - } else if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { return val; } // control flow should not reach here return o_t(); } __host__ __device__ int set() const { return val.idx; } - __host__ __device__ F thresh() const { return val.f; } - __host__ __device__ val_t split() const { return val; } + __host__ __device__ real_t thresh() const { return val.f; } + __host__ __device__ val_t split() const { return val; } __host__ __device__ int fid() const { return bits & FID_MASK; } __host__ __device__ bool def_left() const { return bits & DEF_LEFT_MASK; } __host__ __device__ bool is_leaf() const { return bits & IS_LEAF_MASK; } __host__ __device__ bool is_categorical() const { return bits & IS_CATEGORICAL_MASK; } __host__ __device__ base_node() : val{}, bits(0) {} - base_node( - val_t output, val_t split, int fid, bool def_left, bool is_leaf, bool is_categorical) + base_node(val_t output, + val_t split, + int fid, + bool def_left, + bool is_leaf, + bool is_categorical) { RAFT_EXPECTS((fid & FID_MASK) == fid, "internal error: feature ID doesn't fit into base_node"); bits = (fid & FID_MASK) | (def_left ? DEF_LEFT_MASK : 0) | (is_leaf ? IS_LEAF_MASK : 0) | @@ -156,18 +160,18 @@ struct alignas(std::is_same() ? 8 : 16) base_node { }; /** dense_node is a single node of a dense forest */ -template -struct dense_node : base_node { +template +struct dense_node : base_node { dense_node() = default; /// ignoring left_index, this is useful to unify import from treelite - dense_node(val_t output, - val_t split, + dense_node(val_t output, + val_t split, int fid, bool def_left, bool is_leaf, bool is_categorical, int left_index = -1) - : base_node(output, split, fid, def_left, is_leaf, is_categorical) + : base_node(output, split, fid, def_left, is_leaf, is_categorical) { } /** index of the left child, where curr is the index of the current node */ @@ -175,18 +179,18 @@ struct dense_node : base_node { }; /** sparse_node16 is a 16-byte node in a sparse forest */ -template -struct alignas(16) sparse_node16 : base_node { +template +struct alignas(16) sparse_node16 : base_node { int left_idx; __host__ __device__ sparse_node16() : left_idx(0) {} - sparse_node16(val_t output, - val_t split, + sparse_node16(val_t output, + val_t split, int fid, bool def_left, bool is_leaf, bool is_categorical, int left_index) - : base_node(output, split, fid, def_left, is_leaf, is_categorical), left_idx(left_index) + : base_node(output, split, fid, def_left, is_leaf, is_categorical), left_idx(left_index) { } __host__ __device__ int left_index() const { return left_idx; } @@ -233,20 +237,20 @@ struct sparse_forest; template struct node_traits { - using F = typename node_t::F; + using real_t = typename node_t::real_t; using storage = ML::fil::storage; using forest = sparse_forest; static const bool IS_DENSE = false; static const storage_type_t storage_type_enum = - std::is_same, node_t>() ? SPARSE : SPARSE8; + std::is_same, node_t>() ? SPARSE : SPARSE8; template static void check(const treelite::ModelImpl& model); }; -template -struct node_traits> { - using storage = storage>; - using forest = dense_forest>; +template +struct node_traits> { + using storage = storage>; + using forest = dense_forest>; static const bool IS_DENSE = true; static const storage_type_t storage_type_enum = DENSE; template @@ -294,27 +298,27 @@ enum leaf_algo_t { template struct tree; -template +template struct leaf_output_t { }; -template -struct leaf_output_t { - typedef F T; +template +struct leaf_output_t { + typedef real_t T; }; -template -struct leaf_output_t { +template +struct leaf_output_t { typedef int T; }; -template -struct leaf_output_t { - typedef F T; +template +struct leaf_output_t { + typedef real_t T; }; -template -struct leaf_output_t { - typedef F T; +template +struct leaf_output_t { + typedef real_t T; }; -template -struct leaf_output_t { +template +struct leaf_output_t { typedef int T; }; From 2f658b5ff09241ff0d13a6a06800dec7f6c4a374 Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Thu, 10 Mar 2022 20:27:34 +0100 Subject: [PATCH 16/45] Small fixes. --- cpp/src/fil/common.cuh | 8 ++++---- cpp/src/fil/fil.cu | 7 ++++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh index 2696d9dc28..99c0e7afe8 100644 --- a/cpp/src/fil/common.cuh +++ b/cpp/src/fil/common.cuh @@ -157,8 +157,8 @@ struct shmem_size_params { int block_dim_x = 0; /// shm_sz is the associated shared memory footprint int shm_sz = INT_MAX; - /// sizeof_fp_vars is the size in bytes of all floating-point variables during inference - std::size_t sizeof_fp_vars = 4; + /// sizeof_real is the size in bytes of all floating-point variables during inference + std::size_t sizeof_real = 4; __host__ __device__ int sdata_stride() { @@ -166,7 +166,7 @@ struct shmem_size_params { } __host__ __device__ int cols_shmem_size() { - return cols_in_shmem ? sizeof_fp_vars * sdata_stride() * n_items << log2_threads_per_tree : 0; + return cols_in_shmem ? sizeof_real * sdata_stride() * n_items << log2_threads_per_tree : 0; } template size_t get_smem_footprint(); @@ -180,7 +180,7 @@ struct predict_params : shmem_size_params { // number of outputs for the forest per each data row int num_outputs; - // Data parameters. + // Data parameters; preds and data are pointers to either float or double. void* preds; const void* data; // number of data rows (instances) to predict on diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu index 2738afc9a6..e5ef05eb33 100644 --- a/cpp/src/fil/fil.cu +++ b/cpp/src/fil/fil.cu @@ -30,6 +30,7 @@ creation and prediction (the main inference kernel is defined in infer.cu). */ #include // for expf #include // for size_t +#include // for uint8_t namespace ML { namespace fil { @@ -87,7 +88,7 @@ struct forest { we would have otherwise silently overflowed the index calculation due to short division. It would have failed cpp tests, but we might forget about this source of bugs, if not for the failing assert. */ - ASSERT(max_shm_ < int(proba_ssp_.sizeof_fp_vars) * std::numeric_limits::max(), + ASSERT(max_shm_ < int(proba_ssp_.sizeof_real) * std::numeric_limits::max(), "internal error: please use a larger type inside" " infer_k for column count"); } @@ -146,7 +147,7 @@ struct forest { proba_ssp_.num_cols = params->num_cols; proba_ssp_.num_classes = params->num_classes; proba_ssp_.cats_present = cat_sets.cats_present(); - proba_ssp_.sizeof_fp_vars = sizeof(real_t); + proba_ssp_.sizeof_real = sizeof(real_t); class_ssp_ = proba_ssp_; int device = h.get_device(); @@ -330,7 +331,7 @@ struct forest { int fixed_block_count_ = 0; int max_shm_ = 0; // Optionally used - rmm::device_uvector vector_leaf_; + rmm::device_uvector vector_leaf_; cat_sets_device_owner cat_sets_; }; From 6fb105b90922836547431fa2e2563f0c17ebf6cd Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Thu, 10 Mar 2022 22:49:16 +0100 Subject: [PATCH 17/45] Updated alignment. --- cpp/src/fil/infer.cu | 2 +- cpp/src/fil/internal.cuh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu index f03543ba11..25494ec10d 100644 --- a/cpp/src/fil/infer.cu +++ b/cpp/src/fil/infer.cu @@ -36,7 +36,7 @@ #endif // __CUDA_ARCH__ #endif // CUDA_PRAGMA_UNROLL -#define INLINE_CONFIG __forceinline__ +#define INLINE_CONFIG __noinline__ namespace ML { namespace fil { diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh index 8d240bc24a..e90a5029bb 100644 --- a/cpp/src/fil/internal.cuh +++ b/cpp/src/fil/internal.cuh @@ -101,7 +101,7 @@ union val_t { /** base_node contains common implementation details for dense and sparse nodes */ template -struct alignas(std::is_same() ? 8 : 16) base_node { +struct alignas(2 * sizeof(real_t_)) base_node { using real_t = real_t_; // floating-point type /** val, for parent nodes, is a threshold or category list offset. For leaf nodes, it is the tree prediction (see see leaf_output_t::T) */ @@ -161,7 +161,7 @@ struct alignas(std::is_same() ? 8 : 16) base_node { /** dense_node is a single node of a dense forest */ template -struct dense_node : base_node { +struct alignas(2 * sizeof(real_t)) dense_node : base_node { dense_node() = default; /// ignoring left_index, this is useful to unify import from treelite dense_node(val_t output, From f1a10bee38f9d1caa95b1600845db9620912ea2c Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Thu, 10 Mar 2022 23:03:01 +0100 Subject: [PATCH 18/45] static_assert(real_t == float) in a number of places. --- cpp/src/fil/fil.cu | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu index e5ef05eb33..f1da7e79a4 100644 --- a/cpp/src/fil/fil.cu +++ b/cpp/src/fil/fil.cu @@ -343,17 +343,17 @@ struct opt_into_arch_dependent_shmem : dispatch_functor { template > void run(predict_params p) { - if constexpr (std::is_same()) { - auto kernel = infer_k; - // p.shm_sz might be > max_shm or < MAX_SHM_STD, but we should not check for either, because - // we don't run on both proba_ssp_ and class_ssp_ (only class_ssp_). This should be quick. - RAFT_CUDA_TRY( - cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shm)); - } + static_assert(std::is_same_v, + "real_t must be float; to be removed in the following pull requests"); + auto kernel = infer_k; + // p.shm_sz might be > max_shm or < MAX_SHM_STD, but we should not check for either, because + // we don't run on both proba_ssp_ and class_ssp_ (only class_ssp_). This should be quick. + RAFT_CUDA_TRY( + cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shm)); } }; @@ -431,8 +431,9 @@ struct dense_forest> : forest { num_trees_, algo_ == algo_t::NAIVE ? tree_num_nodes(depth_) : 1, algo_ == algo_t::NAIVE ? 1 : num_trees_); - if constexpr (std::is_same()) // to remove in next PR - fil::infer(forest, params, stream); + static_assert(std::is_same_v, + "real_t must be float; to be removed in the following pull requests"); + fil::infer(forest, params, stream); } virtual void free(const raft::handle_t& h) override @@ -486,8 +487,9 @@ struct sparse_forest : forest { trees_.data(), nodes_.data(), num_trees_); - if constexpr (std::is_same()) // to remove in next PR - fil::infer(forest, params, stream); + static_assert(std::is_same_v, + "real_t must be float; to be removed in the following pull requests"); + fil::infer(forest, params, stream); } void free(const raft::handle_t& h) override @@ -592,8 +594,9 @@ void init(const raft::handle_t& h, check_params(params, node_traits::IS_DENSE); using forest_type = typename node_traits::forest; forest_type* f = new forest_type(h); - if constexpr (std::is_same()) // to remove in next PR - f->init(h, cat_sets, vector_leaf, trees, nodes, params); + static_assert(std::is_same_v, + "real_t must be float; to be removed in the following pull requests"); + f->init(h, cat_sets, vector_leaf, trees, nodes, params); *pf = f; } From ce3624eb6c2f0a8e311f468f63617ea79eda5530 Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Thu, 10 Mar 2022 23:10:18 +0100 Subject: [PATCH 19/45] noinline -> forceinline. --- cpp/src/fil/infer.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu index 25494ec10d..f03543ba11 100644 --- a/cpp/src/fil/infer.cu +++ b/cpp/src/fil/infer.cu @@ -36,7 +36,7 @@ #endif // __CUDA_ARCH__ #endif // CUDA_PRAGMA_UNROLL -#define INLINE_CONFIG __noinline__ +#define INLINE_CONFIG __forceinline__ namespace ML { namespace fil { From 63fadd1d67a9da2b7a52cd7f1e2263fc41ffc3d4 Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Fri, 11 Mar 2022 23:21:21 +0100 Subject: [PATCH 20/45] Updated comment. --- cpp/src/fil/fil.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu index f1da7e79a4..705cd96702 100644 --- a/cpp/src/fil/fil.cu +++ b/cpp/src/fil/fil.cu @@ -330,7 +330,8 @@ struct forest { shmem_size_params class_ssp_, proba_ssp_; int fixed_block_count_ = 0; int max_shm_ = 0; - // Optionally used + // vector_leaf_ is only used if {class,proba}_ssp_.leaf_algo is VECTOR_LEAF, + // otherwise it is empty rmm::device_uvector vector_leaf_; cat_sets_device_owner cat_sets_; }; From fda76692dca9008e3aee6f768253d3efc24fba76 Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Sat, 12 Mar 2022 02:51:12 +0100 Subject: [PATCH 21/45] Fixed many compiler errors. --- cpp/src/fil/common.cuh | 4 +- cpp/src/fil/infer.cu | 199 ++++++++++++++++++++++------------------- 2 files changed, 111 insertions(+), 92 deletions(-) diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh index c03008186e..a9f1933458 100644 --- a/cpp/src/fil/common.cuh +++ b/cpp/src/fil/common.cuh @@ -94,7 +94,7 @@ struct storage> : storage_base { /** sparse tree */ template struct tree : tree_base { - using F = typename node_t::F; + using real_t = typename node_t::real_t; __host__ __device__ tree(categorical_sets cat_sets, node_t* nodes) : tree_base{cat_sets}, nodes_(nodes) { @@ -170,7 +170,7 @@ struct shmem_size_params { { return cols_in_shmem ? sizeof_real * sdata_stride() * n_items << log2_threads_per_tree : 0; } - template + template size_t get_smem_footprint(); }; diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu index 8ae937b444..4eaaede85b 100644 --- a/cpp/src/fil/infer.cu +++ b/cpp/src/fil/infer.cu @@ -157,7 +157,7 @@ __device__ __forceinline__ vec tree_leaf_output(tree_type t template __device__ __forceinline__ vec infer_one_tree( - tree_type tree, const typename tree_type::F* input, int cols, int n_rows) + tree_type tree, const typename tree_type::real_t* input, int cols, int n_rows) { // find the leaf nodes for each row int curr[NITEMS]; @@ -185,10 +185,8 @@ __device__ __forceinline__ vec infer_one_tree( } template -__device__ __forceinline__ vec<1, output_type> infer_one_tree(tree_type tree, - const typename tree_type::F* input, - int cols, - int rows) +__device__ __forceinline__ vec<1, output_type> infer_one_tree( + tree_type tree, const typename tree_type::real_t* input, int cols, int rows) { int curr = 0; for (;;) { @@ -219,12 +217,12 @@ host code. See https://rapids.ai/start.html as well as cmake defaults. */ // values below are defaults as of this change. -template +template size_t block_reduce_footprint_host() { return sizeof( typename cub:: - BlockReduce, FIL_TPB, cub::BLOCK_REDUCE_WARP_REDUCTIONS, 1, 1, 600>:: + BlockReduce, FIL_TPB, cub::BLOCK_REDUCE_WARP_REDUCTIONS, 1, 1, 600>:: TempStorage); } @@ -249,10 +247,10 @@ __device__ __forceinline__ T block_reduce(T value, BinaryOp op, void* storage) } template // = FLOAT_UNARY_BINARY struct tree_aggregator_t { - vec acc; + vec acc; void* tmp_storage; /** shared memory footprint of the accumulator during @@ -264,8 +262,8 @@ struct tree_aggregator_t { int log2_threads_per_tree, bool predict_proba) { - return log2_threads_per_tree != 0 ? FIL_TPB * NITEMS * sizeof(F) - : block_reduce_footprint_host(); + return log2_threads_per_tree != 0 ? FIL_TPB * NITEMS * sizeof(real_t) + : block_reduce_footprint_host(); } /** shared memory footprint of the accumulator during @@ -279,19 +277,19 @@ struct tree_aggregator_t { __device__ __forceinline__ tree_aggregator_t(predict_params params, void* accumulate_workspace, void* finalize_workspace, - F* vector_leaf) + real_t* vector_leaf) : tmp_storage(finalize_workspace) { } - __device__ __forceinline__ void accumulate(vec single_tree_prediction, + __device__ __forceinline__ void accumulate(vec single_tree_prediction, int tree, int thread_num_rows) { acc += single_tree_prediction; } - __device__ INLINE_CONFIG void finalize(F* block_out, + __device__ INLINE_CONFIG void finalize(real_t* block_out, int block_num_rows, int output_stride, output_t transform, @@ -304,7 +302,7 @@ struct tree_aggregator_t { if (log2_threads_per_tree == 0) { acc = block_reduce(acc, vectorized(cub::Sum()), tmp_storage); } else { - auto per_thread = (vec*)tmp_storage; + auto per_thread = (vec*)tmp_storage; per_thread[threadIdx.x] = acc; __syncthreads(); // We have two pertinent cases for splitting FIL_TPB == 256 values: @@ -352,7 +350,7 @@ __device__ __forceinline__ auto allreduce_shmem(Iterator begin, // tmp_storage may overlap shared memory addressed by [begin, end) template __device__ __forceinline__ void write_best_class( - Iterator begin, Iterator end, void* tmp_storage, F* out, int num_rows) + Iterator begin, Iterator end, void* tmp_storage, real_t* out, int num_rows) { // reduce per-class candidate margins to one best class candidate // per thread (for each of the NITEMS rows) @@ -371,8 +369,25 @@ __device__ __forceinline__ void write_best_class( } /// needed for softmax -__device__ double shifted_exp(double margin, double max) { return exp(margin - max); } -__device__ float shifted_exp(float margin, float max) { return expf(margin - max); } +template +__device__ real_t shifted_exp(real_t margin, real_t max) +{ + // this is always false, since specializations are used for float and double + static_assert(std::is_same_v || std::is_same_v, + "only specializations of shifted_exp can be used"); +} + +template <> +__device__ double shifted_exp(double margin, double max) +{ + return exp(margin - max); +} + +template <> +__device__ float shifted_exp(float margin, float max) +{ + return expf(margin - max); +} // *begin and *end shall be struct vec // tmp_storage may NOT overlap shared memory addressed by [begin, end) @@ -380,12 +395,12 @@ template __device__ __forceinline__ void block_softmax(Iterator begin, Iterator end, void* tmp_storage) { // subtract max before exponentiating for numerical stability - typedef typename std::iterator_traits::value_type value_type; - value_type max = allreduce_shmem(begin, end, vectorized(cub::Max()), tmp_storage); + using real_t = typename std::iterator_traits::value_type; + real_t max = allreduce_shmem(begin, end, vectorized(cub::Max()), tmp_storage); for (Iterator it = begin + threadIdx.x; it < end; it += blockDim.x) - *it = vectorized(shifted_exp)(*it, max); + *it = vectorized(shifted_exp)(*it, max); // sum of exponents - value_type soe = allreduce_shmem(begin, end, vectorized(cub::Sum()), tmp_storage); + real_t soe = allreduce_shmem(begin, end, vectorized(cub::Sum()), tmp_storage); // softmax phase 2: normalization for (Iterator it = begin + threadIdx.x; it < end; it += blockDim.x) *it /= soe; @@ -393,13 +408,13 @@ __device__ __forceinline__ void block_softmax(Iterator begin, Iterator end, void // *begin and *end shall be struct vec // tmp_storage may NOT overlap shared memory addressed by [begin, end) -template +template __device__ __forceinline__ void normalize_softmax_and_write(Iterator begin, Iterator end, output_t transform, int trees_per_class, void* tmp_storage, - F* out, + real_t* out, int num_rows) { if ((transform & output_t::AVG) != 0) { @@ -418,13 +433,13 @@ __device__ __forceinline__ void normalize_softmax_and_write(Iterator begin, // *begin and *end shall be struct vec // tmp_storage may NOT overlap shared memory addressed by [begin, end) // in case num_outputs > 1 -template +template __device__ __forceinline__ void class_margins_to_global_memory(Iterator begin, Iterator end, output_t transform, int trees_per_class, void* tmp_storage, - F* out, + real_t* out, int num_rows, int num_outputs) { @@ -437,11 +452,11 @@ __device__ __forceinline__ void class_margins_to_global_memory(Iterator begin, } } -template -struct tree_aggregator_t { - vec acc; +template +struct tree_aggregator_t { + vec acc; int num_classes; - vec* per_thread; + vec* per_thread; void* tmp_storage; static size_t smem_finalize_footprint(size_t data_row_size, @@ -449,9 +464,9 @@ struct tree_aggregator_t { int log2_threads_per_tree, bool predict_proba) { - size_t phase1 = (FIL_TPB - FIL_TPB % num_classes) * sizeof(vec); - size_t phase2 = predict_proba ? block_reduce_footprint_host() - : block_reduce_best_class_footprint_host(); + size_t phase1 = (FIL_TPB - FIL_TPB % num_classes) * sizeof(vec); + size_t phase2 = predict_proba ? block_reduce_footprint_host() + : block_reduce_best_class_footprint_host(); return predict_proba ? phase1 + phase2 : std::max(phase1, phase2); } @@ -460,21 +475,21 @@ struct tree_aggregator_t { __device__ __forceinline__ tree_aggregator_t(predict_params params, void* accumulate_workspace, void* finalize_workspace, - F* vector_leaf) + real_t* vector_leaf) : num_classes(params.num_classes), - per_thread((vec*)finalize_workspace), + per_thread((vec*)finalize_workspace), tmp_storage(params.predict_proba ? per_thread + num_classes : finalize_workspace) { } - __device__ __forceinline__ void accumulate(vec single_tree_prediction, + __device__ __forceinline__ void accumulate(vec single_tree_prediction, int tree, int thread_num_rows) { acc += single_tree_prediction; } - __device__ INLINE_CONFIG void finalize(F* out, + __device__ INLINE_CONFIG void finalize(real_t* out, int num_rows, int num_outputs, output_t transform, @@ -500,11 +515,11 @@ struct tree_aggregator_t { } }; -template -struct tree_aggregator_t { - vec acc; +template +struct tree_aggregator_t { + vec acc; /// at first, per class margin, then, possibly, different softmax partials - vec* per_class_margin; + vec* per_class_margin; void* tmp_storage; int num_classes; @@ -514,30 +529,30 @@ struct tree_aggregator_t { bool predict_proba) { size_t phase1 = data_row_size + smem_accumulate_footprint(num_classes); - size_t phase2 = predict_proba ? block_reduce_footprint_host() - : block_reduce_best_class_footprint_host(); + size_t phase2 = predict_proba ? block_reduce_footprint_host() + : block_reduce_best_class_footprint_host(); return predict_proba ? phase1 + phase2 : std::max(phase1, phase2); } static __host__ __device__ size_t smem_accumulate_footprint(int num_classes) { - return num_classes * sizeof(vec); + return num_classes * sizeof(vec); } __device__ __forceinline__ tree_aggregator_t(predict_params params, void* accumulate_workspace, void* finalize_workspace, - F* vector_leaf) - : per_class_margin((vec*)accumulate_workspace), + real_t* vector_leaf) + : per_class_margin((vec*)accumulate_workspace), tmp_storage(params.predict_proba ? per_class_margin + num_classes : finalize_workspace), num_classes(params.num_classes) { for (int c = threadIdx.x; c < num_classes; c += blockDim.x) - per_class_margin[c] = vec(0); + per_class_margin[c] = vec(0); // __syncthreads() is called in infer_k } - __device__ __forceinline__ void accumulate(vec single_tree_prediction, + __device__ __forceinline__ void accumulate(vec single_tree_prediction, int tree, int thread_num_rows) { @@ -546,7 +561,7 @@ struct tree_aggregator_t { __syncthreads(); } - __device__ INLINE_CONFIG void finalize(F* out, + __device__ INLINE_CONFIG void finalize(real_t* out, int num_rows, int num_outputs, output_t transform, @@ -564,17 +579,17 @@ struct tree_aggregator_t { } }; -template -struct tree_aggregator_t { +template +struct tree_aggregator_t { // per_class_margin is a row-major matrix // of size num_threads_per_class * num_classes // used to acccumulate class values - vec* per_class_margin; + vec* per_class_margin; vec* vector_leaf_indices; int* thread_num_rows; int num_classes; int num_threads_per_class; - F* vector_leaf; + real_t* vector_leaf; void* tmp_storage; static size_t smem_finalize_footprint(size_t data_row_size, @@ -583,20 +598,20 @@ struct tree_aggregator_t { bool predict_proba) { size_t phase1 = data_row_size + smem_accumulate_footprint(num_classes); - size_t phase2 = predict_proba ? block_reduce_footprint_host() - : block_reduce_best_class_footprint_host(); + size_t phase2 = predict_proba ? block_reduce_footprint_host() + : block_reduce_best_class_footprint_host(); return predict_proba ? phase1 + phase2 : std::max(phase1, phase2); } static size_t smem_accumulate_footprint(int num_classes) { - return sizeof(vec) * num_classes * max(1, FIL_TPB / num_classes) + + return sizeof(vec) * num_classes * max(1, FIL_TPB / num_classes) + sizeof(vec) * FIL_TPB + sizeof(int) * FIL_TPB; } __device__ __forceinline__ tree_aggregator_t(predict_params params, void* accumulate_workspace, void* finalize_workspace, - F* vector_leaf) + real_t* vector_leaf) : num_classes(params.num_classes), num_threads_per_class(max(1, blockDim.x / params.num_classes)), vector_leaf(vector_leaf), @@ -604,15 +619,15 @@ struct tree_aggregator_t { { // Assign workspace char* ptr = (char*)accumulate_workspace; - per_class_margin = (vec*)ptr; - ptr += sizeof(vec) * num_classes * num_threads_per_class; + per_class_margin = (vec*)ptr; + ptr += sizeof(vec) * num_classes * num_threads_per_class; vector_leaf_indices = (vec*)ptr; ptr += sizeof(vec) * blockDim.x; thread_num_rows = (int*)ptr; // Initialise shared memory for (int i = threadIdx.x; i < num_classes * num_threads_per_class; i += blockDim.x) { - per_class_margin[i] = vec(); + per_class_margin[i] = vec(); } vector_leaf_indices[threadIdx.x] = vec(); thread_num_rows[threadIdx.x] = 0; @@ -639,13 +654,13 @@ struct tree_aggregator_t { // we have num_classes threads for each j for (int j = i / num_classes; j < blockDim.x; j += num_threads_per_class) { for (int item = 0; item < thread_num_rows[j]; ++item) { - F pred = vector_leaf[vector_leaf_indices[j][item] * num_classes + c]; + real_t pred = vector_leaf[vector_leaf_indices[j][item] * num_classes + c]; per_class_margin[i][item] += pred; } } } } - __device__ INLINE_CONFIG void finalize(F* out, + __device__ INLINE_CONFIG void finalize(real_t* out, int num_rows, int num_outputs, output_t transform, @@ -670,8 +685,8 @@ struct tree_aggregator_t { } }; -template -struct tree_aggregator_t { +template +struct tree_aggregator_t { // could switch to uint16_t to save shared memory // provided raft::myAtomicAdd(short*) simulated with appropriate shifts int* votes; @@ -693,7 +708,7 @@ struct tree_aggregator_t { __device__ __forceinline__ tree_aggregator_t(predict_params params, void* accumulate_workspace, void* finalize_workspace, - F* vector_leaf) + real_t* vector_leaf) : num_classes(params.num_classes), votes((int*)accumulate_workspace) { for (int c = threadIdx.x; c < num_classes; c += FIL_TPB * NITEMS) @@ -714,7 +729,7 @@ struct tree_aggregator_t { } // class probabilities or regression. for regression, num_classes // is just the number of outputs for each data instance - __device__ __forceinline__ void finalize_multiple_outputs(F* out, int num_rows) + __device__ __forceinline__ void finalize_multiple_outputs(real_t* out, int num_rows) { __syncthreads(); for (int c = threadIdx.x; c < num_classes; c += blockDim.x) { @@ -725,7 +740,7 @@ struct tree_aggregator_t { } // using this when predicting a single class label, as opposed to sparse class vector // or class probabilities or regression - __device__ __forceinline__ void finalize_class_label(F* out, int num_rows) + __device__ __forceinline__ void finalize_class_label(real_t* out, int num_rows) { __syncthreads(); // make sure all votes[] are final int item = threadIdx.x; @@ -742,7 +757,7 @@ struct tree_aggregator_t { out[row] = best_class; } } - __device__ INLINE_CONFIG void finalize(F* out, + __device__ INLINE_CONFIG void finalize(real_t* out, int num_rows, int num_outputs, output_t transform, @@ -758,9 +773,12 @@ struct tree_aggregator_t { } }; -template -__device__ INLINE_CONFIG void load_data( - F* sdata, const F* block_input, predict_params params, int rows_per_block, int block_num_rows) +template +__device__ INLINE_CONFIG void load_data(real_t* sdata, + const real_t* block_input, + predict_params params, + int rows_per_block, + int block_num_rows) { int num_cols = params.num_cols; int sdata_stride = params.sdata_stride(); @@ -784,7 +802,7 @@ __device__ INLINE_CONFIG void load_data( } template (params.data) + block_row0 * num_cols; + const real_t* block_input = + reinterpret_cast(params.data) + block_row0 * num_cols; if constexpr (cols_in_shmem) load_data(sdata, block_input, params, rows_per_block, block_num_rows); - tree_aggregator_t acc( + tree_aggregator_t acc( params, (char*)sdata + params.cols_shmem_size(), sdata, forest.vector_leaf_); __syncthreads(); // for both row cache init and acc init @@ -821,7 +840,7 @@ __global__ void infer_k(storage_type forest, predict_params params) and is made exact below. Same with thread_num_rows > 0 */ - typedef typename leaf_output_t::T pred_t; + typedef typename leaf_output_t::T pred_t; vec prediction; if (tree < forest.num_trees() && thread_num_rows != 0) { prediction = infer_one_tree( @@ -834,7 +853,7 @@ __global__ void infer_k(storage_type forest, predict_params params) // Dummy threads can be marked as having 0 rows acc.accumulate(prediction, tree, tree < forest.num_trees() ? thread_num_rows : 0); } - acc.finalize(reinterpret_cast(params.preds) + params.num_outputs * block_row0, + acc.finalize(reinterpret_cast(params.preds) + params.num_outputs * block_row0, block_num_rows, params.num_outputs, params.transform, @@ -844,13 +863,13 @@ __global__ void infer_k(storage_type forest, predict_params params) } } -template +template size_t shmem_size_params::get_smem_footprint() { - size_t finalize_footprint = tree_aggregator_t::smem_finalize_footprint( + size_t finalize_footprint = tree_aggregator_t::smem_finalize_footprint( cols_shmem_size(), num_classes, log2_threads_per_tree, predict_proba); size_t accumulate_footprint = - tree_aggregator_t::smem_accumulate_footprint(num_classes) + + tree_aggregator_t::smem_accumulate_footprint(num_classes) + cols_shmem_size(); return std::max(accumulate_footprint, finalize_footprint); } @@ -858,7 +877,7 @@ size_t shmem_size_params::get_smem_footprint() template int compute_smem_footprint::run(predict_params ssp) { - return ssp.template get_smem_footprint(); + return ssp.template get_smem_footprint(); } // make sure to instantiate all possible get_smem_footprint instantiations @@ -894,16 +913,16 @@ void infer(storage_type forest, predict_params params, cudaStream_t stream) dispatch_on_fil_template_params(infer_k_storage_template(forest, stream), params); } -struct instantiate_infer { - template - void operator()(fil_node_t) - { - infer>( - storage forest, predict_params params, cudaStream_t stream); - } -}; +// struct instantiate_infer { +// template +// void operator()(fil_node_t) +// { +// infer>( +// storage forest, predict_params params, cudaStream_t stream); +// } +// }; -template void instantiate_for_all_node_types(instantiate_infer); +// template void instantiate_for_all_node_types(instantiate_infer); } // namespace fil } // namespace ML From 40f7a238919482239f37c457d0adaa339b979263 Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Mon, 14 Mar 2022 19:23:41 +0100 Subject: [PATCH 22/45] Multiple changes. --- cpp/src/fil/common.cuh | 1 + cpp/src/fil/fil.cu | 32 +++++++++++++++++--------------- cpp/src/fil/infer.cu | 10 ++++++++++ cpp/src/fil/internal.cuh | 5 +++-- cpp/src/fil/treelite_import.cu | 6 +++--- 5 files changed, 34 insertions(+), 20 deletions(-) diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh index a9f1933458..e7d7b2f11e 100644 --- a/cpp/src/fil/common.cuh +++ b/cpp/src/fil/common.cuh @@ -127,6 +127,7 @@ struct storage : storage_base { } }; +typedef storage> dense_storage; typedef storage> sparse_storage16; typedef storage sparse_storage8; diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu index e6ff17737d..239938a3de 100644 --- a/cpp/src/fil/fil.cu +++ b/cpp/src/fil/fil.cu @@ -43,20 +43,20 @@ __host__ __device__ float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); } averaging (multiplying by inv_num_trees), adding global_bias (always done), sigmoid and applying threshold. in case of complement_proba, fills in the complement probability */ -template -__global__ void transform_k(F* preds, +template +__global__ void transform_k(real_t* preds, size_t n, output_t output, - F inv_num_trees, - F threshold, - F global_bias, + real_t inv_num_trees, + real_t threshold, + real_t global_bias, bool complement_proba) { size_t i = threadIdx.x + size_t(blockIdx.x) * blockDim.x; if (i >= n) return; if (complement_proba && i % 2 != 0) return; - F result = preds[i]; + real_t result = preds[i]; if ((output & output_t::AVG) != 0) result *= inv_num_trees; result += global_bias; if ((output & output_t::SIGMOID) != 0) result = sigmoid(result); @@ -175,9 +175,9 @@ struct forest { virtual void infer(predict_params params, cudaStream_t stream) = 0; - template + template void predict( - const raft::handle_t& h, F* preds, const F* data, size_t num_rows, bool predict_proba) + const raft::handle_t& h, real_t* preds, const real_t* data, size_t num_rows, bool predict_proba) { // Initialize prediction parameters. predict_params params(predict_proba ? proba_ssp_ : class_ssp_); @@ -256,7 +256,7 @@ struct forest { // Simulating treelite order, which cancels out bias. // If non-proba prediction used, it still will not matter // for the same reason softmax will not. - F global_bias = (ot & output_t::SOFTMAX) != 0 ? F(0.0) : global_bias_; + real_t global_bias = (ot & output_t::SOFTMAX) != 0 ? real_t(0.0) : global_bias_; bool complement_proba = false, do_transform; if (predict_proba) { @@ -273,17 +273,19 @@ struct forest { // for GROVE_PER_CLASS, averaging happens in infer_k ot = output_t(ot & ~output_t::AVG); params.num_outputs = params.num_classes; - do_transform = (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != F(0.0); + do_transform = + (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != real_t(0.0); break; case leaf_algo_t::CATEGORICAL_LEAF: params.num_outputs = params.num_classes; - do_transform = ot != output_t::RAW || global_bias_ != F(0.0); + do_transform = ot != output_t::RAW || global_bias_ != real_t(0.0); break; case leaf_algo_t::VECTOR_LEAF: // for VECTOR_LEAF, averaging happens in infer_k ot = output_t(ot & ~output_t::AVG); params.num_outputs = params.num_classes; - do_transform = (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != F(0.0); + do_transform = + (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != real_t(0.0); break; default: ASSERT(false, "internal error: predict: invalid leaf_algo %d", params.leaf_algo); } @@ -308,9 +310,9 @@ struct forest { preds, num_values_to_transform, ot, - num_trees_ > 0 ? (F(1.0) / num_trees_) : F(1.0), - F(threshold_), - F(global_bias), + num_trees_ > 0 ? (real_t(1.0) / num_trees_) : real_t(1.0), + real_t(threshold_), + real_t(global_bias), complement_proba); RAFT_CUDA_TRY(cudaPeekAtLastError()); } diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu index 4eaaede85b..142e739cfb 100644 --- a/cpp/src/fil/infer.cu +++ b/cpp/src/fil/infer.cu @@ -913,6 +913,16 @@ void infer(storage_type forest, predict_params params, cudaStream_t stream) dispatch_on_fil_template_params(infer_k_storage_template(forest, stream), params); } +template void infer(dense_storage forest, + predict_params params, + cudaStream_t stream); +template void infer(sparse_storage16 forest, + predict_params params, + cudaStream_t stream); +template void infer(sparse_storage8 forest, + predict_params params, + cudaStream_t stream); + // struct instantiate_infer { // template // void operator()(fil_node_t) diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh index fbfd687248..801b8c6564 100644 --- a/cpp/src/fil/internal.cuh +++ b/cpp/src/fil/internal.cuh @@ -393,7 +393,7 @@ struct categorical_sets { // set count is due to tree_idx + node_within_tree_idx are both ints, hence uint32_t result template __host__ __device__ __forceinline__ int category_matches(node_t node, - typename node_t::F category) const + typename node_t::real_t category) const { // standard boolean packing. This layout has better ILP // node.set() is global across feature IDs and is an offset (as opposed @@ -409,7 +409,8 @@ struct categorical_sets { FIL will reject a model where an integer within [0, fid_num_cats] cannot be represented precisely as a 32-bit float. */ - return static_cast(category) < fid_num_cats[node.fid()] && category >= 0.0f && + return static_cast(category) < fid_num_cats[node.fid()] && + category >= 0.0f && fetch_bit(bits + node.set(), static_cast(static_cast(category))); } static int sizeof_mask_from_num_cats(int num_cats) diff --git a/cpp/src/fil/treelite_import.cu b/cpp/src/fil/treelite_import.cu index ae941452e6..7fc90eab3b 100644 --- a/cpp/src/fil/treelite_import.cu +++ b/cpp/src/fil/treelite_import.cu @@ -655,11 +655,11 @@ void convert(const raft::handle_t& handle, tl2fil.init_forest(handle, pforest); } -template +template constexpr bool type_supported() { // not using std::is_floating_point because we did not instantiate fp16-based nodes/trees/forests - return std::is_same() || std::is_same(); + return std::is_same() || std::is_same(); } template @@ -681,7 +681,7 @@ void from_treelite(const raft::handle_t& handle, "This may lead to predictions with reduced accuracy."); } // same as std::common_type: float+double=double, float+int64_t=float - typedef decltype(threshold_t{} + leaf_t{}) F; + typedef decltype(threshold_t{} + leaf_t{}) real_t; storage_type_t storage_type = tl_params->storage_type; // build dense trees by default From 12cc051913860aba9a182dd06813aa07912a9ae0 Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Mon, 14 Mar 2022 20:13:23 +0100 Subject: [PATCH 23/45] Fixed compilation errors; now it compiles. --- cpp/src/fil/infer.cu | 36 +++++++++++------------------------- 1 file changed, 11 insertions(+), 25 deletions(-) diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu index 142e739cfb..101a9ef826 100644 --- a/cpp/src/fil/infer.cu +++ b/cpp/src/fil/infer.cu @@ -48,7 +48,7 @@ struct vec; template struct Vectorized { BinaryOp op; - __device__ Vectorized(BinaryOp op_) : op(op_) {} + __host__ __device__ Vectorized(BinaryOp op_) : op(op_) {} template constexpr __host__ __device__ __forceinline__ vec operator()(vec a, vec b) const @@ -63,7 +63,7 @@ struct Vectorized { template constexpr __host__ __device__ Vectorized vectorized(BinaryOp op) { - return op; + return Vectorized(op); } template @@ -369,25 +369,11 @@ __device__ __forceinline__ void write_best_class( } /// needed for softmax -template -__device__ real_t shifted_exp(real_t margin, real_t max) -{ - // this is always false, since specializations are used for float and double - static_assert(std::is_same_v || std::is_same_v, - "only specializations of shifted_exp can be used"); -} - -template <> -__device__ double shifted_exp(double margin, double max) -{ - return exp(margin - max); -} +struct shifted_exp { + __device__ double operator()(double margin, double max) const { return exp(margin - max); } -template <> -__device__ float shifted_exp(float margin, float max) -{ - return expf(margin - max); -} + __device__ float operator()(float margin, float max) const { return expf(margin - max); } +}; // *begin and *end shall be struct vec // tmp_storage may NOT overlap shared memory addressed by [begin, end) @@ -395,12 +381,12 @@ template __device__ __forceinline__ void block_softmax(Iterator begin, Iterator end, void* tmp_storage) { // subtract max before exponentiating for numerical stability - using real_t = typename std::iterator_traits::value_type; - real_t max = allreduce_shmem(begin, end, vectorized(cub::Max()), tmp_storage); + using value_type = typename std::iterator_traits::value_type; + value_type max = allreduce_shmem(begin, end, vectorized(cub::Max()), tmp_storage); for (Iterator it = begin + threadIdx.x; it < end; it += blockDim.x) - *it = vectorized(shifted_exp)(*it, max); + *it = vectorized(shifted_exp())(*it, max); // sum of exponents - real_t soe = allreduce_shmem(begin, end, vectorized(cub::Sum()), tmp_storage); + value_type soe = allreduce_shmem(begin, end, vectorized(cub::Sum()), tmp_storage); // softmax phase 2: normalization for (Iterator it = begin + threadIdx.x; it < end; it += blockDim.x) *it /= soe; @@ -802,13 +788,13 @@ __device__ INLINE_CONFIG void load_data(real_t* sdata, } template __global__ void infer_k(storage_type forest, predict_params params) { + using real_t = typename storage_type::real_t; extern __shared__ char smem[]; real_t* sdata = (real_t*)smem; int sdata_stride = params.sdata_stride(); From 2bc6b6efdb5b281cb33f1637e320ff2596cf9560 Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Wed, 16 Mar 2022 23:52:56 +0100 Subject: [PATCH 24/45] float -> void in predict(). --- cpp/include/cuml/fil/fil.h | 8 +++++--- cpp/src/fil/fil.cu | 21 +++++++-------------- cpp/src/fil/infer.cu | 2 +- 3 files changed, 13 insertions(+), 18 deletions(-) diff --git a/cpp/include/cuml/fil/fil.h b/cpp/include/cuml/fil/fil.h index dfa66ad1a8..98be3b5593 100644 --- a/cpp/include/cuml/fil/fil.h +++ b/cpp/include/cuml/fil/fil.h @@ -128,17 +128,19 @@ void free(const raft::handle_t& h, forest_t f); * @param h cuML handle used by this function * @param f forest used for predictions * @param preds array in GPU memory to store predictions into - size == predict_proba ? (2*num_rows) : num_rows + * size = predict_proba ? (2*num_rows) : num_rows + * type = the type used for the forest representation (float or double) * @param data array of size n * cols (cols is the number of columns * for the forest f) from which to predict + * type = the type used for the forest representation (float or double) * @param num_rows number of data rows * @param predict_proba for classifier models, this forces to output both class probabilities * instead of binary class prediction. format matches scikit-learn API */ void predict(const raft::handle_t& h, forest_t f, - float* preds, - const float* data, + void* preds, + const void* data, size_t num_rows, bool predict_proba = false); diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu index 239938a3de..3131f5227b 100644 --- a/cpp/src/fil/fil.cu +++ b/cpp/src/fil/fil.cu @@ -640,23 +640,16 @@ void free(const raft::handle_t& h, forest_t f) /// part of C API - overload instead of template void predict(const raft::handle_t& h, forest_t f, - double* preds, - const double* data, + void* preds, + const void* data, size_t num_rows, bool predict_proba) { - f->predict(h, preds, data, num_rows, predict_proba); -} - -/// part of C API - overload instead of template -void predict(const raft::handle_t& h, - forest_t f, - float* preds, - const float* data, - size_t num_rows, - bool predict_proba) -{ - f->predict(h, preds, data, num_rows, predict_proba); + f->predict(h, + reinterpret_cast(preds), + reinterpret_cast(data), + num_rows, + predict_proba); } } // namespace fil diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu index 101a9ef826..de64a2b262 100644 --- a/cpp/src/fil/infer.cu +++ b/cpp/src/fil/infer.cu @@ -36,7 +36,7 @@ #endif // __CUDA_ARCH__ #endif // CUDA_PRAGMA_UNROLL -#define INLINE_CONFIG __forceinline__ +#define INLINE_CONFIG __noinline__ namespace ML { namespace fil { From 138e1dd38d588e86a06a12b06aae23274c2da73f Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Thu, 17 Mar 2022 00:24:34 +0100 Subject: [PATCH 25/45] Some templating. --- cpp/src/fil/fil.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu index 3131f5227b..8d6068f43c 100644 --- a/cpp/src/fil/fil.cu +++ b/cpp/src/fil/fil.cu @@ -61,11 +61,11 @@ __global__ void transform_k(real_t* preds, result += global_bias; if ((output & output_t::SIGMOID) != 0) result = sigmoid(result); // will not be done on CATEGORICAL_LEAF because the whole kernel will not run - if ((output & output_t::CLASS) != 0) { result = result > threshold ? 1.0f : 0.0f; } + if ((output & output_t::CLASS) != 0) { result = result > threshold ? real_t(1) : real_t(0); } // sklearn outputs numpy array in 'C' order, with the number of classes being last dimension // that is also the default order, so we should use the same one if (complement_proba) { - preds[i] = 1.0f - result; + preds[i] = real_t(1) - result; preds[i + 1] = result; } else preds[i] = result; From 532686fe0c49b802523035759108ab674cd984b5 Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Thu, 17 Mar 2022 01:38:25 +0100 Subject: [PATCH 26/45] template_forest for type-dependent forest members. --- cpp/src/fil/fil.cu | 199 ++++++++++++++++++++++----------------- cpp/src/fil/internal.cuh | 4 +- 2 files changed, 117 insertions(+), 86 deletions(-) diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu index 8d6068f43c..100049a082 100644 --- a/cpp/src/fil/fil.cu +++ b/cpp/src/fil/fil.cu @@ -76,8 +76,11 @@ __global__ void transform_k(real_t* preds, // but rather one symbol for the whole template specialization, as below. extern template int dispatch_on_fil_template_params(compute_smem_footprint, predict_params); +// forest is the base type for all forests and contains data and methods common +// to both dense and sparse forests and independent of the floating-point type +// used for model and data struct forest { - forest(const raft::handle_t& h) : vector_leaf_(0, h.get_stream()), cat_sets_(h.get_stream()) {} + forest(const raft::handle_t& h) : cat_sets_(h.get_stream()) {} void init_shmem_size(int device) { @@ -131,7 +134,34 @@ struct forest { fixed_block_count_ = blocks_per_sm * sm_count; } - template + virtual void predict(const raft::handle_t& h, + void* preds, + const void* data, + size_t num_rows, + bool predict_proba) = 0; + + virtual void free(const raft::handle_t& h) { cat_sets_.release(); } + + virtual ~forest() {} + + int num_trees_ = 0; + int depth_ = 0; + algo_t algo_ = algo_t::NAIVE; + output_t output_ = output_t::RAW; + double threshold_ = 0.5; + double global_bias_ = 0; + shmem_size_params class_ssp_, proba_ssp_; + int fixed_block_count_ = 0; + int max_shm_ = 0; + cat_sets_device_owner cat_sets_; +}; + +// template_forest contains data and methods common for both dense and sparse forests +// but dependent on the floating-point type used for model and data +template +struct template_forest : public forest { + template_forest(const raft::handle_t& h) : forest(h), vector_leaf_(0, h.get_stream()) {} + void init_common(const raft::handle_t& h, const categorical_sets& cat_sets, const std::vector& vector_leaf, @@ -175,8 +205,20 @@ struct forest { virtual void infer(predict_params params, cudaStream_t stream) = 0; - template - void predict( + virtual void predict(const raft::handle_t& h, + void* preds, + const void* data, + size_t num_rows, + bool predict_proba) override + { + template_predict(h, + reinterpret_cast(preds), + reinterpret_cast(data), + num_rows, + predict_proba); + } + + void template_predict( const raft::handle_t& h, real_t* preds, const real_t* data, size_t num_rows, bool predict_proba) { // Initialize prediction parameters. @@ -320,25 +362,13 @@ struct forest { virtual void free(const raft::handle_t& h) { - cat_sets_.release(); vector_leaf_.release(); + forest::free(h); } - virtual ~forest() {} - - int num_trees_ = 0; - int depth_ = 0; - algo_t algo_ = algo_t::NAIVE; - output_t output_ = output_t::RAW; - double threshold_ = 0.5; - double global_bias_ = 0; - shmem_size_params class_ssp_, proba_ssp_; - int fixed_block_count_ = 0; - int max_shm_ = 0; // vector_leaf_ is only used if {class,proba}_ssp_.leaf_algo is VECTOR_LEAF, // otherwise it is empty - rmm::device_uvector vector_leaf_; - cat_sets_device_owner cat_sets_; + rmm::device_uvector vector_leaf_; }; template @@ -364,9 +394,9 @@ struct opt_into_arch_dependent_shmem : dispatch_functor { }; template -struct dense_forest> : forest { +struct dense_forest> : template_forest { using node_t = dense_node; - dense_forest(const raft::handle_t& h) : forest(h), nodes_(0, h.get_stream()) {} + dense_forest(const raft::handle_t& h) : template_forest(h), nodes_(0, h.get_stream()) {} void transform_trees(const node_t* nodes) { @@ -379,14 +409,14 @@ struct dense_forest> : forest { roots of all trees (node 2), and so on. */ int global_node = 0; - for (int tree = 0; tree < num_trees_; ++tree) { + for (int tree = 0; tree < this->num_trees_; ++tree) { int tree_node = 0; // the counters `level` and `branch` are not used for computing node // indices, they are only here to highlight the node ordering within // each tree - for (int level = 0; level <= depth_; ++level) { + for (int level = 0; level <= this->depth_; ++level) { for (int branch = 0; branch < 1 << level; ++branch) { - h_nodes_[tree_node * num_trees_ + tree] = nodes[global_node]; + h_nodes_[tree_node * this->num_trees_ + tree] = nodes[global_node]; ++tree_node; ++global_node; } @@ -403,13 +433,13 @@ struct dense_forest> : forest { const node_t* nodes, const forest_params_t* params) { - init_common(h, cat_sets, vector_leaf, params); - if (algo_ == algo_t::NAIVE) algo_ = algo_t::BATCH_TREE_REORG; + this->init_common(h, cat_sets, vector_leaf, params); + if (this->algo_ == algo_t::NAIVE) this->algo_ = algo_t::BATCH_TREE_REORG; - int num_nodes = forest_num_nodes(num_trees_, depth_); + int num_nodes = forest_num_nodes(this->num_trees_, this->depth_); nodes_.resize(num_nodes, h.get_stream()); h_nodes_.resize(num_nodes); - if (algo_ == algo_t::NAIVE) { + if (this->algo_ == algo_t::NAIVE) { std::copy(nodes, nodes + num_nodes, h_nodes_.begin()); } else { transform_trees(nodes); @@ -421,8 +451,8 @@ struct dense_forest> : forest { h.get_stream())); // predict_proba is a runtime parameter, and opt-in is unconditional - dispatch_on_fil_template_params(opt_into_arch_dependent_shmem>(max_shm_), - static_cast(class_ssp_)); + dispatch_on_fil_template_params(opt_into_arch_dependent_shmem>(this->max_shm_), + static_cast(this->class_ssp_)); // copy must be finished before freeing the host data h.sync_stream(); h_nodes_.clear(); @@ -431,12 +461,12 @@ struct dense_forest> : forest { virtual void infer(predict_params params, cudaStream_t stream) override { - storage forest(cat_sets_.accessor(), - reinterpret_cast(vector_leaf_.data()), + storage forest(this->cat_sets_.accessor(), + reinterpret_cast(this->vector_leaf_.data()), nodes_.data(), - num_trees_, - algo_ == algo_t::NAIVE ? tree_num_nodes(depth_) : 1, - algo_ == algo_t::NAIVE ? 1 : num_trees_); + this->num_trees_, + this->algo_ == algo_t::NAIVE ? tree_num_nodes(this->depth_) : 1, + this->algo_ == algo_t::NAIVE ? 1 : this->num_trees_); static_assert(std::is_same_v, "real_t must be float; to be removed in the following pull requests"); fil::infer(forest, params, stream); @@ -445,7 +475,7 @@ struct dense_forest> : forest { virtual void free(const raft::handle_t& h) override { nodes_.release(); - forest::free(h); + template_forest::free(h); } rmm::device_uvector nodes_; @@ -453,28 +483,35 @@ struct dense_forest> : forest { }; template -struct sparse_forest : forest { +struct sparse_forest : template_forest { + using real_t = typename node_t::real_t; + sparse_forest(const raft::handle_t& h) - : forest(h), trees_(0, h.get_stream()), nodes_(0, h.get_stream()) + : template_forest(h), + trees_(0, h.get_stream()), + nodes_(0, h.get_stream()) { } void init(const raft::handle_t& h, const categorical_sets& cat_sets, - const std::vector& vector_leaf, + const std::vector& vector_leaf, const int* trees, const node_t* nodes, const forest_params_t* params) { - init_common(h, cat_sets, vector_leaf, params); - if (algo_ == algo_t::ALGO_AUTO) algo_ = algo_t::NAIVE; - depth_ = 0; // a placeholder value - num_nodes_ = params->num_nodes; + this->init_common(h, cat_sets, vector_leaf, params); + if (this->algo_ == algo_t::ALGO_AUTO) this->algo_ = algo_t::NAIVE; + this->depth_ = 0; // a placeholder value + num_nodes_ = params->num_nodes; // trees - trees_.resize(num_trees_, h.get_stream()); - RAFT_CUDA_TRY(cudaMemcpyAsync( - trees_.data(), trees, sizeof(int) * num_trees_, cudaMemcpyHostToDevice, h.get_stream())); + trees_.resize(this->num_trees_, h.get_stream()); + RAFT_CUDA_TRY(cudaMemcpyAsync(trees_.data(), + trees, + sizeof(int) * this->num_trees_, + cudaMemcpyHostToDevice, + h.get_stream())); // nodes nodes_.resize(num_nodes_, h.get_stream()); @@ -482,25 +519,25 @@ struct sparse_forest : forest { nodes_.data(), nodes, sizeof(node_t) * num_nodes_, cudaMemcpyHostToDevice, h.get_stream())); // predict_proba is a runtime parameter, and opt-in is unconditional - dispatch_on_fil_template_params(opt_into_arch_dependent_shmem>(max_shm_), - static_cast(class_ssp_)); + dispatch_on_fil_template_params(opt_into_arch_dependent_shmem>(this->max_shm_), + static_cast(this->class_ssp_)); } virtual void infer(predict_params params, cudaStream_t stream) override { - storage forest(cat_sets_.accessor(), - reinterpret_cast(vector_leaf_.data()), + storage forest(this->cat_sets_.accessor(), + reinterpret_cast(this->vector_leaf_.data()), trees_.data(), nodes_.data(), - num_trees_); - static_assert(std::is_same_v, + this->num_trees_); + static_assert(std::is_same_v, "real_t must be float; to be removed in the following pull requests"); fil::infer(forest, params, stream); } void free(const raft::handle_t& h) override { - forest::free(h); + template_forest::free(h); trees_.release(); nodes_.release(); } @@ -588,11 +625,11 @@ void check_params(const forest_params_t* params, bool dense) /** initializes a forest of any type * When fil_node_t == dense_node, const int* trees is ignored */ -template +template void init(const raft::handle_t& h, forest_t* pf, const categorical_sets& cat_sets, - const std::vector& vector_leaf, + const std::vector& vector_leaf, const int* trees, const fil_node_t* nodes, const forest_params_t* params) @@ -600,36 +637,34 @@ void init(const raft::handle_t& h, check_params(params, node_traits::IS_DENSE); using forest_type = typename node_traits::forest; forest_type* f = new forest_type(h); - static_assert(std::is_same_v, + static_assert(std::is_same_v, "real_t must be float; to be removed in the following pull requests"); f->init(h, cat_sets, vector_leaf, trees, nodes, params); *pf = f; } -// explicit instantiations for init_sparse() -template void init>(const raft::handle_t& h, - forest_t* pf, - const categorical_sets& cat_sets, - const std::vector& vector_leaf, - const int* trees, - const sparse_node16* nodes, - const forest_params_t* params); - -template void init(const raft::handle_t& h, - forest_t* pf, - const categorical_sets& cat_sets, - const std::vector& vector_leaf, - const int* trees, - const sparse_node8* nodes, - const forest_params_t* params); - -template void init>(const raft::handle_t& h, - forest_t* pf, - const categorical_sets& cat_sets, - const std::vector& vector_leaf, - const int* trees, - const dense_node* nodes, - const forest_params_t* params); +// explicit instantiations for init() +template void init, float>(const raft::handle_t& h, + forest_t* pf, + const categorical_sets& cat_sets, + const std::vector& vector_leaf, + const int* trees, + const sparse_node16* nodes, + const forest_params_t* params); +template void init(const raft::handle_t& h, + forest_t* pf, + const categorical_sets& cat_sets, + const std::vector& vector_leaf, + const int* trees, + const sparse_node8* nodes, + const forest_params_t* params); +template void init, float>(const raft::handle_t& h, + forest_t* pf, + const categorical_sets& cat_sets, + const std::vector& vector_leaf, + const int* trees, + const dense_node* nodes, + const forest_params_t* params); void free(const raft::handle_t& h, forest_t f) { @@ -645,11 +680,7 @@ void predict(const raft::handle_t& h, size_t num_rows, bool predict_proba) { - f->predict(h, - reinterpret_cast(preds), - reinterpret_cast(data), - num_rows, - predict_proba); + f->predict(h, preds, data, num_rows, predict_proba); } } // namespace fil diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh index 801b8c6564..5223a141b3 100644 --- a/cpp/src/fil/internal.cuh +++ b/cpp/src/fil/internal.cuh @@ -561,11 +561,11 @@ struct cat_sets_device_owner { * @param params pointer to parameters used to initialize the forest * @param vector_leaf optional vector leaves */ -template +template void init(const raft::handle_t& h, forest_t* pf, const categorical_sets& cat_sets, - const std::vector& vector_leaf, + const std::vector& vector_leaf, const int* trees, const fil_node_t* nodes, const forest_params_t* params); From ce689f7c2e0d97ae1a0b5ad43a20201f654a32b4 Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Thu, 17 Mar 2022 21:59:44 +0100 Subject: [PATCH 27/45] Instantiate forests with double. --- cpp/src/fil/common.cuh | 8 +++++--- cpp/src/fil/fil.cu | 36 +++++++++++++++++++++--------------- cpp/src/fil/infer.cu | 29 ++++++++++++----------------- cpp/src/fil/internal.cuh | 4 ++-- 4 files changed, 40 insertions(+), 37 deletions(-) diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh index e7d7b2f11e..71cb1ad08d 100644 --- a/cpp/src/fil/common.cuh +++ b/cpp/src/fil/common.cuh @@ -127,9 +127,11 @@ struct storage : storage_base { } }; -typedef storage> dense_storage; -typedef storage> sparse_storage16; -typedef storage sparse_storage8; +using dense_storage_f32 = storage>; +using dense_storage_f64 = storage>; +using sparse_storage16_f32 = storage>; +using sparse_storage16_f64 = storage>; +using sparse_storage8 = storage; /// all model parameters mostly required to compute shared memory footprint, /// also the footprint itself diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu index 100049a082..49e98f9e8b 100644 --- a/cpp/src/fil/fil.cu +++ b/cpp/src/fil/fil.cu @@ -379,8 +379,6 @@ struct opt_into_arch_dependent_shmem : dispatch_functor { template > void run(predict_params p) { - static_assert(std::is_same_v, - "real_t must be float; to be removed in the following pull requests"); auto kernel = infer_k> : template_forest { this->num_trees_, this->algo_ == algo_t::NAIVE ? tree_num_nodes(this->depth_) : 1, this->algo_ == algo_t::NAIVE ? 1 : this->num_trees_); - static_assert(std::is_same_v, - "real_t must be float; to be removed in the following pull requests"); fil::infer(forest, params, stream); } @@ -530,8 +526,6 @@ struct sparse_forest : template_forest { trees_.data(), nodes_.data(), this->num_trees_); - static_assert(std::is_same_v, - "real_t must be float; to be removed in the following pull requests"); fil::infer(forest, params, stream); } @@ -637,13 +631,25 @@ void init(const raft::handle_t& h, check_params(params, node_traits::IS_DENSE); using forest_type = typename node_traits::forest; forest_type* f = new forest_type(h); - static_assert(std::is_same_v, - "real_t must be float; to be removed in the following pull requests"); f->init(h, cat_sets, vector_leaf, trees, nodes, params); *pf = f; } // explicit instantiations for init() +template void init, float>(const raft::handle_t& h, + forest_t* pf, + const categorical_sets& cat_sets, + const std::vector& vector_leaf, + const int* trees, + const dense_node* nodes, + const forest_params_t* params); +template void init, double>(const raft::handle_t& h, + forest_t* pf, + const categorical_sets& cat_sets, + const std::vector& vector_leaf, + const int* trees, + const dense_node* nodes, + const forest_params_t* params); template void init, float>(const raft::handle_t& h, forest_t* pf, const categorical_sets& cat_sets, @@ -651,6 +657,13 @@ template void init, float>(const raft::handle_t& h, const int* trees, const sparse_node16* nodes, const forest_params_t* params); +template void init, double>(const raft::handle_t& h, + forest_t* pf, + const categorical_sets& cat_sets, + const std::vector& vector_leaf, + const int* trees, + const sparse_node16* nodes, + const forest_params_t* params); template void init(const raft::handle_t& h, forest_t* pf, const categorical_sets& cat_sets, @@ -658,13 +671,6 @@ template void init(const raft::handle_t& h, const int* trees, const sparse_node8* nodes, const forest_params_t* params); -template void init, float>(const raft::handle_t& h, - forest_t* pf, - const categorical_sets& cat_sets, - const std::vector& vector_leaf, - const int* trees, - const dense_node* nodes, - const forest_params_t* params); void free(const raft::handle_t& h, forest_t f) { diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu index de64a2b262..233b1bec5d 100644 --- a/cpp/src/fil/infer.cu +++ b/cpp/src/fil/infer.cu @@ -899,26 +899,21 @@ void infer(storage_type forest, predict_params params, cudaStream_t stream) dispatch_on_fil_template_params(infer_k_storage_template(forest, stream), params); } -template void infer(dense_storage forest, - predict_params params, - cudaStream_t stream); -template void infer(sparse_storage16 forest, - predict_params params, - cudaStream_t stream); +template void infer(dense_storage_f32 forest, + predict_params params, + cudaStream_t stream); +template void infer(dense_storage_f64 forest, + predict_params params, + cudaStream_t stream); +template void infer(sparse_storage16_f32 forest, + predict_params params, + cudaStream_t stream); +template void infer(sparse_storage16_f64 forest, + predict_params params, + cudaStream_t stream); template void infer(sparse_storage8 forest, predict_params params, cudaStream_t stream); -// struct instantiate_infer { -// template -// void operator()(fil_node_t) -// { -// infer>( -// storage forest, predict_params params, cudaStream_t stream); -// } -// }; - -// template void instantiate_for_all_node_types(instantiate_infer); - } // namespace fil } // namespace ML diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh index 5223a141b3..0f39270588 100644 --- a/cpp/src/fil/internal.cuh +++ b/cpp/src/fil/internal.cuh @@ -409,8 +409,8 @@ struct categorical_sets { FIL will reject a model where an integer within [0, fid_num_cats] cannot be represented precisely as a 32-bit float. */ - return static_cast(category) < fid_num_cats[node.fid()] && - category >= 0.0f && + using real_t = typename node_t::real_t; + return category < static_cast(fid_num_cats[node.fid()]) && category >= real_t(0) && fetch_bit(bits + node.set(), static_cast(static_cast(category))); } static int sizeof_mask_from_num_cats(int num_cats) From 98b997a0ea934db899b12b12881a375563c4801b Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Thu, 17 Mar 2022 23:27:07 +0100 Subject: [PATCH 28/45] Small changes. --- cpp/src/fil/fil.cu | 38 +++++++++++++++++++------------------- cpp/src/fil/internal.cuh | 4 ++-- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu index 49e98f9e8b..43ec9fcad4 100644 --- a/cpp/src/fil/fil.cu +++ b/cpp/src/fil/fil.cu @@ -144,12 +144,10 @@ struct forest { virtual ~forest() {} - int num_trees_ = 0; - int depth_ = 0; - algo_t algo_ = algo_t::NAIVE; - output_t output_ = output_t::RAW; - double threshold_ = 0.5; - double global_bias_ = 0; + int num_trees_ = 0; + int depth_ = 0; + algo_t algo_ = algo_t::NAIVE; + output_t output_ = output_t::RAW; shmem_size_params class_ssp_, proba_ssp_; int fixed_block_count_ = 0; int max_shm_ = 0; @@ -171,8 +169,8 @@ struct template_forest : public forest { num_trees_ = params->num_trees; algo_ = params->algo; output_ = params->output; - threshold_ = params->threshold; - global_bias_ = params->global_bias; + threshold_ = static_cast(params->threshold); + global_bias_ = static_cast(params->global_bias); proba_ssp_.n_items = params->n_items; proba_ssp_.log2_threads_per_tree = log2(params->threads_per_tree); proba_ssp_.leaf_algo = params->leaf_algo; @@ -298,7 +296,7 @@ struct template_forest : public forest { // Simulating treelite order, which cancels out bias. // If non-proba prediction used, it still will not matter // for the same reason softmax will not. - real_t global_bias = (ot & output_t::SOFTMAX) != 0 ? real_t(0.0) : global_bias_; + real_t global_bias = (ot & output_t::SOFTMAX) != 0 ? real_t(0) : global_bias_; bool complement_proba = false, do_transform; if (predict_proba) { @@ -316,24 +314,24 @@ struct template_forest : public forest { ot = output_t(ot & ~output_t::AVG); params.num_outputs = params.num_classes; do_transform = - (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != real_t(0.0); + (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != real_t(0); break; case leaf_algo_t::CATEGORICAL_LEAF: params.num_outputs = params.num_classes; - do_transform = ot != output_t::RAW || global_bias_ != real_t(0.0); + do_transform = ot != output_t::RAW || global_bias_ != real_t(0); break; case leaf_algo_t::VECTOR_LEAF: // for VECTOR_LEAF, averaging happens in infer_k ot = output_t(ot & ~output_t::AVG); params.num_outputs = params.num_classes; do_transform = - (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != real_t(0.0); + (ot != output_t::RAW && ot != output_t::SOFTMAX) || global_bias != real_t(0); break; default: ASSERT(false, "internal error: predict: invalid leaf_algo %d", params.leaf_algo); } } else { if (params.leaf_algo == leaf_algo_t::FLOAT_UNARY_BINARY) { - do_transform = ot != output_t::RAW || global_bias_ != 0.0f; + do_transform = ot != output_t::RAW || global_bias_ != real_t(0); } else { // GROVE_PER_CLASS, CATEGORICAL_LEAF: moot since choosing best class and // all transforms are monotonic. also, would break current code @@ -352,9 +350,9 @@ struct template_forest : public forest { preds, num_values_to_transform, ot, - num_trees_ > 0 ? (real_t(1.0) / num_trees_) : real_t(1.0), - real_t(threshold_), - real_t(global_bias), + num_trees_ > 0 ? (real_t(1) / num_trees_) : real_t(1), + threshold_, + global_bias, complement_proba); RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -366,6 +364,8 @@ struct template_forest : public forest { forest::free(h); } + real_t threshold_ = 0.5; + real_t global_bias_ = 0; // vector_leaf_ is only used if {class,proba}_ssp_.leaf_algo is VECTOR_LEAF, // otherwise it is empty rmm::device_uvector vector_leaf_; @@ -460,7 +460,7 @@ struct dense_forest> : template_forest { virtual void infer(predict_params params, cudaStream_t stream) override { storage forest(this->cat_sets_.accessor(), - reinterpret_cast(this->vector_leaf_.data()), + this->vector_leaf_.data(), nodes_.data(), this->num_trees_, this->algo_ == algo_t::NAIVE ? tree_num_nodes(this->depth_) : 1, @@ -522,7 +522,7 @@ struct sparse_forest : template_forest { virtual void infer(predict_params params, cudaStream_t stream) override { storage forest(this->cat_sets_.accessor(), - reinterpret_cast(this->vector_leaf_.data()), + this->vector_leaf_.data(), trees_.data(), nodes_.data(), this->num_trees_); @@ -678,7 +678,7 @@ void free(const raft::handle_t& h, forest_t f) delete f; } -/// part of C API - overload instead of template +/// part of C API - preds and data are either both pointers to float or to double void predict(const raft::handle_t& h, forest_t f, void* preds, diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh index 0f39270588..37b600d201 100644 --- a/cpp/src/fil/internal.cuh +++ b/cpp/src/fil/internal.cuh @@ -242,7 +242,7 @@ struct node_traits { using forest = sparse_forest; static const bool IS_DENSE = false; static const storage_type_t storage_type_enum = - std::is_same, node_t>() ? SPARSE : SPARSE8; + std::is_same, node_t>() ? SPARSE : SPARSE8; template static void check(const treelite::ModelImpl& model); }; @@ -431,7 +431,7 @@ struct tree_base { template __host__ __device__ __forceinline__ int child_index(const node_t& node, int node_idx, - float val) const + typename node_t::real_t val) const { bool cond; From 8c84cf7d0663ee8ff764792a2c1cb694eff70e6b Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Fri, 18 Mar 2022 03:14:00 +0100 Subject: [PATCH 29/45] Templatized BaseFilTest. --- cpp/test/sg/fil_test.cu | 195 +++++++++++++++++++++------------------- 1 file changed, 103 insertions(+), 92 deletions(-) diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu index 74ab0c4d00..893c6bf92a 100644 --- a/cpp/test/sg/fil_test.cu +++ b/cpp/test/sg/fil_test.cu @@ -131,7 +131,8 @@ std::ostream& operator<<(std::ostream& os, const FilTestParams& ps) return os; } -__global__ void nan_kernel(float* data, const bool* mask, int len, float nan) +template +__global__ void nan_kernel(real_t* data, const bool* mask, int len, real_t nan) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid >= len) return; @@ -139,6 +140,7 @@ __global__ void nan_kernel(float* data, const bool* mask, int len, float nan) } float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); } +double sigmoid(double x) { return 1.0 / (1.0 + exp(-x)); } void hard_clipped_bernoulli( raft::random::Rng rng, float* d, std::size_t n_vals, float prob_of_zero, cudaStream_t stream) @@ -154,23 +156,24 @@ void hard_clipped_bernoulli( }); } +template struct replace_some_floating_with_categorical { float* fid_num_cats_d; int num_cols; - __device__ float operator()(float data, int data_idx) + __device__ real_t operator()(real_t data, int data_idx) { - float fid_num_cats = fid_num_cats_d[data_idx % num_cols]; - if (fid_num_cats == 0.0f) return data; + real_t fid_num_cats = fid_num_cats_d[data_idx % num_cols]; + if (fid_num_cats == real_t(0)) return data; // Transform `data` from (uniform on) [-1.0, 1.0] into [-fid_num_cats-3, fid_num_cats+3]. - float tmp = data * (fid_num_cats + 3.0f); + real_t tmp = data * (fid_num_cats + real_t(3)); // Also test invalid (negative and above fid_num_cats) categories: samples within // [fid_num_cats+2.5, fid_num_cats+3) and opposite will test infinite floats as categorical. - if (tmp + fid_num_cats < -2.5f) return -INFINITY; - if (tmp - fid_num_cats > +2.5f) return +INFINITY; + if (tmp + fid_num_cats < real_t(-2.5f)) return -INFINITY; + if (tmp - fid_num_cats > real_t(+2.5f)) return +INFINITY; // Samples within [fid_num_cats+2, fid_num_cats+2.5) (and their negative counterparts) will // test huge invalid categories. - if (tmp + fid_num_cats < -2.0f) tmp -= MAX_FIL_INT_FLOAT; - if (tmp - fid_num_cats > +2.0f) tmp += MAX_FIL_INT_FLOAT; + if (tmp + fid_num_cats < real_t(-2.0f)) tmp -= MAX_FIL_INT_FLOAT; + if (tmp - fid_num_cats > real_t(+2.0f)) tmp += MAX_FIL_INT_FLOAT; // Samples within [0, fid_num_cats+2) will be valid categories, rounded towards 0 with a cast. // Negative categories are always invalid. For correct interpretation, see // cpp/src/fil/internal.cuh `int category_matches(node_t node, float category)` @@ -178,7 +181,8 @@ struct replace_some_floating_with_categorical { } }; -__global__ void floats_to_bit_stream_k(uint8_t* dst, float* src, std::size_t size) +template +__global__ void floats_to_bit_stream_k(uint8_t* dst, real_t* src, std::size_t size) { std::size_t idx = std::size_t(blockIdx.x) * blockDim.x + threadIdx.x; if (idx >= size) return; @@ -221,6 +225,7 @@ void adjust_threshold_to_treelite( } } +template class BaseFilTest : public testing::TestWithParam { public: BaseFilTest() @@ -252,9 +257,9 @@ class BaseFilTest : public testing::TestWithParam { size_t num_nodes = forest_num_nodes(); // helper data - /// weights, used as float* or int* - rmm::device_uvector weights_d(num_nodes, stream); - rmm::device_uvector thresholds_d(num_nodes, stream); + rmm::device_uvector weights_int_d(num_nodes, stream); + rmm::device_uvector weights_real_d(num_nodes, stream); + rmm::device_uvector thresholds_d(num_nodes, stream); rmm::device_uvector def_lefts_d(num_nodes, stream); rmm::device_uvector is_leafs_d(num_nodes, stream); rmm::device_uvector is_categoricals_d(num_nodes, stream); @@ -266,25 +271,25 @@ class BaseFilTest : public testing::TestWithParam { raft::random::Rng r(ps.seed); if (ps.leaf_algo == fil::leaf_algo_t::CATEGORICAL_LEAF) { // [0..num_classes) - r.uniformInt((int*)weights_d.data(), num_nodes, 0, ps.num_classes, stream); + r.uniformInt(weights_int_d.data(), num_nodes, 0, ps.num_classes, stream); } else if (ps.leaf_algo == fil::leaf_algo_t::VECTOR_LEAF) { std::mt19937 gen(3); - std::uniform_real_distribution<> dist(0, 1); + std::uniform_real_distribution dist(0, 1); vector_leaf.resize(num_nodes * ps.num_classes); for (size_t i = 0; i < vector_leaf.size(); i++) { vector_leaf[i] = dist(gen); } // Normalise probabilities to 1 for (size_t i = 0; i < vector_leaf.size(); i += ps.num_classes) { - auto sum = std::accumulate(&vector_leaf[i], &vector_leaf[i + ps.num_classes], 0.0f); + auto sum = std::accumulate(&vector_leaf[i], &vector_leaf[i + ps.num_classes], real_t(0)); for (size_t j = i; j < i + ps.num_classes; j++) { vector_leaf[j] /= sum; } } } else { - r.uniform((float*)weights_d.data(), num_nodes, -1.0f, 1.0f, stream); + r.uniform(weights_real_d.data(), num_nodes, real_t(-1), real_t(1), stream); } - r.uniform(thresholds_d.data(), num_nodes, -1.0f, 1.0f, stream); + r.uniform(thresholds_d.data(), num_nodes, real_t(-1), real_t(1), stream); r.uniformInt(fids_d.data(), num_nodes, 0, ps.num_cols, stream); r.bernoulli(def_lefts_d.data(), num_nodes, 0.5f, stream); r.bernoulli(is_leafs_d.data(), num_nodes, 1.0f - ps.leaf_prob, stream); @@ -292,8 +297,9 @@ class BaseFilTest : public testing::TestWithParam { r, is_categoricals_d.data(), num_nodes, 1.0f - ps.node_categorical_prob, stream); // copy data to host - std::vector thresholds_h(num_nodes), is_categoricals_h(num_nodes); - std::vector weights_h(num_nodes), fids_h(num_nodes), node_cat_set(num_nodes); + std::vector thresholds_h(num_nodes), weights_real_h(num_nodes); + std::vector is_categoricals_h(num_nodes); + std::vector weights_int_h(num_nodes), fids_h(num_nodes), node_cat_set(num_nodes); std::vector fid_num_cats_h(ps.num_cols); std::vector feature_categorical(ps.num_cols); // bool vectors are not guaranteed to be stored byte-per-value @@ -320,7 +326,8 @@ class BaseFilTest : public testing::TestWithParam { cat_sets_h.fid_num_cats[fid] = 0.0f; } } - raft::update_host(weights_h.data(), (int*)weights_d.data(), num_nodes, stream); + raft::update_host(weights_int_h.data(), weights_int_d.data(), num_nodes, stream); + raft::update_host(weights_real_h.data(), weights_real_d.data(), num_nodes, stream); raft::update_host(thresholds_h.data(), thresholds_d.data(), num_nodes, stream); raft::update_host(fids_h.data(), fids_d.data(), num_nodes, stream); raft::update_host(def_lefts_h, def_lefts_d.data(), num_nodes, stream); @@ -346,7 +353,7 @@ class BaseFilTest : public testing::TestWithParam { if (!feature_categorical[fid] || is_leafs_h[node_id]) is_categoricals_h[node_id] = 0.0f; - if (is_categoricals_h[node_id] == 1.0) { + if (is_categoricals_h[node_id] == 1.0f) { // might allocate a categorical set for an unreachable inner node. That's OK. ++cat_sets_h.n_nodes[fid]; node_cat_set[node_id] = bit_pool_size; @@ -378,27 +385,27 @@ class BaseFilTest : public testing::TestWithParam { // initialize nodes nodes.resize(num_nodes); for (size_t i = 0; i < num_nodes; ++i) { - fil::val_t w; + fil::val_t w; switch (ps.leaf_algo) { - case fil::leaf_algo_t::CATEGORICAL_LEAF: w.idx = weights_h[i]; break; + case fil::leaf_algo_t::CATEGORICAL_LEAF: w.idx = weights_int_h[i]; break; case fil::leaf_algo_t::FLOAT_UNARY_BINARY: case fil::leaf_algo_t::GROVE_PER_CLASS: // not relying on fil::val_t internals // merely that we copied floats into weights_h earlier - std::memcpy(&w.f, &weights_h[i], sizeof w.f); + w.f = weights_real_h[i]; break; case fil::leaf_algo_t::VECTOR_LEAF: w.idx = i; break; default: ASSERT(false, "internal error: invalid ps.leaf_algo"); } // make sure nodes are categorical only when their feature ID is categorical bool is_categorical = is_categoricals_h[i] == 1.0f; - val_t split; + val_t split; if (is_categorical) split.idx = node_cat_set[i]; else split.f = thresholds_h[i]; nodes[i] = - fil::dense_node(w, split, fids_h[i], def_lefts_h[i], is_leafs_h[i], is_categorical); + fil::dense_node(w, split, fids_h[i], def_lefts_h[i], is_leafs_h[i], is_categorical); } // clean up @@ -416,17 +423,18 @@ class BaseFilTest : public testing::TestWithParam { // generate random data raft::random::Rng r(ps.seed); - r.uniform(data_d.data(), num_data, -1.0f, 1.0f, stream); - thrust::transform(thrust::cuda::par.on(stream), - data_d.data(), - data_d.data() + num_data, - thrust::counting_iterator(0), - data_d.data(), - replace_some_floating_with_categorical{fid_num_cats_d.data(), ps.num_cols}); + r.uniform(data_d.data(), num_data, real_t(-1), real_t(1), stream); + thrust::transform( + thrust::cuda::par.on(stream), + data_d.data(), + data_d.data() + num_data, + thrust::counting_iterator(0), + data_d.data(), + replace_some_floating_with_categorical{fid_num_cats_d.data(), ps.num_cols}); r.bernoulli(mask_d.data(), num_data, ps.nan_prob, stream); int tpb = 256; nan_kernel<<>>( - data_d.data(), mask_d.data(), num_data, std::numeric_limits::quiet_NaN()); + data_d.data(), mask_d.data(), num_data, std::numeric_limits::quiet_NaN()); RAFT_CUDA_TRY(cudaPeekAtLastError()); // copy to host @@ -435,48 +443,48 @@ class BaseFilTest : public testing::TestWithParam { handle.sync_stream(); } - void apply_softmax(float* class_scores) + void apply_softmax(real_t* class_scores) { - float max = *std::max_element(class_scores, &class_scores[ps.num_classes]); + real_t max = *std::max_element(class_scores, &class_scores[ps.num_classes]); for (int i = 0; i < ps.num_classes; ++i) - class_scores[i] = expf(class_scores[i] - max); - float sum = std::accumulate(class_scores, &class_scores[ps.num_classes], 0.0f); + class_scores[i] = exp(class_scores[i] - max); + real_t sum = std::accumulate(class_scores, &class_scores[ps.num_classes], real_t(0)); for (int i = 0; i < ps.num_classes; ++i) class_scores[i] /= sum; } - void transform(float f, float& proba, float& output) + void transform(real_t f, real_t& proba, real_t& output) { if ((ps.output & fil::output_t::AVG) != 0) { if (ps.leaf_algo == fil::leaf_algo_t::GROVE_PER_CLASS) { f /= ps.num_trees / ps.num_classes; } else { - f *= 1.0f / ps.num_trees; + f *= real_t(1) / ps.num_trees; } } f += ps.global_bias; if ((ps.output & fil::output_t::SIGMOID) != 0) { f = sigmoid(f); } proba = f; - if ((ps.output & fil::output_t::CLASS) != 0) { f = f > ps.threshold ? 1.0f : 0.0f; } + if ((ps.output & fil::output_t::CLASS) != 0) { f = f > ps.threshold ? real_t(1) : real_t(0); } output = f; } - void complement(float* proba) { proba[0] = 1.0f - proba[1]; } + void complement(real_t* proba) { proba[0] = real_t(1) - proba[1]; } void predict_on_cpu() { auto stream = handle.get_stream(); // predict on host - std::vector want_preds_h(ps.num_preds_outputs()); + std::vector want_preds_h(ps.num_preds_outputs()); want_proba_h.resize(ps.num_proba_outputs()); int num_nodes = tree_num_nodes(); - std::vector class_scores(ps.num_classes); + std::vector class_scores(ps.num_classes); // we use tree_base::child_index() on CPU tree_base base{cat_sets_h.accessor()}; switch (ps.leaf_algo) { case fil::leaf_algo_t::FLOAT_UNARY_BINARY: for (int i = 0; i < ps.num_rows; ++i) { - float pred = 0.0f; + real_t pred = 0; for (int j = 0; j < ps.num_trees; ++j) { pred += infer_one_tree(&nodes[j * num_nodes], &data_h[i * ps.num_cols], base).f; } @@ -486,7 +494,7 @@ class BaseFilTest : public testing::TestWithParam { break; case fil::leaf_algo_t::GROVE_PER_CLASS: for (int row = 0; row < ps.num_rows; ++row) { - std::fill(class_scores.begin(), class_scores.end(), 0.0f); + std::fill(class_scores.begin(), class_scores.end(), real_t(0)); for (int tree = 0; tree < ps.num_trees; ++tree) { class_scores[tree % ps.num_classes] += infer_one_tree(&nodes[tree * num_nodes], &data_h[row * ps.num_cols], base).f; @@ -494,7 +502,7 @@ class BaseFilTest : public testing::TestWithParam { want_preds_h[row] = std::max_element(class_scores.begin(), class_scores.end()) - class_scores.begin(); for (int c = 0; c < ps.num_classes; ++c) { - float thresholded_proba; // not used; + real_t thresholded_proba; // not used; transform(class_scores[c], want_proba_h[row * ps.num_classes + c], thresholded_proba); } if ((ps.output & fil::output_t::SOFTMAX) != 0) @@ -511,7 +519,7 @@ class BaseFilTest : public testing::TestWithParam { ++class_votes[class_label]; } for (int c = 0; c < ps.num_classes; ++c) { - float thresholded_proba; // not used; do argmax instead + real_t thresholded_proba; // not used; do argmax instead transform(class_votes[c], want_proba_h[r * ps.num_classes + c], thresholded_proba); } want_preds_h[r] = @@ -521,16 +529,16 @@ class BaseFilTest : public testing::TestWithParam { } case fil::leaf_algo_t::VECTOR_LEAF: for (int r = 0; r < ps.num_rows; ++r) { - std::vector class_probabilities(ps.num_classes); + std::vector class_probabilities(ps.num_classes); for (int j = 0; j < ps.num_trees; ++j) { int vector_index = infer_one_tree(&nodes[j * num_nodes], &data_h[r * ps.num_cols], base).idx; - float sum = 0.0; + real_t sum = 0; for (int k = 0; k < ps.num_classes; k++) { class_probabilities[k] += vector_leaf[vector_index * ps.num_classes + k]; sum += vector_leaf[vector_index * ps.num_classes + k]; } - ASSERT_LE(std::abs(sum - 1.0f), 1e-5); + ASSERT_LE(std::abs(sum - real_t(1)), real_t(1e-5)); } for (int c = 0; c < ps.num_classes; ++c) { @@ -564,8 +572,9 @@ class BaseFilTest : public testing::TestWithParam { // predict preds_d.resize(ps.num_preds_outputs(), stream); proba_d.resize(ps.num_proba_outputs(), stream); - fil::predict(handle, forest, preds_d.data(), data_d.data(), ps.num_rows); - fil::predict(handle, forest, proba_d.data(), data_d.data(), ps.num_rows, true); + fil::predict(handle, forest, (void*)preds_d.data(), (const void*)data_d.data(), ps.num_rows); + fil::predict( + handle, forest, (void*)proba_d.data(), (const void*)data_d.data(), ps.num_rows, true); handle.sync_stream(); // cleanup @@ -577,29 +586,31 @@ class BaseFilTest : public testing::TestWithParam { ASSERT_TRUE(raft::devArrMatch(want_proba_d.data(), proba_d.data(), ps.num_proba_outputs(), - raft::CompareApprox(ps.tolerance), + raft::CompareApprox(ps.tolerance), stream)); float tolerance = ps.leaf_algo == fil::leaf_algo_t::FLOAT_UNARY_BINARY ? ps.tolerance - : std::numeric_limits::epsilon(); + : std::numeric_limits::epsilon(); // in multi-class prediction, floats represent the most likely class // and would be generated by converting an int to float ASSERT_TRUE(raft::devArrMatch(want_preds_d.data(), preds_d.data(), ps.num_rows, - raft::CompareApprox(tolerance), + raft::CompareApprox(tolerance), stream)); } - fil::val_t infer_one_tree(fil::dense_node* root, float* data, const tree_base& tree) + fil::val_t infer_one_tree(fil::dense_node* root, + real_t* data, + const tree_base& tree) { int curr = 0; - fil::val_t output{.f = 0.0f}; + fil::val_t output{.f = 0.0f}; for (;;) { - const fil::dense_node& node = root[curr]; - if (node.is_leaf()) return node.template output>(); - float val = data[node.fid()]; - curr = tree.child_index(node, curr, val); + const fil::dense_node& node = root[curr]; + if (node.is_leaf()) return node.template output>(); + real_t val = data[node.fid()]; + curr = tree.child_index(node, curr, val); } return output; } @@ -614,26 +625,26 @@ class BaseFilTest : public testing::TestWithParam { cudaStream_t stream = 0; // predictions - rmm::device_uvector preds_d; - rmm::device_uvector proba_d; - rmm::device_uvector want_preds_d; - rmm::device_uvector want_proba_d; + rmm::device_uvector preds_d; + rmm::device_uvector proba_d; + rmm::device_uvector want_preds_d; + rmm::device_uvector want_proba_d; // input data - rmm::device_uvector data_d; - std::vector data_h; - std::vector want_proba_h; + rmm::device_uvector data_d; + std::vector data_h; + std::vector want_proba_h; // forest data - std::vector> nodes; - std::vector vector_leaf; + std::vector> nodes; + std::vector vector_leaf; cat_sets_owner cat_sets_h; rmm::device_uvector fids_d = rmm::device_uvector(0, cudaStream_t()); rmm::device_uvector fid_num_cats_d = rmm::device_uvector(0, cudaStream_t()); }; template -class BasePredictFilTest : public BaseFilTest { +class BasePredictFilTest : public BaseFilTest { protected: void dense2sparse_node(const fil::dense_node* dense_root, int i_dense, @@ -673,8 +684,8 @@ class BasePredictFilTest : public BaseFilTest { void dense2sparse() { - for (int tree = 0; tree < ps.num_trees; ++tree) { - dense2sparse_tree(&nodes[tree * tree_num_nodes()]); + for (int tree = 0; tree < this->ps.num_trees; ++tree) { + dense2sparse_tree(&this->nodes[tree * this->tree_num_nodes()]); } } @@ -686,31 +697,31 @@ class BasePredictFilTest : public BaseFilTest { dense2sparse(); init_nodes = sparse_nodes; } else { - init_nodes = nodes; + init_nodes = this->nodes; } ASSERT(init_nodes.size() < std::size_t(INT_MAX), "generated too many nodes"); // init FIL model fil::forest_params_t fil_params = { .num_nodes = static_cast(init_nodes.size()), - .depth = ps.depth, - .num_trees = ps.num_trees, - .num_cols = ps.num_cols, - .leaf_algo = ps.leaf_algo, - .algo = ps.algo, - .output = ps.output, - .threshold = ps.threshold, - .global_bias = ps.global_bias, - .num_classes = ps.num_classes, - .blocks_per_sm = ps.blocks_per_sm, - .threads_per_tree = ps.threads_per_tree, - .n_items = ps.n_items, + .depth = this->ps.depth, + .num_trees = this->ps.num_trees, + .num_cols = this->ps.num_cols, + .leaf_algo = this->ps.leaf_algo, + .algo = this->ps.algo, + .output = this->ps.output, + .threshold = this->ps.threshold, + .global_bias = this->ps.global_bias, + .num_classes = this->ps.num_classes, + .blocks_per_sm = this->ps.blocks_per_sm, + .threads_per_tree = this->ps.threads_per_tree, + .n_items = this->ps.n_items, }; - fil::init(handle, + fil::init(this->handle, pforest, - cat_sets_h.accessor(), - vector_leaf, + this->cat_sets_h.accessor(), + this->vector_leaf, trees.data(), init_nodes.data(), &fil_params); @@ -723,7 +734,7 @@ typedef BasePredictFilTest> PredictDenseFilTest; typedef BasePredictFilTest> PredictSparse16FilTest; typedef BasePredictFilTest PredictSparse8FilTest; -class TreeliteFilTest : public BaseFilTest { +class TreeliteFilTest : public BaseFilTest { protected: /** adds nodes[node] of tree starting at index root to builder at index at *pkey, increments *pkey, From 05de38dd89a7cdff0fdaf851365081692ee5ba9b Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Fri, 18 Mar 2022 22:42:30 +0100 Subject: [PATCH 30/45] Templatized child_index tests, added float64-only tests. --- cpp/test/sg/fil_child_index_test.cu | 135 +++++++++++++++++----------- 1 file changed, 81 insertions(+), 54 deletions(-) diff --git a/cpp/test/sg/fil_child_index_test.cu b/cpp/test/sg/fil_child_index_test.cu index 75eca6cba2..3425c51190 100644 --- a/cpp/test/sg/fil_child_index_test.cu +++ b/cpp/test/sg/fil_child_index_test.cu @@ -38,28 +38,39 @@ struct proto_inner_node { bool is_categorical = false; // see base_node::is_categorical int fid = 0; // feature id, see base_node::fid int set = 0; // which bit set represents the matching category list - float thresh = 0.0f; // threshold, see base_node::thresh + double thresh = 0.0; // threshold, see base_node::thresh int left = 1; // left child idx, see sparse_node*::left_index() - val_t split() + template + val_t split() { - val_t split; + val_t split; if (is_categorical) split.idx = set; + else if (std::isnan(thresh)) + split.f = std::numeric_limits::quiet_NaN(); else - split.f = thresh; + split.f = static_cast(thresh); return split; } + operator dense_node() + { + return dense_node({}, split(), fid, def_left, false, is_categorical); + } + operator dense_node() + { + return dense_node({}, split(), fid, def_left, false, is_categorical); + } operator sparse_node16() { - return sparse_node16({}, split(), fid, def_left, false, is_categorical, left); + return sparse_node16({}, split(), fid, def_left, false, is_categorical, left); } - operator sparse_node8() + operator sparse_node16() { - return sparse_node8({}, split(), fid, def_left, false, is_categorical, left); + return sparse_node16({}, split(), fid, def_left, false, is_categorical, left); } - operator dense_node() + operator sparse_node8() { - return dense_node({}, split(), fid, def_left, false, is_categorical); + return sparse_node8({}, split(), fid, def_left, false, is_categorical, left); } }; @@ -108,8 +119,9 @@ struct ChildIndexTestParams { proto_inner_node node; int parent_node_idx = 0; cat_sets_owner cso; - float input = 0.0f; - int correct = INT_MAX; + double input = 0.0; + int correct = INT_MAX; + bool skip_f32 = false; // if true, the test only runs for float64 }; std::ostream& operator<<(std::ostream& os, const ChildIndexTestParams& ps) @@ -136,29 +148,36 @@ std::ostream& operator<<(std::ostream& os, const ChildIndexTestParams& ps) template class ChildIndexTest : public testing::TestWithParam { + using real_t = typename fil_node_t::real_t; + protected: void check() { ChildIndexTestParams param = GetParam(); + + // skip tests that require float64 to work correctly + if (std::is_same_v && param.skip_f32) return; + tree_base tree{param.cso.accessor()}; - if (!std::is_same>::value) { + if (!std::is_same_v>) { // test that the logic uses node.left instead of parent_node_idx param.node.left = param.parent_node_idx * 2 + 1; param.parent_node_idx = INT_MIN; } + real_t input = isnan(param.input) ? std::numeric_limits::quiet_NaN() + : static_cast(param.input); // nan -> !def_left, categorical -> if matches, numerical -> input >= threshold - int test_idx = - tree.child_index((fil_node_t)param.node, param.parent_node_idx, param.input); - ASSERT(test_idx == param.correct, - "child index test: actual %d != correct %d", - test_idx, - param.correct); + int test_idx = tree.child_index((fil_node_t)param.node, param.parent_node_idx, input); + ASSERT_EQ(test_idx, param.correct) + << "child index test: actual " << test_idx << " != correct %d" << param.correct; } }; -typedef ChildIndexTest> ChildIndexTestDense; -typedef ChildIndexTest> ChildIndexTestSparse16; -typedef ChildIndexTest ChildIndexTestSparse8; +using ChildIndexTestDenseFloat32 = ChildIndexTest>; +using ChildIndexTestDenseFloat64 = ChildIndexTest>; +using ChildIndexTestSparse16Float32 = ChildIndexTest>; +using ChildIndexTestSparse16Float64 = ChildIndexTest>; +using ChildIndexTestSparse8 = ChildIndexTest; /* for dense nodes, left (false) == parent * 2 + 1, right (true) == parent * 2 + 2 E.g. see tree below: @@ -168,48 +187,52 @@ typedef ChildIndexTest ChildIndexTestSparse8; 3 -> 7, 8 4 -> 9, 10 */ -const float INF = std::numeric_limits::infinity(); +const double INF = std::numeric_limits::infinity(); +const double QNAN = std::numeric_limits::quiet_NaN(); std::vector params = { - CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = 0.0f), input = -INF, correct = 1), // val !>= thresh - CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = 0.0f), input = 0.0f, correct = 2), // val >= thresh - CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = 0.0f), input = +INF, correct = 2), // val >= thresh + CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = 0.0), input = -INF, correct = 1), // val !>= thresh + CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = 0.0), input = 0.0, correct = 2), // val >= thresh + CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = 0.0), input = +INF, correct = 2), // val >= thresh + // the following two tests only work for float64 + CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = 0.0), input = -1e-50, correct = 1, skip_f32 = true), + CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = 1e-50), input = 0.0, correct = 1, skip_f32 = true), CHILD_INDEX_TEST_PARAMS( - node = NODE(thresh = 1.0f), input = -3.141592f, correct = 1), // val !>= thresh - CHILD_INDEX_TEST_PARAMS( // val >= thresh (e**pi > pi**e) - node = NODE(thresh = 22.459158f), - input = 23.140693f, + node = NODE(thresh = 1.0), input = -3.141592, correct = 1), // val !>= thresh + CHILD_INDEX_TEST_PARAMS( // val >= thresh (e**pi > pi**e) + node = NODE(thresh = 22.459158), + input = 23.140693, correct = 2), CHILD_INDEX_TEST_PARAMS( // val >= thresh for both negative - node = NODE(thresh = -0.37f), - input = -0.36f, - correct = 2), // val >= thresh - CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = -INF), input = 0.36f, correct = 2), // val >= thresh - CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = 0.0f), input = NAN, correct = 2), // !def_left - CHILD_INDEX_TEST_PARAMS(node = NODE(def_left = true), input = NAN, correct = 1), // !def_left - CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = NAN), input = NAN, correct = 2), // !def_left + node = NODE(thresh = -0.37), + input = -0.36, + correct = 2), // val >= thresh + CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = -INF), input = 0.36, correct = 2), // val >= thresh + CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = 0.0f), input = QNAN, correct = 2), // !def_left + CHILD_INDEX_TEST_PARAMS(node = NODE(def_left = true), input = QNAN, correct = 1), // !def_left + CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = QNAN), input = QNAN, correct = 2), // !def_left CHILD_INDEX_TEST_PARAMS( - node = NODE(def_left = true, thresh = NAN), input = NAN, correct = 1), // !def_left - CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = NAN), input = 0.0f, correct = 1), // val !>= thresh + node = NODE(def_left = true, thresh = QNAN), input = QNAN, correct = 1), // !def_left + CHILD_INDEX_TEST_PARAMS(node = NODE(thresh = QNAN), input = 0.0, correct = 1), // val !>= thresh CHILD_INDEX_TEST_PARAMS( - node = NODE(thresh = 0.0f), parent_node_idx = 1, input = -INF, correct = 3), + node = NODE(thresh = 0.0), parent_node_idx = 1, input = -INF, correct = 3), CHILD_INDEX_TEST_PARAMS( - node = NODE(thresh = 0.0f), parent_node_idx = 1, input = 0.0f, correct = 4), + node = NODE(thresh = 0.0), parent_node_idx = 1, input = 0.0f, correct = 4), CHILD_INDEX_TEST_PARAMS( - node = NODE(thresh = 0.0f), parent_node_idx = 2, input = -INF, correct = 5), + node = NODE(thresh = 0.0), parent_node_idx = 2, input = -INF, correct = 5), CHILD_INDEX_TEST_PARAMS( - node = NODE(thresh = 0.0f), parent_node_idx = 2, input = 0.0f, correct = 6), + node = NODE(thresh = 0.0), parent_node_idx = 2, input = 0.0f, correct = 6), CHILD_INDEX_TEST_PARAMS( - node = NODE(thresh = 0.0f), parent_node_idx = 3, input = -INF, correct = 7), + node = NODE(thresh = 0.0), parent_node_idx = 3, input = -INF, correct = 7), CHILD_INDEX_TEST_PARAMS( - node = NODE(thresh = 0.0f), parent_node_idx = 3, input = 0.0f, correct = 8), + node = NODE(thresh = 0.0), parent_node_idx = 3, input = 0.0f, correct = 8), CHILD_INDEX_TEST_PARAMS( - node = NODE(thresh = 0.0f), parent_node_idx = 4, input = -INF, correct = 9), + node = NODE(thresh = 0.0), parent_node_idx = 4, input = -INF, correct = 9), CHILD_INDEX_TEST_PARAMS( - node = NODE(thresh = 0.0f), parent_node_idx = 4, input = 0.0f, correct = 10), - CHILD_INDEX_TEST_PARAMS(parent_node_idx = 4, input = NAN, correct = 10), // !def_left + node = NODE(thresh = 0.0), parent_node_idx = 4, input = 0.0, correct = 10), + CHILD_INDEX_TEST_PARAMS(parent_node_idx = 4, input = QNAN, correct = 10), // !def_left CHILD_INDEX_TEST_PARAMS( - node = NODE(def_left = true), input = NAN, parent_node_idx = 4, correct = 9), // !def_left + node = NODE(def_left = true), input = QNAN, parent_node_idx = 4, correct = 9), // !def_left // cannot match ( < 0 and realistic fid_num_cats) CHILD_INDEX_TEST_PARAMS(node = NODE(is_categorical = true), cso.bits = {}, @@ -282,21 +305,25 @@ std::vector params = { CHILD_INDEX_TEST_PARAMS(node = NODE(is_categorical = true, def_left = true), cso.bits = {0b0000'0101}, cso.fid_num_cats = {3.0f}, - input = NAN, + input = QNAN, correct = 1), // default right CHILD_INDEX_TEST_PARAMS(node = NODE(is_categorical = true, def_left = false), cso.bits = {0b0000'0101}, cso.fid_num_cats = {3.0f}, - input = NAN, + input = QNAN, correct = 2), }; -TEST_P(ChildIndexTestDense, Predict) { check(); } -TEST_P(ChildIndexTestSparse16, Predict) { check(); } +TEST_P(ChildIndexTestDenseFloat32, Predict) { check(); } +TEST_P(ChildIndexTestDenseFloat64, Predict) { check(); } +TEST_P(ChildIndexTestSparse16Float32, Predict) { check(); } +TEST_P(ChildIndexTestSparse16Float64, Predict) { check(); } TEST_P(ChildIndexTestSparse8, Predict) { check(); } -INSTANTIATE_TEST_CASE_P(FilTests, ChildIndexTestDense, testing::ValuesIn(params)); -INSTANTIATE_TEST_CASE_P(FilTests, ChildIndexTestSparse16, testing::ValuesIn(params)); +INSTANTIATE_TEST_CASE_P(FilTests, ChildIndexTestDenseFloat32, testing::ValuesIn(params)); +INSTANTIATE_TEST_CASE_P(FilTests, ChildIndexTestDenseFloat64, testing::ValuesIn(params)); +INSTANTIATE_TEST_CASE_P(FilTests, ChildIndexTestSparse16Float32, testing::ValuesIn(params)); +INSTANTIATE_TEST_CASE_P(FilTests, ChildIndexTestSparse16Float64, testing::ValuesIn(params)); INSTANTIATE_TEST_CASE_P(FilTests, ChildIndexTestSparse8, testing::ValuesIn(params)); } // namespace ML From 316e99a886bbefd6f650e8fb502c4d11fd9ba70b Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Sat, 19 Mar 2022 01:45:58 +0100 Subject: [PATCH 31/45] float64 versions of multi-sum and FIL predict tests. --- cpp/include/cuml/fil/multi_sum.cuh | 2 +- cpp/test/sg/fil_test.cu | 46 ++++++++++++++++++-------- cpp/test/sg/multi_sum_test.cu | 53 +++++++++++++++++++++--------- 3 files changed, 71 insertions(+), 30 deletions(-) diff --git a/cpp/include/cuml/fil/multi_sum.cuh b/cpp/include/cuml/fil/multi_sum.cuh index 818ae225ec..d5d08766b9 100644 --- a/cpp/include/cuml/fil/multi_sum.cuh +++ b/cpp/include/cuml/fil/multi_sum.cuh @@ -34,7 +34,7 @@ @data[] values are ordered such that the stride is 1 for values belonging to the same group and @n_groups for values that are to be added together */ -template +template __device__ T multi_sum(T* data, int n_groups, int n_values) { T acc = threadIdx.x < n_groups * n_values ? data[threadIdx.x] : T(); diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu index 893c6bf92a..1531c1be74 100644 --- a/cpp/test/sg/fil_test.cu +++ b/cpp/test/sg/fil_test.cu @@ -645,17 +645,24 @@ class BaseFilTest : public testing::TestWithParam { template class BasePredictFilTest : public BaseFilTest { + using real_t = typename fil_node_t::real_t; + protected: - void dense2sparse_node(const fil::dense_node* dense_root, + void dense2sparse_node(const fil::dense_node* dense_root, int i_dense, int i_sparse_root, int i_sparse) { - const fil::dense_node& node = dense_root[i_dense]; + const fil::dense_node& node = dense_root[i_dense]; if (node.is_leaf()) { // leaf sparse node - sparse_nodes[i_sparse] = fil_node_t( - node.output>(), {}, node.fid(), node.def_left(), node.is_leaf(), false, 0); + sparse_nodes[i_sparse] = fil_node_t(node.template output>(), + {}, + node.fid(), + node.def_left(), + node.is_leaf(), + false, + 0); return; } // inner sparse node @@ -674,7 +681,7 @@ class BasePredictFilTest : public BaseFilTest { dense2sparse_node(dense_root, 2 * i_dense + 2, i_sparse_root, left_index + 1); } - void dense2sparse_tree(const fil::dense_node* dense_root) + void dense2sparse_tree(const fil::dense_node* dense_root) { int i_sparse_root = sparse_nodes.size(); sparse_nodes.push_back(fil_node_t()); @@ -730,9 +737,11 @@ class BasePredictFilTest : public BaseFilTest { std::vector trees; }; -typedef BasePredictFilTest> PredictDenseFilTest; -typedef BasePredictFilTest> PredictSparse16FilTest; -typedef BasePredictFilTest PredictSparse8FilTest; +using PredictDenseFloat32FilTest = BasePredictFilTest>; +using PredictDenseFloat64FilTest = BasePredictFilTest>; +using PredictSparse16Float32FilTest = BasePredictFilTest>; +using PredictSparse16Float64FilTest = BasePredictFilTest>; +using PredictSparse8FilTest = BasePredictFilTest; class TreeliteFilTest : public BaseFilTest { protected: @@ -1097,9 +1106,15 @@ std::vector predict_dense_inputs = { max_magnitude_of_matching_cat = 5), }; -TEST_P(PredictDenseFilTest, Predict) { compare(); } +TEST_P(PredictDenseFloat32FilTest, Predict) { compare(); } +TEST_P(PredictDenseFloat64FilTest, Predict) { compare(); } -INSTANTIATE_TEST_CASE_P(FilTests, PredictDenseFilTest, testing::ValuesIn(predict_dense_inputs)); +INSTANTIATE_TEST_CASE_P(FilTests, + PredictDenseFloat32FilTest, + testing::ValuesIn(predict_dense_inputs)); +INSTANTIATE_TEST_CASE_P(FilTests, + PredictDenseFloat64FilTest, + testing::ValuesIn(predict_dense_inputs)); std::vector predict_sparse_inputs = { FIL_TEST_PARAMS(), @@ -1175,10 +1190,15 @@ std::vector predict_sparse_inputs = { max_magnitude_of_matching_cat = 5), }; -TEST_P(PredictSparse16FilTest, Predict) { compare(); } +TEST_P(PredictSparse16Float32FilTest, Predict) { compare(); } +TEST_P(PredictSparse16Float64FilTest, Predict) { compare(); } -// Temporarily disabled, see https://github.com/rapidsai/cuml/issues/3205 -INSTANTIATE_TEST_CASE_P(FilTests, PredictSparse16FilTest, testing::ValuesIn(predict_sparse_inputs)); +INSTANTIATE_TEST_CASE_P(FilTests, + PredictSparse16Float32FilTest, + testing::ValuesIn(predict_sparse_inputs)); +INSTANTIATE_TEST_CASE_P(FilTests, + PredictSparse16Float64FilTest, + testing::ValuesIn(predict_sparse_inputs)); TEST_P(PredictSparse8FilTest, Predict) { compare(); } diff --git a/cpp/test/sg/multi_sum_test.cu b/cpp/test/sg/multi_sum_test.cu index 6d960b8a4d..5adb18a1fc 100644 --- a/cpp/test/sg/multi_sum_test.cu +++ b/cpp/test/sg/multi_sum_test.cu @@ -44,7 +44,7 @@ __device__ void serial_multi_sum(const T* in, T* out, int n_groups, int n_values } // the most threads a block can have -const int max_threads = 1024; +const int MAX_THREADS = 1024; struct MultiSumTestParams { int radix; // number of elements summed to 1 at each stage of the sum @@ -52,14 +52,22 @@ struct MultiSumTestParams { int n_values; // number of elements to add in each sum }; +template +struct multi_sum_test_shmem { + T work[MAX_THREADS]; + T correct_result[MAX_THREADS]; +}; + template -__device__ void test_single_radix(T thread_value, MultiSumTestParams p, int* block_error_flag) +__device__ void test_single_radix(multi_sum_test_shmem& s, + T thread_value, + MultiSumTestParams p, + int* block_error_flag) { - __shared__ T work[max_threads], correct_result[max_threads]; - work[threadIdx.x] = thread_value; - serial_multi_sum(work, correct_result, p.n_groups, p.n_values); - T sum = multi_sum(work, p.n_groups, p.n_values); - if (threadIdx.x < p.n_groups && 1e-4 < fabsf(sum - correct_result[threadIdx.x])) { + s.work[threadIdx.x] = thread_value; + serial_multi_sum(s.work, s.correct_result, p.n_groups, p.n_values); + T sum = multi_sum(s.work, p.n_groups, p.n_values); + if (threadIdx.x < p.n_groups && 1e-4 < fabsf(sum - s.correct_result[threadIdx.x])) { atomicAdd(block_error_flag, 1); } } @@ -67,13 +75,14 @@ __device__ void test_single_radix(T thread_value, MultiSumTestParams p, int* blo template __global__ void test_multi_sum_k(T* data, MultiSumTestParams* params, int* error_flags) { + __shared__ multi_sum_test_shmem s; MultiSumTestParams p = params[blockIdx.x]; switch (p.radix) { - case 2: test_single_radix<2>(data[threadIdx.x], p, &error_flags[blockIdx.x]); break; - case 3: test_single_radix<3>(data[threadIdx.x], p, &error_flags[blockIdx.x]); break; - case 4: test_single_radix<4>(data[threadIdx.x], p, &error_flags[blockIdx.x]); break; - case 5: test_single_radix<5>(data[threadIdx.x], p, &error_flags[blockIdx.x]); break; - case 6: test_single_radix<6>(data[threadIdx.x], p, &error_flags[blockIdx.x]); break; + case 2: test_single_radix<2>(s, data[threadIdx.x], p, &error_flags[blockIdx.x]); break; + case 3: test_single_radix<3>(s, data[threadIdx.x], p, &error_flags[blockIdx.x]); break; + case 4: test_single_radix<4>(s, data[threadIdx.x], p, &error_flags[blockIdx.x]); break; + case 5: test_single_radix<5>(s, data[threadIdx.x], p, &error_flags[blockIdx.x]); break; + case 6: test_single_radix<6>(s, data[threadIdx.x], p, &error_flags[blockIdx.x]); break; } } @@ -142,12 +151,12 @@ std::vector block_sizes = []() { std::vector res; for (int i = 2; i < 50; ++i) res.push_back(i); - for (int i = max_threads - 50; i <= max_threads; ++i) + for (int i = MAX_THREADS - 50; i <= MAX_THREADS; ++i) res.push_back(i); return res; }(); -class MultiSumTestFloat : public MultiSumTest { +class MultiSumTestFloat32 : public MultiSumTest { public: void generate_data() { @@ -155,8 +164,20 @@ class MultiSumTestFloat : public MultiSumTest { r.uniform(data_d.data().get(), data_d.size(), -1.0f, 1.0f, cudaStreamDefault); } }; -TEST_P(MultiSumTestFloat, Import) { check(); } -INSTANTIATE_TEST_CASE_P(FilTests, MultiSumTestFloat, testing::ValuesIn(block_sizes)); +TEST_P(MultiSumTestFloat32, Import) { check(); } +INSTANTIATE_TEST_CASE_P(FilTests, MultiSumTestFloat32, testing::ValuesIn(block_sizes)); + +class MultiSumTestFloat64 : public MultiSumTest { + public: + void generate_data() + { + raft::random::Rng r(4321); + r.uniform(data_d.data().get(), data_d.size(), -1.0, 1.0, cudaStreamDefault); + } +}; + +TEST_P(MultiSumTestFloat64, Import) { check(); } +INSTANTIATE_TEST_CASE_P(FilTests, MultiSumTestFloat64, testing::ValuesIn(block_sizes)); class MultiSumTestInt : public MultiSumTest { public: From 2ba5eed15da51f0ef43729ca1b9334c6f6a48201 Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Sat, 19 Mar 2022 03:23:51 +0100 Subject: [PATCH 32/45] compute_smem_footprint() uses float or double, based on sizeof_real. --- cpp/src/fil/infer.cu | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu index 233b1bec5d..ccc09edfda 100644 --- a/cpp/src/fil/infer.cu +++ b/cpp/src/fil/infer.cu @@ -863,7 +863,20 @@ size_t shmem_size_params::get_smem_footprint() template int compute_smem_footprint::run(predict_params ssp) { - return ssp.template get_smem_footprint(); + switch (ssp.sizeof_real) { + case 4: + return ssp + .template get_smem_footprint(); + case 8: + return ssp + .template get_smem_footprint(); + default: + ASSERT(ssp.sizeof_real == 4 || ssp.sizeof_real == 8, + "internal error: sizeof_real == %d, but must be 4 or 8", + static_cast(ssp.sizeof_real)); + // unreachable + return 0; + } } // make sure to instantiate all possible get_smem_footprint instantiations From ac92be785e788baf6dd88d8df8f41ac1593f3e9e Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Tue, 22 Mar 2022 13:36:19 +0100 Subject: [PATCH 33/45] Removed stray static_asserts. --- cpp/src/fil/fil.cu | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu index d1dc8cdc25..43ec9fcad4 100644 --- a/cpp/src/fil/fil.cu +++ b/cpp/src/fil/fil.cu @@ -379,8 +379,6 @@ struct opt_into_arch_dependent_shmem : dispatch_functor { template > void run(predict_params p) { - static_assert(std::is_same_v, - "real_t must be float; to be removed in the following pull requests"); auto kernel = infer_k::IS_DENSE); using forest_type = typename node_traits::forest; forest_type* f = new forest_type(h); - static_assert(std::is_same_v, - "real_t must be float; to be removed in the following pull requests"); f->init(h, cat_sets, vector_leaf, trees, nodes, params); *pf = f; } From 92c44af6f32c76946056fab457a000216625f9ab Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Mon, 4 Apr 2022 20:14:59 +0200 Subject: [PATCH 34/45] Finish merge. --- cpp/src/fil/common.cuh | 6 ------ 1 file changed, 6 deletions(-) diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh index 3f31db199e..c4877216a8 100644 --- a/cpp/src/fil/common.cuh +++ b/cpp/src/fil/common.cuh @@ -46,15 +46,9 @@ struct storage_base { }; /** represents a dense tree */ -<<<<<<< HEAD -template -struct tree> : tree_base { - using real_t = real_t_; -======= template struct tree> : tree_base { using real_type = real_t; ->>>>>>> branch-22.06 __host__ __device__ tree(categorical_sets cat_sets, dense_node* nodes, int node_pitch) : tree_base{cat_sets}, nodes_(nodes), node_pitch_(node_pitch) { From 938e02a846ed1a111b6fa0ba0b951e27c560c19a Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Mon, 4 Apr 2022 21:13:00 +0200 Subject: [PATCH 35/45] Fixed compilation errors. --- cpp/test/sg/fil_test.cu | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu index d08aaa22a1..a3cc8ed75d 100644 --- a/cpp/test/sg/fil_test.cu +++ b/cpp/test/sg/fil_test.cu @@ -162,7 +162,7 @@ struct replace_some_floating_with_categorical { int num_cols; __device__ real_t operator()(real_t data, int data_idx) { - real_t fid_num_cats = fid_num_cats_d[data_idx % num_cols]; + real_t fid_num_cats = static_cast(fid_num_cats_d[data_idx % num_cols]); if (fid_num_cats == real_t(0)) return data; // Transform `data` from (uniform on) [-1.0, 1.0] into [-fid_num_cats-3, fid_num_cats+3]. real_t tmp = data * (fid_num_cats + real_t(3)); @@ -561,20 +561,19 @@ class BaseFilTest : public testing::TestWithParam { handle.sync_stream(); } - virtual void init_forest(fil::forest_t* pforest) = 0; + virtual void init_forest(fil::forest_t* pforest) = 0; void predict_on_gpu() { - auto stream = handle.get_stream(); - fil::forest_t forest = nullptr; + auto stream = handle.get_stream(); + fil::forest_t forest = nullptr; init_forest(&forest); // predict preds_d.resize(ps.num_preds_outputs(), stream); proba_d.resize(ps.num_proba_outputs(), stream); - fil::predict(handle, forest, (void*)preds_d.data(), (const void*)data_d.data(), ps.num_rows); - fil::predict( - handle, forest, (void*)proba_d.data(), (const void*)data_d.data(), ps.num_rows, true); + fil::predict(handle, forest, preds_d.data(), data_d.data(), ps.num_rows); + fil::predict(handle, forest, proba_d.data(), data_d.data(), ps.num_rows, true); handle.sync_stream(); // cleanup @@ -644,8 +643,8 @@ class BaseFilTest : public testing::TestWithParam { }; template -class BasePredictFilTest : public BaseFilTest { - using real_t = typename fil_node_t::real_t; +class BasePredictFilTest : public BaseFilTest { + using real_t = typename fil_node_t::real_type; protected: void dense2sparse_node(const fil::dense_node* dense_root, @@ -696,7 +695,7 @@ class BasePredictFilTest : public BaseFilTest { } } - void init_forest(fil::forest_t* pforest) override + void init_forest(fil::forest_t* pforest) override { constexpr bool IS_DENSE = node_traits::IS_DENSE; std::vector init_nodes; From c665bbfa05ac88f72bee37cabd246ee630b33f24 Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Mon, 4 Apr 2022 21:16:24 +0200 Subject: [PATCH 36/45] Fixed endless recursion in forest::free(). --- cpp/src/fil/fil.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu index afdd2fd467..887a79e7dc 100644 --- a/cpp/src/fil/fil.cu +++ b/cpp/src/fil/fil.cu @@ -324,8 +324,8 @@ struct forest { virtual void free(const raft::handle_t& h) { + cat_sets_.release(); vector_leaf_.release(); - forest::free(h); } virtual ~forest() {} @@ -504,9 +504,9 @@ struct sparse_forest : forest { void free(const raft::handle_t& h) override { - forest::free(h); trees_.release(); nodes_.release(); + forest::free(h); } int num_nodes_ = 0; From 175837a2bcc642edfdee07eb2a24cc00098dc222 Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Tue, 5 Apr 2022 00:38:01 +0200 Subject: [PATCH 37/45] Removed changes to fil.h. --- cpp/include/cuml/fil/fil.h | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/include/cuml/fil/fil.h b/cpp/include/cuml/fil/fil.h index 22b7b53059..581fe3eb13 100644 --- a/cpp/include/cuml/fil/fil.h +++ b/cpp/include/cuml/fil/fil.h @@ -135,7 +135,6 @@ void free(const raft::handle_t& h, forest_t f); * size = predict_proba ? (2*num_rows) : num_rows * @param data array of size n * cols (cols is the number of columns * for the forest f) from which to predict - * type = the type used for the forest representation (float or double) * @param num_rows number of data rows * @param predict_proba for classifier models, this forces to output both class probabilities * instead of binary class prediction. format matches scikit-learn API From 886c64959b036e9521f75c4878759e57cdc15977 Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Tue, 5 Apr 2022 00:53:14 +0200 Subject: [PATCH 38/45] Refactored tests. --- cpp/test/sg/fil_test.cu | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu index a3cc8ed75d..4a727efdc0 100644 --- a/cpp/test/sg/fil_test.cu +++ b/cpp/test/sg/fil_test.cu @@ -139,8 +139,11 @@ __global__ void nan_kernel(real_t* data, const bool* mask, int len, real_t nan) if (!mask[tid]) data[tid] = nan; } -float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); } -double sigmoid(double x) { return 1.0 / (1.0 + exp(-x)); } +template +real_t sigmoid(real_t x) +{ + return real_t(1) / (real_t(1) + exp(-x)); +} void hard_clipped_bernoulli( raft::random::Rng rng, float* d, std::size_t n_vals, float prob_of_zero, cudaStream_t stream) @@ -168,12 +171,12 @@ struct replace_some_floating_with_categorical { real_t tmp = data * (fid_num_cats + real_t(3)); // Also test invalid (negative and above fid_num_cats) categories: samples within // [fid_num_cats+2.5, fid_num_cats+3) and opposite will test infinite floats as categorical. - if (tmp + fid_num_cats < real_t(-2.5f)) return -INFINITY; - if (tmp - fid_num_cats > real_t(+2.5f)) return +INFINITY; + if (tmp + fid_num_cats < real_t(-2.5f)) return -std::numeric_limits::infinity(); + if (tmp - fid_num_cats > real_t(+2.5f)) return +std::numeric_limits::infinity(); // Samples within [fid_num_cats+2, fid_num_cats+2.5) (and their negative counterparts) will // test huge invalid categories. - if (tmp + fid_num_cats < real_t(-2.0f)) tmp -= MAX_FIL_INT_FLOAT; - if (tmp - fid_num_cats > real_t(+2.0f)) tmp += MAX_FIL_INT_FLOAT; + if (tmp + fid_num_cats < real_t(-2.0f)) tmp -= real_t(MAX_FIL_INT_FLOAT); + if (tmp - fid_num_cats > real_t(+2.0f)) tmp += real_t(MAX_FIL_INT_FLOAT); // Samples within [0, fid_num_cats+2) will be valid categories, rounded towards 0 with a cast. // Negative categories are always invalid. For correct interpretation, see // cpp/src/fil/internal.cuh `int category_matches(node_t node, float category)` From 1426c14b7f08745828e59183f446b0304ec0465f Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Tue, 5 Apr 2022 01:00:40 +0200 Subject: [PATCH 39/45] noinline -> forceinline. --- cpp/src/fil/infer.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu index 7166c600e6..5af31aa76e 100644 --- a/cpp/src/fil/infer.cu +++ b/cpp/src/fil/infer.cu @@ -36,7 +36,7 @@ #endif // __CUDA_ARCH__ #endif // CUDA_PRAGMA_UNROLL -#define INLINE_CONFIG __noinline__ +#define INLINE_CONFIG __forceinline__ namespace ML { namespace fil { From 56ecd5119a91a314400a0d8df79b981c48947088 Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Wed, 6 Apr 2022 23:55:01 +0200 Subject: [PATCH 40/45] Addressed review comments. --- cpp/test/sg/fil_test.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu index 4a727efdc0..cec210280a 100644 --- a/cpp/test/sg/fil_test.cu +++ b/cpp/test/sg/fil_test.cu @@ -165,7 +165,7 @@ struct replace_some_floating_with_categorical { int num_cols; __device__ real_t operator()(real_t data, int data_idx) { - real_t fid_num_cats = static_cast(fid_num_cats_d[data_idx % num_cols]); + auto fid_num_cats = static_cast(fid_num_cats_d[data_idx % num_cols]); if (fid_num_cats == real_t(0)) return data; // Transform `data` from (uniform on) [-1.0, 1.0] into [-fid_num_cats-3, fid_num_cats+3]. real_t tmp = data * (fid_num_cats + real_t(3)); From 9098b86d9c1a808493876dddeb0454383de7dd4e Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Thu, 7 Apr 2022 02:50:21 +0200 Subject: [PATCH 41/45] float64 support in treelite->FIL import. --- cpp/bench/sg/fil.cu | 4 +- cpp/include/cuml/fil/fil.h | 7 +- cpp/src/fil/infer.cu | 2 +- cpp/src/fil/treelite_import.cu | 69 ++++++++------ cpp/test/sg/fil_test.cu | 166 ++++++++++++++++++++------------- cpp/test/sg/rf_test.cu | 15 +-- 6 files changed, 157 insertions(+), 106 deletions(-) diff --git a/cpp/bench/sg/fil.cu b/cpp/bench/sg/fil.cu index adf283fbaf..67017fd9f5 100644 --- a/cpp/bench/sg/fil.cu +++ b/cpp/bench/sg/fil.cu @@ -91,7 +91,9 @@ class FIL : public RegressionFixture { .threads_per_tree = 1, .n_items = 0, .pforest_shape_str = nullptr}; - ML::fil::from_treelite(*handle, &forest, model, &tl_params); + ML::fil::forest_variant forest_variant; + ML::fil::from_treelite(*handle, &forest_variant, model, &tl_params); + forest = std::get>(forest_variant); // only time prediction this->loopOnState(state, [this]() { diff --git a/cpp/include/cuml/fil/fil.h b/cpp/include/cuml/fil/fil.h index 581fe3eb13..5cc4a9f748 100644 --- a/cpp/include/cuml/fil/fil.h +++ b/cpp/include/cuml/fil/fil.h @@ -20,6 +20,8 @@ #include +#include // for std::get<>, std::variant<> + #include namespace raft { @@ -76,6 +78,9 @@ struct forest; template using forest_t = forest*; +/** forest_variant is used to get a forest represented with either float or double. */ +using forest_variant = std::variant, forest_t>; + /** MAX_N_ITEMS determines the maximum allowed value for tl_params::n_items */ constexpr int MAX_N_ITEMS = 4; @@ -116,7 +121,7 @@ struct treelite_params_t { */ // TODO (canonizer): use std::variant forest_t>* for pforest void from_treelite(const raft::handle_t& handle, - forest_t* pforest, + forest_variant* pforest, ModelHandle model, const treelite_params_t* tl_params); diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu index 5af31aa76e..7166c600e6 100644 --- a/cpp/src/fil/infer.cu +++ b/cpp/src/fil/infer.cu @@ -36,7 +36,7 @@ #endif // __CUDA_ARCH__ #endif // CUDA_PRAGMA_UNROLL -#define INLINE_CONFIG __forceinline__ +#define INLINE_CONFIG __noinline__ namespace ML { namespace fil { diff --git a/cpp/src/fil/treelite_import.cu b/cpp/src/fil/treelite_import.cu index 68634fe26a..2b9e320c95 100644 --- a/cpp/src/fil/treelite_import.cu +++ b/cpp/src/fil/treelite_import.cu @@ -40,6 +40,7 @@ #include // for std::size_t #include // for uint8_t #include // for ios, stringstream +#include // for std::numeric_limits #include // for std::stack #include // for std::string #include // for std::is_same @@ -223,7 +224,8 @@ cat_sets_owner allocate_cat_sets_owner(const tl::ModelImpl& model) return cat_sets; } -void adjust_threshold(float* pthreshold, bool* swap_child_nodes, tl::Operator comparison_op) +template +void adjust_threshold(real_t* pthreshold, bool* swap_child_nodes, tl::Operator comparison_op) { // in treelite (take left node if val [op] threshold), // the meaning of the condition is reversed compared to FIL; @@ -237,12 +239,12 @@ void adjust_threshold(float* pthreshold, bool* swap_child_nodes, tl::Operator co case tl::Operator::kLT: break; case tl::Operator::kLE: // x <= y is equivalent to x < y', where y' is the next representable float - *pthreshold = std::nextafterf(*pthreshold, std::numeric_limits::infinity()); + *pthreshold = std::nextafterf(*pthreshold, std::numeric_limits::infinity()); break; case tl::Operator::kGT: // x > y is equivalent to x >= y', where y' is the next representable float // left and right still need to be swapped - *pthreshold = std::nextafterf(*pthreshold, std::numeric_limits::infinity()); + *pthreshold = std::nextafterf(*pthreshold, std::numeric_limits::infinity()); case tl::Operator::kGE: // swap left and right *swap_child_nodes = !*swap_child_nodes; @@ -279,7 +281,7 @@ void tl2fil_leaf_payload(fil_node_t* fil_node, const tl::Tree& tl_tree, int tl_node_id, const forest_params_t& forest_params, - std::vector* vector_leaf, + std::vector* vector_leaf, size_t* leaf_counter) { auto vec = tl_tree.LeafVector(tl_node_id); @@ -301,7 +303,7 @@ void tl2fil_leaf_payload(fil_node_t* fil_node, } case leaf_algo_t::FLOAT_UNARY_BINARY: case leaf_algo_t::GROVE_PER_CLASS: - fil_node->val.f = static_cast(tl_tree.LeafValue(tl_node_id)); + fil_node->val.f = static_cast(tl_tree.LeafValue(tl_node_id)); ASSERT(!tl_tree.HasLeafVector(tl_node_id), "some but not all treelite leaves have leaf_vector()"); break; @@ -323,14 +325,15 @@ conversion_state tl2fil_inner_node(int fil_left_child, cat_sets_owner* cat_sets, std::size_t* bit_pool_offset) { + using real_t = typename fil_node_t::real_type; int tl_left = tree.LeftChild(tl_node_id), tl_right = tree.RightChild(tl_node_id); - val_t split = {.f = NAN}; // yes there's a default initializer already + val_t split = {.f = std::numeric_limits::quiet_NaN()}; int feature_id = tree.SplitIndex(tl_node_id); bool is_categorical = tree.SplitType(tl_node_id) == tl::SplitFeatureType::kCategorical && tree.MatchingCategories(tl_node_id).size() > 0; bool swap_child_nodes = false; if (tree.SplitType(tl_node_id) == tl::SplitFeatureType::kNumerical) { - split.f = static_cast(tree.Threshold(tl_node_id)); + split.f = static_cast(tree.Threshold(tl_node_id)); adjust_threshold(&split.f, &swap_child_nodes, tree.ComparisonOp(tl_node_id)); } else if (tree.SplitType(tl_node_id) == tl::SplitFeatureType::kCategorical) { // for FIL, the list of categories is always for the right child @@ -346,14 +349,14 @@ conversion_state tl2fil_inner_node(int fil_left_child, } } else { // always branch left in FIL. Already accounted for Treelite branching direction above. - split.f = NAN; + split.f = std::numeric_limits::quiet_NaN(); } } else { ASSERT(false, "only numerical and categorical split nodes are supported"); } bool default_left = tree.DefaultLeft(tl_node_id) ^ swap_child_nodes; fil_node_t node( - val_t{}, split, feature_id, default_left, false, is_categorical, fil_left_child); + val_t{}, split, feature_id, default_left, false, is_categorical, fil_left_child); return conversion_state{node, swap_child_nodes}; } @@ -363,7 +366,7 @@ int tree2fil(std::vector& nodes, const tl::Tree& tree, std::size_t tree_idx, const forest_params_t& forest_params, - std::vector* vector_leaf, + std::vector* vector_leaf, std::size_t* leaf_counter, cat_sets_owner* cat_sets) { @@ -443,10 +446,11 @@ std::stringstream depth_hist_and_max(const tl::ModelImpl& model) forest_shape << "Total: branches: " << total_branches << " leaves: " << total_leaves << " nodes: " << total_nodes << endl; forest_shape << "Avg nodes per tree: " << setprecision(2) - << total_nodes / (float)hist[0].n_branch_nodes << endl; + << total_nodes / static_cast(hist[0].n_branch_nodes) << endl; forest_shape.copyfmt(default_state); forest_shape << "Leaf depth: min: " << min_leaf_depth << " avg: " << setprecision(2) << fixed - << leaves_times_depth / (float)total_leaves << " max: " << hist.size() - 1 << endl; + << leaves_times_depth / static_cast(total_leaves) + << " max: " << hist.size() - 1 << endl; forest_shape.copyfmt(default_state); vector hist_bytes(hist.size() * sizeof(hist[0])); @@ -575,9 +579,10 @@ void node_traits::check(const treelite::ModelImpl& template struct tl2fil_t { + using real_t = typename fil_node_t::real_type; std::vector roots_; std::vector nodes_; - std::vector vector_leaf_; + std::vector vector_leaf_; forest_params_t params_; cat_sets_owner cat_sets_; const tl::ModelImpl& model_; @@ -631,7 +636,7 @@ struct tl2fil_t { } /// initializes FIL forest object, to be ready to infer - void init_forest(const raft::handle_t& handle, forest_t* pforest) + void init_forest(const raft::handle_t& handle, forest_t* pforest) { ML::fil::init( handle, pforest, cat_sets_.accessor(), vector_leaf_, roots_.data(), nodes_.data(), ¶ms_); @@ -646,7 +651,7 @@ struct tl2fil_t { template void convert(const raft::handle_t& handle, - forest_t* pforest, + forest_t* pforest, const tl::ModelImpl& model, const treelite_params_t& tl_params) { @@ -664,24 +669,21 @@ constexpr bool type_supported() template void from_treelite(const raft::handle_t& handle, - forest_t* pforest, + forest_variant* pforest_variant, const tl::ModelImpl& model, const treelite_params_t* tl_params) { + // floating-point type used for model representation + using real_t = decltype(threshold_t(0) + leaf_t(0)); + + // get the pointer to the right forest variant + *pforest_variant = (forest_t)nullptr; + forest_t* pforest = &std::get>(*pforest_variant); + // Invariants on threshold and leaf types static_assert(type_supported(), "Model must contain float32 or float64 thresholds for splits"); ASSERT(type_supported(), "Models with integer leaf output are not yet supported"); - // Display appropriate warnings when float64 values are being casted into - // float32, as FIL only supports inferencing with float32 for the time being - if (std::is_same::value || std::is_same::value) { - CUML_LOG_WARN( - "Casting all thresholds and leaf values to float32, as FIL currently " - "doesn't support inferencing models with float64 values. " - "This may lead to predictions with reduced accuracy."); - } - // same as std::common_type: float+double=double, float+int64_t=float - using real_t = decltype(threshold_t(0) + leaf_t(0)); storage_type_t storage_type = tl_params->storage_type; // build dense trees by default @@ -702,18 +704,25 @@ void from_treelite(const raft::handle_t& handle, switch (storage_type) { case storage_type_t::DENSE: - convert>(handle, pforest, model, *tl_params); + convert>(handle, pforest, model, *tl_params); break; case storage_type_t::SPARSE: - convert>(handle, pforest, model, *tl_params); + convert>(handle, pforest, model, *tl_params); + break; + case storage_type_t::SPARSE8: + // SPARSE8 is only supported for float32 + if constexpr (std::is_same_v) { + convert(handle, pforest, model, *tl_params); + } else { + ASSERT(false, "SPARSE8 is only supported for float32 treelite models"); + } break; - case storage_type_t::SPARSE8: convert(handle, pforest, model, *tl_params); break; default: ASSERT(false, "tl_params->sparse must be one of AUTO, DENSE or SPARSE"); } } void from_treelite(const raft::handle_t& handle, - forest_t* pforest, + forest_variant* pforest, ModelHandle model, const treelite_params_t* tl_params) { diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu index cec210280a..cb3bcf8495 100644 --- a/cpp/test/sg/fil_test.cu +++ b/cpp/test/sg/fil_test.cu @@ -196,8 +196,9 @@ __global__ void floats_to_bit_stream_k(uint8_t* dst, real_t* src, std::size_t si dst[idx] = byte; } +template void adjust_threshold_to_treelite( - float* pthreshold, int* tl_left, int* tl_right, bool* default_left, tl::Operator comparison_op) + real_t* pthreshold, int* tl_left, int* tl_right, bool* default_left, tl::Operator comparison_op) { // in treelite (take left node if val [op] threshold), // the meaning of the condition is reversed compared to FIL; @@ -213,12 +214,12 @@ void adjust_threshold_to_treelite( case tl::Operator::kLT: break; case tl::Operator::kLE: // x <= y is equivalent to x < y', where y' is the next representable float - *pthreshold = std::nextafterf(*pthreshold, -std::numeric_limits::infinity()); + *pthreshold = std::nextafterf(*pthreshold, -std::numeric_limits::infinity()); break; case tl::Operator::kGT: // x > y is equivalent to x >= y', where y' is the next representable float // left and right still need to be swapped - *pthreshold = std::nextafterf(*pthreshold, -std::numeric_limits::infinity()); + *pthreshold = std::nextafterf(*pthreshold, -std::numeric_limits::infinity()); case tl::Operator::kGE: // swap left and right std::swap(*tl_left, *tl_right); @@ -745,7 +746,8 @@ using PredictSparse16Float32FilTest = BasePredictFilTest>; using PredictSparse8FilTest = BasePredictFilTest; -class TreeliteFilTest : public BaseFilTest { +template +class TreeliteFilTest : public BaseFilTest { protected: /** adds nodes[node] of tree starting at index root to builder at index at *pkey, increments *pkey, @@ -754,28 +756,29 @@ class TreeliteFilTest : public BaseFilTest { { int key = (*pkey)++; builder->CreateNode(key); - const fil::dense_node& dense_node = nodes[node]; + const fil::dense_node& dense_node = this->nodes[node]; std::vector left_categories; if (dense_node.is_leaf()) { - switch (ps.leaf_algo) { + switch (this->ps.leaf_algo) { case fil::leaf_algo_t::FLOAT_UNARY_BINARY: case fil::leaf_algo_t::GROVE_PER_CLASS: // default is fil::FLOAT_UNARY_BINARY - builder->SetLeafNode(key, tlf::Value::Create(dense_node.output())); + builder->SetLeafNode(key, tlf::Value::Create(dense_node.template output())); break; case fil::leaf_algo_t::CATEGORICAL_LEAF: { - std::vector vec(ps.num_classes); - for (int i = 0; i < ps.num_classes; ++i) { - vec[i] = tlf::Value::Create(i == dense_node.output() ? 1.0f : 0.0f); + std::vector vec(this->ps.num_classes); + for (int i = 0; i < this->ps.num_classes; ++i) { + vec[i] = + tlf::Value::Create(i == dense_node.template output() ? real_t(1) : real_t(0)); } builder->SetLeafVectorNode(key, vec); break; } case fil::leaf_algo_t::VECTOR_LEAF: { - std::vector vec(ps.num_classes); - for (int i = 0; i < ps.num_classes; ++i) { - auto idx = dense_node.output(); - vec[i] = tlf::Value::Create(vector_leaf[idx * ps.num_classes + i]); + std::vector vec(this->ps.num_classes); + for (int i = 0; i < this->ps.num_classes; ++i) { + auto idx = dense_node.template output(); + vec[i] = tlf::Value::Create(this->vector_leaf[idx * this->ps.num_classes + i]); } builder->SetLeafVectorNode(key, vec); break; @@ -787,14 +790,15 @@ class TreeliteFilTest : public BaseFilTest { int left = root + 2 * (node - root) + 1; int right = root + 2 * (node - root) + 2; bool default_left = dense_node.def_left(); - float threshold = dense_node.is_categorical() ? NAN : dense_node.thresh(); + real_t threshold = dense_node.is_categorical() ? std::numeric_limits::quiet_NaN() + : dense_node.thresh(); if (dense_node.is_categorical()) { uint8_t byte = 0; for (int category = 0; - category < static_cast(cat_sets_h.fid_num_cats[dense_node.fid()]); + category < static_cast(this->cat_sets_h.fid_num_cats[dense_node.fid()]); ++category) { if (category % BITS_PER_BYTE == 0) { - byte = cat_sets_h.bits[dense_node.set() + category / BITS_PER_BYTE]; + byte = this->cat_sets_h.bits[dense_node.set() + category / BITS_PER_BYTE]; } if ((byte & (1 << (category % BITS_PER_BYTE))) != 0) { left_categories.push_back(category); @@ -815,10 +819,10 @@ class TreeliteFilTest : public BaseFilTest { builder->SetCategoricalTestNode( key, dense_node.fid(), left_categories, default_left, left_key, right_key); } else { - adjust_threshold_to_treelite(&threshold, &left_key, &right_key, &default_left, ps.op); + adjust_threshold_to_treelite(&threshold, &left_key, &right_key, &default_left, this->ps.op); builder->SetNumericalTestNode(key, dense_node.fid(), - ps.op, + this->ps.op, tlf::Value::Create(threshold), default_left, left_key, @@ -828,28 +832,27 @@ class TreeliteFilTest : public BaseFilTest { return key; } - void init_forest_impl(fil::forest_t* pforest, fil::storage_type_t storage_type) + void init_forest_impl(fil::forest_t* pforest, fil::storage_type_t storage_type) { - auto stream = handle.get_stream(); - bool random_forest_flag = (ps.output & fil::output_t::AVG) != 0; + auto stream = this->handle.get_stream(); + bool random_forest_flag = (this->ps.output & fil::output_t::AVG) != 0; + tl::TypeInfo tl_type_info = + std::is_same_v ? tl::TypeInfo::kFloat32 : tl::TypeInfo::kFloat64; int treelite_num_classes = - ps.leaf_algo == fil::leaf_algo_t::FLOAT_UNARY_BINARY ? 1 : ps.num_classes; - std::unique_ptr model_builder(new tlf::ModelBuilder(ps.num_cols, - treelite_num_classes, - random_forest_flag, - tl::TypeInfo::kFloat32, - tl::TypeInfo::kFloat32)); + this->ps.leaf_algo == fil::leaf_algo_t::FLOAT_UNARY_BINARY ? 1 : this->ps.num_classes; + std::unique_ptr model_builder(new tlf::ModelBuilder( + this->ps.num_cols, treelite_num_classes, random_forest_flag, tl_type_info, tl_type_info)); // prediction transform - if ((ps.output & fil::output_t::SIGMOID) != 0) { - if (ps.num_classes > 2) + if ((this->ps.output & fil::output_t::SIGMOID) != 0) { + if (this->ps.num_classes > 2) model_builder->SetModelParam("pred_transform", "multiclass_ova"); else model_builder->SetModelParam("pred_transform", "sigmoid"); - } else if (ps.leaf_algo != fil::leaf_algo_t::FLOAT_UNARY_BINARY) { + } else if (this->ps.leaf_algo != fil::leaf_algo_t::FLOAT_UNARY_BINARY) { model_builder->SetModelParam("pred_transform", "max_index"); - ps.output = fil::output_t(ps.output | fil::output_t::CLASS); - } else if (ps.leaf_algo == GROVE_PER_CLASS) { + this->ps.output = fil::output_t(this->ps.output | fil::output_t::CLASS); + } else if (this->ps.leaf_algo == GROVE_PER_CLASS) { model_builder->SetModelParam("pred_transform", "identity_multiclass"); } else { model_builder->SetModelParam("pred_transform", "identity"); @@ -857,18 +860,17 @@ class TreeliteFilTest : public BaseFilTest { // global bias char* global_bias_str = nullptr; - ASSERT(asprintf(&global_bias_str, "%f", double(ps.global_bias)) > 0, + ASSERT(asprintf(&global_bias_str, "%f", double(this->ps.global_bias)) > 0, "cannot convert global_bias into a string"); model_builder->SetModelParam("global_bias", global_bias_str); ::free(global_bias_str); // build the trees - for (int i_tree = 0; i_tree < ps.num_trees; ++i_tree) { - tlf::TreeBuilder* tree_builder = - new tlf::TreeBuilder(tl::TypeInfo::kFloat32, tl::TypeInfo::kFloat32); - int key_counter = 0; - int root = i_tree * tree_num_nodes(); - int root_key = node_to_treelite(tree_builder, &key_counter, root, root); + for (int i_tree = 0; i_tree < this->ps.num_trees; ++i_tree) { + tlf::TreeBuilder* tree_builder = new tlf::TreeBuilder(tl_type_info, tl_type_info); + int key_counter = 0; + int root = i_tree * this->tree_num_nodes(); + int root_key = node_to_treelite(tree_builder, &key_counter, root, root); tree_builder->SetRootNode(root_key); // InsertTree() consumes tree_builder TL_CPP_CHECK(model_builder->InsertTree(tree_builder)); @@ -880,17 +882,19 @@ class TreeliteFilTest : public BaseFilTest { // init FIL forest with the model char* forest_shape_str = nullptr; fil::treelite_params_t params; - params.algo = ps.algo; - params.threshold = ps.threshold; - params.output_class = (ps.output & fil::output_t::CLASS) != 0; + params.algo = this->ps.algo; + params.threshold = this->ps.threshold; + params.output_class = (this->ps.output & fil::output_t::CLASS) != 0; params.storage_type = storage_type; - params.blocks_per_sm = ps.blocks_per_sm; - params.threads_per_tree = ps.threads_per_tree; - params.n_items = ps.n_items; - params.pforest_shape_str = ps.print_forest_shape ? &forest_shape_str : nullptr; - fil::from_treelite(handle, pforest, (ModelHandle)model.get(), ¶ms); - handle.sync_stream(stream); - if (ps.print_forest_shape) { + params.blocks_per_sm = this->ps.blocks_per_sm; + params.threads_per_tree = this->ps.threads_per_tree; + params.n_items = this->ps.n_items; + params.pforest_shape_str = this->ps.print_forest_shape ? &forest_shape_str : nullptr; + fil::forest_variant forest_variant; + fil::from_treelite(this->handle, &forest_variant, (ModelHandle)model.get(), ¶ms); + *pforest = std::get>(forest_variant); + this->handle.sync_stream(stream); + if (this->ps.print_forest_shape) { std::string str(forest_shape_str); for (const char* substr : {"model size", " MB", @@ -908,38 +912,48 @@ class TreeliteFilTest : public BaseFilTest { } }; -class TreeliteDenseFilTest : public TreeliteFilTest { +template +class TreeliteDenseFilTest : public TreeliteFilTest { protected: - void init_forest(fil::forest_t* pforest) override + void init_forest(fil::forest_t* pforest) override { - init_forest_impl(pforest, fil::storage_type_t::DENSE); + this->init_forest_impl(pforest, fil::storage_type_t::DENSE); } }; -class TreeliteSparse16FilTest : public TreeliteFilTest { +template +class TreeliteSparse16FilTest : public TreeliteFilTest { protected: - void init_forest(fil::forest_t* pforest) override + void init_forest(fil::forest_t* pforest) override { - init_forest_impl(pforest, fil::storage_type_t::SPARSE); + this->init_forest_impl(pforest, fil::storage_type_t::SPARSE); } }; -class TreeliteSparse8FilTest : public TreeliteFilTest { +class TreeliteSparse8FilTest : public TreeliteFilTest { protected: void init_forest(fil::forest_t* pforest) override { - init_forest_impl(pforest, fil::storage_type_t::SPARSE8); + this->init_forest_impl(pforest, fil::storage_type_t::SPARSE8); } }; -class TreeliteAutoFilTest : public TreeliteFilTest { +template +class TreeliteAutoFilTest : public TreeliteFilTest { protected: - void init_forest(fil::forest_t* pforest) override + void init_forest(fil::forest_t* pforest) override { - init_forest_impl(pforest, fil::storage_type_t::AUTO); + this->init_forest_impl(pforest, fil::storage_type_t::AUTO); } }; +using TreeliteDenseFloat32FilTest = TreeliteDenseFilTest; +using TreeliteDenseFloat64FilTest = TreeliteDenseFilTest; +using TreeliteSparse16Float32FilTest = TreeliteDenseFilTest; +using TreeliteSparse16Float64FilTest = TreeliteDenseFilTest; +using TreeliteAutoFloat32FilTest = TreeliteAutoFilTest; +using TreeliteAutoFloat64FilTest = TreeliteAutoFilTest; + // test for failures; currently only supported for sparse8 nodes class TreeliteThrowSparse8FilTest : public TreeliteSparse8FilTest { protected: @@ -1300,9 +1314,15 @@ std::vector import_dense_inputs = { max_magnitude_of_matching_cat = 5), }; -TEST_P(TreeliteDenseFilTest, Import) { compare(); } +TEST_P(TreeliteDenseFloat32FilTest, Import) { compare(); } +TEST_P(TreeliteDenseFloat64FilTest, Import) { compare(); } -INSTANTIATE_TEST_CASE_P(FilTests, TreeliteDenseFilTest, testing::ValuesIn(import_dense_inputs)); +INSTANTIATE_TEST_CASE_P(FilTests, + TreeliteDenseFloat32FilTest, + testing::ValuesIn(import_dense_inputs)); +INSTANTIATE_TEST_CASE_P(FilTests, + TreeliteDenseFloat64FilTest, + testing::ValuesIn(import_dense_inputs)); std::vector import_sparse_inputs = { FIL_TEST_PARAMS(), @@ -1353,9 +1373,15 @@ std::vector import_sparse_inputs = { max_magnitude_of_matching_cat = 5), }; -TEST_P(TreeliteSparse16FilTest, Import) { compare(); } +TEST_P(TreeliteSparse16Float32FilTest, Import) { compare(); } +TEST_P(TreeliteSparse16Float64FilTest, Import) { compare(); } -INSTANTIATE_TEST_CASE_P(FilTests, TreeliteSparse16FilTest, testing::ValuesIn(import_sparse_inputs)); +INSTANTIATE_TEST_CASE_P(FilTests, + TreeliteSparse16Float32FilTest, + testing::ValuesIn(import_sparse_inputs)); +INSTANTIATE_TEST_CASE_P(FilTests, + TreeliteSparse16Float64FilTest, + testing::ValuesIn(import_sparse_inputs)); TEST_P(TreeliteSparse8FilTest, Import) { compare(); } @@ -1381,9 +1407,15 @@ std::vector import_auto_inputs = { #endif }; -TEST_P(TreeliteAutoFilTest, Import) { compare(); } +TEST_P(TreeliteAutoFloat32FilTest, Import) { compare(); } +TEST_P(TreeliteAutoFloat64FilTest, Import) { compare(); } -INSTANTIATE_TEST_CASE_P(FilTests, TreeliteAutoFilTest, testing::ValuesIn(import_auto_inputs)); +INSTANTIATE_TEST_CASE_P(FilTests, + TreeliteAutoFloat32FilTest, + testing::ValuesIn(import_auto_inputs)); +INSTANTIATE_TEST_CASE_P(FilTests, + TreeliteAutoFloat64FilTest, + testing::ValuesIn(import_auto_inputs)); // adjust test parameters if the sparse8 format changes std::vector import_throw_sparse8_inputs = { diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index 345770efa1..18923c4baa 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -172,8 +172,9 @@ auto FilPredict(const raft::handle_t& handle, 1, 0, nullptr}; - fil::forest_t fil_forest; - fil::from_treelite(handle, &fil_forest, model, &tl_params); + fil::forest_variant forest_variant; + fil::from_treelite(handle, &forest_variant, model, &tl_params); + fil::forest_t fil_forest = std::get>(forest_variant); fil::predict(handle, fil_forest, pred->data().get(), X_transpose, params.n_rows, false); return pred; } @@ -191,8 +192,9 @@ auto FilPredictProba(const raft::handle_t& handle, build_treelite_forest(&model, forest, params.n_cols); fil::treelite_params_t tl_params{ fil::algo_t::ALGO_AUTO, 0, 0.0f, fil::storage_type_t::AUTO, 8, 1, 0, nullptr}; - fil::forest_t fil_forest; - fil::from_treelite(handle, &fil_forest, model, &tl_params); + fil::forest_variant forest_variant; + fil::from_treelite(handle, &forest_variant, model, &tl_params); + fil::forest_t fil_forest = std::get>(forest_variant); fil::predict(handle, fil_forest, pred->data().get(), X_transpose, params.n_rows, true); return pred; } @@ -557,8 +559,9 @@ TEST(RfTests, IntegerOverflow) 1, 0, nullptr}; - fil::forest_t fil_forest; - fil::from_treelite(handle, &fil_forest, model, &tl_params); + fil::forest_variant forest_variant; + fil::from_treelite(handle, &forest_variant, model, &tl_params); + fil::forest_t fil_forest = std::get>(forest_variant); fil::predict(handle, fil_forest, pred.data().get(), X.data().get(), m, false); } From 1fd862401fb28e16772c83005aee936b5a4b18fc Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Thu, 7 Apr 2022 23:41:45 +0200 Subject: [PATCH 42/45] float64 support in FIL Python layer. --- cpp/include/cuml/fil/fil.h | 4 +++ python/cuml/fil/fil.pyx | 73 ++++++++++++++++++++++++++++---------- 2 files changed, 58 insertions(+), 19 deletions(-) diff --git a/cpp/include/cuml/fil/fil.h b/cpp/include/cuml/fil/fil.h index 5cc4a9f748..c842a68313 100644 --- a/cpp/include/cuml/fil/fil.h +++ b/cpp/include/cuml/fil/fil.h @@ -78,6 +78,10 @@ struct forest; template using forest_t = forest*; +/** forest32_t and forest64_t are definitions required in Cython */ +using forest32_t = forest*; +using forest64_t = forest*; + /** forest_variant is used to get a forest represented with either float or double. */ using forest_variant = std::variant, forest_t>; diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx index a9e0b79e1a..f084190e69 100644 --- a/python/cuml/fil/fil.pyx +++ b/python/cuml/fil/fil.pyx @@ -177,6 +177,13 @@ cdef class TreeliteModel(): model.set_handle(handle) return model +cdef extern from "variant" namespace "std": + cdef cppclass variant[T1, T2]: + variant() + size_t index() + + cdef T& get[T, T1, T2](variant[T1, T2]& v) + cdef extern from "cuml/fil/fil.h" namespace "ML::fil": cdef enum algo_t: ALGO_AUTO, @@ -193,6 +200,10 @@ cdef extern from "cuml/fil/fil.h" namespace "ML::fil": cdef cppclass forest[real_t]: pass + ctypedef forest[float]* forest32_t + ctypedef forest[double]* forest64_t + ctypedef variant[forest32_t, forest64_t] forest_variant + # TODO(canonizer): use something like # ctypedef forest[real_t]* forest_t[real_t] # once it is supported in Cython @@ -227,15 +238,15 @@ cdef extern from "cuml/fil/fil.h" namespace "ML::fil": size_t, bool) except + - cdef forest[float]* from_treelite(handle_t& handle, - forest[float]**, - ModelHandle, - treelite_params_t*) except + + cdef void from_treelite(handle_t& handle, + forest_variant*, + ModelHandle, + treelite_params_t*) except + cdef class ForestInference_impl(): cdef object handle - cdef forest[float]* forest_data + cdef forest_variant forest_data cdef size_t num_class cdef bool output_class cdef char* shape_str @@ -243,7 +254,7 @@ cdef class ForestInference_impl(): def __cinit__(self, handle=None): self.handle = handle - self.forest_data = NULL + self.forest_data = forest_variant() self.shape_str = NULL def get_shape_str(self): @@ -251,6 +262,10 @@ cdef class ForestInference_impl(): return unicode(self.shape_str, 'utf-8') return None + def get_dtype(self): + dtype_array = [np.float32, np.float64] + return dtype_array[self.forest_data.index()] + def get_algo(self, algo_str): algo_dict={'AUTO': algo_t.ALGO_AUTO, 'auto': algo_t.ALGO_AUTO, @@ -327,12 +342,13 @@ cdef class ForestInference_impl(): " using a Classification model, please " " set `output_class=True` while creating" " the FIL model.") + fil_dtype = self.get_dtype() cdef uintptr_t X_ptr X_m, n_rows, n_cols, dtype = \ input_to_cuml_array(X, order='C', - convert_to_dtype=np.float32, + convert_to_dtype=fil_dtype, safe_dtype_conversion=safe_dtype_conversion, - check_dtype=np.float32) + check_dtype=fil_dtype) X_ptr = X_m.ptr cdef handle_t* handle_ =\ @@ -345,7 +361,7 @@ cdef class ForestInference_impl(): shape += (2,) else: shape += (self.num_class,) - preds = CumlArray.empty(shape=shape, dtype=np.float32, order='C', + preds = CumlArray.empty(shape=shape, dtype=fil_dtype, order='C', index=X_m.index) else: if not hasattr(preds, "__cuda_array_interface__"): @@ -356,12 +372,24 @@ cdef class ForestInference_impl(): cdef uintptr_t preds_ptr preds_ptr = preds.ptr - predict(handle_[0], - self.forest_data, - preds_ptr, - X_ptr, - n_rows, - predict_proba) + if fil_dtype == np.float32: + predict(handle_[0], + get[forest32_t, forest32_t, forest64_t](self.forest_data), + preds_ptr, + X_ptr, + n_rows, + predict_proba) + elif fil_dtype == np.float64: + predict(handle_[0], + get[forest64_t, forest32_t, forest64_t](self.forest_data), + preds_ptr, + X_ptr, + n_rows, + predict_proba) + else: + # should not reach here + assert False, 'invalid fil_dtype, must be np.float32 or np.float64' + self.handle.sync() # special case due to predict and predict_proba @@ -372,7 +400,7 @@ cdef class ForestInference_impl(): return preds def load_from_treelite_model_handle(self, **kwargs): - self.forest_data = NULL + self.forest_data = forest_variant() return self.load_using_treelite_handle(**kwargs) def load_from_treelite_model(self, **kwargs): @@ -413,9 +441,16 @@ cdef class ForestInference_impl(): def __dealloc__(self): cdef handle_t* handle_ = self.handle.getHandle() - - if self.forest_data !=NULL: - free(handle_[0], self.forest_data) + fil_dtype = self.get_dtype() + if fil_dtype == np.float32: + if get[forest32_t, forest32_t, forest64_t](self.forest_data) != NULL: + free[float](handle_[0], get[forest32_t, forest32_t, forest64_t](self.forest_data)) + elif fil_dtype == np.float64: + if get[forest64_t, forest32_t, forest64_t](self.forest_data) != NULL: + free[double](handle_[0], get[forest64_t, forest32_t, forest64_t](self.forest_data)) + else: + # should not reach here + assert False, 'invalid fil_dtype, must be np.float32 or np.float64' class ForestInference(Base, From aaa326182c6d8599cc77f7806e9887681c2adf93 Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Fri, 8 Apr 2022 01:21:27 +0200 Subject: [PATCH 43/45] get_forest{32,64} in fil.pyx. --- python/cuml/fil/fil.pyx | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx index f084190e69..4d4d1f8950 100644 --- a/python/cuml/fil/fil.pyx +++ b/python/cuml/fil/fil.pyx @@ -251,6 +251,12 @@ cdef class ForestInference_impl(): cdef bool output_class cdef char* shape_str + cdef forest32_t get_forest32(self): + return get[forest32_t, forest32_t, forest64_t](self.forest_data) + + cdef forest64_t get_forest64(self): + return get[forest64_t, forest32_t, forest64_t](self.forest_data) + def __cinit__(self, handle=None): self.handle = handle @@ -374,14 +380,14 @@ cdef class ForestInference_impl(): if fil_dtype == np.float32: predict(handle_[0], - get[forest32_t, forest32_t, forest64_t](self.forest_data), + self.get_forest32(), preds_ptr, X_ptr, n_rows, predict_proba) elif fil_dtype == np.float64: predict(handle_[0], - get[forest64_t, forest32_t, forest64_t](self.forest_data), + self.get_forest64(), preds_ptr, X_ptr, n_rows, @@ -389,7 +395,7 @@ cdef class ForestInference_impl(): else: # should not reach here assert False, 'invalid fil_dtype, must be np.float32 or np.float64' - + self.handle.sync() # special case due to predict and predict_proba @@ -443,11 +449,11 @@ cdef class ForestInference_impl(): cdef handle_t* handle_ = self.handle.getHandle() fil_dtype = self.get_dtype() if fil_dtype == np.float32: - if get[forest32_t, forest32_t, forest64_t](self.forest_data) != NULL: - free[float](handle_[0], get[forest32_t, forest32_t, forest64_t](self.forest_data)) + if self.get_forest32() != NULL: + free[float](handle_[0], self.get_forest32()) elif fil_dtype == np.float64: - if get[forest64_t, forest32_t, forest64_t](self.forest_data) != NULL: - free[double](handle_[0], get[forest64_t, forest32_t, forest64_t](self.forest_data)) + if self.get_forest64() != NULL: + free[double](handle_[0], self.get_forest64()) else: # should not reach here assert False, 'invalid fil_dtype, must be np.float32 or np.float64' From 606ee1d01676fc7474f2572c4ac98b07496f5340 Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Fri, 8 Apr 2022 01:43:53 +0200 Subject: [PATCH 44/45] Initializing forest_data with forest_variant(NULL). --- python/cuml/fil/fil.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx index 4d4d1f8950..8908b0fc4a 100644 --- a/python/cuml/fil/fil.pyx +++ b/python/cuml/fil/fil.pyx @@ -180,6 +180,7 @@ cdef class TreeliteModel(): cdef extern from "variant" namespace "std": cdef cppclass variant[T1, T2]: variant() + variant(T1) size_t index() cdef T& get[T, T1, T2](variant[T1, T2]& v) @@ -260,7 +261,7 @@ cdef class ForestInference_impl(): def __cinit__(self, handle=None): self.handle = handle - self.forest_data = forest_variant() + self.forest_data = forest_variant( NULL) self.shape_str = NULL def get_shape_str(self): @@ -406,7 +407,7 @@ cdef class ForestInference_impl(): return preds def load_from_treelite_model_handle(self, **kwargs): - self.forest_data = forest_variant() + self.forest_data = forest_variant( NULL) return self.load_using_treelite_handle(**kwargs) def load_from_treelite_model(self, **kwargs): From 0d2c51299b0b86a339d3ee954edbb68f838a3c11 Mon Sep 17 00:00:00 2001 From: Andy Adinets Date: Sat, 9 Apr 2022 03:55:01 +0200 Subject: [PATCH 45/45] Addressed review comments. --- cpp/include/cuml/fil/fil.h | 7 ++----- cpp/src/fil/infer.cu | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/cpp/include/cuml/fil/fil.h b/cpp/include/cuml/fil/fil.h index c842a68313..2d5d786520 100644 --- a/cpp/include/cuml/fil/fil.h +++ b/cpp/include/cuml/fil/fil.h @@ -31,10 +31,8 @@ class handle_t; namespace ML { namespace fil { -/** @note FIL only supports inference with single precision. - * TODO(canonizer): parameterize the functions and structures by the data type - * and the threshold/weight type. - */ +/** @note FIL supports inference with both single and double precision. However, + the floating-point type used in the data and model must be the same. */ /** Inference algorithm to use. */ enum algo_t { @@ -123,7 +121,6 @@ struct treelite_params_t { * @param model treelite model used to initialize the forest * @param tl_params additional parameters for the forest */ -// TODO (canonizer): use std::variant forest_t>* for pforest void from_treelite(const raft::handle_t& handle, forest_variant* pforest, ModelHandle model, diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu index 7166c600e6..5af31aa76e 100644 --- a/cpp/src/fil/infer.cu +++ b/cpp/src/fil/infer.cu @@ -36,7 +36,7 @@ #endif // __CUDA_ARCH__ #endif // CUDA_PRAGMA_UNROLL -#define INLINE_CONFIG __noinline__ +#define INLINE_CONFIG __forceinline__ namespace ML { namespace fil {