rapidsai · rapids-bot · Jun 29, 2021 · Jun 5, 2021 · Jun 8, 2021 · Jun 17, 2021
@@ -92,7 +92,7 @@ void set_tree_params(DecisionTreeParams &params, int cfg_max_depth = -1,
                      int cfg_min_samples_split = 2,
                      float cfg_min_impurity_decrease = 0.0f,
                      CRITERION cfg_split_criterion = CRITERION_END,
-                     int cfg_max_batch_size = 128);
+                     int cfg_max_batch_size = 4096);
 
 /**
  * @brief Check validity of all decision tree hyper-parameters.

@@ -338,13 +338,16 @@ struct Builder {
     raft::update_device(curr_nodes, h_nodes.data() + node_start, batchSize, s);
 
     int total_samples_in_curr_batch = 0;
+    int n_large_nodes_in_curr_batch = 0;
     total_num_blocks = 0;
     for (int n = 0; n < batchSize; n++) {
       total_samples_in_curr_batch += h_nodes[node_start + n].count;
       int num_blocks = raft::ceildiv(h_nodes[node_start + n].count,
                                      SAMPLES_PER_THREAD * Traits::TPB_DEFAULT);
       num_blocks = std::max(1, num_blocks);
 
+      if (num_blocks > 1) ++n_large_nodes_in_curr_batch;
+
       bool is_leaf = leafBasedOnParams<DataT, IdxT>(
         h_nodes[node_start + n].depth, params.max_depth,
         params.min_samples_split, params.max_leaves, h_n_leaves,
@@ -353,6 +356,8 @@ struct Builder {
 
       for (int b = 0; b < num_blocks; b++) {
         h_workload_info[total_num_blocks + b].nodeid = n;
+        h_workload_info[total_num_blocks + b].large_nodeid =
+          n_large_nodes_in_curr_batch - 1;
         h_workload_info[total_num_blocks + b].offset_blockid = b;
         h_workload_info[total_num_blocks + b].num_blocks = num_blocks;
       }
@@ -364,7 +369,8 @@ struct Builder {
     auto n_col_blks = n_blks_for_cols;
     if (total_num_blocks) {
       for (IdxT c = 0; c < input.nSampledCols; c += n_col_blks) {
-        Traits::computeSplit(*this, c, batchSize, params.split_criterion, s);
+        Traits::computeSplit(*this, c, batchSize, params.split_criterion,
+                             n_large_nodes_in_curr_batch, s);
         CUDA_CHECK(cudaGetLastError());
       }
     }
@@ -426,7 +432,7 @@ struct ClsTraits {
    */
   static void computeSplit(Builder<ClsTraits<DataT, LabelT, IdxT>>& b, IdxT col,
                            IdxT batchSize, CRITERION splitType,
-                           cudaStream_t s) {
+                           int& n_large_nodes_in_curr_batch, cudaStream_t s) {
     ML::PUSH_RANGE(
       "Builder::computeSplit @builder_base.cuh [batched-levelalgo]");
     auto nbins = b.params.n_bins;
@@ -446,7 +452,9 @@ struct ClsTraits {
     // Pick the max of two
     size_t smemSize = std::max(smemSize1, smemSize2);
     dim3 grid(b.total_num_blocks, colBlks, 1);
-    CUDA_CHECK(cudaMemsetAsync(b.hist, 0, sizeof(int) * b.nHistBins, s));
+    int nHistBins = 0;
+    nHistBins = n_large_nodes_in_curr_batch * (1 + nbins) * colBlks * nclasses;
+    CUDA_CHECK(cudaMemsetAsync(b.hist, 0, sizeof(int) * nHistBins, s));
     ML::PUSH_RANGE(
       "computeSplitClassificationKernel @builder_base.cuh [batched-levelalgo]");
     computeSplitClassificationKernel<DataT, LabelT, IdxT, TPB_DEFAULT>
@@ -507,7 +515,7 @@ struct RegTraits {
    */
   static void computeSplit(Builder<RegTraits<DataT, IdxT>>& b, IdxT col,
                            IdxT batchSize, CRITERION splitType,
-                           cudaStream_t s) {
+                           int& n_large_nodes_in_curr_batch, cudaStream_t s) {
     ML::PUSH_RANGE(
       "Builder::computeSplit @builder_base.cuh [batched-levelalgo]");
     auto colBlks = std::min(b.n_blks_for_cols, b.input.nSampledCols - col);
@@ -529,10 +537,10 @@ struct RegTraits {
     size_t smemSize = std::max(smemSize1, smemSize2);
     dim3 grid(b.total_num_blocks, colBlks, 1);
 
-    CUDA_CHECK(
-      cudaMemsetAsync(b.pred, 0, sizeof(DataT) * b.nPredCounts * 2, s));
-    CUDA_CHECK(
-      cudaMemsetAsync(b.pred_count, 0, sizeof(IdxT) * b.nPredCounts, s));
+    int nPredCounts = 0;
+    nPredCounts = n_large_nodes_in_curr_batch * nbins * colBlks;
+    CUDA_CHECK(cudaMemsetAsync(b.pred, 0, sizeof(DataT) * nPredCounts * 2, s));
+    CUDA_CHECK(cudaMemsetAsync(b.pred_count, 0, sizeof(IdxT) * nPredCounts, s));
 
     ML::PUSH_RANGE(
       "computeSplitRegressionKernel @builder_base.cuh [batched-levelalgo]");

@@ -35,6 +35,7 @@ namespace DecisionTree {
 template <typename IdxT>
 struct WorkloadInfo {
   IdxT nodeid;  // Node in the batch on which the threadblock needs to work
+  IdxT large_nodeid;    // counts only large nodes
   IdxT offset_blockid;  // Offset threadblock id among all the blocks that are
                         // working on this node
   IdxT num_blocks;      // Total number of blocks that are working on the node
@@ -387,6 +388,7 @@ __global__ void computeSplitClassificationKernel(
   // Read workload info for this block
   WorkloadInfo<IdxT> workload_info_cta = workload_info[blockIdx.x];
   IdxT nid = workload_info_cta.nodeid;
+  IdxT large_nid = workload_info_cta.large_nodeid;
   auto node = nodes[nid];
   auto range_start = node.start;
   auto range_len = node.count;
@@ -445,7 +447,7 @@ __global__ void computeSplitClassificationKernel(
   __syncthreads();
   if (num_blocks > 1) {
     // update the corresponding global location
-    auto histOffset = ((nid * gridDim.y) + blockIdx.y) * pdf_shist_len;
+    auto histOffset = ((large_nid * gridDim.y) + blockIdx.y) * pdf_shist_len;
     for (IdxT i = threadIdx.x; i < pdf_shist_len; i += blockDim.x) {
       atomicAdd(hist + histOffset + i, pdf_shist[i]);
     }
@@ -530,6 +532,7 @@ __global__ void computeSplitRegressionKernel(
   // Read workload info for this block
   WorkloadInfo<IdxT> workload_info_cta = workload_info[blockIdx.x];
   IdxT nid = workload_info_cta.nodeid;
+  IdxT large_nid = workload_info_cta.large_nodeid;
 
   auto node = nodes[nid];
   auto range_start = node.start;
@@ -598,13 +601,13 @@ __global__ void computeSplitRegressionKernel(
 
   if (num_blocks > 1) {
     // update the corresponding global location for counts
-    auto gcOffset = ((nid * gridDim.y) + blockIdx.y) * nbins;
+    auto gcOffset = ((large_nid * gridDim.y) + blockIdx.y) * nbins;
     for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) {
       atomicAdd(count + gcOffset + i, pdf_scount[i]);
     }
 
     // update the corresponding global location for preds
-    auto gOffset = ((nid * gridDim.y) + blockIdx.y) * pdf_spred_len;
+    auto gOffset = ((large_nid * gridDim.y) + blockIdx.y) * pdf_spred_len;
     for (IdxT i = threadIdx.x; i < pdf_spred_len; i += blockDim.x) {
       atomicAdd(pred + gOffset + i, pdf_spred[i]);
     }

@@ -65,7 +65,7 @@ class BaseRandomForestModel(Base):
                  min_impurity_split=None, oob_score=None, random_state=None,
                  warm_start=None, class_weight=None,
                  criterion=None, use_experimental_backend=True,
-                 max_batch_size=128):
+                 max_batch_size=4096):
 
         sklearn_params = {"criterion": criterion,
                           "min_weight_fraction_leaf": min_weight_fraction_leaf,

@@ -243,7 +243,7 @@ class RandomForestClassifier(BaseRandomForestModel,
     use_experimental_backend : boolean (default = True)
         Deprecated and currrently has no effect.
         .. deprecated:: 21.08
-    max_batch_size: int (default = 128)
+    max_batch_size: int (default = 4096)
         Maximum number of nodes that can be processed in a given batch. This is
         used only when 'use_experimental_backend' is true. Does not currently
         fully guarantee the exact same results.

@@ -230,7 +230,7 @@ class RandomForestRegressor(BaseRandomForestModel,
     use_experimental_backend : boolean (default = True)
         Deprecated and currrently has no effect.
         .. deprecated:: 21.08
-    max_batch_size: int (default = 128)
+    max_batch_size: int (default = 4096)
         Maximum number of nodes that can be processed in a given batch. This is
         used only when 'use_experimental_backend' is true.
     random_state : int (default = None)