adding one pass kernels but best metric issue does not allow the use

rapidsai · Dec 9, 2019 · ba5a222 · ba5a222
1 parent 12e9ac0
commit ba5a222
Show file tree

Hide file tree

Showing 3 changed files with 50 additions and 33 deletions.
diff --git a/cpp/src/decisiontree/levelalgo/levelfunc_regressor.cuh b/cpp/src/decisiontree/levelalgo/levelfunc_regressor.cuh
@@ -110,6 +110,7 @@ void grow_deep_tree_regression(
     MLCommon::updateDevice(d_colids, h_colids, Ncols, tempmem->stream);
   }
   std::vector<unsigned int> feature_selector(h_colids, h_colids + Ncols);
+  float* infogain = tempmem->h_outgain->data();
 
   for (int depth = 0; (depth < maxdepth) && (n_nodes_nextitr != 0); depth++) {
     depth_cnt = depth + 1;
@@ -128,26 +129,40 @@ void grow_deep_tree_regression(
                       sparsesize, depth, tempmem);
 
     if (split_cr == ML::CRITERION::MSE) {
-      get_mse_regression_fused<T>(
-        data, labels, flagsptr, sample_cnt, nrows, Ncols, ncols_sampled, nbins,
-        n_nodes, split_algo, tempmem, d_mseout, d_predout, d_count);
+      //get_mse_regression_fused<T>(
+      //  data, labels, flagsptr, sample_cnt, nrows, Ncols, ncols_sampled, nbins,
+      //  n_nodes, split_algo, tempmem, d_mseout, d_predout, d_count);
+      //get_best_split_regression<T, MSEImpurity<T>>(
+      //  h_mseout, d_mseout, h_predout, d_predout, h_count, d_count, h_colids,
+      //  d_colids, h_colstart, d_colstart, Ncols, ncols_sampled, nbins, n_nodes,
+      //  depth, min_rows_per_node, split_algo, sparsesize, infogain,
+      //  sparse_meanstate, sparse_countstate, sparsetree, sparse_nodelist,
+      //  h_split_colidx, h_split_binidx, d_split_colidx, d_split_binidx,
+      //  tempmem);
       get_mse_regression<T, SquareFunctor>(
         data, labels, flagsptr, sample_cnt, nrows, Ncols, ncols_sampled, nbins,
         n_nodes, split_algo, tempmem, d_mseout, d_predout, d_count);
+      get_best_split_regression<T, MAEImpurity<T>>(
+        h_mseout, d_mseout, h_predout, d_predout, h_count, d_count, h_colids,
+        d_colids, h_colstart, d_colstart, Ncols, ncols_sampled, nbins, n_nodes,
+        depth, min_rows_per_node, split_algo, sparsesize, infogain,
+        sparse_meanstate, sparse_countstate, sparsetree, sparse_nodelist,
+        h_split_colidx, h_split_binidx, d_split_colidx, d_split_binidx,
+        tempmem);
+
     } else {
       get_mse_regression<T, AbsFunctor>(
         data, labels, flagsptr, sample_cnt, nrows, Ncols, ncols_sampled, nbins,
         n_nodes, split_algo, tempmem, d_mseout, d_predout, d_count);
+      get_best_split_regression<T, MAEImpurity<T>>(
+        h_mseout, d_mseout, h_predout, d_predout, h_count, d_count, h_colids,
+        d_colids, h_colstart, d_colstart, Ncols, ncols_sampled, nbins, n_nodes,
+        depth, min_rows_per_node, split_algo, sparsesize, infogain,
+        sparse_meanstate, sparse_countstate, sparsetree, sparse_nodelist,
+        h_split_colidx, h_split_binidx, d_split_colidx, d_split_binidx,
+        tempmem);
     }
 
-    float* infogain = tempmem->h_outgain->data();
-    get_best_split_regression(
-      h_mseout, d_mseout, h_predout, d_predout, h_count, d_count, h_colids,
-      d_colids, h_colstart, d_colstart, Ncols, ncols_sampled, nbins, n_nodes,
-      depth, min_rows_per_node, split_algo, sparsesize, infogain,
-      sparse_meanstate, sparse_countstate, sparsetree, sparse_nodelist,
-      h_split_colidx, h_split_binidx, d_split_colidx, d_split_binidx, tempmem);
-
     CUDA_CHECK(cudaStreamSynchronize(tempmem->stream));
     leaf_eval_regression(infogain, depth, min_impurity_decrease, maxdepth,
                          maxleaves, h_new_node_flags, sparsetree, sparsesize,

diff --git a/cpp/src/decisiontree/levelalgo/levelhelper_regressor.cuh b/cpp/src/decisiontree/levelalgo/levelhelper_regressor.cuh
@@ -205,7 +205,7 @@ void get_mse_regression(const T *data, const T *labels, unsigned int *flags,
     CUDA_CHECK(cudaGetLastError());
   }
 }
-template <typename T>
+template <typename T, typename Impurity>
 void get_best_split_regression(
   T *mseout, T *d_mseout, T *predout, T *d_predout, unsigned int *count,
   unsigned int *d_count, unsigned int *h_colids, unsigned int *d_colids,
@@ -263,10 +263,12 @@ void get_best_split_regression(
     CUDA_CHECK(cudaMemsetAsync(d_split_colidx, 0, n_nodes * sizeof(int),
                                tempmem->stream));
 
-    get_best_split_regression_kernel<<<n_nodes, threads, 0, tempmem->stream>>>(
-      d_mseout, d_predout, d_count, d_parentmean, d_parentcount, d_parentmetric,
-      nbins, ncols_sampled, n_nodes, min_rpn, d_outgain, d_split_colidx,
-      d_split_binidx, d_childmean, d_childcount, d_childmetric);
+    get_best_split_regression_kernel<T, Impurity>
+      <<<n_nodes, threads, 0, tempmem->stream>>>(
+        d_mseout, d_predout, d_count, d_parentmean, d_parentcount,
+        d_parentmetric, nbins, ncols_sampled, n_nodes, min_rpn, d_outgain,
+        d_split_colidx, d_split_binidx, d_childmean, d_childcount,
+        d_childmetric);
     CUDA_CHECK(cudaGetLastError());
 
     MLCommon::updateHost(h_childmetric, d_childmetric, 2 * n_nodes,
@@ -347,16 +349,13 @@ void get_best_split_regression(
           T tmp_mse_left = mseout[coloff_mse + binoff_mse + nodeoff_mse];
           T tmp_mse_right = mseout[coloff_mse + binoff_mse + nodeoff_mse + 1];
 
-          T impurity2 = MAEImpurity<T>::exec(
-            parent_count, tmp_lnrows, tmp_rnrows, parent_mean, tmp_meanleft,
-            tmp_mse_left, tmp_mse_right);
-
+          T impurity =
+            Impurity::exec(parent_count, tmp_lnrows, tmp_rnrows, parent_mean,
+                           tmp_meanleft, tmp_mse_left, tmp_mse_right);
           tmp_meanleft /= tmp_lnrows;
           tmp_meanright /= tmp_rnrows;
           tmp_mse_left /= tmp_lnrows;
           tmp_mse_right /= tmp_rnrows;
-          T impurity = (tmp_lnrows * 1.0 / totalrows) * tmp_mse_left +
-                       (tmp_rnrows * 1.0 / totalrows) * tmp_mse_right;
           float info_gain =
             (float)(sparsetree[parentid].best_metric_val - impurity);
 

diff --git a/cpp/src/decisiontree/levelalgo/levelkernel_regressor.cuh b/cpp/src/decisiontree/levelalgo/levelkernel_regressor.cuh
@@ -22,12 +22,11 @@ struct MSEImpurity {
   static HDI T exec(const unsigned int total, const unsigned int left,
                     const unsigned int right, const T parent_mean,
                     const T sumleft, const T sumsq_left, const T sumsq_right) {
+    T temp = sumleft / total;
     T sumright = (parent_mean * total) - sumleft;
-    T left_impurity = (sumsq_left / total) -
-                      (total / left) * (sumleft / total) * (sumleft / total);
-    T right_impurity = (sumsq_right / total) - (total / right) *
-                                                 (sumright / total) *
-                                                 (sumright / total);
+    T left_impurity = (sumsq_left / total) - (total / left) * temp * temp;
+    temp = sumright / total;
+    T right_impurity = (sumsq_right / total) - (total / right) * temp * temp;
     return (left_impurity + right_impurity);
   }
 };
@@ -499,7 +498,7 @@ __global__ void get_mse_pred_kernel_global(
 }
 
 //This is device version of best split in case, used when more than 512 nodes.
-template <typename T>
+template <typename T, typename Impurity>
 __global__ void get_best_split_regression_kernel(
   const T *__restrict__ mseout, const T *__restrict__ predout,
   const unsigned int *__restrict__ count, const T *__restrict__ parentmean,
@@ -530,13 +529,17 @@ __global__ void get_best_split_regression_kernel(
       if (tmp_lnrows == 0 || tmp_rnrows == 0 || totalrows < min_rpn) continue;
       T tmp_meanleft = predout[threadoffset];
       T tmp_meanright = parent_mean * parent_count - tmp_meanleft;
+      T tmp_mse_left = mseout[2 * threadoffset];
+      T tmp_mse_right = mseout[2 * threadoffset + 1];
+
+      T impurity =
+        Impurity::exec(parent_count, tmp_lnrows, tmp_rnrows, parent_mean,
+                       tmp_meanleft, tmp_mse_left, tmp_mse_right);
+
       tmp_meanleft /= tmp_lnrows;
       tmp_meanright /= tmp_rnrows;
-      T tmp_mse_left = mseout[2 * threadoffset] / tmp_lnrows;
-      T tmp_mse_right = mseout[2 * threadoffset + 1] / tmp_rnrows;
-
-      T impurity = (tmp_lnrows * 1.0 / totalrows) * tmp_mse_left +
-                   (tmp_rnrows * 1.0 / totalrows) * tmp_mse_right;
+      tmp_mse_left /= tmp_lnrows;
+      tmp_mse_right /= tmp_rnrows;
       float info_gain = (float)(parent_metric - impurity);
 
       if (info_gain > tid_pair.gain) {