Optimisations for gpu_hist.

* Use streams to overlap operations. * Reduce redundant calls to cudaSetDevice(). * ColumnSampler now uses HostDeviceVector to prevent repeatedly copying feature vectors to the device.
dmlc · Mar 12, 2019 · 55a1f37 · 55a1f37
1 parent 6fb4c5e
commit 55a1f37
Show file tree

Hide file tree

Showing 8 changed files with 269 additions and 128 deletions.
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
@@ -212,13 +212,23 @@ inline void LaunchN(int device_idx, size_t n, L lambda) {
     return;
   }
 
-  safe_cuda(cudaSetDevice(device_idx));
   const int GRID_SIZE =
       static_cast<int>(DivRoundUp(n, ITEMS_PER_THREAD * BLOCK_THREADS));
   LaunchNKernel<<<GRID_SIZE, BLOCK_THREADS>>>(static_cast<size_t>(0), n,
                                               lambda);
 }
 
+template <int ITEMS_PER_THREAD = 8, int BLOCK_THREADS = 256, typename L>
+inline void LaunchN(int device_idx, size_t n, cudaStream_t stream, L lambda) {
+  if (n == 0) {
+    return;
+  }
+
+  const int GRID_SIZE =
+      static_cast<int>(DivRoundUp(n, ITEMS_PER_THREAD * BLOCK_THREADS));
+  LaunchNKernel<<<GRID_SIZE, BLOCK_THREADS, 0, stream>>>(static_cast<size_t>(0),
+                                                         n, lambda);
+}
 /*
  * Memory
  */
@@ -247,7 +257,6 @@ class DVec {
     ptr_ = static_cast<T *>(ptr);
     size_ = size;
     device_idx_ = device_idx;
-    safe_cuda(cudaSetDevice(device_idx_));
   }
 
   DVec() : ptr_(NULL), size_(0), device_idx_(-1) {}
@@ -269,7 +278,6 @@ class DVec {
 
   std::vector<T> AsVector() const {
     std::vector<T> h_vector(Size());
-    safe_cuda(cudaSetDevice(device_idx_));
     safe_cuda(cudaMemcpy(h_vector.data(), ptr_, Size() * sizeof(T),
                          cudaMemcpyDeviceToHost));
     return h_vector;
@@ -306,7 +314,6 @@ class DVec {
       throw std::runtime_error(
           "Cannot copy assign DVec to DVec, sizes are different");
     }
-    safe_cuda(cudaSetDevice(this->DeviceIdx()));
     if (other.DeviceIdx() == this->DeviceIdx()) {
       dh::safe_cuda(cudaMemcpyAsync(this->Data(), other.Data(),
                                other.Size() * sizeof(T),
@@ -324,7 +331,6 @@ class DVec {
 
   template <typename IterT>
   void copy(IterT begin, IterT end) {
-    safe_cuda(cudaSetDevice(this->DeviceIdx()));
     if (end - begin != Size()) {
       LOG(FATAL) << "Cannot copy assign vector to DVec, sizes are different" <<
         " vector::Size(): " << end - begin << " DVec::Size(): " << Size();
@@ -333,7 +339,6 @@ class DVec {
   }
 
   void copy(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
-    safe_cuda(cudaSetDevice(this->DeviceIdx()));
     if (end - begin != Size()) {
       throw std::runtime_error(
           "Cannot copy assign vector to dvec, sizes are different");
@@ -431,7 +436,6 @@ class BulkAllocator {
 
   char *AllocateDevice(int device_idx, size_t bytes, MemoryType t) {
     char *ptr;
-    safe_cuda(cudaSetDevice(device_idx));
     safe_cuda(cudaMalloc(&ptr, bytes));
     return ptr;
   }
@@ -473,7 +477,6 @@ class BulkAllocator {
   ~BulkAllocator() {
     for (size_t i = 0; i < d_ptr_.size(); i++) {
       if (!(d_ptr_[i] == nullptr)) {
-        safe_cuda(cudaSetDevice(device_idx_[i]));
         safe_cuda(cudaFree(d_ptr_[i]));
         d_ptr_[i] = nullptr;
       }
@@ -499,6 +502,31 @@ class BulkAllocator {
   }
 };
 
+// Keep track of pinned memory allocation
+struct PinnedMemory {
+  void *temp_storage{nullptr};
+  size_t temp_storage_bytes{0};
+
+  ~PinnedMemory() { Free(); }
+
+  template <typename T>
+  xgboost::common::Span<T> GetSpan(size_t size) {
+    size_t num_bytes = size * sizeof(T);
+    if (num_bytes > temp_storage_bytes) {
+      Free();
+      safe_cuda(cudaMallocHost(&temp_storage, num_bytes));
+      temp_storage_bytes = num_bytes;
+    }
+    return xgboost::common::Span<T>(static_cast<T *>(temp_storage), size);
+  }
+
+  void Free() {
+    if (temp_storage != nullptr) {
+      safe_cuda(cudaFreeHost(temp_storage));
+    }
+  }
+};
+
 // Keep track of cub library device allocation
 struct CubMemory {
   void *d_temp_storage;
@@ -653,7 +681,6 @@ void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory,
                         OffsetT count, SegmentIterT segments,
                         OffsetT num_segments, FunctionT f) {
   typedef typename cub::CubVector<OffsetT, 2>::Type CoordinateT;
-  dh::safe_cuda(cudaSetDevice(device_idx));
   const int BLOCK_THREADS = 256;
   const int ITEMS_PER_THREAD = 1;
   const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;

diff --git a/src/common/random.h b/src/common/random.h
@@ -18,6 +18,7 @@
 #include <random>
 
 #include "io.h"
+#include "host_device_vector.h"
 
 namespace xgboost {
 namespace common {
@@ -84,28 +85,31 @@ GlobalRandomEngine& GlobalRandom(); // NOLINT(*)
  */
 
 class ColumnSampler {
-  std::shared_ptr<std::vector<int>> feature_set_tree_;
-  std::map<int, std::shared_ptr<std::vector<int>>> feature_set_level_;
+  std::shared_ptr<HostDeviceVector<int>> feature_set_tree_;
+  std::map<int, std::shared_ptr<HostDeviceVector<int>>> feature_set_level_;
   float colsample_bylevel_{1.0f};
   float colsample_bytree_{1.0f};
   float colsample_bynode_{1.0f};
 
-  std::shared_ptr<std::vector<int>> ColSample
-    (std::shared_ptr<std::vector<int>> p_features, float colsample) const {
+  std::shared_ptr<HostDeviceVector<int>> ColSample
+    (std::shared_ptr<HostDeviceVector<int>> p_features, float colsample) const {
     if (colsample == 1.0f) return p_features;
-    const auto& features = *p_features;
+    const auto& features = p_features->HostVector();
     CHECK_GT(features.size(), 0);
     int n = std::max(1, static_cast<int>(colsample * features.size()));
-    auto p_new_features = std::make_shared<std::vector<int>>();
+    auto p_new_features = std::make_shared<HostDeviceVector<int>>();
     auto& new_features = *p_new_features;
-    new_features.resize(features.size());
-    std::copy(features.begin(), features.end(), new_features.begin());
-    std::shuffle(new_features.begin(), new_features.end(), common::GlobalRandom());
-    new_features.resize(n);
-    std::sort(new_features.begin(), new_features.end());
+    new_features.Resize(features.size());
+    std::copy(features.begin(), features.end(),
+              new_features.HostVector().begin());
+    std::shuffle(new_features.HostVector().begin(),
+                 new_features.HostVector().end(), common::GlobalRandom());
+    new_features.Resize(n);
+    std::sort(new_features.HostVector().begin(),
+              new_features.HostVector().end());
 
     // ensure that new_features are the same across ranks
-    rabit::Broadcast(&new_features, 0);
+    rabit::Broadcast(&new_features.HostVector(), 0);
 
     return p_new_features;
   }
@@ -127,13 +131,14 @@ class ColumnSampler {
     colsample_bynode_ = colsample_bynode;
 
     if (feature_set_tree_ == nullptr) {
-      feature_set_tree_ = std::make_shared<std::vector<int>>();
+      feature_set_tree_ = std::make_shared<HostDeviceVector<int>>();
     }
     Reset();
 
     int begin_idx = skip_index_0 ? 1 : 0;
-    feature_set_tree_->resize(num_col - begin_idx);
-    std::iota(feature_set_tree_->begin(), feature_set_tree_->end(), begin_idx);
+    feature_set_tree_->Resize(num_col - begin_idx);
+    std::iota(feature_set_tree_->HostVector().begin(),
+              feature_set_tree_->HostVector().end(), begin_idx);
 
     feature_set_tree_ = ColSample(feature_set_tree_, colsample_bytree_);
   }
@@ -142,7 +147,7 @@ class ColumnSampler {
    * \brief Resets this object.
    */
   void Reset() {
-    feature_set_tree_->clear();
+    feature_set_tree_->Resize(0);
     feature_set_level_.clear();
   }
 
@@ -154,7 +159,7 @@ class ColumnSampler {
    * \note If colsample_bynode_ < 1.0, this method creates a new feature set each time it
    * is called. Therefore, it should be called only once per node.
    */
-  std::shared_ptr<std::vector<int>> GetFeatureSet(int depth) {
+  std::shared_ptr<HostDeviceVector<int>> GetFeatureSet(int depth) {
     if (colsample_bylevel_ == 1.0f && colsample_bynode_ == 1.0f) {
       return feature_set_tree_;
     }

diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
@@ -631,8 +631,7 @@ class ColMaker: public TreeUpdater {
                           const std::vector<GradientPair> &gpair,
                           DMatrix *p_fmat,
                           RegTree *p_tree) {
-      auto p_feature_set = column_sampler_.GetFeatureSet(depth);
-      const auto& feat_set = *p_feature_set;
+      auto &feat_set = column_sampler_.GetFeatureSet(depth)->HostVector();
       for (const auto &batch : p_fmat->GetSortedColumnBatches()) {
         this->UpdateSolution(batch, feat_set, gpair, p_fmat);
       }

diff --git a/src/tree/updater_gpu_common.cuh b/src/tree/updater_gpu_common.cuh
@@ -125,6 +125,18 @@ struct DeviceSplitCandidate {
   XGBOOST_DEVICE bool IsValid() const { return loss_chg > 0.0f; }
 };
 
+struct DeviceSplitCandidateReduceOp {
+  GPUTrainingParam param;
+  DeviceSplitCandidateReduceOp(GPUTrainingParam param) : param(param) {}
+  XGBOOST_DEVICE DeviceSplitCandidate operator()(
+      const DeviceSplitCandidate& a, const DeviceSplitCandidate& b) const {
+    DeviceSplitCandidate best;
+    best.Update(a, param);
+    best.Update(b, param);
+    return best;
+  }
+};
+
 struct DeviceNodeStats {
   GradientPair sum_gradients;
   float root_gain;