Rebase

dmlc · Aug 29, 2018 · 0fb1ef1 · 0fb1ef1
1 parent bf950c2
commit 0fb1ef1
Show file tree

Hide file tree

Showing 10 changed files with 64 additions and 75 deletions.
diff --git a/include/xgboost/data.h b/include/xgboost/data.h
@@ -236,31 +236,20 @@ class SparsePage {
    *  \param inst an instance row
    */
   inline void Push(const Inst &inst) {
-<<<<<<< HEAD
-    offset.push_back(offset.back() + inst.size());
-    size_t begin = data.size();
-    data.resize(begin + inst.size());
-    if (inst.size() != 0) {
-      std::memcpy(dmlc::BeginPtr(data) + begin, inst.data(),
-                  sizeof(Entry) * inst.size());
-=======
     auto& data_vec = data.HostVector();
     auto& offset_vec = offset.HostVector();
-    offset_vec.push_back(offset_vec.back() + inst.length);
+    offset_vec.push_back(offset_vec.back() + inst.size());
     size_t begin = data.Size();
-    data_vec.resize(begin + inst.length);
-    if (inst.length != 0) {
-      std::memcpy(dmlc::BeginPtr(data_vec) + begin, inst.data,
-                  sizeof(Entry) * inst.length);
->>>>>>> Replaced std::vector with HostDeviceVector in MetaInfo and SparsePage.
+    data_vec.resize(begin + inst.size());
+    if (inst.size() != 0) {
+      std::memcpy(dmlc::BeginPtr(data_vec) + begin, inst.data(),
+                  sizeof(Entry) * inst.size());
     }
   }
 
   size_t Size() { return offset.Size() - 1; }
 };
 
-
-
 /*!
  * \brief This is data structure that user can pass to DMatrix::Create
  *  to create a DMatrix for training, user can create this data structure

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
@@ -338,10 +338,10 @@ XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr,
     CHECK_LE(mat.info.num_row_, num_row);
     // provision for empty rows at the bottom of matrix
     for (uint64_t i = mat.info.num_row_; i < static_cast<uint64_t>(num_row); ++i) {
-      mat.page_.offset.push_back(mat.page_.offset.back());
+      offset_vec.push_back(offset_vec.back());
     }
     mat.info.num_row_ = num_row;
-    CHECK_EQ(mat.info.num_row_, mat.page_.offset.size() - 1);  // sanity check
+    CHECK_EQ(mat.info.num_row_, offset_vec.size() - 1);  // sanity check
   }
   mat.info.num_col_ = ncol;
   mat.info.num_nonzero_ = nelem;
@@ -707,9 +707,9 @@ XGB_DLL int XGDMatrixSliceDMatrix(DMatrixHandle handle,
     const int ridx = idxset[i];
     auto inst = batch[ridx];
     CHECK_LT(static_cast<xgboost::bst_ulong>(ridx), batch.Size());
-    ret.page_.data.insert(ret.page_.data.end(), inst.data(),
+    data_vec.insert(data_vec.end(), inst.data(),
                           inst.data() + inst.size());
-    ret.page_.offset.push_back(ret.page_.offset.back() + inst.size());
+    offset_vec.push_back(offset_vec.back() + inst.size());
     ret.info.num_nonzero_ += inst.size();
 
     if (src_labels.size() != 0) {

diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
@@ -118,7 +118,7 @@ struct GPUSketcher {
 
     void Init(const SparsePage& row_batch, const MetaInfo& info) {
       num_cols_ = info.num_col_;
-      has_weights_ = info.weights_.size() > 0;
+      has_weights_ = info.weights_.Size() > 0;
 
       // find the batch size
       if (param_.gpu_batch_nrows == 0) {
@@ -282,19 +282,23 @@ struct GPUSketcher {
       size_t batch_row_end = std::min((gpu_batch + 1) * gpu_batch_nrows_,
                                       static_cast<size_t>(n_rows_));
       size_t batch_nrows = batch_row_end - batch_row_begin;
-      size_t n_entries =
-        row_batch.offset[row_begin_ + batch_row_end] -
-        row_batch.offset[row_begin_ + batch_row_begin];
+
+      const auto& offset_vec = row_batch.offset.HostVector();
+      const auto& data_vec = row_batch.data.HostVector();
+
+      size_t n_entries = offset_vec[row_begin_ + batch_row_end] -
+        offset_vec[row_begin_ + batch_row_begin];
       // copy the batch to the GPU
-      dh::safe_cuda
-        (cudaMemcpy(entries_.data().get(),
-                    &row_batch.data[row_batch.offset[row_begin_ + batch_row_begin]],
-                    n_entries * sizeof(Entry), cudaMemcpyDefault));
+      dh::safe_cuda(
+          cudaMemcpy(entries_.data().get(),
+                     data_vec.data() + offset_vec[row_begin_ + batch_row_begin],
+                     n_entries * sizeof(Entry), cudaMemcpyDefault));
       // copy the weights if necessary
       if (has_weights_) {
+        const auto& weights_vec = info.weights_.HostVector();
         dh::safe_cuda
           (cudaMemcpy(weights_.data().get(),
-                      info.weights_.data() + row_begin_ + batch_row_begin,
+                      weights_vec.data() + row_begin_ + batch_row_begin,
                       batch_nrows * sizeof(bst_float), cudaMemcpyDefault));
       }
 
@@ -310,7 +314,7 @@ struct GPUSketcher {
          row_ptrs_.data().get() + batch_row_begin,
          has_weights_ ? weights_.data().get() : nullptr, entries_.data().get(),
          gpu_batch_nrows_, num_cols_,
-         row_batch.offset[row_begin_ + batch_row_begin], batch_nrows);
+         offset_vec[row_begin_ + batch_row_begin], batch_nrows);
       dh::safe_cuda(cudaGetLastError());       // NOLINT
       dh::safe_cuda(cudaDeviceSynchronize());  // NOLINT
 
@@ -331,10 +335,10 @@ struct GPUSketcher {
     void Sketch(const SparsePage& row_batch, const MetaInfo& info) {
       // copy rows to the device
       dh::safe_cuda(cudaSetDevice(device_));
+      const auto& offset_vec = row_batch.offset.HostVector();
       row_ptrs_.resize(n_rows_ + 1);
-      thrust::copy(row_batch.offset.data() + row_begin_,
-                   row_batch.offset.data() + row_end_ + 1,
-                   row_ptrs_.begin());
+      thrust::copy(offset_vec.data() + row_begin_,
+                   offset_vec.data() + row_end_ + 1, row_ptrs_.begin());
 
       size_t gpu_nbatches = dh::DivRoundUp(n_rows_, gpu_batch_nrows_);
 

diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
@@ -45,7 +45,7 @@ struct HostDeviceVectorImpl {
     void Init(HostDeviceVectorImpl<T>* vec, int device) {
       if (vec_ == nullptr) { vec_ = vec; }
       CHECK_EQ(vec, vec_);
-      device_ = device % dh::NVisibleDevices();
+      device_ = device;
       index_ = vec_->distribution_.devices_.Index(device);
       LazyResize(vec_->Size());
       perm_d_ = vec_->perm_h_.Complementary();
@@ -193,10 +193,19 @@ struct HostDeviceVectorImpl {
   }
 
   common::Span<T> DeviceSpan(int device) {
-    CHECK(devices_.Contains(device));
-    LazySyncDevice(device);
-    return { shards_[devices_.Index(device)].data_.data().get(),
-             static_cast<typename common::Span<T>::index_type>(Size()) };
+    GPUSet devices = distribution_.devices_;
+    CHECK(devices.Contains(device));
+    LazySyncDevice(device, GPUAccess::kWrite);
+    return {shards_[devices.Index(device)].data_.data().get(),
+            static_cast<typename common::Span<T>::index_type>(Size())};
+  }
+
+  common::Span<const T> ConstDeviceSpan(int device) {
+    GPUSet devices = distribution_.devices_;
+    CHECK(devices.Contains(device));
+    LazySyncDevice(device, GPUAccess::kRead);
+    return{ shards_[devices.Index(device)].data_.data().get(),
+      static_cast<typename common::Span<const T>::index_type>(Size()) };
   }
 
   size_t DeviceSize(int device) {
@@ -229,15 +238,10 @@ struct HostDeviceVectorImpl {
 
   void ScatterFrom(thrust::device_ptr<const T> begin, thrust::device_ptr<const T> end) {
     CHECK_EQ(end - begin, Size());
-<<<<<<< HEAD
-    if (on_h_) {
+    if (perm_h_.CanWrite()) {
       dh::safe_cuda(cudaMemcpy(data_h_.data(), begin.get(),
                                (end - begin) * sizeof(T),
                                cudaMemcpyDeviceToHost));
-=======
-    if (perm_h_.CanWrite()) {
-      thrust::copy(begin, end, data_h_.begin());
->>>>>>> Added read-only state for HostDeviceVector sync.
     } else {
       dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
         shard.ScatterFrom(begin.get());
@@ -440,15 +444,19 @@ common::Span<T> HostDeviceVector<T>::DeviceSpan(int device) {
   return impl_->DeviceSpan(device);
 }
 
+template <typename T>
+common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(int device) const {
+  return impl_->ConstDeviceSpan(device);
+}
+
+template <typename T>
 const T* HostDeviceVector<T>::ConstDevicePointer(int device) const {
   return impl_->ConstDevicePointer(device);
 }
 
 template <typename T>
-size_t HostDeviceVector<T>::DeviceStart(int device) { return impl_->DeviceStart(device); }
-
-const T* HostDeviceVector<T>::DevicePointer(int device) const {
-  return impl_->DevicePointer(device);
+size_t HostDeviceVector<T>::DeviceStart(int device) const {
+  return impl_->DeviceStart(device);
 }
 
 template <typename T>
@@ -544,16 +552,8 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
 // explicit instantiations are required, as HostDeviceVector isn't header-only
 template class HostDeviceVector<bst_float>;
 template class HostDeviceVector<GradientPair>;
-<<<<<<< HEAD
-<<<<<<< HEAD
-template class HostDeviceVector<unsigned int>;
 template class HostDeviceVector<int>;
-=======
 template class HostDeviceVector<uint32_t>;
->>>>>>> Fixed linter and test errors.
-=======
-template class HostDeviceVector<int>;
->>>>>>> Fixed explicit template instantiation errors for HostDeviceVector.
 template class HostDeviceVector<Entry>;
 template class HostDeviceVector<size_t>;
 

diff --git a/src/common/host_device_vector.h b/src/common/host_device_vector.h
@@ -194,6 +194,8 @@ class HostDeviceVector {
   GPUSet Devices() const;
   const GPUDistribution& Distribution() const;
   common::Span<T> DeviceSpan(int device);
+  common::Span<const T> ConstDeviceSpan(int device) const;
+  common::Span<const T> DeviceSpan(int device) const { return ConstDeviceSpan(device); }
   T* DevicePointer(int device);
   const T* ConstDevicePointer(int device) const;
   const T* DevicePointer(int device) const { return ConstDevicePointer(device); }

diff --git a/src/objective/hinge.cc b/src/objective/hinge.cc
@@ -38,7 +38,7 @@ class HingeObj : public ObjFunction {
     auto& gpair = out_gpair->HostVector();
 
     for (size_t i = 0; i < preds_h.size(); ++i) {
-      auto y = info.labels_[i] * 2.0 - 1.0;
+      auto y = labels_h[i] * 2.0 - 1.0;
       bst_float p = preds_h[i];
       bst_float w = info.GetWeight(i);
       bst_float g, h;

diff --git a/src/objective/regression_obj_gpu.cu b/src/objective/regression_obj_gpu.cu
@@ -120,14 +120,14 @@ class GPURegLossObj : public ObjFunction {
 #pragma omp parallel for schedule(static, 1) if (devices_.Size() > 1)
     for (int i = 0; i < devices_.Size(); ++i) {
       int d = devices_[i];
-      dh::safe_cuda(cudaSetDevice(d % dh::NVisibleDevices()));
+      dh::safe_cuda(cudaSetDevice(d));
       const int block = 256;
       size_t n = preds.DeviceSize(d);
       if (n > 0) {
         get_gradient_k<Loss><<<dh::DivRoundUp(n, block), block>>>(
             out_gpair->DeviceSpan(d), label_correct_.DeviceSpan(d),
-            preds->DeviceSpan(d), labels_.DeviceSpan(d),
-            info.weights_.Size() > 0 ? weights_.DevicePointer(d) : nullptr, n,
+            preds.ConstDeviceSpan(d), info.labels_.ConstDeviceSpan(d),
+            info.weights_.Size() > 0 ? info.weights_.DevicePointer(d) : nullptr, n,
             param_.scale_pos_weight);
         dh::safe_cuda(cudaGetLastError());
       }
@@ -157,17 +157,12 @@ class GPURegLossObj : public ObjFunction {
 #pragma omp parallel for schedule(static, 1) if (devices_.Size() > 1)
     for (int i = 0; i < devices_.Size(); ++i) {
       int d = devices_[i];
-      dh::safe_cuda(cudaSetDevice(d % dh::NVisibleDevices()));
+      dh::safe_cuda(cudaSetDevice(d));
       const int block = 256;
       size_t n = preds->DeviceSize(d);
       if (n > 0) {
-<<<<<<< HEAD
         pred_transform_k<Loss><<<dh::DivRoundUp(n, block), block>>>(
             preds->DeviceSpan(d), n);
-=======
-        pred_transform_k<Loss><<<dh::DivRoundUp(n, block), block>>>
-          (preds->DevicePointer(d), n);
->>>>>>> Replaced std::vector with HostDeviceVector in MetaInfo and SparsePage.
         dh::safe_cuda(cudaGetLastError());
       }
       dh::safe_cuda(cudaDeviceSynchronize());

diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
@@ -76,8 +76,8 @@ struct DeviceMatrix {
         auto end_itr = begin_itr + batch.Size() + 1;
         IncrementOffset(begin_itr, end_itr, batch.base_rowid);
       }
-      dh::safe_cuda(cudaMemcpy(data.Data() + data_offset, batch.data.data(),
-                               sizeof(Entry) * batch.data.size(),
+      dh::safe_cuda(cudaMemcpy(data.Data() + data_offset, data_vec.data(),
+                               sizeof(Entry) * data_vec.size(),
                                cudaMemcpyHostToDevice));
       // Copy data
       data_offset += batch.data.Size();

diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
@@ -387,11 +387,7 @@ struct DeviceShard {
 
   void InitRowPtrs(const SparsePage& row_batch) {
     dh::safe_cuda(cudaSetDevice(device_idx));
-    thrust::device_vector<float> cuts_d(hmat.cut);
-    thrust::device_vector<size_t> cut_row_ptrs_d(hmat.row_ptr);
-
     auto& offset_vec = row_batch.offset.HostVector();
-    auto& data_vec = row_batch.data.HostVector();
     // find the maximum row size
     thrust::device_vector<size_t> row_ptr_d(
         &offset_vec[row_begin_idx], &offset_vec[row_end_idx + 1]);
@@ -437,6 +433,9 @@ struct DeviceShard {
       (dh::TotalMemory(device_idx) / (16 * row_stride * sizeof(Entry)),
        static_cast<size_t>(n_rows));
 
+    const auto& offset_vec = row_batch.offset.HostVector();
+    const auto& data_vec = row_batch.data.HostVector();
+
     thrust::device_vector<Entry> entries_d(gpu_batch_nrows * row_stride);
 
     size_t gpu_nbatches = dh::DivRoundUp(n_rows, gpu_batch_nrows);

diff --git a/tests/cpp/common/test_host_device_vector.cu b/tests/cpp/common/test_host_device_vector.cu
@@ -52,7 +52,7 @@ void InitHostDeviceVector(size_t n, const GPUDistribution& distribution,
 void PlusOne(HostDeviceVector<int> *v) {
   int n_devices = v->Devices().Size();
   for (int i = 0; i < n_devices; ++i) {
-    dh::safe_cuda(cudaSetDevice(i % dh::NVisibleDevices()));
+    dh::safe_cuda(cudaSetDevice(i));
     thrust::transform(v->tbegin(i), v->tend(i), v->tbegin(i),
                       [=]__device__(unsigned int a){ return a + 1; });
   }
@@ -66,7 +66,7 @@ void CheckDevice(HostDeviceVector<int> *v,
   ASSERT_EQ(v->Devices().Size(), n_devices);
   for (int i = 0; i < n_devices; ++i) {
     ASSERT_EQ(v->DeviceSize(i), sizes.at(i));
-    dh::safe_cuda(cudaSetDevice(i % dh::NVisibleDevices()));
+    dh::safe_cuda(cudaSetDevice(i));
     ASSERT_TRUE(thrust::equal(v->tcbegin(i), v->tcend(i),
                               thrust::make_counting_iterator(first + starts[i])));
     ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kRead));
@@ -76,7 +76,7 @@ void CheckDevice(HostDeviceVector<int> *v,
   ASSERT_EQ(v->HostCanAccess(GPUAccess::kRead), access == GPUAccess::kRead);
   ASSERT_FALSE(v->HostCanAccess(GPUAccess::kWrite));
   for (int i = 0; i < n_devices; ++i) {
-    dh::safe_cuda(cudaSetDevice(i % dh::NVisibleDevices()));
+    dh::safe_cuda(cudaSetDevice(i));
     ASSERT_TRUE(thrust::equal(v->tbegin(i), v->tend(i),
                               thrust::make_counting_iterator(first + starts[i])));
     ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kRead));