Implement transform to reduce CPU/GPU code duplication.

* Implement Transform class. * Add tests for softmax. * Use Transform in regression, softmax and hinge objectives, except for Cox. * Mark old gpu objective functions deprecated. * static_assert for softmax. * Split up multi-gpu tests.
dmlc · Sep 30, 2018 · 4212aba · 4212aba
1 parent baef574
commit 4212aba
Show file tree

Hide file tree

Showing 31 changed files with 1,513 additions and 997 deletions.
diff --git a/src/common/common.cc b/src/common/common.cc
@@ -1,9 +1,11 @@
 /*!
- * Copyright 2015 by Contributors
+ * Copyright 2015-2018 by Contributors
  * \file common.cc
  * \brief Enable all kinds of global variables in common.
  */
 #include <dmlc/thread_local.h>
+
+#include "common.h"
 #include "./random.h"
 
 namespace xgboost {

diff --git a/src/common/common.cu b/src/common/common.cu
@@ -11,7 +11,7 @@ int AllVisibleImpl::AllVisible() {
     // When compiled with CUDA but running on CPU only device,
     // cudaGetDeviceCount will fail.
     dh::safe_cuda(cudaGetDeviceCount(&n_visgpus));
-  } catch(const std::exception& e) {
+  } catch(const thrust::system::system_error& err) {
     return 0;
   }
   return n_visgpus;

diff --git a/src/common/common.h b/src/common/common.h
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2015 by Contributors
+ * Copyright 2015-2018 by Contributors
  * \file common.h
  * \brief Common utilities
  */
@@ -19,6 +19,13 @@
 #if defined(__CUDACC__)
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
+
+#define WITH_CUDA() true
+
+#else
+
+#define WITH_CUDA() false
+
 #endif
 
 namespace dh {
@@ -29,11 +36,11 @@ namespace dh {
 #define safe_cuda(ans) ThrowOnCudaError((ans), __FILE__, __LINE__)
 
 inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file,
-                                       int line) {
+                                    int line) {
   if (code != cudaSuccess) {
-    throw thrust::system_error(code, thrust::cuda_category(),
-                               std::string{file} + "(" +  // NOLINT
-                               std::to_string(line) + ")");
+    LOG(FATAL) << thrust::system_error(code, thrust::cuda_category(),
+                                       std::string{file} + ": " +  // NOLINT
+                                       std::to_string(line)).what();
   }
   return code;
 }
@@ -70,13 +77,13 @@ inline std::string ToString(const T& data) {
  */
 class Range {
  public:
+  using DifferenceType = int64_t;
+
   class Iterator {
     friend class Range;
 
    public:
-    using DifferenceType = int64_t;
-
-    XGBOOST_DEVICE int64_t operator*() const { return i_; }
+    XGBOOST_DEVICE DifferenceType operator*() const { return i_; }
     XGBOOST_DEVICE const Iterator &operator++() {
       i_ += step_;
       return *this;
@@ -97,8 +104,8 @@ class Range {
     XGBOOST_DEVICE void Step(DifferenceType s) { step_ = s; }
 
    protected:
-    XGBOOST_DEVICE explicit Iterator(int64_t start) : i_(start) {}
-    XGBOOST_DEVICE explicit Iterator(int64_t start, int step) :
+    XGBOOST_DEVICE explicit Iterator(DifferenceType start) : i_(start) {}
+    XGBOOST_DEVICE explicit Iterator(DifferenceType start, DifferenceType step) :
         i_{start}, step_{step} {}
 
    public:
@@ -109,9 +116,10 @@ class Range {
   XGBOOST_DEVICE Iterator begin() const { return begin_; }  // NOLINT
   XGBOOST_DEVICE Iterator end() const { return end_; }      // NOLINT
 
-  XGBOOST_DEVICE Range(int64_t begin, int64_t end)
+  XGBOOST_DEVICE Range(DifferenceType begin, DifferenceType end)
       : begin_(begin), end_(end) {}
-  XGBOOST_DEVICE Range(int64_t begin, int64_t end, Iterator::DifferenceType step)
+  XGBOOST_DEVICE Range(DifferenceType begin, DifferenceType end,
+                       DifferenceType step)
       : begin_(begin, step), end_(end) {}
 
   XGBOOST_DEVICE bool operator==(const Range& other) const {
@@ -121,9 +129,7 @@ class Range {
     return !(*this == other);
   }
 
-  XGBOOST_DEVICE void Step(Iterator::DifferenceType s) { begin_.Step(s); }
-
-  XGBOOST_DEVICE Iterator::DifferenceType GetStep() const { return begin_.step_; }
+  XGBOOST_DEVICE void Step(DifferenceType s) { begin_.Step(s); }
 
  private:
   Iterator begin_;

diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
@@ -9,6 +9,7 @@
 #include <xgboost/logging.h>
 
 #include "common.h"
+#include "span.h"
 
 #include <algorithm>
 #include <chrono>
@@ -955,7 +956,7 @@ class SaveCudaContext {
     // cudaGetDevice will fail.
     try {
       safe_cuda(cudaGetDevice(&saved_device_));
-    } catch (thrust::system::system_error & err) {
+    } catch (const thrust::system::system_error & err) {
       saved_device_ = -1;
     }
     func();
@@ -1035,4 +1036,22 @@ ReduceT ReduceShards(std::vector<ShardT> *shards, FunctionT f) {
   };
   return std::accumulate(sums.begin(), sums.end(), ReduceT());
 }
+
+template <typename T,
+  typename IndexT = typename xgboost::common::Span<T>::index_type>
+xgboost::common::Span<T> ToSpan(
+    thrust::device_vector<T>& vec,
+    IndexT offset = 0,
+    IndexT size = -1) {
+  size = size == -1 ? vec.size() : size;
+  CHECK_LE(offset + size, vec.size());
+  return {vec.data().get() + offset, static_cast<IndexT>(size)};
+}
+
+template <typename T>
+xgboost::common::Span<T> ToSpan(thrust::device_vector<T>& vec,
+                                size_t offset, size_t size) {
+  using IndexT = typename xgboost::common::Span<T>::index_type;
+  return ToSpan(vec, static_cast<IndexT>(offset), static_cast<IndexT>(size));
+}
 }  // namespace dh
diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
@@ -116,6 +116,7 @@ struct HostDeviceVectorImpl {
       int ndevices = vec_->distribution_.devices_.Size();
       start_ = vec_->distribution_.ShardStart(new_size, index_);
       proper_size_ = vec_->distribution_.ShardProperSize(new_size, index_);
+      // The size on this device.
       size_t size_d = vec_->distribution_.ShardSize(new_size, index_);
       SetDevice();
       data_.resize(size_d);
@@ -230,15 +231,15 @@ struct HostDeviceVectorImpl {
     CHECK(devices.Contains(device));
     LazySyncDevice(device, GPUAccess::kWrite);
     return {shards_[devices.Index(device)].data_.data().get(),
-            static_cast<typename common::Span<T>::index_type>(DeviceSize(device))};
+        static_cast<typename common::Span<T>::index_type>(DeviceSize(device))};
   }
 
   common::Span<const T> ConstDeviceSpan(int device) {
     GPUSet devices = distribution_.devices_;
     CHECK(devices.Contains(device));
     LazySyncDevice(device, GPUAccess::kRead);
     return {shards_[devices.Index(device)].data_.data().get(),
-      static_cast<typename common::Span<const T>::index_type>(DeviceSize(device))};
+        static_cast<typename common::Span<const T>::index_type>(DeviceSize(device))};
   }
 
   size_t DeviceSize(int device) {
@@ -289,7 +290,6 @@ struct HostDeviceVectorImpl {
                                data_h_.size() * sizeof(T),
                                cudaMemcpyHostToDevice));
     } else {
-      //
       dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.GatherTo(begin); });
     }
   }
@@ -304,14 +304,20 @@ struct HostDeviceVectorImpl {
 
   void Copy(HostDeviceVectorImpl<T>* other) {
     CHECK_EQ(Size(), other->Size());
+    // Data is on host.
     if (perm_h_.CanWrite() && other->perm_h_.CanWrite()) {
       std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin());
-    } else {
-      CHECK(distribution_ == other->distribution_);
-      dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
-          shard.Copy(&other->shards_[i]);
-        });
+      return;
     }
+    // Data is on device;
+    if (distribution_ != other->distribution_) {
+      distribution_ = GPUDistribution();
+      Reshard(other->Distribution());
+      size_d_ = other->size_d_;
+    }
+    dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
+        shard.Copy(&other->shards_[i]);
+      });
   }
 
   void Copy(const std::vector<T>& other) {

diff --git a/src/common/host_device_vector.h b/src/common/host_device_vector.h
@@ -111,8 +111,11 @@ class GPUDistribution {
   }
 
   friend bool operator==(const GPUDistribution& a, const GPUDistribution& b) {
-    return a.devices_ == b.devices_ && a.granularity_ == b.granularity_ &&
-      a.overlap_ == b.overlap_ && a.offsets_ == b.offsets_;
+    bool const res = a.devices_ == b.devices_ &&
+                     a.granularity_ == b.granularity_ &&
+                     a.overlap_ == b.overlap_ &&
+                     a.offsets_ == b.offsets_;
+    return res;
   }
 
   friend bool operator!=(const GPUDistribution& a, const GPUDistribution& b) {

diff --git a/src/common/math.h b/src/common/math.h
@@ -11,6 +11,7 @@
 #include <vector>
 #include <cmath>
 #include <algorithm>
+#include <utility>
 #include "avx_helpers.h"
 
 namespace xgboost {
@@ -29,22 +30,31 @@ inline avx::Float8 Sigmoid(avx::Float8 x) {
 }
 
 /*!
- * \brief do inplace softmax transformaton on p_rec
- * \param p_rec the input/output vector of the values.
+ * \brief Do inplace softmax transformaton on start to end
+ *
+ * \tparam Iterator Input iterator type
+ *
+ * \param start Start iterator of input
+ * \param end end iterator of input
  */
-inline void Softmax(std::vector<float>* p_rec) {
-  std::vector<float> &rec = *p_rec;
-  float wmax = rec[0];
-  for (size_t i = 1; i < rec.size(); ++i) {
-    wmax = std::max(rec[i], wmax);
+template <typename Iterator>
+XGBOOST_DEVICE inline void Softmax(Iterator start, Iterator end) {
+  static_assert(std::is_same<bst_float,
+                typename std::remove_reference<
+                  decltype(std::declval<Iterator>().operator*())>::type
+                >::value,
+                "Values should be of type bst_float");
+  bst_float wmax = *start;
+  for (Iterator i = start+1; i != end; ++i) {
+    wmax = fmaxf(*i, wmax);
   }
   double wsum = 0.0f;
-  for (float & elem : rec) {
-    elem = std::exp(elem - wmax);
-    wsum += elem;
+  for (Iterator i = start; i != end; ++i) {
+    *i = expf(*i - wmax);
+    wsum += *i;
   }
-  for (float & elem : rec) {
-    elem /= static_cast<float>(wsum);
+  for (Iterator i = start; i != end; ++i) {
+    *i /= static_cast<float>(wsum);
   }
 }
 
@@ -56,7 +66,7 @@ inline void Softmax(std::vector<float>* p_rec) {
  * \tparam Iterator The type of the iterator.
  */
 template<typename Iterator>
-inline Iterator FindMaxIndex(Iterator begin, Iterator end) {
+XGBOOST_DEVICE inline Iterator FindMaxIndex(Iterator begin, Iterator end) {
   Iterator maxit = begin;
   for (Iterator it = begin; it != end; ++it) {
     if (*it > *maxit) maxit = it;

diff --git a/src/common/span.h b/src/common/span.h
@@ -49,7 +49,7 @@
  *
  *   https://github.com/Microsoft/GSL/pull/664
  *
- * FIXME: Group these MSVC workarounds into a manageable place.
+ * TODO(trivialfis): Group these MSVC workarounds into a manageable place.
  */
 #if defined(_MSC_VER) && _MSC_VER < 1910
 
@@ -68,7 +68,7 @@ namespace xgboost {
 namespace common {
 
 // Usual logging facility is not available inside device code.
-// FIXME: Make dmlc check more generic.
+// TODO(trivialfis): Make dmlc check more generic.
 #define KERNEL_CHECK(cond)                                      \
   do {                                                          \
     if (!(cond)) {                                              \
@@ -104,11 +104,11 @@ constexpr detail::ptrdiff_t dynamic_extent = -1;  // NOLINT
 
 enum class byte : unsigned char {};  // NOLINT
 
-namespace detail {
-
-template <class ElementType, detail::ptrdiff_t Extent = dynamic_extent>
+template <class ElementType, detail::ptrdiff_t Extent>
 class Span;
 
+namespace detail {
+
 template <typename SpanType, bool IsConst>
 class SpanIterator {
   using ElementType = typename SpanType::element_type;