Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lanuch function for unifying CPU and GPU code. [Reopen] #3643

Merged
merged 1 commit into from
Oct 2, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/common/common.cc
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
/*!
* Copyright 2015 by Contributors
* Copyright 2015-2018 by Contributors
* \file common.cc
* \brief Enable all kinds of global variables in common.
*/
#include <dmlc/thread_local.h>

#include "common.h"
#include "./random.h"

namespace xgboost {
Expand Down
2 changes: 1 addition & 1 deletion src/common/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ int AllVisibleImpl::AllVisible() {
// When compiled with CUDA but running on CPU only device,
// cudaGetDeviceCount will fail.
dh::safe_cuda(cudaGetDeviceCount(&n_visgpus));
} catch(const std::exception& e) {
} catch(const thrust::system::system_error& err) {
return 0;
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you check the return value of cudaGetDeviceCount() instead of catching all exceptions?

Copy link
Member Author

@trivialfis trivialfis Sep 26, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The try/catch is for XGBoost compiled with CUDA but running on CPU, in which case the cudaGetDeviceCount will fail and we return 0 as default.
I will make some note about that.

return n_visgpus;
Expand Down
36 changes: 21 additions & 15 deletions src/common/common.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*!
* Copyright 2015 by Contributors
* Copyright 2015-2018 by Contributors
* \file common.h
* \brief Common utilities
*/
Expand All @@ -19,6 +19,13 @@
#if defined(__CUDACC__)
#include <thrust/system/cuda/error.h>
#include <thrust/system_error.h>

#define WITH_CUDA() true
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't there a #define in xgboost already that does exactly this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you mean XGBOOST_USE_CUDA, it's a definition from CMake, which doesn't indicate whether this translation unit is being compiled by nvcc.


#else

#define WITH_CUDA() false

#endif

namespace dh {
Expand All @@ -29,11 +36,11 @@ namespace dh {
#define safe_cuda(ans) ThrowOnCudaError((ans), __FILE__, __LINE__)

inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file,
int line) {
int line) {
if (code != cudaSuccess) {
throw thrust::system_error(code, thrust::cuda_category(),
std::string{file} + "(" + // NOLINT
std::to_string(line) + ")");
LOG(FATAL) << thrust::system_error(code, thrust::cuda_category(),
std::string{file} + ": " + // NOLINT
std::to_string(line)).what();
}
return code;
}
Expand Down Expand Up @@ -70,13 +77,13 @@ inline std::string ToString(const T& data) {
*/
class Range {
public:
using DifferenceType = int64_t;

class Iterator {
friend class Range;

public:
using DifferenceType = int64_t;

XGBOOST_DEVICE int64_t operator*() const { return i_; }
XGBOOST_DEVICE DifferenceType operator*() const { return i_; }
XGBOOST_DEVICE const Iterator &operator++() {
i_ += step_;
return *this;
Expand All @@ -97,8 +104,8 @@ class Range {
XGBOOST_DEVICE void Step(DifferenceType s) { step_ = s; }

protected:
XGBOOST_DEVICE explicit Iterator(int64_t start) : i_(start) {}
XGBOOST_DEVICE explicit Iterator(int64_t start, int step) :
XGBOOST_DEVICE explicit Iterator(DifferenceType start) : i_(start) {}
XGBOOST_DEVICE explicit Iterator(DifferenceType start, DifferenceType step) :
i_{start}, step_{step} {}

public:
Expand All @@ -109,9 +116,10 @@ class Range {
XGBOOST_DEVICE Iterator begin() const { return begin_; } // NOLINT
XGBOOST_DEVICE Iterator end() const { return end_; } // NOLINT

XGBOOST_DEVICE Range(int64_t begin, int64_t end)
XGBOOST_DEVICE Range(DifferenceType begin, DifferenceType end)
: begin_(begin), end_(end) {}
XGBOOST_DEVICE Range(int64_t begin, int64_t end, Iterator::DifferenceType step)
XGBOOST_DEVICE Range(DifferenceType begin, DifferenceType end,
DifferenceType step)
: begin_(begin, step), end_(end) {}

XGBOOST_DEVICE bool operator==(const Range& other) const {
Expand All @@ -121,9 +129,7 @@ class Range {
return !(*this == other);
}

XGBOOST_DEVICE void Step(Iterator::DifferenceType s) { begin_.Step(s); }

XGBOOST_DEVICE Iterator::DifferenceType GetStep() const { return begin_.step_; }
XGBOOST_DEVICE void Step(DifferenceType s) { begin_.Step(s); }

private:
Iterator begin_;
Expand Down
21 changes: 20 additions & 1 deletion src/common/device_helpers.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <xgboost/logging.h>

#include "common.h"
#include "span.h"

#include <algorithm>
#include <chrono>
Expand Down Expand Up @@ -955,7 +956,7 @@ class SaveCudaContext {
// cudaGetDevice will fail.
try {
safe_cuda(cudaGetDevice(&saved_device_));
} catch (thrust::system::system_error & err) {
} catch (const thrust::system::system_error & err) {
saved_device_ = -1;
}
func();
Expand Down Expand Up @@ -1035,4 +1036,22 @@ ReduceT ReduceShards(std::vector<ShardT> *shards, FunctionT f) {
};
return std::accumulate(sums.begin(), sums.end(), ReduceT());
}

template <typename T,
typename IndexT = typename xgboost::common::Span<T>::index_type>
xgboost::common::Span<T> ToSpan(
thrust::device_vector<T>& vec,
IndexT offset = 0,
IndexT size = -1) {
size = size == -1 ? vec.size() : size;
CHECK_LE(offset + size, vec.size());
return {vec.data().get() + offset, static_cast<IndexT>(size)};
}

template <typename T>
xgboost::common::Span<T> ToSpan(thrust::device_vector<T>& vec,
size_t offset, size_t size) {
using IndexT = typename xgboost::common::Span<T>::index_type;
return ToSpan(vec, static_cast<IndexT>(offset), static_cast<IndexT>(size));
}
} // namespace dh
22 changes: 14 additions & 8 deletions src/common/host_device_vector.cu
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ struct HostDeviceVectorImpl {
int ndevices = vec_->distribution_.devices_.Size();
start_ = vec_->distribution_.ShardStart(new_size, index_);
proper_size_ = vec_->distribution_.ShardProperSize(new_size, index_);
// The size on this device.
size_t size_d = vec_->distribution_.ShardSize(new_size, index_);
SetDevice();
data_.resize(size_d);
Expand Down Expand Up @@ -230,15 +231,15 @@ struct HostDeviceVectorImpl {
CHECK(devices.Contains(device));
LazySyncDevice(device, GPUAccess::kWrite);
return {shards_[devices.Index(device)].data_.data().get(),
static_cast<typename common::Span<T>::index_type>(DeviceSize(device))};
static_cast<typename common::Span<T>::index_type>(DeviceSize(device))};
}

common::Span<const T> ConstDeviceSpan(int device) {
GPUSet devices = distribution_.devices_;
CHECK(devices.Contains(device));
LazySyncDevice(device, GPUAccess::kRead);
return {shards_[devices.Index(device)].data_.data().get(),
static_cast<typename common::Span<const T>::index_type>(DeviceSize(device))};
static_cast<typename common::Span<const T>::index_type>(DeviceSize(device))};
}

size_t DeviceSize(int device) {
Expand Down Expand Up @@ -289,7 +290,6 @@ struct HostDeviceVectorImpl {
data_h_.size() * sizeof(T),
cudaMemcpyHostToDevice));
} else {
//
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.GatherTo(begin); });
}
}
Expand All @@ -304,14 +304,20 @@ struct HostDeviceVectorImpl {

void Copy(HostDeviceVectorImpl<T>* other) {
CHECK_EQ(Size(), other->Size());
// Data is on host.
if (perm_h_.CanWrite() && other->perm_h_.CanWrite()) {
std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin());
} else {
CHECK(distribution_ == other->distribution_);
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
shard.Copy(&other->shards_[i]);
});
return;
}
// Data is on device;
if (distribution_ != other->distribution_) {
distribution_ = GPUDistribution();
Reshard(other->Distribution());
size_d_ = other->size_d_;
}
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
shard.Copy(&other->shards_[i]);
});
}

void Copy(const std::vector<T>& other) {
Expand Down
7 changes: 5 additions & 2 deletions src/common/host_device_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,11 @@ class GPUDistribution {
}

friend bool operator==(const GPUDistribution& a, const GPUDistribution& b) {
return a.devices_ == b.devices_ && a.granularity_ == b.granularity_ &&
a.overlap_ == b.overlap_ && a.offsets_ == b.offsets_;
bool const res = a.devices_ == b.devices_ &&
a.granularity_ == b.granularity_ &&
a.overlap_ == b.overlap_ &&
a.offsets_ == b.offsets_;
return res;
}

friend bool operator!=(const GPUDistribution& a, const GPUDistribution& b) {
Expand Down
36 changes: 23 additions & 13 deletions src/common/math.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <vector>
#include <cmath>
#include <algorithm>
#include <utility>
#include "avx_helpers.h"

namespace xgboost {
Expand All @@ -29,22 +30,31 @@ inline avx::Float8 Sigmoid(avx::Float8 x) {
}

/*!
* \brief do inplace softmax transformaton on p_rec
* \param p_rec the input/output vector of the values.
* \brief Do inplace softmax transformaton on start to end
*
* \tparam Iterator Input iterator type
*
* \param start Start iterator of input
* \param end end iterator of input
*/
inline void Softmax(std::vector<float>* p_rec) {
std::vector<float> &rec = *p_rec;
float wmax = rec[0];
for (size_t i = 1; i < rec.size(); ++i) {
wmax = std::max(rec[i], wmax);
template <typename Iterator>
XGBOOST_DEVICE inline void Softmax(Iterator start, Iterator end) {
static_assert(std::is_same<bst_float,
typename std::remove_reference<
decltype(std::declval<Iterator>().operator*())>::type
>::value,
"Values should be of type bst_float");
bst_float wmax = *start;
for (Iterator i = start+1; i != end; ++i) {
wmax = fmaxf(*i, wmax);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Btw, this is a single-precision intrinsic, as is expf. You might want to point out that Iterator must refer to single-precision values.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, let me try a static_assert.

}
double wsum = 0.0f;
for (float & elem : rec) {
elem = std::exp(elem - wmax);
wsum += elem;
for (Iterator i = start; i != end; ++i) {
*i = expf(*i - wmax);
wsum += *i;
}
for (float & elem : rec) {
elem /= static_cast<float>(wsum);
for (Iterator i = start; i != end; ++i) {
*i /= static_cast<float>(wsum);
}
}

Expand All @@ -56,7 +66,7 @@ inline void Softmax(std::vector<float>* p_rec) {
* \tparam Iterator The type of the iterator.
*/
template<typename Iterator>
inline Iterator FindMaxIndex(Iterator begin, Iterator end) {
XGBOOST_DEVICE inline Iterator FindMaxIndex(Iterator begin, Iterator end) {
Iterator maxit = begin;
for (Iterator it = begin; it != end; ++it) {
if (*it > *maxit) maxit = it;
Expand Down
10 changes: 5 additions & 5 deletions src/common/span.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
*
* https://github.com/Microsoft/GSL/pull/664
*
* FIXME: Group these MSVC workarounds into a manageable place.
* TODO(trivialfis): Group these MSVC workarounds into a manageable place.
*/
#if defined(_MSC_VER) && _MSC_VER < 1910

Expand All @@ -68,7 +68,7 @@ namespace xgboost {
namespace common {

// Usual logging facility is not available inside device code.
// FIXME: Make dmlc check more generic.
// TODO(trivialfis): Make dmlc check more generic.
#define KERNEL_CHECK(cond) \
do { \
if (!(cond)) { \
Expand Down Expand Up @@ -104,11 +104,11 @@ constexpr detail::ptrdiff_t dynamic_extent = -1; // NOLINT

enum class byte : unsigned char {}; // NOLINT

namespace detail {

template <class ElementType, detail::ptrdiff_t Extent = dynamic_extent>
template <class ElementType, detail::ptrdiff_t Extent>
class Span;

namespace detail {

template <typename SpanType, bool IsConst>
class SpanIterator {
using ElementType = typename SpanType::element_type;
Expand Down
Loading