Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor DeviceUVector. #10595

Merged
merged 3 commits into from
Jul 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/common/device_helpers.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -510,7 +510,7 @@ xgboost::common::Span<T> ToSpan(thrust::device_vector<T>& vec,

template <typename T>
xgboost::common::Span<T> ToSpan(DeviceUVector<T> &vec) {
return {thrust::raw_pointer_cast(vec.data()), vec.size()};
return {vec.data(), vec.size()};
}

// thrust begin, similiar to std::begin
Expand Down
71 changes: 44 additions & 27 deletions src/common/device_vector.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -284,47 +284,64 @@ class LoggingResource : public rmm::mr::device_memory_resource {

LoggingResource *GlobalLoggingResource();

#endif // defined(XGBOOST_USE_RMM)

/**
* @brief Container class that doesn't initialize the data.
* @brief Container class that doesn't initialize the data when RMM is used.
*/
template <typename T>
class DeviceUVector : public rmm::device_uvector<T> {
using Super = rmm::device_uvector<T>;
class DeviceUVector {
private:
#if defined(XGBOOST_USE_RMM)
rmm::device_uvector<T> data_{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()};
#else
::dh::device_vector<T> data_;
#endif // defined(XGBOOST_USE_RMM)

public:
DeviceUVector() : Super{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()} {}
using value_type = T; // NOLINT
using pointer = value_type *; // NOLINT
using const_pointer = value_type const *; // NOLINT
using reference = value_type &; // NOLINT
using const_reference = value_type const &; // NOLINT

void Resize(std::size_t n) { Super::resize(n, rmm::cuda_stream_per_thread); }
void Resize(std::size_t n, T const &v) {
public:
DeviceUVector() = default;
DeviceUVector(DeviceUVector const &that) = delete;
DeviceUVector &operator=(DeviceUVector const &that) = delete;
DeviceUVector(DeviceUVector &&that) = default;
DeviceUVector &operator=(DeviceUVector &&that) = default;

void resize(std::size_t n) { // NOLINT
#if defined(XGBOOST_USE_RMM)
data_.resize(n, rmm::cuda_stream_per_thread);
#else
data_.resize(n);
#endif
}
void resize(std::size_t n, T const &v) { // NOLINT
#if defined(XGBOOST_USE_RMM)
auto orig = this->size();
Super::resize(n, rmm::cuda_stream_per_thread);
data_.resize(n, rmm::cuda_stream_per_thread);
if (orig < n) {
thrust::fill(rmm::exec_policy_nosync{}, this->begin() + orig, this->end(), v);
}
#else
data_.resize(n, v);
#endif
}
[[nodiscard]] std::size_t size() const { return data_.size(); } // NOLINT

private:
// undefined private, cannot be accessed.
void resize(std::size_t n, rmm::cuda_stream_view stream); // NOLINT
};

#else
[[nodiscard]] auto begin() { return data_.begin(); } // NOLINT
[[nodiscard]] auto end() { return data_.end(); } // NOLINT

/**
* @brief Without RMM, the initialization will happen.
*/
template <typename T>
class DeviceUVector : public thrust::device_vector<T, XGBDeviceAllocator<T>> {
using Super = thrust::device_vector<T, XGBDeviceAllocator<T>>;
[[nodiscard]] auto begin() const { return this->cbegin(); } // NOLINT
[[nodiscard]] auto end() const { return this->cend(); } // NOLINT

public:
void Resize(std::size_t n) { Super::resize(n); }
void Resize(std::size_t n, T const &v) { Super::resize(n, v); }
[[nodiscard]] auto cbegin() const { return data_.cbegin(); } // NOLINT
[[nodiscard]] auto cend() const { return data_.cend(); } // NOLINT

private:
// undefined private, cannot be accessed.
void resize(std::size_t n, T const &v = T{}); // NOLINT
[[nodiscard]] auto data() { return thrust::raw_pointer_cast(data_.data()); } // NOLINT
[[nodiscard]] auto data() const { return thrust::raw_pointer_cast(data_.data()); } // NOLINT
};

#endif // defined(XGBOOST_USE_RMM)
} // namespace dh
27 changes: 12 additions & 15 deletions src/common/host_device_vector.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class HostDeviceVectorImpl {
if (device.IsCUDA()) {
gpu_access_ = GPUAccess::kWrite;
SetDevice();
data_d_->Resize(size, v);
data_d_->resize(size, v);
} else {
data_h_.resize(size, v);
}
Expand Down Expand Up @@ -67,12 +67,12 @@ class HostDeviceVectorImpl {

T* DevicePointer() {
LazySyncDevice(GPUAccess::kWrite);
return thrust::raw_pointer_cast(data_d_->data());
return data_d_->data();
}

const T* ConstDevicePointer() {
LazySyncDevice(GPUAccess::kRead);
return thrust::raw_pointer_cast(data_d_->data());
return data_d_->data();
}

common::Span<T> DeviceSpan() {
Expand Down Expand Up @@ -181,7 +181,7 @@ class HostDeviceVectorImpl {
gpu_access_ = GPUAccess::kWrite;
SetDevice();
auto old_size = data_d_->size();
data_d_->Resize(new_size, std::forward<U>(args)...);
data_d_->resize(new_size, std::forward<U>(args)...);
} else {
// resize on host
LazySyncHost(GPUAccess::kNone);
Expand All @@ -200,8 +200,8 @@ class HostDeviceVectorImpl {
gpu_access_ = access;
if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); }
SetDevice();
dh::safe_cuda(cudaMemcpy(data_h_.data(), thrust::raw_pointer_cast(data_d_->data()),
data_d_->size() * sizeof(T), cudaMemcpyDeviceToHost));
dh::safe_cuda(cudaMemcpy(data_h_.data(), data_d_->data(), data_d_->size() * sizeof(T),
cudaMemcpyDeviceToHost));
}

void LazySyncDevice(GPUAccess access) {
Expand All @@ -214,9 +214,8 @@ class HostDeviceVectorImpl {
// data is on the host
LazyResizeDevice(data_h_.size());
SetDevice();
dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), data_h_.data(),
data_d_->size() * sizeof(T), cudaMemcpyHostToDevice,
dh::DefaultStream()));
dh::safe_cuda(cudaMemcpyAsync(data_d_->data(), data_h_.data(), data_d_->size() * sizeof(T),
cudaMemcpyHostToDevice, dh::DefaultStream()));
gpu_access_ = access;
}

Expand All @@ -241,8 +240,7 @@ class HostDeviceVectorImpl {
LazyResizeDevice(Size());
gpu_access_ = GPUAccess::kWrite;
SetDevice();
dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()),
thrust::raw_pointer_cast(other->data_d_->data()),
dh::safe_cuda(cudaMemcpyAsync(data_d_->data(), other->data_d_->data(),
data_d_->size() * sizeof(T), cudaMemcpyDefault,
dh::DefaultStream()));
}
Expand All @@ -252,15 +250,14 @@ class HostDeviceVectorImpl {
LazyResizeDevice(Size());
gpu_access_ = GPUAccess::kWrite;
SetDevice();
dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), begin,
data_d_->size() * sizeof(T), cudaMemcpyDefault,
dh::DefaultStream()));
dh::safe_cuda(cudaMemcpyAsync(data_d_->data(), begin, data_d_->size() * sizeof(T),
cudaMemcpyDefault, dh::DefaultStream()));
}

void LazyResizeDevice(size_t new_size) {
if (data_d_ && new_size == data_d_->size()) { return; }
SetDevice();
data_d_->Resize(new_size);
data_d_->resize(new_size);
}

void SetDevice() {
Expand Down
2 changes: 1 addition & 1 deletion tests/cpp/common/test_device_vector.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ TEST(DeviceUVector, Basic) {
std::int32_t verbosity{3};
std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
DeviceUVector<float> uvec;
uvec.Resize(12);
uvec.resize(12);
auto peak = GlobalMemoryLogger().PeakMemory();
auto n_bytes = sizeof(decltype(uvec)::value_type) * uvec.size();
ASSERT_EQ(peak, n_bytes);
Expand Down
Loading