diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 1754c9507036..98a76d72a263 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -510,7 +510,7 @@ xgboost::common::Span ToSpan(thrust::device_vector& vec, template xgboost::common::Span ToSpan(DeviceUVector &vec) { - return {thrust::raw_pointer_cast(vec.data()), vec.size()}; + return {vec.data(), vec.size()}; } // thrust begin, similiar to std::begin diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh index 35386856cc9c..2587ce719780 100644 --- a/src/common/device_vector.cuh +++ b/src/common/device_vector.cuh @@ -284,47 +284,64 @@ class LoggingResource : public rmm::mr::device_memory_resource { LoggingResource *GlobalLoggingResource(); +#endif // defined(XGBOOST_USE_RMM) + /** - * @brief Container class that doesn't initialize the data. + * @brief Container class that doesn't initialize the data when RMM is used. */ template -class DeviceUVector : public rmm::device_uvector { - using Super = rmm::device_uvector; +class DeviceUVector { + private: +#if defined(XGBOOST_USE_RMM) + rmm::device_uvector data_{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()}; +#else + ::dh::device_vector data_; +#endif // defined(XGBOOST_USE_RMM) public: - DeviceUVector() : Super{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()} {} + using value_type = T; // NOLINT + using pointer = value_type *; // NOLINT + using const_pointer = value_type const *; // NOLINT + using reference = value_type &; // NOLINT + using const_reference = value_type const &; // NOLINT - void Resize(std::size_t n) { Super::resize(n, rmm::cuda_stream_per_thread); } - void Resize(std::size_t n, T const &v) { + public: + DeviceUVector() = default; + DeviceUVector(DeviceUVector const &that) = delete; + DeviceUVector &operator=(DeviceUVector const &that) = delete; + DeviceUVector(DeviceUVector &&that) = default; + DeviceUVector &operator=(DeviceUVector &&that) = default; + + void resize(std::size_t n) { // NOLINT +#if defined(XGBOOST_USE_RMM) + data_.resize(n, rmm::cuda_stream_per_thread); +#else + data_.resize(n); +#endif + } + void resize(std::size_t n, T const &v) { // NOLINT +#if defined(XGBOOST_USE_RMM) auto orig = this->size(); - Super::resize(n, rmm::cuda_stream_per_thread); + data_.resize(n, rmm::cuda_stream_per_thread); if (orig < n) { thrust::fill(rmm::exec_policy_nosync{}, this->begin() + orig, this->end(), v); } +#else + data_.resize(n, v); +#endif } + [[nodiscard]] std::size_t size() const { return data_.size(); } // NOLINT - private: - // undefined private, cannot be accessed. - void resize(std::size_t n, rmm::cuda_stream_view stream); // NOLINT -}; - -#else + [[nodiscard]] auto begin() { return data_.begin(); } // NOLINT + [[nodiscard]] auto end() { return data_.end(); } // NOLINT -/** - * @brief Without RMM, the initialization will happen. - */ -template -class DeviceUVector : public thrust::device_vector> { - using Super = thrust::device_vector>; + [[nodiscard]] auto begin() const { return this->cbegin(); } // NOLINT + [[nodiscard]] auto end() const { return this->cend(); } // NOLINT - public: - void Resize(std::size_t n) { Super::resize(n); } - void Resize(std::size_t n, T const &v) { Super::resize(n, v); } + [[nodiscard]] auto cbegin() const { return data_.cbegin(); } // NOLINT + [[nodiscard]] auto cend() const { return data_.cend(); } // NOLINT - private: - // undefined private, cannot be accessed. - void resize(std::size_t n, T const &v = T{}); // NOLINT + [[nodiscard]] auto data() { return thrust::raw_pointer_cast(data_.data()); } // NOLINT + [[nodiscard]] auto data() const { return thrust::raw_pointer_cast(data_.data()); } // NOLINT }; - -#endif // defined(XGBOOST_USE_RMM) } // namespace dh diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu index 16a1aa027f09..00055ec69a7e 100644 --- a/src/common/host_device_vector.cu +++ b/src/common/host_device_vector.cu @@ -29,7 +29,7 @@ class HostDeviceVectorImpl { if (device.IsCUDA()) { gpu_access_ = GPUAccess::kWrite; SetDevice(); - data_d_->Resize(size, v); + data_d_->resize(size, v); } else { data_h_.resize(size, v); } @@ -67,12 +67,12 @@ class HostDeviceVectorImpl { T* DevicePointer() { LazySyncDevice(GPUAccess::kWrite); - return thrust::raw_pointer_cast(data_d_->data()); + return data_d_->data(); } const T* ConstDevicePointer() { LazySyncDevice(GPUAccess::kRead); - return thrust::raw_pointer_cast(data_d_->data()); + return data_d_->data(); } common::Span DeviceSpan() { @@ -181,7 +181,7 @@ class HostDeviceVectorImpl { gpu_access_ = GPUAccess::kWrite; SetDevice(); auto old_size = data_d_->size(); - data_d_->Resize(new_size, std::forward(args)...); + data_d_->resize(new_size, std::forward(args)...); } else { // resize on host LazySyncHost(GPUAccess::kNone); @@ -200,8 +200,8 @@ class HostDeviceVectorImpl { gpu_access_ = access; if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); } SetDevice(); - dh::safe_cuda(cudaMemcpy(data_h_.data(), thrust::raw_pointer_cast(data_d_->data()), - data_d_->size() * sizeof(T), cudaMemcpyDeviceToHost)); + dh::safe_cuda(cudaMemcpy(data_h_.data(), data_d_->data(), data_d_->size() * sizeof(T), + cudaMemcpyDeviceToHost)); } void LazySyncDevice(GPUAccess access) { @@ -214,9 +214,8 @@ class HostDeviceVectorImpl { // data is on the host LazyResizeDevice(data_h_.size()); SetDevice(); - dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), data_h_.data(), - data_d_->size() * sizeof(T), cudaMemcpyHostToDevice, - dh::DefaultStream())); + dh::safe_cuda(cudaMemcpyAsync(data_d_->data(), data_h_.data(), data_d_->size() * sizeof(T), + cudaMemcpyHostToDevice, dh::DefaultStream())); gpu_access_ = access; } @@ -241,8 +240,7 @@ class HostDeviceVectorImpl { LazyResizeDevice(Size()); gpu_access_ = GPUAccess::kWrite; SetDevice(); - dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), - thrust::raw_pointer_cast(other->data_d_->data()), + dh::safe_cuda(cudaMemcpyAsync(data_d_->data(), other->data_d_->data(), data_d_->size() * sizeof(T), cudaMemcpyDefault, dh::DefaultStream())); } @@ -252,15 +250,14 @@ class HostDeviceVectorImpl { LazyResizeDevice(Size()); gpu_access_ = GPUAccess::kWrite; SetDevice(); - dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), begin, - data_d_->size() * sizeof(T), cudaMemcpyDefault, - dh::DefaultStream())); + dh::safe_cuda(cudaMemcpyAsync(data_d_->data(), begin, data_d_->size() * sizeof(T), + cudaMemcpyDefault, dh::DefaultStream())); } void LazyResizeDevice(size_t new_size) { if (data_d_ && new_size == data_d_->size()) { return; } SetDevice(); - data_d_->Resize(new_size); + data_d_->resize(new_size); } void SetDevice() { diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu index 95da4ef3f167..c6a8c0ab95ce 100644 --- a/tests/cpp/common/test_device_vector.cu +++ b/tests/cpp/common/test_device_vector.cu @@ -12,7 +12,7 @@ TEST(DeviceUVector, Basic) { std::int32_t verbosity{3}; std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity); DeviceUVector uvec; - uvec.Resize(12); + uvec.resize(12); auto peak = GlobalMemoryLogger().PeakMemory(); auto n_bytes = sizeof(decltype(uvec)::value_type) * uvec.size(); ASSERT_EQ(peak, n_bytes);