diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 919d9af5e629..afe684fd5174 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -206,29 +206,26 @@ __global__ void LaunchNKernel(int device_idx, size_t begin, size_t end, } } -template -inline void LaunchN(int device_idx, size_t n, L lambda) { - if (n == 0) { - return; - } - - const int GRID_SIZE = - static_cast(DivRoundUp(n, ITEMS_PER_THREAD * BLOCK_THREADS)); - LaunchNKernel<<>>(static_cast(0), n, - lambda); -} - template inline void LaunchN(int device_idx, size_t n, cudaStream_t stream, L lambda) { if (n == 0) { return; } + safe_cuda(cudaSetDevice(device_idx)); + const int GRID_SIZE = static_cast(DivRoundUp(n, ITEMS_PER_THREAD * BLOCK_THREADS)); LaunchNKernel<<>>(static_cast(0), n, lambda); } + +// Default stream version +template +inline void LaunchN(int device_idx, size_t n, L lambda) { + LaunchN(device_idx, n, nullptr, lambda); +} + /* * Memory */ @@ -257,6 +254,7 @@ class DVec { ptr_ = static_cast(ptr); size_ = size; device_idx_ = device_idx; + safe_cuda(cudaSetDevice(device_idx_)); } DVec() : ptr_(NULL), size_(0), device_idx_(-1) {} @@ -278,6 +276,7 @@ class DVec { std::vector AsVector() const { std::vector h_vector(Size()); + safe_cuda(cudaSetDevice(device_idx_)); safe_cuda(cudaMemcpy(h_vector.data(), ptr_, Size() * sizeof(T), cudaMemcpyDeviceToHost)); return h_vector; @@ -314,6 +313,7 @@ class DVec { throw std::runtime_error( "Cannot copy assign DVec to DVec, sizes are different"); } + safe_cuda(cudaSetDevice(this->DeviceIdx())); if (other.DeviceIdx() == this->DeviceIdx()) { dh::safe_cuda(cudaMemcpyAsync(this->Data(), other.Data(), other.Size() * sizeof(T), @@ -331,6 +331,7 @@ class DVec { template void copy(IterT begin, IterT end) { + safe_cuda(cudaSetDevice(this->DeviceIdx())); if (end - begin != Size()) { LOG(FATAL) << "Cannot copy assign vector to DVec, sizes are different" << " vector::Size(): " << end - begin << " DVec::Size(): " << Size(); @@ -339,6 +340,7 @@ class DVec { } void copy(thrust::device_ptr begin, thrust::device_ptr end) { + safe_cuda(cudaSetDevice(this->DeviceIdx())); if (end - begin != Size()) { throw std::runtime_error( "Cannot copy assign vector to dvec, sizes are different"); @@ -436,6 +438,7 @@ class BulkAllocator { char *AllocateDevice(int device_idx, size_t bytes, MemoryType t) { char *ptr; + safe_cuda(cudaSetDevice(device_idx)); safe_cuda(cudaMalloc(&ptr, bytes)); return ptr; } @@ -477,6 +480,7 @@ class BulkAllocator { ~BulkAllocator() { for (size_t i = 0; i < d_ptr_.size(); i++) { if (!(d_ptr_[i] == nullptr)) { + safe_cuda(cudaSetDevice(device_idx_[i])); safe_cuda(cudaFree(d_ptr_[i])); d_ptr_[i] = nullptr; } @@ -681,6 +685,7 @@ void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory, OffsetT count, SegmentIterT segments, OffsetT num_segments, FunctionT f) { typedef typename cub::CubVector::Type CoordinateT; + dh::safe_cuda(cudaSetDevice(device_idx)); const int BLOCK_THREADS = 256; const int ITEMS_PER_THREAD = 1; const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD; diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index b0f382a7174e..5a5f8990172a 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -290,6 +290,7 @@ class DeviceHistogram { } void Reset() { + dh::safe_cuda(cudaSetDevice(device_id_)); dh::safe_cuda(cudaMemsetAsync( data_.data().get(), 0, data_.size() * sizeof(typename decltype(data_)::value_type))); @@ -307,6 +308,7 @@ class DeviceHistogram { if (HistogramExists(nidx)) return; size_t current_size = nidx_map_.size() * n_bins_ * 2; // Number of items currently used in data + dh::safe_cuda(cudaSetDevice(device_id_)); if (data_.size() >= kStopGrowingSize) { // Recycle histogram memory std::pair old_entry = *nidx_map_.begin(); @@ -451,7 +453,7 @@ void SortPosition(dh::CubMemory* temp_memory, common::Span position, common::Span position_out, common::Span ridx, common::Span ridx_out, int left_nidx, int right_nidx, int64_t* d_left_count, - cudaStream_t stream = 0) { + cudaStream_t stream = nullptr) { auto d_position_out = position_out.data(); auto d_position_in = position.data(); auto d_ridx_out = ridx_out.data(); @@ -579,6 +581,7 @@ struct DeviceShard { /* Init row_ptrs and row_stride */ void InitRowPtrs(const SparsePage& row_batch) { + dh::safe_cuda(cudaSetDevice(device_id)); const auto& offset_vec = row_batch.offset.HostVector(); row_ptrs.resize(n_rows + 1); thrust::copy(offset_vec.data() + row_begin_idx, @@ -636,6 +639,7 @@ struct DeviceShard { // Reset values for each update iteration void Reset(HostDeviceVector* dh_gpair) { + dh::safe_cuda(cudaSetDevice(device_id)); position.CurrentDVec().Fill(0); std::fill(node_sum_gradients.begin(), node_sum_gradients.end(), GradientPair()); @@ -660,6 +664,7 @@ struct DeviceShard { common::ColumnSampler* column_sampler, const std::vector& value_constraints, size_t num_columns) { + dh::safe_cuda(cudaSetDevice(device_id)); auto result = pinned_memory.GetSpan(nidxs.size()); // Work out cub temporary memory requirement @@ -755,6 +760,7 @@ struct DeviceShard { int64_t split_gidx, bool default_dir_left, bool is_dense, int fidx_begin, // cut.row_ptr[fidx] int fidx_end) { // cut.row_ptr[fidx + 1] + dh::safe_cuda(cudaSetDevice(device_id)); Segment segment = ridx_segments[nidx]; bst_uint* d_ridx = ridx.Current(); int* d_position = position.Current(); @@ -835,6 +841,7 @@ struct DeviceShard { } void UpdatePredictionCache(bst_float* out_preds_d) { + dh::safe_cuda(cudaSetDevice(device_id)); if (!prediction_cache_initialised) { dh::safe_cuda(cudaMemcpyAsync(prediction_cache.Data(), out_preds_d, prediction_cache.Size() * sizeof(bst_float), @@ -889,6 +896,7 @@ struct SharedMemHistBuilder : public GPUHistBuilderBase { if (grid_size <= 0) { return; } + dh::safe_cuda(cudaSetDevice(shard->device_id)); SharedMemHistKernel<<>> (shard->row_stride, d_ridx, d_gidx, null_gidx_value, d_node_hist.data(), d_gpair, segment_begin, n_elements); @@ -923,6 +931,7 @@ struct GlobalMemHistBuilder : public GPUHistBuilderBase { template inline void DeviceShard::InitCompressedData( const common::HistCutMatrix& hmat, const SparsePage& row_batch) { + dh::safe_cuda(cudaSetDevice(device_id)); n_bins = hmat.NumBins(); null_gidx_value = hmat.NumBins();