Skip to content

Commit

Permalink
Add back cudaSetDevice calls
Browse files Browse the repository at this point in the history
  • Loading branch information
RAMitchell committed Mar 13, 2019
1 parent b69b34a commit 62edfb3
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 13 deletions.
29 changes: 17 additions & 12 deletions src/common/device_helpers.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -206,29 +206,26 @@ __global__ void LaunchNKernel(int device_idx, size_t begin, size_t end,
}
}

template <int ITEMS_PER_THREAD = 8, int BLOCK_THREADS = 256, typename L>
inline void LaunchN(int device_idx, size_t n, L lambda) {
if (n == 0) {
return;
}

const int GRID_SIZE =
static_cast<int>(DivRoundUp(n, ITEMS_PER_THREAD * BLOCK_THREADS));
LaunchNKernel<<<GRID_SIZE, BLOCK_THREADS>>>(static_cast<size_t>(0), n,
lambda);
}

template <int ITEMS_PER_THREAD = 8, int BLOCK_THREADS = 256, typename L>
inline void LaunchN(int device_idx, size_t n, cudaStream_t stream, L lambda) {
if (n == 0) {
return;
}

safe_cuda(cudaSetDevice(device_idx));

const int GRID_SIZE =
static_cast<int>(DivRoundUp(n, ITEMS_PER_THREAD * BLOCK_THREADS));
LaunchNKernel<<<GRID_SIZE, BLOCK_THREADS, 0, stream>>>(static_cast<size_t>(0),
n, lambda);
}

// Default stream version
template <int ITEMS_PER_THREAD = 8, int BLOCK_THREADS = 256, typename L>
inline void LaunchN(int device_idx, size_t n, L lambda) {
LaunchN<ITEMS_PER_THREAD, BLOCK_THREADS>(device_idx, n, nullptr, lambda);
}

/*
* Memory
*/
Expand Down Expand Up @@ -257,6 +254,7 @@ class DVec {
ptr_ = static_cast<T *>(ptr);
size_ = size;
device_idx_ = device_idx;
safe_cuda(cudaSetDevice(device_idx_));
}

DVec() : ptr_(NULL), size_(0), device_idx_(-1) {}
Expand All @@ -278,6 +276,7 @@ class DVec {

std::vector<T> AsVector() const {
std::vector<T> h_vector(Size());
safe_cuda(cudaSetDevice(device_idx_));
safe_cuda(cudaMemcpy(h_vector.data(), ptr_, Size() * sizeof(T),
cudaMemcpyDeviceToHost));
return h_vector;
Expand Down Expand Up @@ -314,6 +313,7 @@ class DVec {
throw std::runtime_error(
"Cannot copy assign DVec to DVec, sizes are different");
}
safe_cuda(cudaSetDevice(this->DeviceIdx()));
if (other.DeviceIdx() == this->DeviceIdx()) {
dh::safe_cuda(cudaMemcpyAsync(this->Data(), other.Data(),
other.Size() * sizeof(T),
Expand All @@ -331,6 +331,7 @@ class DVec {

template <typename IterT>
void copy(IterT begin, IterT end) {
safe_cuda(cudaSetDevice(this->DeviceIdx()));
if (end - begin != Size()) {
LOG(FATAL) << "Cannot copy assign vector to DVec, sizes are different" <<
" vector::Size(): " << end - begin << " DVec::Size(): " << Size();
Expand All @@ -339,6 +340,7 @@ class DVec {
}

void copy(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
safe_cuda(cudaSetDevice(this->DeviceIdx()));
if (end - begin != Size()) {
throw std::runtime_error(
"Cannot copy assign vector to dvec, sizes are different");
Expand Down Expand Up @@ -436,6 +438,7 @@ class BulkAllocator {

char *AllocateDevice(int device_idx, size_t bytes, MemoryType t) {
char *ptr;
safe_cuda(cudaSetDevice(device_idx));
safe_cuda(cudaMalloc(&ptr, bytes));
return ptr;
}
Expand Down Expand Up @@ -477,6 +480,7 @@ class BulkAllocator {
~BulkAllocator() {
for (size_t i = 0; i < d_ptr_.size(); i++) {
if (!(d_ptr_[i] == nullptr)) {
safe_cuda(cudaSetDevice(device_idx_[i]));
safe_cuda(cudaFree(d_ptr_[i]));
d_ptr_[i] = nullptr;
}
Expand Down Expand Up @@ -681,6 +685,7 @@ void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory,
OffsetT count, SegmentIterT segments,
OffsetT num_segments, FunctionT f) {
typedef typename cub::CubVector<OffsetT, 2>::Type CoordinateT;
dh::safe_cuda(cudaSetDevice(device_idx));
const int BLOCK_THREADS = 256;
const int ITEMS_PER_THREAD = 1;
const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
Expand Down
11 changes: 10 additions & 1 deletion src/tree/updater_gpu_hist.cu
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@ class DeviceHistogram {
}

void Reset() {
dh::safe_cuda(cudaSetDevice(device_id_));
dh::safe_cuda(cudaMemsetAsync(
data_.data().get(), 0,
data_.size() * sizeof(typename decltype(data_)::value_type)));
Expand All @@ -307,6 +308,7 @@ class DeviceHistogram {
if (HistogramExists(nidx)) return;
size_t current_size = nidx_map_.size() * n_bins_ *
2; // Number of items currently used in data
dh::safe_cuda(cudaSetDevice(device_id_));
if (data_.size() >= kStopGrowingSize) {
// Recycle histogram memory
std::pair<int, size_t> old_entry = *nidx_map_.begin();
Expand Down Expand Up @@ -451,7 +453,7 @@ void SortPosition(dh::CubMemory* temp_memory, common::Span<int> position,
common::Span<int> position_out, common::Span<bst_uint> ridx,
common::Span<bst_uint> ridx_out, int left_nidx,
int right_nidx, int64_t* d_left_count,
cudaStream_t stream = 0) {
cudaStream_t stream = nullptr) {
auto d_position_out = position_out.data();
auto d_position_in = position.data();
auto d_ridx_out = ridx_out.data();
Expand Down Expand Up @@ -579,6 +581,7 @@ struct DeviceShard {

/* Init row_ptrs and row_stride */
void InitRowPtrs(const SparsePage& row_batch) {
dh::safe_cuda(cudaSetDevice(device_id));
const auto& offset_vec = row_batch.offset.HostVector();
row_ptrs.resize(n_rows + 1);
thrust::copy(offset_vec.data() + row_begin_idx,
Expand Down Expand Up @@ -636,6 +639,7 @@ struct DeviceShard {

// Reset values for each update iteration
void Reset(HostDeviceVector<GradientPair>* dh_gpair) {
dh::safe_cuda(cudaSetDevice(device_id));
position.CurrentDVec().Fill(0);
std::fill(node_sum_gradients.begin(), node_sum_gradients.end(),
GradientPair());
Expand All @@ -660,6 +664,7 @@ struct DeviceShard {
common::ColumnSampler* column_sampler,
const std::vector<ValueConstraint>& value_constraints,
size_t num_columns) {
dh::safe_cuda(cudaSetDevice(device_id));
auto result = pinned_memory.GetSpan<DeviceSplitCandidate>(nidxs.size());

// Work out cub temporary memory requirement
Expand Down Expand Up @@ -755,6 +760,7 @@ struct DeviceShard {
int64_t split_gidx, bool default_dir_left, bool is_dense,
int fidx_begin, // cut.row_ptr[fidx]
int fidx_end) { // cut.row_ptr[fidx + 1]
dh::safe_cuda(cudaSetDevice(device_id));
Segment segment = ridx_segments[nidx];
bst_uint* d_ridx = ridx.Current();
int* d_position = position.Current();
Expand Down Expand Up @@ -835,6 +841,7 @@ struct DeviceShard {
}

void UpdatePredictionCache(bst_float* out_preds_d) {
dh::safe_cuda(cudaSetDevice(device_id));
if (!prediction_cache_initialised) {
dh::safe_cuda(cudaMemcpyAsync(prediction_cache.Data(), out_preds_d,
prediction_cache.Size() * sizeof(bst_float),
Expand Down Expand Up @@ -889,6 +896,7 @@ struct SharedMemHistBuilder : public GPUHistBuilderBase<GradientSumT> {
if (grid_size <= 0) {
return;
}
dh::safe_cuda(cudaSetDevice(shard->device_id));
SharedMemHistKernel<<<grid_size, block_threads, smem_size>>>
(shard->row_stride, d_ridx, d_gidx, null_gidx_value, d_node_hist.data(), d_gpair,
segment_begin, n_elements);
Expand Down Expand Up @@ -923,6 +931,7 @@ struct GlobalMemHistBuilder : public GPUHistBuilderBase<GradientSumT> {
template <typename GradientSumT>
inline void DeviceShard<GradientSumT>::InitCompressedData(
const common::HistCutMatrix& hmat, const SparsePage& row_batch) {
dh::safe_cuda(cudaSetDevice(device_id));
n_bins = hmat.NumBins();
null_gidx_value = hmat.NumBins();

Expand Down

0 comments on commit 62edfb3

Please sign in to comment.