diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 919d9af5e629..afe684fd5174 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -206,29 +206,26 @@ __global__ void LaunchNKernel(int device_idx, size_t begin, size_t end,
   }
 }
 
-template <int ITEMS_PER_THREAD = 8, int BLOCK_THREADS = 256, typename L>
-inline void LaunchN(int device_idx, size_t n, L lambda) {
-  if (n == 0) {
-    return;
-  }
-
-  const int GRID_SIZE =
-      static_cast<int>(DivRoundUp(n, ITEMS_PER_THREAD * BLOCK_THREADS));
-  LaunchNKernel<<<GRID_SIZE, BLOCK_THREADS>>>(static_cast<size_t>(0), n,
-                                              lambda);
-}
-
 template <int ITEMS_PER_THREAD = 8, int BLOCK_THREADS = 256, typename L>
 inline void LaunchN(int device_idx, size_t n, cudaStream_t stream, L lambda) {
   if (n == 0) {
     return;
   }
 
+  safe_cuda(cudaSetDevice(device_idx));
+
   const int GRID_SIZE =
       static_cast<int>(DivRoundUp(n, ITEMS_PER_THREAD * BLOCK_THREADS));
   LaunchNKernel<<<GRID_SIZE, BLOCK_THREADS, 0, stream>>>(static_cast<size_t>(0),
                                                          n, lambda);
 }
+
+// Default stream version
+template <int ITEMS_PER_THREAD = 8, int BLOCK_THREADS = 256, typename L>
+inline void LaunchN(int device_idx, size_t n, L lambda) {
+  LaunchN<ITEMS_PER_THREAD, BLOCK_THREADS>(device_idx, n, nullptr, lambda);
+}
+
 /*
  * Memory
  */
@@ -257,6 +254,7 @@ class DVec {
     ptr_ = static_cast<T *>(ptr);
     size_ = size;
     device_idx_ = device_idx;
+    safe_cuda(cudaSetDevice(device_idx_));
   }
 
   DVec() : ptr_(NULL), size_(0), device_idx_(-1) {}
@@ -278,6 +276,7 @@ class DVec {
 
   std::vector<T> AsVector() const {
     std::vector<T> h_vector(Size());
+    safe_cuda(cudaSetDevice(device_idx_));
     safe_cuda(cudaMemcpy(h_vector.data(), ptr_, Size() * sizeof(T),
                          cudaMemcpyDeviceToHost));
     return h_vector;
@@ -314,6 +313,7 @@ class DVec {
       throw std::runtime_error(
           "Cannot copy assign DVec to DVec, sizes are different");
     }
+    safe_cuda(cudaSetDevice(this->DeviceIdx()));
     if (other.DeviceIdx() == this->DeviceIdx()) {
       dh::safe_cuda(cudaMemcpyAsync(this->Data(), other.Data(),
                                other.Size() * sizeof(T),
@@ -331,6 +331,7 @@ class DVec {
 
   template <typename IterT>
   void copy(IterT begin, IterT end) {
+    safe_cuda(cudaSetDevice(this->DeviceIdx()));
     if (end - begin != Size()) {
       LOG(FATAL) << "Cannot copy assign vector to DVec, sizes are different" <<
         " vector::Size(): " << end - begin << " DVec::Size(): " << Size();
@@ -339,6 +340,7 @@ class DVec {
   }
 
   void copy(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
+    safe_cuda(cudaSetDevice(this->DeviceIdx()));
     if (end - begin != Size()) {
       throw std::runtime_error(
           "Cannot copy assign vector to dvec, sizes are different");
@@ -436,6 +438,7 @@ class BulkAllocator {
 
   char *AllocateDevice(int device_idx, size_t bytes, MemoryType t) {
     char *ptr;
+    safe_cuda(cudaSetDevice(device_idx));
     safe_cuda(cudaMalloc(&ptr, bytes));
     return ptr;
   }
@@ -477,6 +480,7 @@ class BulkAllocator {
   ~BulkAllocator() {
     for (size_t i = 0; i < d_ptr_.size(); i++) {
       if (!(d_ptr_[i] == nullptr)) {
+        safe_cuda(cudaSetDevice(device_idx_[i]));
         safe_cuda(cudaFree(d_ptr_[i]));
         d_ptr_[i] = nullptr;
       }
@@ -681,6 +685,7 @@ void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory,
                         OffsetT count, SegmentIterT segments,
                         OffsetT num_segments, FunctionT f) {
   typedef typename cub::CubVector<OffsetT, 2>::Type CoordinateT;
+  dh::safe_cuda(cudaSetDevice(device_idx));
   const int BLOCK_THREADS = 256;
   const int ITEMS_PER_THREAD = 1;
   const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index b0f382a7174e..5a5f8990172a 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -290,6 +290,7 @@ class DeviceHistogram {
   }
 
   void Reset() {
+    dh::safe_cuda(cudaSetDevice(device_id_));
     dh::safe_cuda(cudaMemsetAsync(
         data_.data().get(), 0,
         data_.size() * sizeof(typename decltype(data_)::value_type)));
@@ -307,6 +308,7 @@ class DeviceHistogram {
     if (HistogramExists(nidx)) return;
     size_t current_size = nidx_map_.size() * n_bins_ *
                           2;  // Number of items currently used in data
+    dh::safe_cuda(cudaSetDevice(device_id_));
     if (data_.size() >= kStopGrowingSize) {
       // Recycle histogram memory
       std::pair<int, size_t> old_entry = *nidx_map_.begin();
@@ -451,7 +453,7 @@ void SortPosition(dh::CubMemory* temp_memory, common::Span<int> position,
                   common::Span<int> position_out, common::Span<bst_uint> ridx,
                   common::Span<bst_uint> ridx_out, int left_nidx,
                   int right_nidx, int64_t* d_left_count,
-                  cudaStream_t stream = 0) {
+                  cudaStream_t stream = nullptr) {
   auto d_position_out = position_out.data();
   auto d_position_in = position.data();
   auto d_ridx_out = ridx_out.data();
@@ -579,6 +581,7 @@ struct DeviceShard {
 
   /* Init row_ptrs and row_stride */
   void InitRowPtrs(const SparsePage& row_batch) {
+    dh::safe_cuda(cudaSetDevice(device_id));
     const auto& offset_vec = row_batch.offset.HostVector();
     row_ptrs.resize(n_rows + 1);
     thrust::copy(offset_vec.data() + row_begin_idx,
@@ -636,6 +639,7 @@ struct DeviceShard {
 
   // Reset values for each update iteration
   void Reset(HostDeviceVector<GradientPair>* dh_gpair) {
+    dh::safe_cuda(cudaSetDevice(device_id));
     position.CurrentDVec().Fill(0);
     std::fill(node_sum_gradients.begin(), node_sum_gradients.end(),
               GradientPair());
@@ -660,6 +664,7 @@ struct DeviceShard {
       common::ColumnSampler* column_sampler,
       const std::vector<ValueConstraint>& value_constraints,
       size_t num_columns) {
+    dh::safe_cuda(cudaSetDevice(device_id));
     auto result = pinned_memory.GetSpan<DeviceSplitCandidate>(nidxs.size());
 
     // Work out cub temporary memory requirement
@@ -755,6 +760,7 @@ struct DeviceShard {
                       int64_t split_gidx, bool default_dir_left, bool is_dense,
                       int fidx_begin,  // cut.row_ptr[fidx]
                       int fidx_end) {  // cut.row_ptr[fidx + 1]
+    dh::safe_cuda(cudaSetDevice(device_id));
     Segment segment = ridx_segments[nidx];
     bst_uint* d_ridx = ridx.Current();
     int* d_position = position.Current();
@@ -835,6 +841,7 @@ struct DeviceShard {
   }
 
   void UpdatePredictionCache(bst_float* out_preds_d) {
+    dh::safe_cuda(cudaSetDevice(device_id));
     if (!prediction_cache_initialised) {
       dh::safe_cuda(cudaMemcpyAsync(prediction_cache.Data(), out_preds_d,
                                     prediction_cache.Size() * sizeof(bst_float),
@@ -889,6 +896,7 @@ struct SharedMemHistBuilder : public GPUHistBuilderBase<GradientSumT> {
     if (grid_size <= 0) {
       return;
     }
+    dh::safe_cuda(cudaSetDevice(shard->device_id));
     SharedMemHistKernel<<<grid_size, block_threads, smem_size>>>
         (shard->row_stride, d_ridx, d_gidx, null_gidx_value, d_node_hist.data(), d_gpair,
          segment_begin, n_elements);
@@ -923,6 +931,7 @@ struct GlobalMemHistBuilder : public GPUHistBuilderBase<GradientSumT> {
 template <typename GradientSumT>
 inline void DeviceShard<GradientSumT>::InitCompressedData(
     const common::HistCutMatrix& hmat, const SparsePage& row_batch) {
+  dh::safe_cuda(cudaSetDevice(device_id));
   n_bins = hmat.NumBins();
   null_gidx_value = hmat.NumBins();