diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index 09deb575f2414..a3b252598582b 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -654,10 +654,17 @@ struct GPUContext::Impl { } void AddStreamCallback(const std::function& callback) const { - // TODO(wilber): Do we need ThreadPool? - auto* func = new std::function([this, callback] { + // NOTE(zhiqiu): better use threadpool here, otherwise "std::async" may + // launch too + // many threads and result in thread oversubscription. + auto* callback_func = new std::function(std::move(callback)); + auto* func = new std::function([this, callback_func] { std::lock_guard lock(stream_call_back_mtx_); - last_future_ = std::async(std::launch::deferred, [&]() { callback(); }); + VLOG(4) << "Stream callback"; + last_future_ = std::async(std::launch::async, [callback_func]() { + std::unique_ptr> releaser(callback_func); + (*callback_func)(); + }); }); #ifdef PADDLE_WITH_HIP diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu index 840c8872f50f8..06be592dd9375 100644 --- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu +++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu @@ -395,6 +395,8 @@ struct ConcatFunctor { auto* data_alloc_released = data_alloc.release(); auto* col_alloc_released = col_alloc.release(); context.AddStreamCallback([data_alloc_released, col_alloc_released] { + VLOG(4) << "Delete cuda pinned at " << data_alloc_released; + VLOG(4) << "Delete cuda pinned at " << col_alloc_released; paddle::memory::allocation::Allocator::AllocationDeleter( data_alloc_released); paddle::memory::allocation::Allocator::AllocationDeleter(