diff --git a/libs/pika/CMakeLists.txt b/libs/pika/CMakeLists.txt index 1d16d4a76..9db14e4d9 100644 --- a/libs/pika/CMakeLists.txt +++ b/libs/pika/CMakeLists.txt @@ -16,7 +16,6 @@ set(_pika_modules assertion async_base async_cuda - async_cuda_base async_mpi command_line_handling concepts diff --git a/libs/pika/async_cuda/CMakeLists.txt b/libs/pika/async_cuda/CMakeLists.txt index 530080a36..e9c349a2d 100644 --- a/libs/pika/async_cuda/CMakeLists.txt +++ b/libs/pika/async_cuda/CMakeLists.txt @@ -10,17 +10,39 @@ endif() list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") +# Default location is $PIKA_ROOT/libs/async_cuda/include set(async_cuda_headers + pika/async_cuda/cublas_exception.hpp + pika/async_cuda/cublas_handle.hpp + pika/async_cuda/cuda_device_scope.hpp + pika/async_cuda/cuda_event.hpp pika/async_cuda/cuda_polling_helper.hpp pika/async_cuda/cuda_pool.hpp pika/async_cuda/cuda_scheduler.hpp pika/async_cuda/cuda_scheduler_bulk.hpp + pika/async_cuda/cuda_stream.hpp + pika/async_cuda/cusolver_exception.hpp + pika/async_cuda/cusolver_handle.hpp + pika/async_cuda/custom_blas_api.hpp + pika/async_cuda/custom_lapack_api.hpp + pika/async_cuda/detail/cuda_debug.hpp pika/async_cuda/detail/cuda_event_callback.hpp pika/async_cuda/then_on_host.hpp pika/async_cuda/then_with_stream.hpp ) -set(async_cuda_sources cuda_event_callback.cpp cuda_pool.cpp cuda_scheduler.cpp) +set(async_cuda_sources + cublas_exception.cpp + cublas_handle.cpp + cuda_device_scope.cpp + cuda_event_callback.cpp + cuda_pool.cpp + cuda_scheduler.cpp + cuda_stream.cpp + cusolver_exception.cpp + cusolver_handle.cpp + then_with_stream.cpp +) if(PIKA_WITH_HIP) set(async_cuda_extra_deps roc::rocblas roc::rocsolver) @@ -37,18 +59,17 @@ pika_add_module( SOURCES ${async_cuda_sources} HEADERS ${async_cuda_headers} MODULE_DEPENDENCIES + pika_allocator_support pika_assertion pika_async_base - pika_async_cuda_base pika_concurrency pika_config - pika_coroutines + pika_debugging pika_errors - pika_execution pika_execution_base + pika_memory pika_runtime pika_threading_base - pika_topology DEPENDENCIES ${async_cuda_extra_deps} CMAKE_SUBDIRS examples tests ) diff --git a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cublas_exception.hpp b/libs/pika/async_cuda/include/pika/async_cuda/cublas_exception.hpp similarity index 94% rename from libs/pika/async_cuda_base/include/pika/async_cuda_base/cublas_exception.hpp rename to libs/pika/async_cuda/include/pika/async_cuda/cublas_exception.hpp index 67beb782c..bf0c10a6d 100644 --- a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cublas_exception.hpp +++ b/libs/pika/async_cuda/include/pika/async_cuda/cublas_exception.hpp @@ -7,7 +7,7 @@ #pragma once #include -#include +#include #include #include diff --git a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cublas_handle.hpp b/libs/pika/async_cuda/include/pika/async_cuda/cublas_handle.hpp similarity index 95% rename from libs/pika/async_cuda_base/include/pika/async_cuda_base/cublas_handle.hpp rename to libs/pika/async_cuda/include/pika/async_cuda/cublas_handle.hpp index 48b0eda5c..31aa1ba34 100644 --- a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cublas_handle.hpp +++ b/libs/pika/async_cuda/include/pika/async_cuda/cublas_handle.hpp @@ -7,8 +7,8 @@ #pragma once #include -#include -#include +#include +#include #include #include diff --git a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cuda_device_scope.hpp b/libs/pika/async_cuda/include/pika/async_cuda/cuda_device_scope.hpp similarity index 100% rename from libs/pika/async_cuda_base/include/pika/async_cuda_base/cuda_device_scope.hpp rename to libs/pika/async_cuda/include/pika/async_cuda/cuda_device_scope.hpp diff --git a/libs/pika/async_cuda/include/pika/async_cuda/cuda_event.hpp b/libs/pika/async_cuda/include/pika/async_cuda/cuda_event.hpp new file mode 100644 index 000000000..2477f8d51 --- /dev/null +++ b/libs/pika/async_cuda/include/pika/async_cuda/cuda_event.hpp @@ -0,0 +1,72 @@ +// Copyright (c) 2020 John Biddiscombe +// Copyright (c) 2020 Teodor Nikolov +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#pragma once + +#include +#include + +namespace pika::cuda::experimental::detail { + + // a pool of cudaEvent_t objects. + // Since allocation of a cuda event passes into the cuda runtime + // it might be an expensive operation, so we pre-allocate a pool + // of them at startup. + struct cuda_event_pool + { + static constexpr int initial_events_in_pool = 128; + + static cuda_event_pool& get_event_pool() + { + static cuda_event_pool event_pool_; + return event_pool_; + } + + // create a bunch of events on initialization + cuda_event_pool() + : free_list_(initial_events_in_pool) + { + for (int i = 0; i < initial_events_in_pool; ++i) { add_event_to_pool(); } + } + + // on destruction, all objects in stack will be freed + ~cuda_event_pool() + { + whip::event_t event; + bool ok = true; + while (ok) + { + ok = free_list_.pop(event); + if (ok) whip::event_destroy(event); + } + } + + inline bool pop(whip::event_t& event) + { + // pop an event off the pool, if that fails, create a new one + while (!free_list_.pop(event)) { add_event_to_pool(); } + return true; + } + + inline bool push(whip::event_t event) { return free_list_.push(event); } + + private: + void add_event_to_pool() + { + whip::event_t event; + // Create an cuda_event to query a CUDA/CUBLAS kernel for completion. + // Timing is disabled for performance. [1] + // + // [1]: CUDA Runtime API, section 5.5 cuda_event Management + whip::event_create_with_flags(&event, whip::event_disable_timing); + free_list_.push(event); + } + + // pool is dynamically sized and can grow if needed + boost::lockfree::stack> free_list_; + }; +} // namespace pika::cuda::experimental::detail diff --git a/libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp b/libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp index 86e1cabd8..fe8459d2d 100644 --- a/libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp +++ b/libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp @@ -7,9 +7,7 @@ #pragma once #include -#include -#include -#include +#include #include #include @@ -18,43 +16,10 @@ #include #include #include -#include #include #include namespace pika::cuda::experimental { - class locked_cublas_handle - { - cublas_handle& handle; - std::unique_lock mutex; - - public: - PIKA_EXPORT locked_cublas_handle( - cublas_handle& handle, std::unique_lock&& mutex); - locked_cublas_handle(locked_cublas_handle&&) = delete; - locked_cublas_handle(locked_cublas_handle const&) = delete; - locked_cublas_handle& operator=(locked_cublas_handle&&) = delete; - locked_cublas_handle& operator=(locked_cublas_handle const&) = delete; - - PIKA_EXPORT cublas_handle const& get() noexcept; - }; - - class locked_cusolver_handle - { - cusolver_handle& handle; - std::unique_lock mutex; - - public: - PIKA_EXPORT locked_cusolver_handle( - cusolver_handle& handle, std::unique_lock&& mutex); - locked_cusolver_handle(locked_cusolver_handle&&) = delete; - locked_cusolver_handle(locked_cusolver_handle const&) = delete; - locked_cusolver_handle& operator=(locked_cusolver_handle&&) = delete; - locked_cusolver_handle& operator=(locked_cusolver_handle const&) = delete; - - PIKA_EXPORT cusolver_handle const& get() noexcept; - }; - /// A pool of CUDA streams, used for scheduling work on a CUDA device. /// /// The pool initializes a set of CUDA (thread-local) streams on @@ -82,48 +47,11 @@ namespace pika::cuda::experimental { PIKA_EXPORT cuda_stream const& get_next_stream(); }; - struct cublas_handles_holder - { - std::size_t const concurrency; - std::vector unsynchronized_handles; - std::atomic synchronized_handle_index; - std::vector synchronized_handles; - std::vector handle_mutexes; - - PIKA_EXPORT cublas_handles_holder(); - cublas_handles_holder(cublas_handles_holder&&) = delete; - cublas_handles_holder(cublas_handles_holder const&) = delete; - cublas_handles_holder& operator=(cublas_handles_holder&&) = delete; - cublas_handles_holder& operator=(cublas_handles_holder const&) = delete; - - PIKA_EXPORT locked_cublas_handle get_locked_handle( - cuda_stream const& stream, cublasPointerMode_t pointer_mode); - }; - - struct cusolver_handles_holder - { - std::size_t const concurrency; - std::vector unsynchronized_handles; - std::atomic synchronized_handle_index; - std::vector synchronized_handles; - std::vector handle_mutexes; - - PIKA_EXPORT cusolver_handles_holder(); - cusolver_handles_holder(cusolver_handles_holder&&) = delete; - cusolver_handles_holder(cusolver_handles_holder const&) = delete; - cusolver_handles_holder& operator=(cusolver_handles_holder&&) = delete; - cusolver_handles_holder& operator=(cusolver_handles_holder const&) = delete; - - PIKA_EXPORT locked_cusolver_handle get_locked_handle(cuda_stream const& stream); - }; - struct pool_data { int device; streams_holder normal_priority_streams; streams_holder high_priority_streams; - cublas_handles_holder cublas_handles; - cusolver_handles_holder cusolver_handles; PIKA_EXPORT pool_data(int device, std::size_t num_normal_priority_streams_per_thread, std::size_t num_high_priority_streams_per_thread, unsigned int flags); @@ -152,9 +80,6 @@ namespace pika::cuda::experimental { PIKA_EXPORT explicit operator bool() noexcept; PIKA_EXPORT cuda_stream const& get_next_stream( pika::execution::thread_priority priority = pika::execution::thread_priority::normal); - PIKA_EXPORT locked_cublas_handle get_cublas_handle( - cuda_stream const& stream, cublasPointerMode_t pointer_mode); - PIKA_EXPORT locked_cusolver_handle get_cusolver_handle(cuda_stream const& stream); /// \cond NOINTERNAL friend bool operator==(cuda_pool const& lhs, cuda_pool const& rhs) noexcept diff --git a/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler.hpp b/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler.hpp index 4d0c787cc..33fc052a4 100644 --- a/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler.hpp +++ b/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler.hpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include @@ -40,9 +40,6 @@ namespace pika::cuda::experimental { PIKA_EXPORT cuda_pool const& get_pool() const noexcept; PIKA_EXPORT cuda_stream const& get_next_stream(); - PIKA_EXPORT locked_cublas_handle get_cublas_handle( - cuda_stream const& stream, cublasPointerMode_t pointer_mode); - PIKA_EXPORT locked_cusolver_handle get_cusolver_handle(cuda_stream const& stream); /// \cond NOINTERNAL friend bool operator==(cuda_scheduler const& lhs, cuda_scheduler const& rhs) diff --git a/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler_bulk.hpp b/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler_bulk.hpp index 20cc78950..80c281282 100644 --- a/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler_bulk.hpp +++ b/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler_bulk.hpp @@ -8,8 +8,8 @@ #include #include +#include #include -#include #include #include #include diff --git a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cuda_stream.hpp b/libs/pika/async_cuda/include/pika/async_cuda/cuda_stream.hpp similarity index 100% rename from libs/pika/async_cuda_base/include/pika/async_cuda_base/cuda_stream.hpp rename to libs/pika/async_cuda/include/pika/async_cuda/cuda_stream.hpp diff --git a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cusolver_exception.hpp b/libs/pika/async_cuda/include/pika/async_cuda/cusolver_exception.hpp similarity index 94% rename from libs/pika/async_cuda_base/include/pika/async_cuda_base/cusolver_exception.hpp rename to libs/pika/async_cuda/include/pika/async_cuda/cusolver_exception.hpp index 85add9d52..5e7b37d3e 100644 --- a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cusolver_exception.hpp +++ b/libs/pika/async_cuda/include/pika/async_cuda/cusolver_exception.hpp @@ -8,7 +8,7 @@ #include #if defined(PIKA_HAVE_GPU_SUPPORT) -# include +# include # include # include diff --git a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cusolver_handle.hpp b/libs/pika/async_cuda/include/pika/async_cuda/cusolver_handle.hpp similarity index 95% rename from libs/pika/async_cuda_base/include/pika/async_cuda_base/cusolver_handle.hpp rename to libs/pika/async_cuda/include/pika/async_cuda/cusolver_handle.hpp index 302492501..fe62be58e 100644 --- a/libs/pika/async_cuda_base/include/pika/async_cuda_base/cusolver_handle.hpp +++ b/libs/pika/async_cuda/include/pika/async_cuda/cusolver_handle.hpp @@ -8,8 +8,8 @@ #include #if defined(PIKA_HAVE_GPU_SUPPORT) -# include -# include +# include +# include # include # include diff --git a/libs/pika/async_cuda_base/include/pika/async_cuda_base/custom_blas_api.hpp b/libs/pika/async_cuda/include/pika/async_cuda/custom_blas_api.hpp similarity index 100% rename from libs/pika/async_cuda_base/include/pika/async_cuda_base/custom_blas_api.hpp rename to libs/pika/async_cuda/include/pika/async_cuda/custom_blas_api.hpp diff --git a/libs/pika/async_cuda_base/include/pika/async_cuda_base/custom_lapack_api.hpp b/libs/pika/async_cuda/include/pika/async_cuda/custom_lapack_api.hpp similarity index 100% rename from libs/pika/async_cuda_base/include/pika/async_cuda_base/custom_lapack_api.hpp rename to libs/pika/async_cuda/include/pika/async_cuda/custom_lapack_api.hpp diff --git a/libs/pika/async_cuda_base/include/pika/async_cuda_base/detail/cuda_debug.hpp b/libs/pika/async_cuda/include/pika/async_cuda/detail/cuda_debug.hpp similarity index 100% rename from libs/pika/async_cuda_base/include/pika/async_cuda_base/detail/cuda_debug.hpp rename to libs/pika/async_cuda/include/pika/async_cuda/detail/cuda_debug.hpp diff --git a/libs/pika/async_cuda/include/pika/async_cuda/detail/cuda_event_callback.hpp b/libs/pika/async_cuda/include/pika/async_cuda/detail/cuda_event_callback.hpp index d7fb0f6e4..7719feace 100644 --- a/libs/pika/async_cuda/include/pika/async_cuda/detail/cuda_event_callback.hpp +++ b/libs/pika/async_cuda/include/pika/async_cuda/detail/cuda_event_callback.hpp @@ -15,7 +15,7 @@ #pragma once #include -#include +#include #include #include diff --git a/libs/pika/async_cuda/include/pika/async_cuda/then_with_stream.hpp b/libs/pika/async_cuda/include/pika/async_cuda/then_with_stream.hpp index 58329a67f..4633a5bb9 100644 --- a/libs/pika/async_cuda/include/pika/async_cuda/then_with_stream.hpp +++ b/libs/pika/async_cuda/include/pika/async_cuda/then_with_stream.hpp @@ -8,12 +8,12 @@ #include #include +#include #include +#include +#include +#include #include -#include -#include -#include -#include #include #include #include @@ -35,23 +35,29 @@ #include namespace pika::cuda::experimental::then_with_stream_detail { + PIKA_EXPORT pika::cuda::experimental::cublas_handle const& get_thread_local_cublas_handle( + cuda_stream const& stream, cublasPointerMode_t pointer_mode); + template - auto invoke_with_thread_local_cublas_handle(cuda_scheduler& sched, cuda_stream const& stream, - cublasPointerMode_t pointer_mode, F&& f, Ts&&... ts) + auto invoke_with_thread_local_cublas_handle( + cuda_stream const& stream, cublasPointerMode_t pointer_mode, F&& f, Ts&&... ts) -> decltype(PIKA_INVOKE( PIKA_FORWARD(F, f), std::declval(), PIKA_FORWARD(Ts, ts)...)) { - auto locked_handle = sched.get_cublas_handle(stream, pointer_mode); - return PIKA_INVOKE(PIKA_FORWARD(F, f), locked_handle.get().get(), PIKA_FORWARD(Ts, ts)...); + return PIKA_INVOKE(PIKA_FORWARD(F, f), + get_thread_local_cublas_handle(stream, pointer_mode).get(), PIKA_FORWARD(Ts, ts)...); } + PIKA_EXPORT pika::cuda::experimental::cusolver_handle const& get_thread_local_cusolver_handle( + cuda_stream const& stream); + template - auto invoke_with_thread_local_cusolver_handle(cuda_scheduler& sched, cuda_stream const& stream, - F&& f, Ts&&... ts) -> decltype(PIKA_INVOKE(PIKA_FORWARD(F, f), - std::declval(), PIKA_FORWARD(Ts, ts)...)) + auto invoke_with_thread_local_cusolver_handle(cuda_stream const& stream, F&& f, Ts&&... ts) + -> decltype(PIKA_INVOKE( + PIKA_FORWARD(F, f), std::declval(), PIKA_FORWARD(Ts, ts)...)) { - auto locked_handle = sched.get_cusolver_handle(stream); - return PIKA_INVOKE(PIKA_FORWARD(F, f), locked_handle.get().get(), PIKA_FORWARD(Ts, ts)...); + return PIKA_INVOKE(PIKA_FORWARD(F, f), get_thread_local_cusolver_handle(stream).get(), + PIKA_FORWARD(Ts, ts)...); } template @@ -151,11 +157,11 @@ namespace pika::cuda::experimental::then_with_stream_detail { #if defined(PIKA_HAVE_STDEXEC) template - requires std::is_invocable_v>...> using invoke_result_helper = pika::execution::experimental::completion_signatures>...>>>; using completion_signatures = @@ -171,7 +177,7 @@ namespace pika::cuda::experimental::then_with_stream_detail { template