From 8c72e52f0e8c61acd423a875201d00c47b27a21f Mon Sep 17 00:00:00 2001 From: Mikael Simberg Date: Tue, 6 Feb 2024 18:54:24 +0000 Subject: [PATCH] Merge pull request #1017 from msimberg/fix-cublas-handles-cuda-pool Unrevert #872 --- libs/pika/CMakeLists.txt | 1 + libs/pika/async_cuda/CMakeLists.txt | 31 +---- .../include/pika/async_cuda/cuda_event.hpp | 72 ----------- .../include/pika/async_cuda/cuda_pool.hpp | 77 +++++++++++- .../pika/async_cuda/cuda_scheduler.hpp | 5 +- .../pika/async_cuda/cuda_scheduler_bulk.hpp | 2 +- .../async_cuda/detail/cuda_event_callback.hpp | 2 +- .../pika/async_cuda/then_with_stream.hpp | 72 ++++++----- .../async_cuda/src/cuda_event_callback.cpp | 8 +- libs/pika/async_cuda/src/cuda_pool.cpp | 114 +++++++++++++++++- libs/pika/async_cuda/src/cuda_scheduler.cpp | 11 ++ libs/pika/async_cuda/src/then_with_stream.cpp | 37 ------ .../pika/async_cuda/tests/unit/CMakeLists.txt | 7 +- .../tests/unit/cuda_device_reset.cpp | 81 +++++++++++++ libs/pika/async_cuda_base/CMakeLists.txt | 54 +++++++++ libs/pika/async_cuda_base/README.rst | 13 ++ libs/pika/async_cuda_base/docs/index.rst | 14 +++ .../async_cuda_base/examples/CMakeLists.txt | 18 +++ .../async_cuda_base}/cublas_exception.hpp | 2 +- .../pika/async_cuda_base}/cublas_handle.hpp | 4 +- .../async_cuda_base}/cuda_device_scope.hpp | 0 .../pika/async_cuda_base/cuda_event.hpp | 44 +++++++ .../pika/async_cuda_base}/cuda_stream.hpp | 0 .../async_cuda_base}/cusolver_exception.hpp | 2 +- .../pika/async_cuda_base}/cusolver_handle.hpp | 4 +- .../pika/async_cuda_base}/custom_blas_api.hpp | 0 .../async_cuda_base}/custom_lapack_api.hpp | 0 .../async_cuda_base}/detail/cuda_debug.hpp | 0 .../src/cublas_exception.cpp | 4 +- .../src/cublas_handle.cpp | 10 +- .../src/cuda_device_scope.cpp | 2 +- libs/pika/async_cuda_base/src/cuda_event.cpp | 73 +++++++++++ .../src/cuda_stream.cpp | 4 +- .../src/cusolver_exception.cpp | 4 +- .../src/cusolver_handle.cpp | 10 +- .../pika/async_cuda_base/tests/CMakeLists.txt | 44 +++++++ .../tests/performance/CMakeLists.txt | 5 + .../tests/regressions/CMakeLists.txt | 5 + .../async_cuda_base/tests/unit/CMakeLists.txt | 34 ++++++ .../tests/unit/cublas_handle.cu | 0 .../tests/unit/cuda_stream.cu | 0 .../tests/unit/cusolver_handle.cu | 0 libs/pika/include/include/pika/cuda.hpp | 1 + libs/pika/runtime/CMakeLists.txt | 4 + libs/pika/runtime/src/runtime.cpp | 8 ++ 45 files changed, 674 insertions(+), 209 deletions(-) delete mode 100644 libs/pika/async_cuda/include/pika/async_cuda/cuda_event.hpp delete mode 100644 libs/pika/async_cuda/src/then_with_stream.cpp create mode 100644 libs/pika/async_cuda/tests/unit/cuda_device_reset.cpp create mode 100644 libs/pika/async_cuda_base/CMakeLists.txt create mode 100644 libs/pika/async_cuda_base/README.rst create mode 100644 libs/pika/async_cuda_base/docs/index.rst create mode 100644 libs/pika/async_cuda_base/examples/CMakeLists.txt rename libs/pika/{async_cuda/include/pika/async_cuda => async_cuda_base/include/pika/async_cuda_base}/cublas_exception.hpp (94%) rename libs/pika/{async_cuda/include/pika/async_cuda => async_cuda_base/include/pika/async_cuda_base}/cublas_handle.hpp (95%) rename libs/pika/{async_cuda/include/pika/async_cuda => async_cuda_base/include/pika/async_cuda_base}/cuda_device_scope.hpp (100%) create mode 100644 libs/pika/async_cuda_base/include/pika/async_cuda_base/cuda_event.hpp rename libs/pika/{async_cuda/include/pika/async_cuda => async_cuda_base/include/pika/async_cuda_base}/cuda_stream.hpp (100%) rename libs/pika/{async_cuda/include/pika/async_cuda => async_cuda_base/include/pika/async_cuda_base}/cusolver_exception.hpp (94%) rename libs/pika/{async_cuda/include/pika/async_cuda => async_cuda_base/include/pika/async_cuda_base}/cusolver_handle.hpp (95%) rename libs/pika/{async_cuda/include/pika/async_cuda => async_cuda_base/include/pika/async_cuda_base}/custom_blas_api.hpp (100%) rename libs/pika/{async_cuda/include/pika/async_cuda => async_cuda_base/include/pika/async_cuda_base}/custom_lapack_api.hpp (100%) rename libs/pika/{async_cuda/include/pika/async_cuda => async_cuda_base/include/pika/async_cuda_base}/detail/cuda_debug.hpp (100%) rename libs/pika/{async_cuda => async_cuda_base}/src/cublas_exception.cpp (97%) rename libs/pika/{async_cuda => async_cuda_base}/src/cublas_handle.cpp (91%) rename libs/pika/{async_cuda => async_cuda_base}/src/cuda_device_scope.cpp (92%) create mode 100644 libs/pika/async_cuda_base/src/cuda_event.cpp rename libs/pika/{async_cuda => async_cuda_base}/src/cuda_stream.cpp (96%) rename libs/pika/{async_cuda => async_cuda_base}/src/cusolver_exception.cpp (97%) rename libs/pika/{async_cuda => async_cuda_base}/src/cusolver_handle.cpp (90%) create mode 100644 libs/pika/async_cuda_base/tests/CMakeLists.txt create mode 100644 libs/pika/async_cuda_base/tests/performance/CMakeLists.txt create mode 100644 libs/pika/async_cuda_base/tests/regressions/CMakeLists.txt create mode 100644 libs/pika/async_cuda_base/tests/unit/CMakeLists.txt rename libs/pika/{async_cuda => async_cuda_base}/tests/unit/cublas_handle.cu (100%) rename libs/pika/{async_cuda => async_cuda_base}/tests/unit/cuda_stream.cu (100%) rename libs/pika/{async_cuda => async_cuda_base}/tests/unit/cusolver_handle.cu (100%) diff --git a/libs/pika/CMakeLists.txt b/libs/pika/CMakeLists.txt index 9db14e4d9..1d16d4a76 100644 --- a/libs/pika/CMakeLists.txt +++ b/libs/pika/CMakeLists.txt @@ -16,6 +16,7 @@ set(_pika_modules assertion async_base async_cuda + async_cuda_base async_mpi command_line_handling concepts diff --git a/libs/pika/async_cuda/CMakeLists.txt b/libs/pika/async_cuda/CMakeLists.txt index e9c349a2d..530080a36 100644 --- a/libs/pika/async_cuda/CMakeLists.txt +++ b/libs/pika/async_cuda/CMakeLists.txt @@ -10,39 +10,17 @@ endif() list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") -# Default location is $PIKA_ROOT/libs/async_cuda/include set(async_cuda_headers - pika/async_cuda/cublas_exception.hpp - pika/async_cuda/cublas_handle.hpp - pika/async_cuda/cuda_device_scope.hpp - pika/async_cuda/cuda_event.hpp pika/async_cuda/cuda_polling_helper.hpp pika/async_cuda/cuda_pool.hpp pika/async_cuda/cuda_scheduler.hpp pika/async_cuda/cuda_scheduler_bulk.hpp - pika/async_cuda/cuda_stream.hpp - pika/async_cuda/cusolver_exception.hpp - pika/async_cuda/cusolver_handle.hpp - pika/async_cuda/custom_blas_api.hpp - pika/async_cuda/custom_lapack_api.hpp - pika/async_cuda/detail/cuda_debug.hpp pika/async_cuda/detail/cuda_event_callback.hpp pika/async_cuda/then_on_host.hpp pika/async_cuda/then_with_stream.hpp ) -set(async_cuda_sources - cublas_exception.cpp - cublas_handle.cpp - cuda_device_scope.cpp - cuda_event_callback.cpp - cuda_pool.cpp - cuda_scheduler.cpp - cuda_stream.cpp - cusolver_exception.cpp - cusolver_handle.cpp - then_with_stream.cpp -) +set(async_cuda_sources cuda_event_callback.cpp cuda_pool.cpp cuda_scheduler.cpp) if(PIKA_WITH_HIP) set(async_cuda_extra_deps roc::rocblas roc::rocsolver) @@ -59,17 +37,18 @@ pika_add_module( SOURCES ${async_cuda_sources} HEADERS ${async_cuda_headers} MODULE_DEPENDENCIES - pika_allocator_support pika_assertion pika_async_base + pika_async_cuda_base pika_concurrency pika_config - pika_debugging + pika_coroutines pika_errors + pika_execution pika_execution_base - pika_memory pika_runtime pika_threading_base + pika_topology DEPENDENCIES ${async_cuda_extra_deps} CMAKE_SUBDIRS examples tests ) diff --git a/libs/pika/async_cuda/include/pika/async_cuda/cuda_event.hpp b/libs/pika/async_cuda/include/pika/async_cuda/cuda_event.hpp deleted file mode 100644 index 2477f8d51..000000000 --- a/libs/pika/async_cuda/include/pika/async_cuda/cuda_event.hpp +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) 2020 John Biddiscombe -// Copyright (c) 2020 Teodor Nikolov -// -// SPDX-License-Identifier: BSL-1.0 -// Distributed under the Boost Software License, Version 1.0. (See accompanying -// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - -#pragma once - -#include -#include - -namespace pika::cuda::experimental::detail { - - // a pool of cudaEvent_t objects. - // Since allocation of a cuda event passes into the cuda runtime - // it might be an expensive operation, so we pre-allocate a pool - // of them at startup. - struct cuda_event_pool - { - static constexpr int initial_events_in_pool = 128; - - static cuda_event_pool& get_event_pool() - { - static cuda_event_pool event_pool_; - return event_pool_; - } - - // create a bunch of events on initialization - cuda_event_pool() - : free_list_(initial_events_in_pool) - { - for (int i = 0; i < initial_events_in_pool; ++i) { add_event_to_pool(); } - } - - // on destruction, all objects in stack will be freed - ~cuda_event_pool() - { - whip::event_t event; - bool ok = true; - while (ok) - { - ok = free_list_.pop(event); - if (ok) whip::event_destroy(event); - } - } - - inline bool pop(whip::event_t& event) - { - // pop an event off the pool, if that fails, create a new one - while (!free_list_.pop(event)) { add_event_to_pool(); } - return true; - } - - inline bool push(whip::event_t event) { return free_list_.push(event); } - - private: - void add_event_to_pool() - { - whip::event_t event; - // Create an cuda_event to query a CUDA/CUBLAS kernel for completion. - // Timing is disabled for performance. [1] - // - // [1]: CUDA Runtime API, section 5.5 cuda_event Management - whip::event_create_with_flags(&event, whip::event_disable_timing); - free_list_.push(event); - } - - // pool is dynamically sized and can grow if needed - boost::lockfree::stack> free_list_; - }; -} // namespace pika::cuda::experimental::detail diff --git a/libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp b/libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp index fe8459d2d..86e1cabd8 100644 --- a/libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp +++ b/libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp @@ -7,7 +7,9 @@ #pragma once #include -#include +#include +#include +#include #include #include @@ -16,10 +18,43 @@ #include #include #include +#include #include #include namespace pika::cuda::experimental { + class locked_cublas_handle + { + cublas_handle& handle; + std::unique_lock mutex; + + public: + PIKA_EXPORT locked_cublas_handle( + cublas_handle& handle, std::unique_lock&& mutex); + locked_cublas_handle(locked_cublas_handle&&) = delete; + locked_cublas_handle(locked_cublas_handle const&) = delete; + locked_cublas_handle& operator=(locked_cublas_handle&&) = delete; + locked_cublas_handle& operator=(locked_cublas_handle const&) = delete; + + PIKA_EXPORT cublas_handle const& get() noexcept; + }; + + class locked_cusolver_handle + { + cusolver_handle& handle; + std::unique_lock mutex; + + public: + PIKA_EXPORT locked_cusolver_handle( + cusolver_handle& handle, std::unique_lock&& mutex); + locked_cusolver_handle(locked_cusolver_handle&&) = delete; + locked_cusolver_handle(locked_cusolver_handle const&) = delete; + locked_cusolver_handle& operator=(locked_cusolver_handle&&) = delete; + locked_cusolver_handle& operator=(locked_cusolver_handle const&) = delete; + + PIKA_EXPORT cusolver_handle const& get() noexcept; + }; + /// A pool of CUDA streams, used for scheduling work on a CUDA device. /// /// The pool initializes a set of CUDA (thread-local) streams on @@ -47,11 +82,48 @@ namespace pika::cuda::experimental { PIKA_EXPORT cuda_stream const& get_next_stream(); }; + struct cublas_handles_holder + { + std::size_t const concurrency; + std::vector unsynchronized_handles; + std::atomic synchronized_handle_index; + std::vector synchronized_handles; + std::vector handle_mutexes; + + PIKA_EXPORT cublas_handles_holder(); + cublas_handles_holder(cublas_handles_holder&&) = delete; + cublas_handles_holder(cublas_handles_holder const&) = delete; + cublas_handles_holder& operator=(cublas_handles_holder&&) = delete; + cublas_handles_holder& operator=(cublas_handles_holder const&) = delete; + + PIKA_EXPORT locked_cublas_handle get_locked_handle( + cuda_stream const& stream, cublasPointerMode_t pointer_mode); + }; + + struct cusolver_handles_holder + { + std::size_t const concurrency; + std::vector unsynchronized_handles; + std::atomic synchronized_handle_index; + std::vector synchronized_handles; + std::vector handle_mutexes; + + PIKA_EXPORT cusolver_handles_holder(); + cusolver_handles_holder(cusolver_handles_holder&&) = delete; + cusolver_handles_holder(cusolver_handles_holder const&) = delete; + cusolver_handles_holder& operator=(cusolver_handles_holder&&) = delete; + cusolver_handles_holder& operator=(cusolver_handles_holder const&) = delete; + + PIKA_EXPORT locked_cusolver_handle get_locked_handle(cuda_stream const& stream); + }; + struct pool_data { int device; streams_holder normal_priority_streams; streams_holder high_priority_streams; + cublas_handles_holder cublas_handles; + cusolver_handles_holder cusolver_handles; PIKA_EXPORT pool_data(int device, std::size_t num_normal_priority_streams_per_thread, std::size_t num_high_priority_streams_per_thread, unsigned int flags); @@ -80,6 +152,9 @@ namespace pika::cuda::experimental { PIKA_EXPORT explicit operator bool() noexcept; PIKA_EXPORT cuda_stream const& get_next_stream( pika::execution::thread_priority priority = pika::execution::thread_priority::normal); + PIKA_EXPORT locked_cublas_handle get_cublas_handle( + cuda_stream const& stream, cublasPointerMode_t pointer_mode); + PIKA_EXPORT locked_cusolver_handle get_cusolver_handle(cuda_stream const& stream); /// \cond NOINTERNAL friend bool operator==(cuda_pool const& lhs, cuda_pool const& rhs) noexcept diff --git a/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler.hpp b/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler.hpp index 33fc052a4..4d0c787cc 100644 --- a/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler.hpp +++ b/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler.hpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include @@ -40,6 +40,9 @@ namespace pika::cuda::experimental { PIKA_EXPORT cuda_pool const& get_pool() const noexcept; PIKA_EXPORT cuda_stream const& get_next_stream(); + PIKA_EXPORT locked_cublas_handle get_cublas_handle( + cuda_stream const& stream, cublasPointerMode_t pointer_mode); + PIKA_EXPORT locked_cusolver_handle get_cusolver_handle(cuda_stream const& stream); /// \cond NOINTERNAL friend bool operator==(cuda_scheduler const& lhs, cuda_scheduler const& rhs) diff --git a/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler_bulk.hpp b/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler_bulk.hpp index 80c281282..20cc78950 100644 --- a/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler_bulk.hpp +++ b/libs/pika/async_cuda/include/pika/async_cuda/cuda_scheduler_bulk.hpp @@ -8,8 +8,8 @@ #include #include -#include #include +#include #include #include #include diff --git a/libs/pika/async_cuda/include/pika/async_cuda/detail/cuda_event_callback.hpp b/libs/pika/async_cuda/include/pika/async_cuda/detail/cuda_event_callback.hpp index 7719feace..d7fb0f6e4 100644 --- a/libs/pika/async_cuda/include/pika/async_cuda/detail/cuda_event_callback.hpp +++ b/libs/pika/async_cuda/include/pika/async_cuda/detail/cuda_event_callback.hpp @@ -15,7 +15,7 @@ #pragma once #include -#include +#include #include #include diff --git a/libs/pika/async_cuda/include/pika/async_cuda/then_with_stream.hpp b/libs/pika/async_cuda/include/pika/async_cuda/then_with_stream.hpp index 4633a5bb9..58329a67f 100644 --- a/libs/pika/async_cuda/include/pika/async_cuda/then_with_stream.hpp +++ b/libs/pika/async_cuda/include/pika/async_cuda/then_with_stream.hpp @@ -8,12 +8,12 @@ #include #include -#include #include -#include -#include -#include #include +#include +#include +#include +#include #include #include #include @@ -35,29 +35,23 @@ #include namespace pika::cuda::experimental::then_with_stream_detail { - PIKA_EXPORT pika::cuda::experimental::cublas_handle const& get_thread_local_cublas_handle( - cuda_stream const& stream, cublasPointerMode_t pointer_mode); - template - auto invoke_with_thread_local_cublas_handle( - cuda_stream const& stream, cublasPointerMode_t pointer_mode, F&& f, Ts&&... ts) + auto invoke_with_thread_local_cublas_handle(cuda_scheduler& sched, cuda_stream const& stream, + cublasPointerMode_t pointer_mode, F&& f, Ts&&... ts) -> decltype(PIKA_INVOKE( PIKA_FORWARD(F, f), std::declval(), PIKA_FORWARD(Ts, ts)...)) { - return PIKA_INVOKE(PIKA_FORWARD(F, f), - get_thread_local_cublas_handle(stream, pointer_mode).get(), PIKA_FORWARD(Ts, ts)...); + auto locked_handle = sched.get_cublas_handle(stream, pointer_mode); + return PIKA_INVOKE(PIKA_FORWARD(F, f), locked_handle.get().get(), PIKA_FORWARD(Ts, ts)...); } - PIKA_EXPORT pika::cuda::experimental::cusolver_handle const& get_thread_local_cusolver_handle( - cuda_stream const& stream); - template - auto invoke_with_thread_local_cusolver_handle(cuda_stream const& stream, F&& f, Ts&&... ts) - -> decltype(PIKA_INVOKE( - PIKA_FORWARD(F, f), std::declval(), PIKA_FORWARD(Ts, ts)...)) + auto invoke_with_thread_local_cusolver_handle(cuda_scheduler& sched, cuda_stream const& stream, + F&& f, Ts&&... ts) -> decltype(PIKA_INVOKE(PIKA_FORWARD(F, f), + std::declval(), PIKA_FORWARD(Ts, ts)...)) { - return PIKA_INVOKE(PIKA_FORWARD(F, f), get_thread_local_cusolver_handle(stream).get(), - PIKA_FORWARD(Ts, ts)...); + auto locked_handle = sched.get_cusolver_handle(stream); + return PIKA_INVOKE(PIKA_FORWARD(F, f), locked_handle.get().get(), PIKA_FORWARD(Ts, ts)...); } template @@ -157,11 +151,11 @@ namespace pika::cuda::experimental::then_with_stream_detail { #if defined(PIKA_HAVE_STDEXEC) template - requires std::is_invocable_v>...> using invoke_result_helper = pika::execution::experimental::completion_signatures>...>>>; using completion_signatures = @@ -177,7 +171,7 @@ namespace pika::cuda::experimental::then_with_stream_detail { template