Skip to content

Commit

Permalink
Merge pull request #1017 from msimberg/fix-cublas-handles-cuda-pool
Browse files Browse the repository at this point in the history
Unrevert #872
  • Loading branch information
msimberg committed Feb 6, 2024
1 parent ab325f8 commit 8c72e52
Show file tree
Hide file tree
Showing 45 changed files with 674 additions and 209 deletions.
1 change: 1 addition & 0 deletions libs/pika/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ set(_pika_modules
assertion
async_base
async_cuda
async_cuda_base
async_mpi
command_line_handling
concepts
Expand Down
31 changes: 5 additions & 26 deletions libs/pika/async_cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,39 +10,17 @@ endif()

list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")

# Default location is $PIKA_ROOT/libs/async_cuda/include
set(async_cuda_headers
pika/async_cuda/cublas_exception.hpp
pika/async_cuda/cublas_handle.hpp
pika/async_cuda/cuda_device_scope.hpp
pika/async_cuda/cuda_event.hpp
pika/async_cuda/cuda_polling_helper.hpp
pika/async_cuda/cuda_pool.hpp
pika/async_cuda/cuda_scheduler.hpp
pika/async_cuda/cuda_scheduler_bulk.hpp
pika/async_cuda/cuda_stream.hpp
pika/async_cuda/cusolver_exception.hpp
pika/async_cuda/cusolver_handle.hpp
pika/async_cuda/custom_blas_api.hpp
pika/async_cuda/custom_lapack_api.hpp
pika/async_cuda/detail/cuda_debug.hpp
pika/async_cuda/detail/cuda_event_callback.hpp
pika/async_cuda/then_on_host.hpp
pika/async_cuda/then_with_stream.hpp
)

set(async_cuda_sources
cublas_exception.cpp
cublas_handle.cpp
cuda_device_scope.cpp
cuda_event_callback.cpp
cuda_pool.cpp
cuda_scheduler.cpp
cuda_stream.cpp
cusolver_exception.cpp
cusolver_handle.cpp
then_with_stream.cpp
)
set(async_cuda_sources cuda_event_callback.cpp cuda_pool.cpp cuda_scheduler.cpp)

if(PIKA_WITH_HIP)
set(async_cuda_extra_deps roc::rocblas roc::rocsolver)
Expand All @@ -59,17 +37,18 @@ pika_add_module(
SOURCES ${async_cuda_sources}
HEADERS ${async_cuda_headers}
MODULE_DEPENDENCIES
pika_allocator_support
pika_assertion
pika_async_base
pika_async_cuda_base
pika_concurrency
pika_config
pika_debugging
pika_coroutines
pika_errors
pika_execution
pika_execution_base
pika_memory
pika_runtime
pika_threading_base
pika_topology
DEPENDENCIES ${async_cuda_extra_deps}
CMAKE_SUBDIRS examples tests
)
72 changes: 0 additions & 72 deletions libs/pika/async_cuda/include/pika/async_cuda/cuda_event.hpp

This file was deleted.

77 changes: 76 additions & 1 deletion libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
#pragma once

#include <pika/assert.hpp>
#include <pika/async_cuda/cuda_stream.hpp>
#include <pika/async_cuda_base/cublas_handle.hpp>
#include <pika/async_cuda_base/cuda_stream.hpp>
#include <pika/async_cuda_base/cusolver_handle.hpp>
#include <pika/concurrency/cache_line_data.hpp>
#include <pika/coroutines/thread_enums.hpp>

Expand All @@ -16,10 +18,43 @@
#include <atomic>
#include <cstddef>
#include <memory>
#include <mutex>
#include <string>
#include <vector>

namespace pika::cuda::experimental {
class locked_cublas_handle
{
cublas_handle& handle;
std::unique_lock<std::mutex> mutex;

public:
PIKA_EXPORT locked_cublas_handle(
cublas_handle& handle, std::unique_lock<std::mutex>&& mutex);
locked_cublas_handle(locked_cublas_handle&&) = delete;
locked_cublas_handle(locked_cublas_handle const&) = delete;
locked_cublas_handle& operator=(locked_cublas_handle&&) = delete;
locked_cublas_handle& operator=(locked_cublas_handle const&) = delete;

PIKA_EXPORT cublas_handle const& get() noexcept;
};

class locked_cusolver_handle
{
cusolver_handle& handle;
std::unique_lock<std::mutex> mutex;

public:
PIKA_EXPORT locked_cusolver_handle(
cusolver_handle& handle, std::unique_lock<std::mutex>&& mutex);
locked_cusolver_handle(locked_cusolver_handle&&) = delete;
locked_cusolver_handle(locked_cusolver_handle const&) = delete;
locked_cusolver_handle& operator=(locked_cusolver_handle&&) = delete;
locked_cusolver_handle& operator=(locked_cusolver_handle const&) = delete;

PIKA_EXPORT cusolver_handle const& get() noexcept;
};

/// A pool of CUDA streams, used for scheduling work on a CUDA device.
///
/// The pool initializes a set of CUDA (thread-local) streams on
Expand Down Expand Up @@ -47,11 +82,48 @@ namespace pika::cuda::experimental {
PIKA_EXPORT cuda_stream const& get_next_stream();
};

struct cublas_handles_holder
{
std::size_t const concurrency;
std::vector<cublas_handle> unsynchronized_handles;
std::atomic<std::size_t> synchronized_handle_index;
std::vector<cublas_handle> synchronized_handles;
std::vector<std::mutex> handle_mutexes;

PIKA_EXPORT cublas_handles_holder();
cublas_handles_holder(cublas_handles_holder&&) = delete;
cublas_handles_holder(cublas_handles_holder const&) = delete;
cublas_handles_holder& operator=(cublas_handles_holder&&) = delete;
cublas_handles_holder& operator=(cublas_handles_holder const&) = delete;

PIKA_EXPORT locked_cublas_handle get_locked_handle(
cuda_stream const& stream, cublasPointerMode_t pointer_mode);
};

struct cusolver_handles_holder
{
std::size_t const concurrency;
std::vector<cusolver_handle> unsynchronized_handles;
std::atomic<std::size_t> synchronized_handle_index;
std::vector<cusolver_handle> synchronized_handles;
std::vector<std::mutex> handle_mutexes;

PIKA_EXPORT cusolver_handles_holder();
cusolver_handles_holder(cusolver_handles_holder&&) = delete;
cusolver_handles_holder(cusolver_handles_holder const&) = delete;
cusolver_handles_holder& operator=(cusolver_handles_holder&&) = delete;
cusolver_handles_holder& operator=(cusolver_handles_holder const&) = delete;

PIKA_EXPORT locked_cusolver_handle get_locked_handle(cuda_stream const& stream);
};

struct pool_data
{
int device;
streams_holder normal_priority_streams;
streams_holder high_priority_streams;
cublas_handles_holder cublas_handles;
cusolver_handles_holder cusolver_handles;

PIKA_EXPORT pool_data(int device, std::size_t num_normal_priority_streams_per_thread,
std::size_t num_high_priority_streams_per_thread, unsigned int flags);
Expand Down Expand Up @@ -80,6 +152,9 @@ namespace pika::cuda::experimental {
PIKA_EXPORT explicit operator bool() noexcept;
PIKA_EXPORT cuda_stream const& get_next_stream(
pika::execution::thread_priority priority = pika::execution::thread_priority::normal);
PIKA_EXPORT locked_cublas_handle get_cublas_handle(
cuda_stream const& stream, cublasPointerMode_t pointer_mode);
PIKA_EXPORT locked_cusolver_handle get_cusolver_handle(cuda_stream const& stream);

/// \cond NOINTERNAL
friend bool operator==(cuda_pool const& lhs, cuda_pool const& rhs) noexcept
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include <pika/assert.hpp>
#include <pika/async_base/scheduling_properties.hpp>
#include <pika/async_cuda/cuda_pool.hpp>
#include <pika/async_cuda/cuda_stream.hpp>
#include <pika/async_cuda_base/cuda_stream.hpp>
#include <pika/concepts/concepts.hpp>
#include <pika/coroutines/thread_enums.hpp>
#include <pika/execution/algorithms/execute.hpp>
Expand Down Expand Up @@ -40,6 +40,9 @@ namespace pika::cuda::experimental {

PIKA_EXPORT cuda_pool const& get_pool() const noexcept;
PIKA_EXPORT cuda_stream const& get_next_stream();
PIKA_EXPORT locked_cublas_handle get_cublas_handle(
cuda_stream const& stream, cublasPointerMode_t pointer_mode);
PIKA_EXPORT locked_cusolver_handle get_cusolver_handle(cuda_stream const& stream);

/// \cond NOINTERNAL
friend bool operator==(cuda_scheduler const& lhs, cuda_scheduler const& rhs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

#include <pika/assert.hpp>
#include <pika/async_cuda/cuda_pool.hpp>
#include <pika/async_cuda/cuda_stream.hpp>
#include <pika/async_cuda/then_with_stream.hpp>
#include <pika/async_cuda_base/cuda_stream.hpp>
#include <pika/concepts/concepts.hpp>
#include <pika/execution/algorithms/bulk.hpp>
#include <pika/execution/algorithms/execute.hpp>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#pragma once

#include <pika/config.hpp>
#include <pika/async_cuda/cuda_stream.hpp>
#include <pika/async_cuda_base/cuda_stream.hpp>
#include <pika/functional/unique_function.hpp>
#include <pika/threading_base/thread_pool_base.hpp>

Expand Down
Loading

0 comments on commit 8c72e52

Please sign in to comment.