Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert #872 #1009

Merged
merged 1 commit into from
Jan 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion libs/pika/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ set(_pika_modules
assertion
async_base
async_cuda
async_cuda_base
async_mpi
command_line_handling
concepts
Expand Down
31 changes: 26 additions & 5 deletions libs/pika/async_cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,39 @@ endif()

list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")

# Default location is $PIKA_ROOT/libs/async_cuda/include
set(async_cuda_headers
pika/async_cuda/cublas_exception.hpp
pika/async_cuda/cublas_handle.hpp
pika/async_cuda/cuda_device_scope.hpp
pika/async_cuda/cuda_event.hpp
pika/async_cuda/cuda_polling_helper.hpp
pika/async_cuda/cuda_pool.hpp
pika/async_cuda/cuda_scheduler.hpp
pika/async_cuda/cuda_scheduler_bulk.hpp
pika/async_cuda/cuda_stream.hpp
pika/async_cuda/cusolver_exception.hpp
pika/async_cuda/cusolver_handle.hpp
pika/async_cuda/custom_blas_api.hpp
pika/async_cuda/custom_lapack_api.hpp
pika/async_cuda/detail/cuda_debug.hpp
pika/async_cuda/detail/cuda_event_callback.hpp
pika/async_cuda/then_on_host.hpp
pika/async_cuda/then_with_stream.hpp
)

set(async_cuda_sources cuda_event_callback.cpp cuda_pool.cpp cuda_scheduler.cpp)
set(async_cuda_sources
cublas_exception.cpp
cublas_handle.cpp
cuda_device_scope.cpp
cuda_event_callback.cpp
cuda_pool.cpp
cuda_scheduler.cpp
cuda_stream.cpp
cusolver_exception.cpp
cusolver_handle.cpp
then_with_stream.cpp
)

if(PIKA_WITH_HIP)
set(async_cuda_extra_deps roc::rocblas roc::rocsolver)
Expand All @@ -37,18 +59,17 @@ pika_add_module(
SOURCES ${async_cuda_sources}
HEADERS ${async_cuda_headers}
MODULE_DEPENDENCIES
pika_allocator_support
pika_assertion
pika_async_base
pika_async_cuda_base
pika_concurrency
pika_config
pika_coroutines
pika_debugging
pika_errors
pika_execution
pika_execution_base
pika_memory
pika_runtime
pika_threading_base
pika_topology
DEPENDENCIES ${async_cuda_extra_deps}
CMAKE_SUBDIRS examples tests
)
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#pragma once

#include <pika/config.hpp>
#include <pika/async_cuda_base/custom_blas_api.hpp>
#include <pika/async_cuda/custom_blas_api.hpp>
#include <pika/errors/exception.hpp>

#include <string>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
#pragma once

#include <pika/config.hpp>
#include <pika/async_cuda_base/cuda_stream.hpp>
#include <pika/async_cuda_base/custom_blas_api.hpp>
#include <pika/async_cuda/cuda_stream.hpp>
#include <pika/async_cuda/custom_blas_api.hpp>

#include <fmt/format.h>
#include <whip.hpp>
Expand Down
72 changes: 72 additions & 0 deletions libs/pika/async_cuda/include/pika/async_cuda/cuda_event.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
// Copyright (c) 2020 John Biddiscombe
// Copyright (c) 2020 Teodor Nikolov
//
// SPDX-License-Identifier: BSL-1.0
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

#pragma once

#include <boost/lockfree/stack.hpp>
#include <whip.hpp>

namespace pika::cuda::experimental::detail {

// a pool of cudaEvent_t objects.
// Since allocation of a cuda event passes into the cuda runtime
// it might be an expensive operation, so we pre-allocate a pool
// of them at startup.
struct cuda_event_pool
{
static constexpr int initial_events_in_pool = 128;

static cuda_event_pool& get_event_pool()
{
static cuda_event_pool event_pool_;
return event_pool_;
}

// create a bunch of events on initialization
cuda_event_pool()
: free_list_(initial_events_in_pool)
{
for (int i = 0; i < initial_events_in_pool; ++i) { add_event_to_pool(); }
}

// on destruction, all objects in stack will be freed
~cuda_event_pool()
{
whip::event_t event;
bool ok = true;
while (ok)
{
ok = free_list_.pop(event);
if (ok) whip::event_destroy(event);
}
}

inline bool pop(whip::event_t& event)
{
// pop an event off the pool, if that fails, create a new one
while (!free_list_.pop(event)) { add_event_to_pool(); }
return true;
}

inline bool push(whip::event_t event) { return free_list_.push(event); }

private:
void add_event_to_pool()
{
whip::event_t event;
// Create an cuda_event to query a CUDA/CUBLAS kernel for completion.
// Timing is disabled for performance. [1]
//
// [1]: CUDA Runtime API, section 5.5 cuda_event Management
whip::event_create_with_flags(&event, whip::event_disable_timing);
free_list_.push(event);
}

// pool is dynamically sized and can grow if needed
boost::lockfree::stack<whip::event_t, boost::lockfree::fixed_sized<false>> free_list_;
};
} // namespace pika::cuda::experimental::detail
77 changes: 1 addition & 76 deletions libs/pika/async_cuda/include/pika/async_cuda/cuda_pool.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@
#pragma once

#include <pika/assert.hpp>
#include <pika/async_cuda_base/cublas_handle.hpp>
#include <pika/async_cuda_base/cuda_stream.hpp>
#include <pika/async_cuda_base/cusolver_handle.hpp>
#include <pika/async_cuda/cuda_stream.hpp>
#include <pika/concurrency/cache_line_data.hpp>
#include <pika/coroutines/thread_enums.hpp>

Expand All @@ -18,43 +16,10 @@
#include <atomic>
#include <cstddef>
#include <memory>
#include <mutex>
#include <string>
#include <vector>

namespace pika::cuda::experimental {
class locked_cublas_handle
{
cublas_handle& handle;
std::unique_lock<std::mutex> mutex;

public:
PIKA_EXPORT locked_cublas_handle(
cublas_handle& handle, std::unique_lock<std::mutex>&& mutex);
locked_cublas_handle(locked_cublas_handle&&) = delete;
locked_cublas_handle(locked_cublas_handle const&) = delete;
locked_cublas_handle& operator=(locked_cublas_handle&&) = delete;
locked_cublas_handle& operator=(locked_cublas_handle const&) = delete;

PIKA_EXPORT cublas_handle const& get() noexcept;
};

class locked_cusolver_handle
{
cusolver_handle& handle;
std::unique_lock<std::mutex> mutex;

public:
PIKA_EXPORT locked_cusolver_handle(
cusolver_handle& handle, std::unique_lock<std::mutex>&& mutex);
locked_cusolver_handle(locked_cusolver_handle&&) = delete;
locked_cusolver_handle(locked_cusolver_handle const&) = delete;
locked_cusolver_handle& operator=(locked_cusolver_handle&&) = delete;
locked_cusolver_handle& operator=(locked_cusolver_handle const&) = delete;

PIKA_EXPORT cusolver_handle const& get() noexcept;
};

/// A pool of CUDA streams, used for scheduling work on a CUDA device.
///
/// The pool initializes a set of CUDA (thread-local) streams on
Expand Down Expand Up @@ -82,48 +47,11 @@ namespace pika::cuda::experimental {
PIKA_EXPORT cuda_stream const& get_next_stream();
};

struct cublas_handles_holder
{
std::size_t const concurrency;
std::vector<cublas_handle> unsynchronized_handles;
std::atomic<std::size_t> synchronized_handle_index;
std::vector<cublas_handle> synchronized_handles;
std::vector<std::mutex> handle_mutexes;

PIKA_EXPORT cublas_handles_holder();
cublas_handles_holder(cublas_handles_holder&&) = delete;
cublas_handles_holder(cublas_handles_holder const&) = delete;
cublas_handles_holder& operator=(cublas_handles_holder&&) = delete;
cublas_handles_holder& operator=(cublas_handles_holder const&) = delete;

PIKA_EXPORT locked_cublas_handle get_locked_handle(
cuda_stream const& stream, cublasPointerMode_t pointer_mode);
};

struct cusolver_handles_holder
{
std::size_t const concurrency;
std::vector<cusolver_handle> unsynchronized_handles;
std::atomic<std::size_t> synchronized_handle_index;
std::vector<cusolver_handle> synchronized_handles;
std::vector<std::mutex> handle_mutexes;

PIKA_EXPORT cusolver_handles_holder();
cusolver_handles_holder(cusolver_handles_holder&&) = delete;
cusolver_handles_holder(cusolver_handles_holder const&) = delete;
cusolver_handles_holder& operator=(cusolver_handles_holder&&) = delete;
cusolver_handles_holder& operator=(cusolver_handles_holder const&) = delete;

PIKA_EXPORT locked_cusolver_handle get_locked_handle(cuda_stream const& stream);
};

struct pool_data
{
int device;
streams_holder normal_priority_streams;
streams_holder high_priority_streams;
cublas_handles_holder cublas_handles;
cusolver_handles_holder cusolver_handles;

PIKA_EXPORT pool_data(int device, std::size_t num_normal_priority_streams_per_thread,
std::size_t num_high_priority_streams_per_thread, unsigned int flags);
Expand Down Expand Up @@ -152,9 +80,6 @@ namespace pika::cuda::experimental {
PIKA_EXPORT explicit operator bool() noexcept;
PIKA_EXPORT cuda_stream const& get_next_stream(
pika::execution::thread_priority priority = pika::execution::thread_priority::normal);
PIKA_EXPORT locked_cublas_handle get_cublas_handle(
cuda_stream const& stream, cublasPointerMode_t pointer_mode);
PIKA_EXPORT locked_cusolver_handle get_cusolver_handle(cuda_stream const& stream);

/// \cond NOINTERNAL
friend bool operator==(cuda_pool const& lhs, cuda_pool const& rhs) noexcept
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include <pika/assert.hpp>
#include <pika/async_base/scheduling_properties.hpp>
#include <pika/async_cuda/cuda_pool.hpp>
#include <pika/async_cuda_base/cuda_stream.hpp>
#include <pika/async_cuda/cuda_stream.hpp>
#include <pika/concepts/concepts.hpp>
#include <pika/coroutines/thread_enums.hpp>
#include <pika/execution/algorithms/execute.hpp>
Expand Down Expand Up @@ -40,9 +40,6 @@ namespace pika::cuda::experimental {

PIKA_EXPORT cuda_pool const& get_pool() const noexcept;
PIKA_EXPORT cuda_stream const& get_next_stream();
PIKA_EXPORT locked_cublas_handle get_cublas_handle(
cuda_stream const& stream, cublasPointerMode_t pointer_mode);
PIKA_EXPORT locked_cusolver_handle get_cusolver_handle(cuda_stream const& stream);

/// \cond NOINTERNAL
friend bool operator==(cuda_scheduler const& lhs, cuda_scheduler const& rhs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

#include <pika/assert.hpp>
#include <pika/async_cuda/cuda_pool.hpp>
#include <pika/async_cuda/cuda_stream.hpp>
#include <pika/async_cuda/then_with_stream.hpp>
#include <pika/async_cuda_base/cuda_stream.hpp>
#include <pika/concepts/concepts.hpp>
#include <pika/execution/algorithms/bulk.hpp>
#include <pika/execution/algorithms/execute.hpp>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

#include <pika/config.hpp>
#if defined(PIKA_HAVE_GPU_SUPPORT)
# include <pika/async_cuda_base/custom_lapack_api.hpp>
# include <pika/async_cuda/custom_lapack_api.hpp>
# include <pika/errors/exception.hpp>

# include <string>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

#include <pika/config.hpp>
#if defined(PIKA_HAVE_GPU_SUPPORT)
# include <pika/async_cuda_base/cuda_stream.hpp>
# include <pika/async_cuda_base/custom_lapack_api.hpp>
# include <pika/async_cuda/cuda_stream.hpp>
# include <pika/async_cuda/custom_lapack_api.hpp>

# include <fmt/format.h>
# include <whip.hpp>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#pragma once

#include <pika/config.hpp>
#include <pika/async_cuda_base/cuda_stream.hpp>
#include <pika/async_cuda/cuda_stream.hpp>
#include <pika/functional/unique_function.hpp>
#include <pika/threading_base/thread_pool_base.hpp>

Expand Down
Loading
Loading