Skip to content

Commit

Permalink
[alpaka] Support all alpaka backends at the same time (#357)
Browse files Browse the repository at this point in the history
Update the develop branch to 2022.04.27 / 879b95ffce2 .
Use new pinned host memory functionality.
Add forward declaration for alpaka templates and types.
Support serial, TBB, CUDA and ROCm at the same time, with static splitting of event streams across multiple backends.
Autogenerate plugins.txt.
  • Loading branch information
fwyzard authored May 1, 2022
2 parents 13cbf58 + f56ac8b commit 859ffeb
Show file tree
Hide file tree
Showing 30 changed files with 788 additions and 457 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -583,8 +583,8 @@ $(HWLOC_BASE):
external_alpaka: $(ALPAKA_BASE)

$(ALPAKA_BASE):
git clone git@github.com:alpaka-group/alpaka.git -b 0.9.0-rc1 $@
cd $@ && git checkout ebc1171feac21f1e21c49bcd9f053e7b01b584d0
git clone git@github.com:alpaka-group/alpaka.git -b develop $@
cd $@ && git checkout 879b95ffce2da499c9cc6e12d4cfd5545effa701

# Kokkos
external_kokkos: $(KOKKOS_LIB)
Expand Down
43 changes: 22 additions & 21 deletions src/alpaka/AlpakaCore/CachingAllocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,11 @@ namespace cms::alpakatools {
using Event = alpaka::Event<Queue>; // the events used to synchronise the operations
using Buffer = alpaka::Buf<Device, std::byte, alpaka::DimInt<1u>, size_t>;

// The "memory device" type can either be the same as the "synchronisation device" type, or be the host CPU.
static_assert(std::is_same_v<Device, alpaka::Dev<Queue>> or std::is_same_v<Device, alpaka::DevCpu>,
"The \"memory device\" type can either be the same as the \"synchronisation device\" type, or be the "
"host CPU.");

struct CachedBytes {
size_t free = 0; // total bytes freed and cached on this device
size_t live = 0; // total bytes currently in use oin this device
Expand Down Expand Up @@ -311,11 +316,24 @@ namespace cms::alpakatools {
return false;
}

Buffer allocateBuffer(size_t bytes, Queue const& queue) {
if constexpr (std::is_same_v<Device, alpaka::Dev<Queue>>) {
// allocate device memory
return alpaka::allocBuf<std::byte, size_t>(device_, bytes);
} else if constexpr (std::is_same_v<Device, alpaka::DevCpu>) {
// allocate pinned host memory
return alpaka::allocMappedBuf<std::byte, size_t>(device_, alpaka::getDev(queue), bytes);
} else {
// unsupported combination
static_assert(std::is_same_v<Device, alpaka::Dev<Queue>> or std::is_same_v<Device, alpaka::DevCpu>,
"The \"memory device\" type can either be the same as the \"synchronisation device\" type, or be "
"the host CPU.");
}
}

void allocateNewBlock(BlockDescriptor& block) {
try {
// FIXME simplify alpaka::Vec<alpaka::DimInt<1u>, size_t>{block.bytes} to block.bytes ?
block.buffer =
alpaka::allocBuf<std::byte, size_t>(device_, alpaka::Vec<alpaka::DimInt<1u>, size_t>{block.bytes});
block.buffer = allocateBuffer(block.bytes, *block.queue);
} catch (std::runtime_error const& e) {
// the allocation attempt failed: free all cached blocks on the device and retry
if (debug_) {
Expand All @@ -329,25 +347,8 @@ namespace cms::alpakatools {
freeAllCached();

// throw an exception if it fails again
block.buffer =
alpaka::allocBuf<std::byte, size_t>(device_, alpaka::Vec<alpaka::DimInt<1u>, size_t>{block.bytes});
}

// for host memory, pin the newly allocated block
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
if (not cms::alpakatools::devices<alpaka::PltfCudaRt>.empty()) {
// it is possible to initialise the CUDA runtime and call cudaHostRegister
// only if the system has at least one supported GPU
alpaka::prepareForAsyncCopy(*block.buffer);
}
#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
if (not cms::alpakatools::devices<alpaka::PltfHipRt>.empty()) {
// it is possible to initialise the ROCm runtime and call hipHostRegister
// only if the system has at least one supported GPU
alpaka::prepareForAsyncCopy(*block.buffer);
block.buffer = allocateBuffer(block.bytes, *block.queue);
}
#endif // ALPAKA_ACC_GPU_HIP_ENABLED

// create a new event associated to the "synchronisation device"
block.event = Event{block.device()};
Expand Down
6 changes: 5 additions & 1 deletion src/alpaka/AlpakaCore/HostOnlyTask.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ namespace alpaka {
//! The CUDA async queue enqueue trait specialization for "safe tasks"
template <>
struct Enqueue<QueueCudaRtNonBlocking, HostOnlyTask> {
using TApi = ApiCudaRt;

static void CUDART_CB callback(cudaStream_t /*queue*/, cudaError_t /*status*/, void* arg) {
//ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status);
std::unique_ptr<HostOnlyTask> pTask(static_cast<HostOnlyTask*>(arg));
Expand All @@ -42,7 +44,9 @@ namespace alpaka {
//! The HIP async queue enqueue trait specialization for "safe tasks"
template <>
struct Enqueue<QueueHipRtNonBlocking, HostOnlyTask> {
static void HIPRT_CB callback(hipStream_t /*queue*/, hipError_t /*status*/, void* arg) {
using TApi = ApiHipRt;

static void callback(hipStream_t /*queue*/, hipError_t /*status*/, void* arg) {
//ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status);
std::unique_ptr<HostOnlyTask> pTask(static_cast<HostOnlyTask*>(arg));
(*pTask)();
Expand Down
32 changes: 32 additions & 0 deletions src/alpaka/AlpakaCore/alpaka/initialise.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#include <iostream>

#include <alpaka/alpaka.hpp>

#include "AlpakaCore/alpakaConfig.h"
#include "AlpakaCore/alpakaDevices.h"
#include "AlpakaCore/initialise.h"
#include "Framework/demangle.h"

namespace cms::alpakatools {

template <typename TPlatform>
void initialise() {
constexpr const char* suffix[] = {"devices.", "device:", "devices:"};

if (devices<TPlatform>.empty()) {
devices<TPlatform> = enumerate<TPlatform>();
auto size = devices<TPlatform>.size();
//std::cout << edm::demangle<TPlatform> << " platform succesfully initialised." << std::endl;
std::cout << "Found " << size << " " << suffix[size < 2 ? size : 2] << std::endl;
for (auto const& device : devices<TPlatform>) {
std::cout << " - " << alpaka::getName(device) << std::endl;
}
} else {
//std::cout << edm::demangle<TPlatform> << " platform already initialised." << std::endl;
}
}

// explicit template instantiation definition
template void initialise<ALPAKA_ACCELERATOR_NAMESPACE::Platform>();

} // namespace cms::alpakatools
26 changes: 12 additions & 14 deletions src/alpaka/AlpakaCore/alpakaConfig.h
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
#ifndef AlpakaCore_alpakaConfig_h
#define AlpakaCore_alpakaConfig_h

#include <type_traits>

#include <alpaka/alpaka.hpp>
#include "AlpakaCore/alpakaFwd.h"

namespace alpaka_common {

Expand Down Expand Up @@ -32,7 +30,7 @@ namespace alpaka_common {

// host types
using DevHost = alpaka::DevCpu;
using PltfHost = alpaka::Pltf<DevHost>;
using PltfHost = alpaka::PltfCpu;

} // namespace alpaka_common

Expand All @@ -44,7 +42,7 @@ namespace alpaka_common {
#define DEFINE_FWK_ALPAKA_EVENTSETUP_MODULE(name) \
DEFINE_FWK_ALPAKA_EVENTSETUP_MODULE2(ALPAKA_ACCELERATOR_NAMESPACE::name)

#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
#ifdef ALPAKA_ACC_GPU_CUDA_PRESENT
namespace alpaka_cuda_async {
using namespace alpaka_common;

Expand All @@ -61,13 +59,13 @@ namespace alpaka_cuda_async {

} // namespace alpaka_cuda_async

#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
#endif // ALPAKA_ACC_GPU_CUDA_PRESENT

#ifdef ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND
#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_cuda_async
#endif // ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND

#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
#ifdef ALPAKA_ACC_GPU_HIP_PRESENT
namespace alpaka_rocm_async {
using namespace alpaka_common;

Expand All @@ -84,13 +82,13 @@ namespace alpaka_rocm_async {

} // namespace alpaka_rocm_async

#endif // ALPAKA_ACC_GPU_HIP_ENABLED
#endif // ALPAKA_ACC_GPU_HIP_PRESENT

#ifdef ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND
#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_rocm_async
#endif // ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND

#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT
namespace alpaka_serial_sync {
using namespace alpaka_common;

Expand All @@ -107,13 +105,13 @@ namespace alpaka_serial_sync {

} // namespace alpaka_serial_sync

#endif // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
#endif // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT

#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND
#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_serial_sync
#endif // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND

#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT
namespace alpaka_tbb_async {
using namespace alpaka_common;

Expand All @@ -130,13 +128,13 @@ namespace alpaka_tbb_async {

} // namespace alpaka_tbb_async

#endif // ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
#endif // ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT

#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND
#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_tbb_async
#endif // ALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND

#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_PRESENT
namespace alpaka_omp2_async {
using namespace alpaka_common;

Expand All @@ -153,7 +151,7 @@ namespace alpaka_omp2_async {

} // namespace alpaka_omp2_async

#endif // ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
#endif // ALPAKA_ACC_CPU_B_OMP2_T_SEQ_PRESENT

#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ASYNC_BACKEND
#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_omp2_async
Expand Down
96 changes: 96 additions & 0 deletions src/alpaka/AlpakaCore/alpakaFwd.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#ifndef AlpakaCore_alpakaFwd_h
#define AlpakaCore_alpakaFwd_h

#include <cstddef>
#include <cstdint>
#include <type_traits>

/**
* This file forward declares specific types defined in Alpaka
* (depending on the backend-enabling macros) so that these types
* would be available throughout CMSSW without a direct dependence on
* Alpaka in order to avoid the constraints that would impose
* (primarily the device compiler)
*
* This is a little bit brittle, but let's see how it goes.
*/
namespace alpaka {

// miscellanea
template <std::size_t N>
using DimInt = std::integral_constant<std::size_t, N>;

template <typename TDim, typename TVal>
class Vec;

template <typename TDim, typename TIdx>
class WorkDivMembers;

// API
struct ApiCudaRt;
struct ApiHipRt;

// Platforms
class PltfCpu;
template <typename TApi>
class PltfUniformCudaHipRt;
using PltfCudaRt = PltfUniformCudaHipRt<ApiCudaRt>;
using PltfHipRt = PltfUniformCudaHipRt<ApiHipRt>;

// Devices
class DevCpu;
template <typename TApi>
class DevUniformCudaHipRt;
using DevCudaRt = DevUniformCudaHipRt<ApiCudaRt>;
using DevHipRt = DevUniformCudaHipRt<ApiHipRt>;

// Queues
template <typename TDev>
class QueueGenericThreadsBlocking;
using QueueCpuBlocking = QueueGenericThreadsBlocking<DevCpu>;

template <typename TDev>
class QueueGenericThreadsNonBlocking;
using QueueCpuNonBlocking = QueueGenericThreadsNonBlocking<DevCpu>;

namespace uniform_cuda_hip::detail {
template <typename TApi, bool TBlocking>
class QueueUniformCudaHipRt;
}
using QueueCudaRtBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<ApiCudaRt, true>;
using QueueCudaRtNonBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<ApiCudaRt, false>;
using QueueHipRtBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<ApiHipRt, true>;
using QueueHipRtNonBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<ApiHipRt, false>;

// Events
template <typename TDev>
class EventGenericThreads;
using EventCpu = EventGenericThreads<DevCpu>;

template <typename TApi>
class EventUniformCudaHipRt;
using EventCudaRt = EventUniformCudaHipRt<ApiCudaRt>;
using EventHipRt = EventUniformCudaHipRt<ApiHipRt>;

// Accelerators
template <typename TApi, typename TDim, typename TIdx>
class AccGpuUniformCudaHipRt;

template <typename TDim, typename TIdx>
using AccGpuCudaRt = AccGpuUniformCudaHipRt<ApiCudaRt, TDim, TIdx>;

template <typename TDim, typename TIdx>
using AccGpuHipRt = AccGpuUniformCudaHipRt<ApiHipRt, TDim, TIdx>;

template <typename TDim, typename TIdx>
class AccCpuSerial;

template <typename TDim, typename TIdx>
class AccCpuTbbBlocks;

template <typename TDim, typename TIdx>
class AccCpuOmp2Blocks;

} // namespace alpaka

#endif // AlpakaCore_alpakaFwd_h
Loading

0 comments on commit 859ffeb

Please sign in to comment.