[alpaka] Support all alpaka backends at the same time (#357)

Update the develop branch to 2022.04.27 / 879b95ffce2 . Use new pinned host memory functionality. Add forward declaration for alpaka templates and types. Support serial, TBB, CUDA and ROCm at the same time, with static splitting of event streams across multiple backends. Autogenerate plugins.txt.
cms-patatrack · May 1, 2022 · 859ffeb · 859ffeb
2 parents 13cbf58 + f56ac8b
commit 859ffeb
Show file tree

Hide file tree

Showing 30 changed files with 788 additions and 457 deletions.
diff --git a/Makefile b/Makefile
@@ -583,8 +583,8 @@ $(HWLOC_BASE):
 external_alpaka: $(ALPAKA_BASE)
 
 $(ALPAKA_BASE):
-	git clone git@github.com:alpaka-group/alpaka.git -b 0.9.0-rc1 $@
-	cd $@ && git checkout ebc1171feac21f1e21c49bcd9f053e7b01b584d0
+	git clone git@github.com:alpaka-group/alpaka.git -b develop $@
+	cd $@ && git checkout 879b95ffce2da499c9cc6e12d4cfd5545effa701
 
 # Kokkos
 external_kokkos: $(KOKKOS_LIB)

diff --git a/src/alpaka/AlpakaCore/CachingAllocator.h b/src/alpaka/AlpakaCore/CachingAllocator.h
@@ -90,6 +90,11 @@ namespace cms::alpakatools {
     using Event = alpaka::Event<Queue>;  // the events used to synchronise the operations
     using Buffer = alpaka::Buf<Device, std::byte, alpaka::DimInt<1u>, size_t>;
 
+    // The "memory device" type can either be the same as the "synchronisation device" type, or be the host CPU.
+    static_assert(std::is_same_v<Device, alpaka::Dev<Queue>> or std::is_same_v<Device, alpaka::DevCpu>,
+                  "The \"memory device\" type can either be the same as the \"synchronisation device\" type, or be the "
+                  "host CPU.");
+
     struct CachedBytes {
       size_t free = 0;       // total bytes freed and cached on this device
       size_t live = 0;       // total bytes currently in use oin this device
@@ -311,11 +316,24 @@ namespace cms::alpakatools {
       return false;
     }
 
+    Buffer allocateBuffer(size_t bytes, Queue const& queue) {
+      if constexpr (std::is_same_v<Device, alpaka::Dev<Queue>>) {
+        // allocate device memory
+        return alpaka::allocBuf<std::byte, size_t>(device_, bytes);
+      } else if constexpr (std::is_same_v<Device, alpaka::DevCpu>) {
+        // allocate pinned host memory
+        return alpaka::allocMappedBuf<std::byte, size_t>(device_, alpaka::getDev(queue), bytes);
+      } else {
+        // unsupported combination
+        static_assert(std::is_same_v<Device, alpaka::Dev<Queue>> or std::is_same_v<Device, alpaka::DevCpu>,
+                      "The \"memory device\" type can either be the same as the \"synchronisation device\" type, or be "
+                      "the host CPU.");
+      }
+    }
+
     void allocateNewBlock(BlockDescriptor& block) {
       try {
-        // FIXME simplify alpaka::Vec<alpaka::DimInt<1u>, size_t>{block.bytes} to block.bytes ?
-        block.buffer =
-            alpaka::allocBuf<std::byte, size_t>(device_, alpaka::Vec<alpaka::DimInt<1u>, size_t>{block.bytes});
+        block.buffer = allocateBuffer(block.bytes, *block.queue);
       } catch (std::runtime_error const& e) {
         // the allocation attempt failed: free all cached blocks on the device and retry
         if (debug_) {
@@ -329,25 +347,8 @@ namespace cms::alpakatools {
         freeAllCached();
 
         // throw an exception if it fails again
-        block.buffer =
-            alpaka::allocBuf<std::byte, size_t>(device_, alpaka::Vec<alpaka::DimInt<1u>, size_t>{block.bytes});
-      }
-
-      // for host memory, pin the newly allocated block
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-      if (not cms::alpakatools::devices<alpaka::PltfCudaRt>.empty()) {
-        // it is possible to initialise the CUDA runtime and call cudaHostRegister
-        // only if the system has at least one supported GPU
-        alpaka::prepareForAsyncCopy(*block.buffer);
-      }
-#endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-      if (not cms::alpakatools::devices<alpaka::PltfHipRt>.empty()) {
-        // it is possible to initialise the ROCm runtime and call hipHostRegister
-        // only if the system has at least one supported GPU
-        alpaka::prepareForAsyncCopy(*block.buffer);
+        block.buffer = allocateBuffer(block.bytes, *block.queue);
       }
-#endif  // ALPAKA_ACC_GPU_HIP_ENABLED
 
       // create a new event associated to the "synchronisation device"
       block.event = Event{block.device()};

diff --git a/src/alpaka/AlpakaCore/HostOnlyTask.h b/src/alpaka/AlpakaCore/HostOnlyTask.h
@@ -24,6 +24,8 @@ namespace alpaka {
     //! The CUDA async queue enqueue trait specialization for "safe tasks"
     template <>
     struct Enqueue<QueueCudaRtNonBlocking, HostOnlyTask> {
+      using TApi = ApiCudaRt;
+
       static void CUDART_CB callback(cudaStream_t /*queue*/, cudaError_t /*status*/, void* arg) {
         //ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status);
         std::unique_ptr<HostOnlyTask> pTask(static_cast<HostOnlyTask*>(arg));
@@ -42,7 +44,9 @@ namespace alpaka {
     //! The HIP async queue enqueue trait specialization for "safe tasks"
     template <>
     struct Enqueue<QueueHipRtNonBlocking, HostOnlyTask> {
-      static void HIPRT_CB callback(hipStream_t /*queue*/, hipError_t /*status*/, void* arg) {
+      using TApi = ApiHipRt;
+
+      static void callback(hipStream_t /*queue*/, hipError_t /*status*/, void* arg) {
         //ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status);
         std::unique_ptr<HostOnlyTask> pTask(static_cast<HostOnlyTask*>(arg));
         (*pTask)();

diff --git a/src/alpaka/AlpakaCore/alpaka/initialise.cc b/src/alpaka/AlpakaCore/alpaka/initialise.cc
@@ -0,0 +1,32 @@
+#include <iostream>
+
+#include <alpaka/alpaka.hpp>
+
+#include "AlpakaCore/alpakaConfig.h"
+#include "AlpakaCore/alpakaDevices.h"
+#include "AlpakaCore/initialise.h"
+#include "Framework/demangle.h"
+
+namespace cms::alpakatools {
+
+  template <typename TPlatform>
+  void initialise() {
+    constexpr const char* suffix[] = {"devices.", "device:", "devices:"};
+
+    if (devices<TPlatform>.empty()) {
+      devices<TPlatform> = enumerate<TPlatform>();
+      auto size = devices<TPlatform>.size();
+      //std::cout << edm::demangle<TPlatform> << " platform succesfully initialised." << std::endl;
+      std::cout << "Found " << size << " " << suffix[size < 2 ? size : 2] << std::endl;
+      for (auto const& device : devices<TPlatform>) {
+        std::cout << "  - " << alpaka::getName(device) << std::endl;
+      }
+    } else {
+      //std::cout << edm::demangle<TPlatform> << " platform already initialised." << std::endl;
+    }
+  }
+
+  // explicit template instantiation definition
+  template void initialise<ALPAKA_ACCELERATOR_NAMESPACE::Platform>();
+
+}  // namespace cms::alpakatools
diff --git a/src/alpaka/AlpakaCore/alpakaConfig.h b/src/alpaka/AlpakaCore/alpakaConfig.h
@@ -1,9 +1,7 @@
 #ifndef AlpakaCore_alpakaConfig_h
 #define AlpakaCore_alpakaConfig_h
 
-#include <type_traits>
-
-#include <alpaka/alpaka.hpp>
+#include "AlpakaCore/alpakaFwd.h"
 
 namespace alpaka_common {
 
@@ -32,7 +30,7 @@ namespace alpaka_common {
 
   // host types
   using DevHost = alpaka::DevCpu;
-  using PltfHost = alpaka::Pltf<DevHost>;
+  using PltfHost = alpaka::PltfCpu;
 
 }  // namespace alpaka_common
 
@@ -44,7 +42,7 @@ namespace alpaka_common {
 #define DEFINE_FWK_ALPAKA_EVENTSETUP_MODULE(name) \
   DEFINE_FWK_ALPAKA_EVENTSETUP_MODULE2(ALPAKA_ACCELERATOR_NAMESPACE::name)
 
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+#ifdef ALPAKA_ACC_GPU_CUDA_PRESENT
 namespace alpaka_cuda_async {
   using namespace alpaka_common;
 
@@ -61,13 +59,13 @@ namespace alpaka_cuda_async {
 
 }  // namespace alpaka_cuda_async
 
-#endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
+#endif  // ALPAKA_ACC_GPU_CUDA_PRESENT
 
 #ifdef ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND
 #define ALPAKA_ACCELERATOR_NAMESPACE alpaka_cuda_async
 #endif  // ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND
 
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+#ifdef ALPAKA_ACC_GPU_HIP_PRESENT
 namespace alpaka_rocm_async {
   using namespace alpaka_common;
 
@@ -84,13 +82,13 @@ namespace alpaka_rocm_async {
 
 }  // namespace alpaka_rocm_async
 
-#endif  // ALPAKA_ACC_GPU_HIP_ENABLED
+#endif  // ALPAKA_ACC_GPU_HIP_PRESENT
 
 #ifdef ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND
 #define ALPAKA_ACCELERATOR_NAMESPACE alpaka_rocm_async
 #endif  // ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND
 
-#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
+#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT
 namespace alpaka_serial_sync {
   using namespace alpaka_common;
 
@@ -107,13 +105,13 @@ namespace alpaka_serial_sync {
 
 }  // namespace alpaka_serial_sync
 
-#endif  // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
+#endif  // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT
 
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND
 #define ALPAKA_ACCELERATOR_NAMESPACE alpaka_serial_sync
 #endif  // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND
 
-#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
+#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT
 namespace alpaka_tbb_async {
   using namespace alpaka_common;
 
@@ -130,13 +128,13 @@ namespace alpaka_tbb_async {
 
 }  // namespace alpaka_tbb_async
 
-#endif  // ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
+#endif  // ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT
 
 #ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND
 #define ALPAKA_ACCELERATOR_NAMESPACE alpaka_tbb_async
 #endif  // ALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND
 
-#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
+#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_PRESENT
 namespace alpaka_omp2_async {
   using namespace alpaka_common;
 
@@ -153,7 +151,7 @@ namespace alpaka_omp2_async {
 
 }  // namespace alpaka_omp2_async
 
-#endif  // ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED
+#endif  // ALPAKA_ACC_CPU_B_OMP2_T_SEQ_PRESENT
 
 #ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ASYNC_BACKEND
 #define ALPAKA_ACCELERATOR_NAMESPACE alpaka_omp2_async

diff --git a/src/alpaka/AlpakaCore/alpakaFwd.h b/src/alpaka/AlpakaCore/alpakaFwd.h
@@ -0,0 +1,96 @@
+#ifndef AlpakaCore_alpakaFwd_h
+#define AlpakaCore_alpakaFwd_h
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+/**
+ * This file forward declares specific types defined in Alpaka
+ * (depending on the backend-enabling macros) so that these types
+ * would be available throughout CMSSW without a direct dependence on
+ * Alpaka in order to avoid the constraints that would impose
+ * (primarily the device compiler)
+ *
+ * This is a little bit brittle, but let's see how it goes.
+ */
+namespace alpaka {
+
+  // miscellanea
+  template <std::size_t N>
+  using DimInt = std::integral_constant<std::size_t, N>;
+
+  template <typename TDim, typename TVal>
+  class Vec;
+
+  template <typename TDim, typename TIdx>
+  class WorkDivMembers;
+
+  // API
+  struct ApiCudaRt;
+  struct ApiHipRt;
+
+  // Platforms
+  class PltfCpu;
+  template <typename TApi>
+  class PltfUniformCudaHipRt;
+  using PltfCudaRt = PltfUniformCudaHipRt<ApiCudaRt>;
+  using PltfHipRt = PltfUniformCudaHipRt<ApiHipRt>;
+
+  // Devices
+  class DevCpu;
+  template <typename TApi>
+  class DevUniformCudaHipRt;
+  using DevCudaRt = DevUniformCudaHipRt<ApiCudaRt>;
+  using DevHipRt = DevUniformCudaHipRt<ApiHipRt>;
+
+  // Queues
+  template <typename TDev>
+  class QueueGenericThreadsBlocking;
+  using QueueCpuBlocking = QueueGenericThreadsBlocking<DevCpu>;
+
+  template <typename TDev>
+  class QueueGenericThreadsNonBlocking;
+  using QueueCpuNonBlocking = QueueGenericThreadsNonBlocking<DevCpu>;
+
+  namespace uniform_cuda_hip::detail {
+    template <typename TApi, bool TBlocking>
+    class QueueUniformCudaHipRt;
+  }
+  using QueueCudaRtBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<ApiCudaRt, true>;
+  using QueueCudaRtNonBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<ApiCudaRt, false>;
+  using QueueHipRtBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<ApiHipRt, true>;
+  using QueueHipRtNonBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<ApiHipRt, false>;
+
+  // Events
+  template <typename TDev>
+  class EventGenericThreads;
+  using EventCpu = EventGenericThreads<DevCpu>;
+
+  template <typename TApi>
+  class EventUniformCudaHipRt;
+  using EventCudaRt = EventUniformCudaHipRt<ApiCudaRt>;
+  using EventHipRt = EventUniformCudaHipRt<ApiHipRt>;
+
+  // Accelerators
+  template <typename TApi, typename TDim, typename TIdx>
+  class AccGpuUniformCudaHipRt;
+
+  template <typename TDim, typename TIdx>
+  using AccGpuCudaRt = AccGpuUniformCudaHipRt<ApiCudaRt, TDim, TIdx>;
+
+  template <typename TDim, typename TIdx>
+  using AccGpuHipRt = AccGpuUniformCudaHipRt<ApiHipRt, TDim, TIdx>;
+
+  template <typename TDim, typename TIdx>
+  class AccCpuSerial;
+
+  template <typename TDim, typename TIdx>
+  class AccCpuTbbBlocks;
+
+  template <typename TDim, typename TIdx>
+  class AccCpuOmp2Blocks;
+
+}  // namespace alpaka
+
+#endif  // AlpakaCore_alpakaFwd_h