diff --git a/Makefile b/Makefile index 73f8f1f63..41702ae50 100644 --- a/Makefile +++ b/Makefile @@ -583,8 +583,8 @@ $(HWLOC_BASE): external_alpaka: $(ALPAKA_BASE) $(ALPAKA_BASE): - git clone git@github.com:alpaka-group/alpaka.git -b 0.9.0-rc1 $@ - cd $@ && git checkout ebc1171feac21f1e21c49bcd9f053e7b01b584d0 + git clone git@github.com:alpaka-group/alpaka.git -b develop $@ + cd $@ && git checkout 879b95ffce2da499c9cc6e12d4cfd5545effa701 # Kokkos external_kokkos: $(KOKKOS_LIB) diff --git a/src/alpaka/AlpakaCore/CachingAllocator.h b/src/alpaka/AlpakaCore/CachingAllocator.h index 5411ecdc5..20b26ebe0 100644 --- a/src/alpaka/AlpakaCore/CachingAllocator.h +++ b/src/alpaka/AlpakaCore/CachingAllocator.h @@ -90,6 +90,11 @@ namespace cms::alpakatools { using Event = alpaka::Event; // the events used to synchronise the operations using Buffer = alpaka::Buf, size_t>; + // The "memory device" type can either be the same as the "synchronisation device" type, or be the host CPU. + static_assert(std::is_same_v> or std::is_same_v, + "The \"memory device\" type can either be the same as the \"synchronisation device\" type, or be the " + "host CPU."); + struct CachedBytes { size_t free = 0; // total bytes freed and cached on this device size_t live = 0; // total bytes currently in use oin this device @@ -311,11 +316,24 @@ namespace cms::alpakatools { return false; } + Buffer allocateBuffer(size_t bytes, Queue const& queue) { + if constexpr (std::is_same_v>) { + // allocate device memory + return alpaka::allocBuf(device_, bytes); + } else if constexpr (std::is_same_v) { + // allocate pinned host memory + return alpaka::allocMappedBuf(device_, alpaka::getDev(queue), bytes); + } else { + // unsupported combination + static_assert(std::is_same_v> or std::is_same_v, + "The \"memory device\" type can either be the same as the \"synchronisation device\" type, or be " + "the host CPU."); + } + } + void allocateNewBlock(BlockDescriptor& block) { try { - // FIXME simplify alpaka::Vec, size_t>{block.bytes} to block.bytes ? - block.buffer = - alpaka::allocBuf(device_, alpaka::Vec, size_t>{block.bytes}); + block.buffer = allocateBuffer(block.bytes, *block.queue); } catch (std::runtime_error const& e) { // the allocation attempt failed: free all cached blocks on the device and retry if (debug_) { @@ -329,25 +347,8 @@ namespace cms::alpakatools { freeAllCached(); // throw an exception if it fails again - block.buffer = - alpaka::allocBuf(device_, alpaka::Vec, size_t>{block.bytes}); - } - - // for host memory, pin the newly allocated block -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - if (not cms::alpakatools::devices.empty()) { - // it is possible to initialise the CUDA runtime and call cudaHostRegister - // only if the system has at least one supported GPU - alpaka::prepareForAsyncCopy(*block.buffer); - } -#endif // ALPAKA_ACC_GPU_CUDA_ENABLED -#ifdef ALPAKA_ACC_GPU_HIP_ENABLED - if (not cms::alpakatools::devices.empty()) { - // it is possible to initialise the ROCm runtime and call hipHostRegister - // only if the system has at least one supported GPU - alpaka::prepareForAsyncCopy(*block.buffer); + block.buffer = allocateBuffer(block.bytes, *block.queue); } -#endif // ALPAKA_ACC_GPU_HIP_ENABLED // create a new event associated to the "synchronisation device" block.event = Event{block.device()}; diff --git a/src/alpaka/AlpakaCore/HostOnlyTask.h b/src/alpaka/AlpakaCore/HostOnlyTask.h index dc9c5b7af..d010dc3eb 100644 --- a/src/alpaka/AlpakaCore/HostOnlyTask.h +++ b/src/alpaka/AlpakaCore/HostOnlyTask.h @@ -24,6 +24,8 @@ namespace alpaka { //! The CUDA async queue enqueue trait specialization for "safe tasks" template <> struct Enqueue { + using TApi = ApiCudaRt; + static void CUDART_CB callback(cudaStream_t /*queue*/, cudaError_t /*status*/, void* arg) { //ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status); std::unique_ptr pTask(static_cast(arg)); @@ -42,7 +44,9 @@ namespace alpaka { //! The HIP async queue enqueue trait specialization for "safe tasks" template <> struct Enqueue { - static void HIPRT_CB callback(hipStream_t /*queue*/, hipError_t /*status*/, void* arg) { + using TApi = ApiHipRt; + + static void callback(hipStream_t /*queue*/, hipError_t /*status*/, void* arg) { //ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status); std::unique_ptr pTask(static_cast(arg)); (*pTask)(); diff --git a/src/alpaka/AlpakaCore/alpaka/initialise.cc b/src/alpaka/AlpakaCore/alpaka/initialise.cc new file mode 100644 index 000000000..690975af8 --- /dev/null +++ b/src/alpaka/AlpakaCore/alpaka/initialise.cc @@ -0,0 +1,32 @@ +#include + +#include + +#include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/alpakaDevices.h" +#include "AlpakaCore/initialise.h" +#include "Framework/demangle.h" + +namespace cms::alpakatools { + + template + void initialise() { + constexpr const char* suffix[] = {"devices.", "device:", "devices:"}; + + if (devices.empty()) { + devices = enumerate(); + auto size = devices.size(); + //std::cout << edm::demangle << " platform succesfully initialised." << std::endl; + std::cout << "Found " << size << " " << suffix[size < 2 ? size : 2] << std::endl; + for (auto const& device : devices) { + std::cout << " - " << alpaka::getName(device) << std::endl; + } + } else { + //std::cout << edm::demangle << " platform already initialised." << std::endl; + } + } + + // explicit template instantiation definition + template void initialise(); + +} // namespace cms::alpakatools diff --git a/src/alpaka/AlpakaCore/alpakaConfig.h b/src/alpaka/AlpakaCore/alpakaConfig.h index 447ced874..ae4d6c058 100644 --- a/src/alpaka/AlpakaCore/alpakaConfig.h +++ b/src/alpaka/AlpakaCore/alpakaConfig.h @@ -1,9 +1,7 @@ #ifndef AlpakaCore_alpakaConfig_h #define AlpakaCore_alpakaConfig_h -#include - -#include +#include "AlpakaCore/alpakaFwd.h" namespace alpaka_common { @@ -32,7 +30,7 @@ namespace alpaka_common { // host types using DevHost = alpaka::DevCpu; - using PltfHost = alpaka::Pltf; + using PltfHost = alpaka::PltfCpu; } // namespace alpaka_common @@ -44,7 +42,7 @@ namespace alpaka_common { #define DEFINE_FWK_ALPAKA_EVENTSETUP_MODULE(name) \ DEFINE_FWK_ALPAKA_EVENTSETUP_MODULE2(ALPAKA_ACCELERATOR_NAMESPACE::name) -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED +#ifdef ALPAKA_ACC_GPU_CUDA_PRESENT namespace alpaka_cuda_async { using namespace alpaka_common; @@ -61,13 +59,13 @@ namespace alpaka_cuda_async { } // namespace alpaka_cuda_async -#endif // ALPAKA_ACC_GPU_CUDA_ENABLED +#endif // ALPAKA_ACC_GPU_CUDA_PRESENT #ifdef ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND #define ALPAKA_ACCELERATOR_NAMESPACE alpaka_cuda_async #endif // ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND -#ifdef ALPAKA_ACC_GPU_HIP_ENABLED +#ifdef ALPAKA_ACC_GPU_HIP_PRESENT namespace alpaka_rocm_async { using namespace alpaka_common; @@ -84,13 +82,13 @@ namespace alpaka_rocm_async { } // namespace alpaka_rocm_async -#endif // ALPAKA_ACC_GPU_HIP_ENABLED +#endif // ALPAKA_ACC_GPU_HIP_PRESENT #ifdef ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND #define ALPAKA_ACCELERATOR_NAMESPACE alpaka_rocm_async #endif // ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND -#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT namespace alpaka_serial_sync { using namespace alpaka_common; @@ -107,13 +105,13 @@ namespace alpaka_serial_sync { } // namespace alpaka_serial_sync -#endif // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED +#endif // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT #ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND #define ALPAKA_ACCELERATOR_NAMESPACE alpaka_serial_sync #endif // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND -#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED +#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT namespace alpaka_tbb_async { using namespace alpaka_common; @@ -130,13 +128,13 @@ namespace alpaka_tbb_async { } // namespace alpaka_tbb_async -#endif // ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED +#endif // ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT #ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND #define ALPAKA_ACCELERATOR_NAMESPACE alpaka_tbb_async #endif // ALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND -#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED +#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_PRESENT namespace alpaka_omp2_async { using namespace alpaka_common; @@ -153,7 +151,7 @@ namespace alpaka_omp2_async { } // namespace alpaka_omp2_async -#endif // ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED +#endif // ALPAKA_ACC_CPU_B_OMP2_T_SEQ_PRESENT #ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ASYNC_BACKEND #define ALPAKA_ACCELERATOR_NAMESPACE alpaka_omp2_async diff --git a/src/alpaka/AlpakaCore/alpakaFwd.h b/src/alpaka/AlpakaCore/alpakaFwd.h new file mode 100644 index 000000000..2234e440b --- /dev/null +++ b/src/alpaka/AlpakaCore/alpakaFwd.h @@ -0,0 +1,96 @@ +#ifndef AlpakaCore_alpakaFwd_h +#define AlpakaCore_alpakaFwd_h + +#include +#include +#include + +/** + * This file forward declares specific types defined in Alpaka + * (depending on the backend-enabling macros) so that these types + * would be available throughout CMSSW without a direct dependence on + * Alpaka in order to avoid the constraints that would impose + * (primarily the device compiler) + * + * This is a little bit brittle, but let's see how it goes. + */ +namespace alpaka { + + // miscellanea + template + using DimInt = std::integral_constant; + + template + class Vec; + + template + class WorkDivMembers; + + // API + struct ApiCudaRt; + struct ApiHipRt; + + // Platforms + class PltfCpu; + template + class PltfUniformCudaHipRt; + using PltfCudaRt = PltfUniformCudaHipRt; + using PltfHipRt = PltfUniformCudaHipRt; + + // Devices + class DevCpu; + template + class DevUniformCudaHipRt; + using DevCudaRt = DevUniformCudaHipRt; + using DevHipRt = DevUniformCudaHipRt; + + // Queues + template + class QueueGenericThreadsBlocking; + using QueueCpuBlocking = QueueGenericThreadsBlocking; + + template + class QueueGenericThreadsNonBlocking; + using QueueCpuNonBlocking = QueueGenericThreadsNonBlocking; + + namespace uniform_cuda_hip::detail { + template + class QueueUniformCudaHipRt; + } + using QueueCudaRtBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt; + using QueueCudaRtNonBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt; + using QueueHipRtBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt; + using QueueHipRtNonBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt; + + // Events + template + class EventGenericThreads; + using EventCpu = EventGenericThreads; + + template + class EventUniformCudaHipRt; + using EventCudaRt = EventUniformCudaHipRt; + using EventHipRt = EventUniformCudaHipRt; + + // Accelerators + template + class AccGpuUniformCudaHipRt; + + template + using AccGpuCudaRt = AccGpuUniformCudaHipRt; + + template + using AccGpuHipRt = AccGpuUniformCudaHipRt; + + template + class AccCpuSerial; + + template + class AccCpuTbbBlocks; + + template + class AccCpuOmp2Blocks; + +} // namespace alpaka + +#endif // AlpakaCore_alpakaFwd_h diff --git a/src/alpaka/AlpakaCore/alpakaMemory.h b/src/alpaka/AlpakaCore/alpakaMemory.h index 0ec3167c3..5ab53bd59 100644 --- a/src/alpaka/AlpakaCore/alpakaMemory.h +++ b/src/alpaka/AlpakaCore/alpakaMemory.h @@ -72,86 +72,59 @@ namespace cms::alpakatools { } // namespace detail - // helper function for pinning memory buffers - - template - void pin_buffer(TBuf& buffer) { -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - if (not cms::alpakatools::devices.empty()) { - // it is possible to initialise the CUDA runtime and call cudaHostRegister - // only if the system has at least one supported GPU - alpaka::prepareForAsyncCopy(buffer); - } -#endif // ALPAKA_ACC_GPU_CUDA_ENABLED -#ifdef ALPAKA_ACC_GPU_HIP_ENABLED - if (not cms::alpakatools::devices.empty()) { - // it is possible to initialise the ROCm runtime and call hipHostRegister - // only if the system has at least one supported GPU - alpaka::prepareForAsyncCopy(buffer); - } -#endif // ALPAKA_ACC_GPU_HIP_ENABLED - } - // scalar and 1-dimensional host buffers template using host_buffer = typename detail::buffer_type::type; - // non-cached, scalar and 1-dimensional host buffers - // the memory is pinned explicitly + // non-cached, non-pinned, scalar and 1-dimensional host buffers template std::enable_if_t, host_buffer> make_host_buffer() { - auto buffer = alpaka::allocBuf(host, Scalar{}); - pin_buffer(buffer); - return buffer; + return alpaka::allocBuf(host, Scalar{}); } template std::enable_if_t and not std::is_array_v>, host_buffer> make_host_buffer(Extent extent) { - auto buffer = alpaka::allocBuf, Idx>(host, Vec1D{extent}); - pin_buffer(buffer); - return buffer; + return alpaka::allocBuf, Idx>(host, Vec1D{extent}); } template std::enable_if_t and not std::is_array_v>, host_buffer> make_host_buffer() { - auto buffer = alpaka::allocBuf, Idx>(host, Vec1D{std::extent_v}); - pin_buffer(buffer); - return buffer; + return alpaka::allocBuf, Idx>(host, Vec1D{std::extent_v}); } - // potentially cached, scalar and 1-dimensional host buffers, associated to a work queue - // the memory is pinned by the caching allocator, or explicitly if it is not used + // potentially cached, pinned, scalar and 1-dimensional host buffers, associated to a work queue + // the memory is pinned according to the device associated to the queue template - std::enable_if_t, host_buffer> make_host_buffer(TQueue const& queue [[maybe_unused]]) { + std::enable_if_t, host_buffer> make_host_buffer(TQueue const& queue) { if constexpr (allocator_policy> == AllocatorPolicy::Caching) { return allocCachedBuf(host, queue, Scalar{}); } else { - return make_host_buffer(); + return alpaka::allocMappedBuf(host, alpaka::getDev(queue), Scalar{}); } } template std::enable_if_t and not std::is_array_v>, host_buffer> - make_host_buffer(TQueue const& queue [[maybe_unused]], Extent extent) { + make_host_buffer(TQueue const& queue, Extent extent) { if constexpr (allocator_policy> == AllocatorPolicy::Caching) { return allocCachedBuf, Idx>(host, queue, Vec1D{extent}); } else { - return make_host_buffer(extent); + return alpaka::allocMappedBuf, Idx>(host, alpaka::getDev(queue), Vec1D{extent}); } } template std::enable_if_t and not std::is_array_v>, host_buffer> - make_host_buffer(TQueue const& queue [[maybe_unused]]) { + make_host_buffer(TQueue const& queue) { if constexpr (allocator_policy> == AllocatorPolicy::Caching) { return allocCachedBuf, Idx>(host, queue, Vec1D{std::extent_v}); } else { - return make_host_buffer(); + return alpaka::allocMappedBuf, Idx>(host, alpaka::getDev(queue), Vec1D{std::extent_v}); } } diff --git a/src/alpaka/AlpakaCore/backend.h b/src/alpaka/AlpakaCore/backend.h index 387154013..8d58953cc 100644 --- a/src/alpaka/AlpakaCore/backend.h +++ b/src/alpaka/AlpakaCore/backend.h @@ -3,4 +3,15 @@ enum class Backend { SERIAL, TBB, CUDA, HIP }; +inline std::string const& name(Backend backend) { + static const std::string names[] = {"serial_sync", "tbb_async", "cuda_async", "rocm_async"}; + return names[static_cast(backend)]; +} + +template +inline T& operator<<(T& out, Backend backend) { + out << name(backend); + return out; +} + #endif // AlpakaCore_backend_h diff --git a/src/alpaka/AlpakaCore/initialise.h b/src/alpaka/AlpakaCore/initialise.h index 98ba966a1..1114ff5f9 100644 --- a/src/alpaka/AlpakaCore/initialise.h +++ b/src/alpaka/AlpakaCore/initialise.h @@ -1,31 +1,26 @@ #ifndef AlpakaCore_initialise_h #define AlpakaCore_initialise_h -#include - -#include - -#include "AlpakaCore/alpakaDevices.h" -#include "Framework/demangle.h" +#include "AlpakaCore/alpakaConfig.h" namespace cms::alpakatools { template - void initialise() { - constexpr const char* suffix[] = {"devices.", "device:", "devices:"}; - - if (devices.empty()) { - devices = enumerate(); - auto size = devices.size(); - //std::cout << edm::demangle << " platform succesfully initialised." << std::endl; - std::cout << "Found " << size << " " << suffix[size < 2 ? size : 2] << std::endl; - for (auto const& device : devices) { - std::cout << " - " << alpaka::getName(device) << std::endl; - } - } else { - //std::cout << edm::demangle << " platform already initialised." << std::endl; - } - } + void initialise(); + + // explicit template instantiation declaration +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT + extern template void initialise(); +#endif +#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT + extern template void initialise(); +#endif +#ifdef ALPAKA_ACC_GPU_CUDA_PRESENT + extern template void initialise(); +#endif +#ifdef ALPAKA_ACC_GPU_HIP_PRESENT + extern template void initialise(); +#endif } // namespace cms::alpakatools diff --git a/src/alpaka/Makefile b/src/alpaka/Makefile index bc56cab44..2cc62e515 100644 --- a/src/alpaka/Makefile +++ b/src/alpaka/Makefile @@ -31,12 +31,12 @@ EXE_DEP := $(EXE_OBJ:$.o=$.d) LIBNAMES := $(filter-out plugin-% bin test Makefile% plugins.txt%,$(wildcard *)) PLUGINNAMES := $(patsubst plugin-%,%,$(filter plugin-%,$(wildcard *))) -MY_CXXFLAGS := -I$(TARGET_DIR) -DSRC_DIR=$(TARGET_DIR) -DLIB_DIR=$(LIB_DIR)/$(TARGET_NAME) -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED -DALPAKA_HOST_ONLY +MY_CXXFLAGS := -I$(TARGET_DIR) -DLIB_DIR=$(LIB_DIR)/$(TARGET_NAME) -DALPAKA_HOST_ONLY -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT -DALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT ifdef CUDA_BASE -MY_CXXFLAGS += -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ONLY_MODE +MY_CXXFLAGS += -DALPAKA_ACC_GPU_CUDA_PRESENT -DALPAKA_ACC_GPU_CUDA_ONLY_MODE endif ifdef ROCM_BASE -MY_CXXFLAGS += -DALPAKA_ACC_GPU_HIP_ENABLED -DALPAKA_ACC_GPU_HIP_ONLY_MODE +MY_CXXFLAGS += -DALPAKA_ACC_GPU_HIP_PRESENT -DALPAKA_ACC_GPU_HIP_ONLY_MODE endif MY_LDFLAGS := -ldl -Wl,-rpath,$(LIB_DIR)/$(TARGET_NAME) LIB_LDFLAGS := -L$(LIB_DIR)/$(TARGET_NAME) @@ -182,8 +182,12 @@ $(foreach test,$(TESTS_ROCM_EXE),$(eval $(call RUNTEST_template,$(test),amdgpu)) -include $(ALL_DEPENDS) # Build targets -$(TARGET): $(EXE_OBJ) $(LIBS) $(PLUGINS) | $(TESTS_EXE) - $(CXX) $(EXE_OBJ) $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(patsubst %,-l%,$(LIBNAMES)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS)) +$(LIB_DIR)/$(TARGET_NAME)/plugins.txt: $(PLUGINS) + nm -A -C -D -P --defined-only $(PLUGINS) | sed -n -e"s#$(LIB_DIR)/$(TARGET_NAME)/\(plugin\w\+\.so\): typeinfo for edm::\(PluginFactory\|ESPluginFactory\)::impl::Maker<\([A-Za-z0-9_:]\+\)> V .* .*#\3 \1#p" | sort > $@ + +$(TARGET): $(EXE_OBJ) $(LIBS) $(PLUGINS) $(LIB_DIR)/$(TARGET_NAME)/plugins.txt | $(TESTS_EXE) + # Link all libraries, also the "portable" ones + $(CXX) $(EXE_OBJ) $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(foreach lib,$(LIBNAMES),$($(lib)_LDFLAGS) $($(lib)_SERIAL_LDFLAGS) $($(lib)_TBB_LDFLAGS) $($(lib)_CUDA_LDFLAGS) $($(lib)_ROCM_LDFLAGS)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS)) define BUILD_template $(OBJ_DIR)/$(2)/%.cc.o: $(SRC_DIR)/$(2)/%.cc @@ -219,7 +223,7 @@ $$($(1)_ROCM_LIB): $$($(1)_ROCM_OBJ) $$(foreach dep,$(EXTERNAL_DEPENDS_H),$$($$( # Portable code, for serial backend $(OBJ_DIR)/$(2)/alpaka/%.cc.serial.o: $(SRC_DIR)/$(2)/alpaka/%.cc @[ -d $$(@D) ] || mkdir -p $$(@D) - $(CXX) $(CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_CXXFLAGS)) -c $$< -o $$@ -MMD + $(CXX) $(CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_CXXFLAGS)) -c $$< -o $$@ -MMD @cp $(OBJ_DIR)/$(2)/alpaka/$$*.cc.serial.d $(OBJ_DIR)/$(2)/alpaka/$$*.cc.serial.d.tmp; \ sed 's#\($(2)/alpaka/$$*\)\.o[ :]*#\1.o \1.d : #g' < $(OBJ_DIR)/$(2)/alpaka/$$*.cc.serial.d.tmp > $(OBJ_DIR)/$(2)/alpaka/$$*.cc.serial.d; \ sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$$$//' \ @@ -229,7 +233,7 @@ $(OBJ_DIR)/$(2)/alpaka/%.cc.serial.o: $(SRC_DIR)/$(2)/alpaka/%.cc # Portable code, for TBB backend $(OBJ_DIR)/$(2)/alpaka/%.cc.tbb.o: $(SRC_DIR)/$(2)/alpaka/%.cc @[ -d $$(@D) ] || mkdir -p $$(@D) - $(CXX) $(CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_CXXFLAGS)) -c $$< -o $$@ -MMD + $(CXX) $(CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_CXXFLAGS)) -c $$< -o $$@ -MMD @cp $(OBJ_DIR)/$(2)/alpaka/$$*.cc.tbb.d $(OBJ_DIR)/$(2)/alpaka/$$*.cc.tbb.d.tmp; \ sed 's#\($(2)/alpaka/$$*\)\.o[ :]*#\1.o \1.d : #g' < $(OBJ_DIR)/$(2)/alpaka/$$*.cc.tbb.d.tmp > $(OBJ_DIR)/$(2)/alpaka/$$*.cc.tbb.d; \ sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$$$//' \ @@ -240,7 +244,7 @@ $(OBJ_DIR)/$(2)/alpaka/%.cc.tbb.o: $(SRC_DIR)/$(2)/alpaka/%.cc ifdef CUDA_BASE $(OBJ_DIR)/$(2)/alpaka/%.cc.cuda.o: $(SRC_DIR)/$(2)/alpaka/%.cc @[ -d $$(@D) ] || mkdir -p $$(@D) - $(CUDA_NVCC) -x cu $(CUDA_CUFLAGS) $(CUDA_CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND -UALPAKA_HOST_ONLY $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_CXXFLAGS)) $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_NVCC_CXXFLAGS)) -c $$< -o $$@ -MMD + $(CUDA_NVCC) -x cu $(CUDA_CUFLAGS) $(CUDA_CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND -UALPAKA_HOST_ONLY $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_CXXFLAGS)) $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_NVCC_CXXFLAGS)) -c $$< -o $$@ -MMD $$($(1)_CUDADLINK): $$($(1)_CUOBJ) $(CUDA_NVCC) $(CUDA_DLINKFLAGS) $(CUDA_LDFLAGS) $$($(1)_CUOBJ) -o $$@ @@ -250,7 +254,7 @@ endif ifdef ROCM_BASE $(OBJ_DIR)/$(2)/alpaka/%.cc.rocm.o: $(SRC_DIR)/$(2)/alpaka/%.cc @[ -d $$(@D) ] || mkdir -p $$(@D) - $(ROCM_HIPCC) $(HIPCC_CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_GPU_HIP_ASYNC_BACKEND -UALPAKA_HOST_ONLY $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_CXXFLAGS)) -c $$< -o $$@ -MMD + $(ROCM_HIPCC) $(HIPCC_CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_GPU_HIP_ENABLED -DALPAKA_ACC_GPU_HIP_ASYNC_BACKEND -UALPAKA_HOST_ONLY $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_CXXFLAGS)) -c $$< -o $$@ -MMD endif endef @@ -271,7 +275,7 @@ $(OBJ_DIR)/$(TARGET_NAME)/bin/%.cc.o: $(SRC_DIR)/$(TARGET_NAME)/bin/%.cc # Serial backend $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.serial.o: $(SRC_DIR)/$(TARGET_NAME)/test/alpaka/%.cc @[ -d $(@D) ] || mkdir -p $(@D) - $(CXX) $(CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_CXXFLAGS)) -c $< -o $@ -MMD + $(CXX) $(CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_CXXFLAGS)) -c $< -o $@ -MMD @cp $(@D)/$*.cc.serial.d $(@D)/$*.cc.serial.d.tmp; \ sed 's#\($(TARGET_NAME)/$*\)\.o[ :]*#\1.o \1.d : #g' < $(@D)/$*.cc.serial.d.tmp > $(@D)/$*.cc.serial.d; \ sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$//' \ @@ -280,12 +284,12 @@ $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.serial.o: $(SRC_DIR)/$(TARGET_NAME)/t $(TEST_DIR)/$(TARGET_NAME)/%.serial: $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.serial.o | $(LIBS) @[ -d $(@D) ] || mkdir -p $(@D) - $(CXX) $^ $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(patsubst %,-l%,$(LIBNAMES)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS)) + $(CXX) $^ $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(foreach lib,$(LIBNAMES),$($(lib)_LDFLAGS) $($(lib)_SERIAL_LDFLAGS)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS)) # TBB backend $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.tbb.o: $(SRC_DIR)/$(TARGET_NAME)/test/alpaka/%.cc @[ -d $(@D) ] || mkdir -p $(@D) - $(CXX) $(CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_CXXFLAGS)) -c $< -o $@ -MMD + $(CXX) $(CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_CXXFLAGS)) -c $< -o $@ -MMD @cp $(@D)/$*.cc.tbb.d $(@D)/$*.cc.tbb.d.tmp; \ sed 's#\($(TARGET_NAME)/$*\)\.o[ :]*#\1.o \1.d : #g' < $(@D)/$*.cc.tbb.d.tmp > $(@D)/$*.cc.tbb.d; \ sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$//' \ @@ -294,29 +298,29 @@ $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.tbb.o: $(SRC_DIR)/$(TARGET_NAME)/test $(TEST_DIR)/$(TARGET_NAME)/%.tbb: $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.tbb.o | $(LIBS) @[ -d $(@D) ] || mkdir -p $(@D) - $(CXX) $^ $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(patsubst %,-l%,$(LIBNAMES)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS)) + $(CXX) $^ $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(foreach lib,$(LIBNAMES),$($(lib)_LDFLAGS) $($(lib)_TBB_LDFLAGS)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS)) # CUDA backend ifdef CUDA_BASE $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.cuda.o: $(SRC_DIR)/$(TARGET_NAME)/test/alpaka/%.cc @[ -d $(@D) ] || mkdir -p $(@D) - $(CUDA_NVCC) -x cu $(CUDA_CUFLAGS) $(CUDA_CXXFLAGS) $(CUDA_TEST_CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND -UALPAKA_HOST_ONLY $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_CXXFLAGS)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_NVCC_CXXFLAGS)) -c $< -o $@ -MMD + $(CUDA_NVCC) -x cu $(CUDA_CUFLAGS) $(CUDA_CXXFLAGS) $(CUDA_TEST_CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND -UALPAKA_HOST_ONLY $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_CXXFLAGS)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_NVCC_CXXFLAGS)) -c $< -o $@ -MMD $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cudadlink.o: $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.o $(CUDA_NVCC) $(CUDA_DLINKFLAGS) $(CUDA_LDFLAGS) $< -o $@ $(TEST_DIR)/$(TARGET_NAME)/%.cuda: $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.cuda.o $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.cuda.cudadlink.o | $(LIBS) @[ -d $(@D) ] || mkdir -p $(@D) - $(CXX) $^ $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(patsubst %,-l%,$(LIBNAMES)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS)) + $(CXX) $^ $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(foreach lib,$(LIBNAMES),$($(lib)_LDFLAGS) $($(lib)_CUDA_LDFLAGS)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS)) endif # ROCm backend ifdef ROCM_BASE $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.rocm.o: $(SRC_DIR)/$(TARGET_NAME)/test/alpaka/%.cc @[ -d $(@D) ] || mkdir -p $(@D) - $(ROCM_HIPCC) $(HIPCC_CXXFLAGS) $(ROCM_TEST_CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_GPU_HIP_ASYNC_BACKEND -UALPAKA_HOST_ONLY $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_CXXFLAGS)) -c $< -o $@ -MMD + $(ROCM_HIPCC) $(HIPCC_CXXFLAGS) $(ROCM_TEST_CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_GPU_HIP_ENABLED -DALPAKA_ACC_GPU_HIP_ASYNC_BACKEND -UALPAKA_HOST_ONLY $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_CXXFLAGS)) -c $< -o $@ -MMD $(TEST_DIR)/$(TARGET_NAME)/%.rocm: $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.rocm.o | $(LIBS) @[ -d $(@D) ] || mkdir -p $(@D) - $(CXX) $^ $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(patsubst %,-l%,$(LIBNAMES)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS)) + $(CXX) $^ $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(foreach lib,$(LIBNAMES),$($(lib)_LDFLAGS) $($(lib)_ROCM_LDFLAGS)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS)) endif diff --git a/src/alpaka/bin/EventProcessor.cc b/src/alpaka/bin/EventProcessor.cc index 3d7a3546f..7e84f5114 100644 --- a/src/alpaka/bin/EventProcessor.cc +++ b/src/alpaka/bin/EventProcessor.cc @@ -1,3 +1,4 @@ +#include #include #include #include @@ -13,7 +14,7 @@ namespace edm { EventProcessor::EventProcessor(int maxEvents, int runForMinutes, int numberOfStreams, - std::vector const& path, + Alternatives alternatives, std::vector const& esproducers, std::filesystem::path const& datadir, bool validation) @@ -24,9 +25,23 @@ namespace edm { esp->produce(eventSetup_); } + // normalise the total weight to the number of streams + float total = 0.; + for (auto const& alternative : alternatives) { + total += alternative.weight; + } //schedules_.reserve(numberOfStreams); - for (int i = 0; i < numberOfStreams; ++i) { - schedules_.emplace_back(registry_, pluginManager_, &source_, &eventSetup_, i, path); + float cumulative = 0.; + int lower_range = 0; + int upper_range = 0; + for (auto& alternative : alternatives) { + cumulative += alternative.weight; + lower_range = upper_range; + upper_range = static_cast(std::round(cumulative * numberOfStreams / total)); + for (int i = lower_range; i < upper_range; ++i) { + schedules_.emplace_back(registry_, pluginManager_, &source_, &eventSetup_, i, alternative.path); + } + streamsPerBackend_.emplace_back(alternative.backend, upper_range - lower_range); } } diff --git a/src/alpaka/bin/EventProcessor.h b/src/alpaka/bin/EventProcessor.h index 5fb20f0b9..e1ca9fe01 100644 --- a/src/alpaka/bin/EventProcessor.h +++ b/src/alpaka/bin/EventProcessor.h @@ -5,6 +5,7 @@ #include #include +#include "AlpakaCore/backend.h" #include "Framework/EventSetup.h" #include "PluginManager.h" @@ -12,18 +13,31 @@ #include "Source.h" namespace edm { + struct Alternative { + Alternative() = default; + Alternative(Backend backend, float weight, std::vector path) + : backend{backend}, weight{weight}, path{std::move(path)} {} + + Backend backend; + float weight; + std::vector path; + }; + + using Alternatives = std::vector; + class EventProcessor { public: explicit EventProcessor(int maxEvents, int runForMinutes, int numberOfStreams, - std::vector const& path, + Alternatives alternatives, std::vector const& esproducers, std::filesystem::path const& datadir, bool validation); int maxEvents() const { return source_.maxEvents(); } int processedEvents() const { return source_.processedEvents(); } + std::vector> const& backends() const { return streamsPerBackend_; } void runToCompletion(); @@ -35,6 +49,7 @@ namespace edm { Source source_; EventSetup eventSetup_; std::vector schedules_; + std::vector> streamsPerBackend_; }; } // namespace edm diff --git a/src/alpaka/bin/PluginManager.cc b/src/alpaka/bin/PluginManager.cc index 0fd46bbff..d1fa1d0f7 100644 --- a/src/alpaka/bin/PluginManager.cc +++ b/src/alpaka/bin/PluginManager.cc @@ -6,9 +6,6 @@ #include "PluginManager.h" -#ifndef SRC_DIR -#error "SRC_DIR undefined" -#endif #ifndef LIB_DIR #error "LIB_DIR undefined" #endif @@ -18,7 +15,7 @@ namespace edmplugin { PluginManager::PluginManager() { - std::ifstream pluginMap(STR(SRC_DIR) "/plugins.txt"); + std::ifstream pluginMap(STR(LIB_DIR) "/plugins.txt"); std::string plugin, library; while (pluginMap >> plugin >> library) { //std::cout << "plugin " << plugin << " in " << library << std::endl; diff --git a/src/alpaka/bin/main.cc b/src/alpaka/bin/main.cc index 6e863d8f3..9c997f4af 100644 --- a/src/alpaka/bin/main.cc +++ b/src/alpaka/bin/main.cc @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -25,31 +26,31 @@ namespace { void print_help(std::string const& name) { std::cout << name << ": " -#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT << "[--serial] " #endif -#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED +#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT << "[--tbb] " #endif -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED +#ifdef ALPAKA_ACC_GPU_CUDA_PRESENT << "[--cuda] " #endif -#ifdef ALPAKA_ACC_GPU_HIP_ENABLED +#ifdef ALPAKA_ACC_GPU_HIP_PRESENT << "[--hip] " #endif << "[--numberOfThreads NT] [--numberOfStreams NS] [--maxEvents ME] [--data PATH] " - "[--transfer] [--validation]\n\n" + "[--transfer] [--validation] [--histogram]\n\n" << "Options\n" -#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT << " --serial Use CPU Serial backend\n" #endif -#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED +#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT << " --tbb Use CPU TBB backend\n" #endif -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED +#ifdef ALPAKA_ACC_GPU_CUDA_PRESENT << " --cuda Use CUDA backend\n" #endif -#ifdef ALPAKA_ACC_GPU_HIP_ENABLED +#ifdef ALPAKA_ACC_GPU_HIP_PRESENT << " --hip Use ROCm/HIP backend\n" #endif << " --numberOfThreads Number of threads to use (default 1, use 0 to use all CPU cores)\n" @@ -64,13 +65,62 @@ namespace { << " --empty Ignore all producers (for testing only)\n" << std::endl; } - } // namespace +bool getOptionalArgument(std::vector const& args, std::vector::iterator& i, int& value) { + auto it = i; + ++it; + if (it == args.end()) { + return false; + } + try { + value = std::stoi(*it); + ++i; + return true; + } catch (...) { + return false; + } +} + +bool getOptionalArgument(std::vector const& args, std::vector::iterator& i, float& value) { + auto it = i; + ++it; + if (it == args.end()) { + return false; + } + try { + value = std::stof(*it); + ++i; + return true; + } catch (...) { + return false; + } +} + +bool getOptionalArgument(std::vector const& args, + std::vector::iterator& i, + std::filesystem::path& value) { + auto it = i; + ++it; + if (it == args.end()) { + return false; + } + value = *it; + return true; +} + +template +void getArgument(std::vector const& args, std::vector::iterator& i, T& value) { + if (not getOptionalArgument(args, i, value)) { + std::cerr << "error: " << *i << " expects an argument" << std::endl; + exit(EXIT_FAILURE); + } +} + int main(int argc, char** argv) { // Parse command line arguments std::vector args(argv, argv + argc); - std::vector backends; + std::unordered_map backends; int numberOfThreads = 1; int numberOfStreams = 0; int maxEvents = -1; @@ -84,37 +134,40 @@ int main(int argc, char** argv) { if (*i == "-h" or *i == "--help") { print_help(args.front()); return EXIT_SUCCESS; -#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT } else if (*i == "--serial") { - backends.emplace_back(Backend::SERIAL); + float weight = 1.; + getOptionalArgument(args, i, weight); + backends.insert_or_assign(Backend::SERIAL, weight); #endif -#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED +#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT } else if (*i == "--tbb") { - backends.emplace_back(Backend::TBB); + float weight = 1.; + getOptionalArgument(args, i, weight); + backends.insert_or_assign(Backend::TBB, weight); #endif -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED +#ifdef ALPAKA_ACC_GPU_CUDA_PRESENT } else if (*i == "--cuda") { - backends.emplace_back(Backend::CUDA); + float weight = 1.; + getOptionalArgument(args, i, weight); + backends.insert_or_assign(Backend::CUDA, weight); #endif -#ifdef ALPAKA_ACC_GPU_HIP_ENABLED +#ifdef ALPAKA_ACC_GPU_HIP_PRESENT } else if (*i == "--hip") { - backends.emplace_back(Backend::HIP); + float weight = 1.; + getOptionalArgument(args, i, weight); + backends.insert_or_assign(Backend::HIP, weight); #endif } else if (*i == "--numberOfThreads") { - ++i; - numberOfThreads = std::stoi(*i); + getArgument(args, i, numberOfThreads); } else if (*i == "--numberOfStreams") { - ++i; - numberOfStreams = std::stoi(*i); + getArgument(args, i, numberOfStreams); } else if (*i == "--maxEvents") { - ++i; - maxEvents = std::stoi(*i); + getArgument(args, i, maxEvents); } else if (*i == "--runForMinutes") { - ++i; - runForMinutes = std::stoi(*i); + getArgument(args, i, runForMinutes); } else if (*i == "--data") { - ++i; - datadir = *i; + getArgument(args, i, datadir); } else if (*i == "--transfer") { transfer = true; } else if (*i == "--validation") { @@ -150,76 +203,78 @@ int main(int argc, char** argv) { } // Initialiase the selected backends -#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED - if (std::find(backends.begin(), backends.end(), Backend::SERIAL) != backends.end()) { +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT + if (backends.find(Backend::SERIAL) != backends.end()) { cms::alpakatools::initialise(); } #endif -#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED - if (std::find(backends.begin(), backends.end(), Backend::TBB) != backends.end()) { +#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT + if (backends.find(Backend::TBB) != backends.end()) { cms::alpakatools::initialise(); } #endif -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - if (std::find(backends.begin(), backends.end(), Backend::CUDA) != backends.end()) { +#ifdef ALPAKA_ACC_GPU_CUDA_PRESENT + if (backends.find(Backend::CUDA) != backends.end()) { cms::alpakatools::initialise(); } #endif -#ifdef ALPAKA_ACC_GPU_HIP_ENABLED - if (std::find(backends.begin(), backends.end(), Backend::HIP) != backends.end()) { +#ifdef ALPAKA_ACC_GPU_HIP_PRESENT + if (backends.find(Backend::HIP) != backends.end()) { cms::alpakatools::initialise(); } #endif // Initialize EventProcessor - std::vector edmodules; std::vector esmodules; + edm::Alternatives alternatives; if (not empty) { + // host-only ESModules esmodules = {"BeamSpotESProducer", "SiPixelFedIdsESProducer"}; - auto addModules = [&](std::string const& accelerator_namespace, Backend backend) { - if (std::find(backends.begin(), backends.end(), backend) != backends.end()) { - edmodules.emplace_back(accelerator_namespace + "::" + "BeamSpotToAlpaka"); - edmodules.emplace_back(accelerator_namespace + "::" + "SiPixelRawToCluster"); - edmodules.emplace_back(accelerator_namespace + "::" + "SiPixelRecHitAlpaka"); - edmodules.emplace_back(accelerator_namespace + "::" + "CAHitNtupletAlpaka"); - edmodules.emplace_back(accelerator_namespace + "::" + "PixelVertexProducerAlpaka"); - if (transfer) { - edmodules.emplace_back(accelerator_namespace + "::" + "PixelTrackSoAFromAlpaka"); - edmodules.emplace_back(accelerator_namespace + "::" + "PixelVertexSoAFromAlpaka"); - } - if (validation) { - edmodules.emplace_back(accelerator_namespace + "::" + "CountValidator"); - } - if (histogram) { - edmodules.emplace_back(accelerator_namespace + "::" + "HistoValidator"); - } - esmodules.emplace_back(accelerator_namespace + "::" + "SiPixelFedCablingMapESProducer"); - esmodules.emplace_back(accelerator_namespace + "::" + "SiPixelGainCalibrationForHLTESProducer"); - esmodules.emplace_back(accelerator_namespace + "::" + "PixelCPEFastESProducer"); + for (auto const& [backend, weight] : backends) { + std::string prefix = "alpaka_" + name(backend) + "::"; + // "portable" ESModules + esmodules.emplace_back(prefix + "SiPixelFedCablingMapESProducer"); + esmodules.emplace_back(prefix + "SiPixelGainCalibrationForHLTESProducer"); + esmodules.emplace_back(prefix + "PixelCPEFastESProducer"); + // "portable" EDModules + std::vector edmodules; + edmodules.emplace_back(prefix + "BeamSpotToAlpaka"); + edmodules.emplace_back(prefix + "SiPixelRawToCluster"); + edmodules.emplace_back(prefix + "SiPixelRecHitAlpaka"); + edmodules.emplace_back(prefix + "CAHitNtupletAlpaka"); + edmodules.emplace_back(prefix + "PixelVertexProducerAlpaka"); + if (transfer) { + edmodules.emplace_back(prefix + "PixelTrackSoAFromAlpaka"); + edmodules.emplace_back(prefix + "PixelVertexSoAFromAlpaka"); } - }; -#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED - addModules("alpaka_serial_sync", Backend::SERIAL); -#endif -#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED - addModules("alpaka_tbb_async", Backend::TBB); -#endif -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - addModules("alpaka_cuda_async", Backend::CUDA); -#endif -#ifdef ALPAKA_ACC_GPU_HIP_ENABLED - addModules("alpaka_rocm_async", Backend::HIP); -#endif + if (validation) { + edmodules.emplace_back(prefix + "CountValidator"); + } + if (histogram) { + edmodules.emplace_back(prefix + "HistoValidator"); + } + alternatives.emplace_back(backend, weight, std::move(edmodules)); + } } edm::EventProcessor processor( - maxEvents, runForMinutes, numberOfStreams, std::move(edmodules), std::move(esmodules), datadir, validation); + maxEvents, runForMinutes, numberOfStreams, std::move(alternatives), std::move(esmodules), datadir, validation); if (runForMinutes < 0) { - std::cout << "Processing " << processor.maxEvents() << " events, of which " << numberOfStreams - << " concurrently, with " << numberOfThreads << " threads." << std::endl; + std::cout << "Processing " << processor.maxEvents() << " events,"; } else { - std::cout << "Processing for about " << runForMinutes << " minutes with " << numberOfStreams - << " concurrent events and " << numberOfThreads << " threads." << std::endl; + std::cout << "Processing for about " << runForMinutes << " minutes,"; + } + { + std::cout << " with " << numberOfStreams << " concurrent events ("; + bool need_comma = false; + for (auto const& [backend, streams] : processor.backends()) { + if (need_comma) { + std::cout << ", "; + } + std::cout << streams << " on " << backend; + need_comma = true; + } + std::cout << ") and " << numberOfThreads << " threads." << std::endl; } // Initialize the TBB thread pool diff --git a/src/alpaka/plugins.txt b/src/alpaka/plugins.txt deleted file mode 100644 index 35ed33a40..000000000 --- a/src/alpaka/plugins.txt +++ /dev/null @@ -1,50 +0,0 @@ -BeamSpotESProducer pluginBeamSpotProducer.so -alpaka_cuda_async::BeamSpotToAlpaka pluginBeamSpotProducer_cuda.so -alpaka_rocm_async::BeamSpotToAlpaka pluginBeamSpotProducer_rocm.so -alpaka_tbb_async::BeamSpotToAlpaka pluginBeamSpotProducer_tbb.so -alpaka_serial_sync::BeamSpotToAlpaka pluginBeamSpotProducer_serial.so -alpaka_cuda_async::CAHitNtupletAlpaka pluginPixelTriplets_cuda.so -alpaka_rocm_async::CAHitNtupletAlpaka pluginPixelTriplets_rocm.so -alpaka_tbb_async::CAHitNtupletAlpaka pluginPixelTriplets_tbb.so -alpaka_serial_sync::CAHitNtupletAlpaka pluginPixelTriplets_serial.so -alpaka_cuda_async::PixelTrackSoAFromAlpaka pluginPixelTrackFitting_cuda.so -alpaka_rocm_async::PixelTrackSoAFromAlpaka pluginPixelTrackFitting_rocm.so -alpaka_tbb_async::PixelTrackSoAFromAlpaka pluginPixelTrackFitting_tbb.so -alpaka_serial_sync::PixelTrackSoAFromAlpaka pluginPixelTrackFitting_serial.so -alpaka_cuda_async::PixelVertexProducerAlpaka pluginPixelVertexFinding_cuda.so -alpaka_rocm_async::PixelVertexProducerAlpaka pluginPixelVertexFinding_rocm.so -alpaka_tbb_async::PixelVertexProducerAlpaka pluginPixelVertexFinding_tbb.so -alpaka_serial_sync::PixelVertexProducerAlpaka pluginPixelVertexFinding_serial.so -alpaka_cuda_async::PixelVertexSoAFromAlpaka pluginPixelVertexFinding_cuda.so -alpaka_rocm_async::PixelVertexSoAFromAlpaka pluginPixelVertexFinding_rocm.so -alpaka_tbb_async::PixelVertexSoAFromAlpaka pluginPixelVertexFinding_tbb.so -alpaka_serial_sync::PixelVertexSoAFromAlpaka pluginPixelVertexFinding_serial.so -alpaka_cuda_async::SiPixelRawToCluster pluginSiPixelClusterizer_cuda.so -alpaka_rocm_async::SiPixelRawToCluster pluginSiPixelClusterizer_rocm.so -alpaka_tbb_async::SiPixelRawToCluster pluginSiPixelClusterizer_tbb.so -alpaka_serial_sync::SiPixelRawToCluster pluginSiPixelClusterizer_serial.so -SiPixelFedIdsESProducer pluginSiPixelClusterizer.so -alpaka_cuda_async::SiPixelFedCablingMapESProducer pluginSiPixelClusterizer_cuda.so -alpaka_rocm_async::SiPixelFedCablingMapESProducer pluginSiPixelClusterizer_rocm.so -alpaka_tbb_async::SiPixelFedCablingMapESProducer pluginSiPixelClusterizer_tbb.so -alpaka_serial_sync::SiPixelFedCablingMapESProducer pluginSiPixelClusterizer_serial.so -alpaka_cuda_async::SiPixelGainCalibrationForHLTESProducer pluginSiPixelClusterizer_cuda.so -alpaka_rocm_async::SiPixelGainCalibrationForHLTESProducer pluginSiPixelClusterizer_rocm.so -alpaka_tbb_async::SiPixelGainCalibrationForHLTESProducer pluginSiPixelClusterizer_tbb.so -alpaka_serial_sync::SiPixelGainCalibrationForHLTESProducer pluginSiPixelClusterizer_serial.so -alpaka_cuda_async::PixelCPEFastESProducer pluginSiPixelRecHits_cuda.so -alpaka_rocm_async::PixelCPEFastESProducer pluginSiPixelRecHits_rocm.so -alpaka_tbb_async::PixelCPEFastESProducer pluginSiPixelRecHits_tbb.so -alpaka_serial_sync::PixelCPEFastESProducer pluginSiPixelRecHits_serial.so -alpaka_cuda_async::SiPixelRecHitAlpaka pluginSiPixelRecHits_cuda.so -alpaka_rocm_async::SiPixelRecHitAlpaka pluginSiPixelRecHits_rocm.so -alpaka_tbb_async::SiPixelRecHitAlpaka pluginSiPixelRecHits_tbb.so -alpaka_serial_sync::SiPixelRecHitAlpaka pluginSiPixelRecHits_serial.so -alpaka_cuda_async::CountValidator pluginValidation_cuda.so -alpaka_rocm_async::CountValidator pluginValidation_rocm.so -alpaka_tbb_async::CountValidator pluginValidation_tbb.so -alpaka_serial_sync::CountValidator pluginValidation_serial.so -alpaka_cuda_async::HistoValidator pluginValidation_cuda.so -alpaka_rocm_async::HistoValidator pluginValidation_rocm.so -alpaka_tbb_async::HistoValidator pluginValidation_tbb.so -alpaka_serial_sync::HistoValidator pluginValidation_serial.so diff --git a/src/alpakatest/AlpakaCore/CachingAllocator.h b/src/alpakatest/AlpakaCore/CachingAllocator.h index 5411ecdc5..20b26ebe0 100644 --- a/src/alpakatest/AlpakaCore/CachingAllocator.h +++ b/src/alpakatest/AlpakaCore/CachingAllocator.h @@ -90,6 +90,11 @@ namespace cms::alpakatools { using Event = alpaka::Event; // the events used to synchronise the operations using Buffer = alpaka::Buf, size_t>; + // The "memory device" type can either be the same as the "synchronisation device" type, or be the host CPU. + static_assert(std::is_same_v> or std::is_same_v, + "The \"memory device\" type can either be the same as the \"synchronisation device\" type, or be the " + "host CPU."); + struct CachedBytes { size_t free = 0; // total bytes freed and cached on this device size_t live = 0; // total bytes currently in use oin this device @@ -311,11 +316,24 @@ namespace cms::alpakatools { return false; } + Buffer allocateBuffer(size_t bytes, Queue const& queue) { + if constexpr (std::is_same_v>) { + // allocate device memory + return alpaka::allocBuf(device_, bytes); + } else if constexpr (std::is_same_v) { + // allocate pinned host memory + return alpaka::allocMappedBuf(device_, alpaka::getDev(queue), bytes); + } else { + // unsupported combination + static_assert(std::is_same_v> or std::is_same_v, + "The \"memory device\" type can either be the same as the \"synchronisation device\" type, or be " + "the host CPU."); + } + } + void allocateNewBlock(BlockDescriptor& block) { try { - // FIXME simplify alpaka::Vec, size_t>{block.bytes} to block.bytes ? - block.buffer = - alpaka::allocBuf(device_, alpaka::Vec, size_t>{block.bytes}); + block.buffer = allocateBuffer(block.bytes, *block.queue); } catch (std::runtime_error const& e) { // the allocation attempt failed: free all cached blocks on the device and retry if (debug_) { @@ -329,25 +347,8 @@ namespace cms::alpakatools { freeAllCached(); // throw an exception if it fails again - block.buffer = - alpaka::allocBuf(device_, alpaka::Vec, size_t>{block.bytes}); - } - - // for host memory, pin the newly allocated block -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - if (not cms::alpakatools::devices.empty()) { - // it is possible to initialise the CUDA runtime and call cudaHostRegister - // only if the system has at least one supported GPU - alpaka::prepareForAsyncCopy(*block.buffer); - } -#endif // ALPAKA_ACC_GPU_CUDA_ENABLED -#ifdef ALPAKA_ACC_GPU_HIP_ENABLED - if (not cms::alpakatools::devices.empty()) { - // it is possible to initialise the ROCm runtime and call hipHostRegister - // only if the system has at least one supported GPU - alpaka::prepareForAsyncCopy(*block.buffer); + block.buffer = allocateBuffer(block.bytes, *block.queue); } -#endif // ALPAKA_ACC_GPU_HIP_ENABLED // create a new event associated to the "synchronisation device" block.event = Event{block.device()}; diff --git a/src/alpakatest/AlpakaCore/HostOnlyTask.h b/src/alpakatest/AlpakaCore/HostOnlyTask.h index dc9c5b7af..d010dc3eb 100644 --- a/src/alpakatest/AlpakaCore/HostOnlyTask.h +++ b/src/alpakatest/AlpakaCore/HostOnlyTask.h @@ -24,6 +24,8 @@ namespace alpaka { //! The CUDA async queue enqueue trait specialization for "safe tasks" template <> struct Enqueue { + using TApi = ApiCudaRt; + static void CUDART_CB callback(cudaStream_t /*queue*/, cudaError_t /*status*/, void* arg) { //ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status); std::unique_ptr pTask(static_cast(arg)); @@ -42,7 +44,9 @@ namespace alpaka { //! The HIP async queue enqueue trait specialization for "safe tasks" template <> struct Enqueue { - static void HIPRT_CB callback(hipStream_t /*queue*/, hipError_t /*status*/, void* arg) { + using TApi = ApiHipRt; + + static void callback(hipStream_t /*queue*/, hipError_t /*status*/, void* arg) { //ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status); std::unique_ptr pTask(static_cast(arg)); (*pTask)(); diff --git a/src/alpakatest/AlpakaCore/alpaka/initialise.cc b/src/alpakatest/AlpakaCore/alpaka/initialise.cc new file mode 100644 index 000000000..690975af8 --- /dev/null +++ b/src/alpakatest/AlpakaCore/alpaka/initialise.cc @@ -0,0 +1,32 @@ +#include + +#include + +#include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/alpakaDevices.h" +#include "AlpakaCore/initialise.h" +#include "Framework/demangle.h" + +namespace cms::alpakatools { + + template + void initialise() { + constexpr const char* suffix[] = {"devices.", "device:", "devices:"}; + + if (devices.empty()) { + devices = enumerate(); + auto size = devices.size(); + //std::cout << edm::demangle << " platform succesfully initialised." << std::endl; + std::cout << "Found " << size << " " << suffix[size < 2 ? size : 2] << std::endl; + for (auto const& device : devices) { + std::cout << " - " << alpaka::getName(device) << std::endl; + } + } else { + //std::cout << edm::demangle << " platform already initialised." << std::endl; + } + } + + // explicit template instantiation definition + template void initialise(); + +} // namespace cms::alpakatools diff --git a/src/alpakatest/AlpakaCore/alpakaConfig.h b/src/alpakatest/AlpakaCore/alpakaConfig.h index 447ced874..ae4d6c058 100644 --- a/src/alpakatest/AlpakaCore/alpakaConfig.h +++ b/src/alpakatest/AlpakaCore/alpakaConfig.h @@ -1,9 +1,7 @@ #ifndef AlpakaCore_alpakaConfig_h #define AlpakaCore_alpakaConfig_h -#include - -#include +#include "AlpakaCore/alpakaFwd.h" namespace alpaka_common { @@ -32,7 +30,7 @@ namespace alpaka_common { // host types using DevHost = alpaka::DevCpu; - using PltfHost = alpaka::Pltf; + using PltfHost = alpaka::PltfCpu; } // namespace alpaka_common @@ -44,7 +42,7 @@ namespace alpaka_common { #define DEFINE_FWK_ALPAKA_EVENTSETUP_MODULE(name) \ DEFINE_FWK_ALPAKA_EVENTSETUP_MODULE2(ALPAKA_ACCELERATOR_NAMESPACE::name) -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED +#ifdef ALPAKA_ACC_GPU_CUDA_PRESENT namespace alpaka_cuda_async { using namespace alpaka_common; @@ -61,13 +59,13 @@ namespace alpaka_cuda_async { } // namespace alpaka_cuda_async -#endif // ALPAKA_ACC_GPU_CUDA_ENABLED +#endif // ALPAKA_ACC_GPU_CUDA_PRESENT #ifdef ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND #define ALPAKA_ACCELERATOR_NAMESPACE alpaka_cuda_async #endif // ALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND -#ifdef ALPAKA_ACC_GPU_HIP_ENABLED +#ifdef ALPAKA_ACC_GPU_HIP_PRESENT namespace alpaka_rocm_async { using namespace alpaka_common; @@ -84,13 +82,13 @@ namespace alpaka_rocm_async { } // namespace alpaka_rocm_async -#endif // ALPAKA_ACC_GPU_HIP_ENABLED +#endif // ALPAKA_ACC_GPU_HIP_PRESENT #ifdef ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND #define ALPAKA_ACCELERATOR_NAMESPACE alpaka_rocm_async #endif // ALPAKA_ACC_GPU_HIP_ASYNC_BACKEND -#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT namespace alpaka_serial_sync { using namespace alpaka_common; @@ -107,13 +105,13 @@ namespace alpaka_serial_sync { } // namespace alpaka_serial_sync -#endif // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED +#endif // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT #ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND #define ALPAKA_ACCELERATOR_NAMESPACE alpaka_serial_sync #endif // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND -#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED +#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT namespace alpaka_tbb_async { using namespace alpaka_common; @@ -130,13 +128,13 @@ namespace alpaka_tbb_async { } // namespace alpaka_tbb_async -#endif // ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED +#endif // ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT #ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND #define ALPAKA_ACCELERATOR_NAMESPACE alpaka_tbb_async #endif // ALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND -#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED +#ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_PRESENT namespace alpaka_omp2_async { using namespace alpaka_common; @@ -153,7 +151,7 @@ namespace alpaka_omp2_async { } // namespace alpaka_omp2_async -#endif // ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED +#endif // ALPAKA_ACC_CPU_B_OMP2_T_SEQ_PRESENT #ifdef ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ASYNC_BACKEND #define ALPAKA_ACCELERATOR_NAMESPACE alpaka_omp2_async diff --git a/src/alpakatest/AlpakaCore/alpakaFwd.h b/src/alpakatest/AlpakaCore/alpakaFwd.h new file mode 100644 index 000000000..2234e440b --- /dev/null +++ b/src/alpakatest/AlpakaCore/alpakaFwd.h @@ -0,0 +1,96 @@ +#ifndef AlpakaCore_alpakaFwd_h +#define AlpakaCore_alpakaFwd_h + +#include +#include +#include + +/** + * This file forward declares specific types defined in Alpaka + * (depending on the backend-enabling macros) so that these types + * would be available throughout CMSSW without a direct dependence on + * Alpaka in order to avoid the constraints that would impose + * (primarily the device compiler) + * + * This is a little bit brittle, but let's see how it goes. + */ +namespace alpaka { + + // miscellanea + template + using DimInt = std::integral_constant; + + template + class Vec; + + template + class WorkDivMembers; + + // API + struct ApiCudaRt; + struct ApiHipRt; + + // Platforms + class PltfCpu; + template + class PltfUniformCudaHipRt; + using PltfCudaRt = PltfUniformCudaHipRt; + using PltfHipRt = PltfUniformCudaHipRt; + + // Devices + class DevCpu; + template + class DevUniformCudaHipRt; + using DevCudaRt = DevUniformCudaHipRt; + using DevHipRt = DevUniformCudaHipRt; + + // Queues + template + class QueueGenericThreadsBlocking; + using QueueCpuBlocking = QueueGenericThreadsBlocking; + + template + class QueueGenericThreadsNonBlocking; + using QueueCpuNonBlocking = QueueGenericThreadsNonBlocking; + + namespace uniform_cuda_hip::detail { + template + class QueueUniformCudaHipRt; + } + using QueueCudaRtBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt; + using QueueCudaRtNonBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt; + using QueueHipRtBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt; + using QueueHipRtNonBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt; + + // Events + template + class EventGenericThreads; + using EventCpu = EventGenericThreads; + + template + class EventUniformCudaHipRt; + using EventCudaRt = EventUniformCudaHipRt; + using EventHipRt = EventUniformCudaHipRt; + + // Accelerators + template + class AccGpuUniformCudaHipRt; + + template + using AccGpuCudaRt = AccGpuUniformCudaHipRt; + + template + using AccGpuHipRt = AccGpuUniformCudaHipRt; + + template + class AccCpuSerial; + + template + class AccCpuTbbBlocks; + + template + class AccCpuOmp2Blocks; + +} // namespace alpaka + +#endif // AlpakaCore_alpakaFwd_h diff --git a/src/alpakatest/AlpakaCore/alpakaMemory.h b/src/alpakatest/AlpakaCore/alpakaMemory.h index 0ec3167c3..5ab53bd59 100644 --- a/src/alpakatest/AlpakaCore/alpakaMemory.h +++ b/src/alpakatest/AlpakaCore/alpakaMemory.h @@ -72,86 +72,59 @@ namespace cms::alpakatools { } // namespace detail - // helper function for pinning memory buffers - - template - void pin_buffer(TBuf& buffer) { -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - if (not cms::alpakatools::devices.empty()) { - // it is possible to initialise the CUDA runtime and call cudaHostRegister - // only if the system has at least one supported GPU - alpaka::prepareForAsyncCopy(buffer); - } -#endif // ALPAKA_ACC_GPU_CUDA_ENABLED -#ifdef ALPAKA_ACC_GPU_HIP_ENABLED - if (not cms::alpakatools::devices.empty()) { - // it is possible to initialise the ROCm runtime and call hipHostRegister - // only if the system has at least one supported GPU - alpaka::prepareForAsyncCopy(buffer); - } -#endif // ALPAKA_ACC_GPU_HIP_ENABLED - } - // scalar and 1-dimensional host buffers template using host_buffer = typename detail::buffer_type::type; - // non-cached, scalar and 1-dimensional host buffers - // the memory is pinned explicitly + // non-cached, non-pinned, scalar and 1-dimensional host buffers template std::enable_if_t, host_buffer> make_host_buffer() { - auto buffer = alpaka::allocBuf(host, Scalar{}); - pin_buffer(buffer); - return buffer; + return alpaka::allocBuf(host, Scalar{}); } template std::enable_if_t and not std::is_array_v>, host_buffer> make_host_buffer(Extent extent) { - auto buffer = alpaka::allocBuf, Idx>(host, Vec1D{extent}); - pin_buffer(buffer); - return buffer; + return alpaka::allocBuf, Idx>(host, Vec1D{extent}); } template std::enable_if_t and not std::is_array_v>, host_buffer> make_host_buffer() { - auto buffer = alpaka::allocBuf, Idx>(host, Vec1D{std::extent_v}); - pin_buffer(buffer); - return buffer; + return alpaka::allocBuf, Idx>(host, Vec1D{std::extent_v}); } - // potentially cached, scalar and 1-dimensional host buffers, associated to a work queue - // the memory is pinned by the caching allocator, or explicitly if it is not used + // potentially cached, pinned, scalar and 1-dimensional host buffers, associated to a work queue + // the memory is pinned according to the device associated to the queue template - std::enable_if_t, host_buffer> make_host_buffer(TQueue const& queue [[maybe_unused]]) { + std::enable_if_t, host_buffer> make_host_buffer(TQueue const& queue) { if constexpr (allocator_policy> == AllocatorPolicy::Caching) { return allocCachedBuf(host, queue, Scalar{}); } else { - return make_host_buffer(); + return alpaka::allocMappedBuf(host, alpaka::getDev(queue), Scalar{}); } } template std::enable_if_t and not std::is_array_v>, host_buffer> - make_host_buffer(TQueue const& queue [[maybe_unused]], Extent extent) { + make_host_buffer(TQueue const& queue, Extent extent) { if constexpr (allocator_policy> == AllocatorPolicy::Caching) { return allocCachedBuf, Idx>(host, queue, Vec1D{extent}); } else { - return make_host_buffer(extent); + return alpaka::allocMappedBuf, Idx>(host, alpaka::getDev(queue), Vec1D{extent}); } } template std::enable_if_t and not std::is_array_v>, host_buffer> - make_host_buffer(TQueue const& queue [[maybe_unused]]) { + make_host_buffer(TQueue const& queue) { if constexpr (allocator_policy> == AllocatorPolicy::Caching) { return allocCachedBuf, Idx>(host, queue, Vec1D{std::extent_v}); } else { - return make_host_buffer(); + return alpaka::allocMappedBuf, Idx>(host, alpaka::getDev(queue), Vec1D{std::extent_v}); } } diff --git a/src/alpakatest/AlpakaCore/backend.h b/src/alpakatest/AlpakaCore/backend.h index 387154013..8d58953cc 100644 --- a/src/alpakatest/AlpakaCore/backend.h +++ b/src/alpakatest/AlpakaCore/backend.h @@ -3,4 +3,15 @@ enum class Backend { SERIAL, TBB, CUDA, HIP }; +inline std::string const& name(Backend backend) { + static const std::string names[] = {"serial_sync", "tbb_async", "cuda_async", "rocm_async"}; + return names[static_cast(backend)]; +} + +template +inline T& operator<<(T& out, Backend backend) { + out << name(backend); + return out; +} + #endif // AlpakaCore_backend_h diff --git a/src/alpakatest/AlpakaCore/initialise.h b/src/alpakatest/AlpakaCore/initialise.h index 98ba966a1..1114ff5f9 100644 --- a/src/alpakatest/AlpakaCore/initialise.h +++ b/src/alpakatest/AlpakaCore/initialise.h @@ -1,31 +1,26 @@ #ifndef AlpakaCore_initialise_h #define AlpakaCore_initialise_h -#include - -#include - -#include "AlpakaCore/alpakaDevices.h" -#include "Framework/demangle.h" +#include "AlpakaCore/alpakaConfig.h" namespace cms::alpakatools { template - void initialise() { - constexpr const char* suffix[] = {"devices.", "device:", "devices:"}; - - if (devices.empty()) { - devices = enumerate(); - auto size = devices.size(); - //std::cout << edm::demangle << " platform succesfully initialised." << std::endl; - std::cout << "Found " << size << " " << suffix[size < 2 ? size : 2] << std::endl; - for (auto const& device : devices) { - std::cout << " - " << alpaka::getName(device) << std::endl; - } - } else { - //std::cout << edm::demangle << " platform already initialised." << std::endl; - } - } + void initialise(); + + // explicit template instantiation declaration +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT + extern template void initialise(); +#endif +#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT + extern template void initialise(); +#endif +#ifdef ALPAKA_ACC_GPU_CUDA_PRESENT + extern template void initialise(); +#endif +#ifdef ALPAKA_ACC_GPU_HIP_PRESENT + extern template void initialise(); +#endif } // namespace cms::alpakatools diff --git a/src/alpakatest/Makefile b/src/alpakatest/Makefile index 281871c13..c39793a16 100644 --- a/src/alpakatest/Makefile +++ b/src/alpakatest/Makefile @@ -31,12 +31,12 @@ EXE_DEP := $(EXE_OBJ:$.o=$.d) LIBNAMES := $(filter-out plugin-% bin test Makefile% plugins.txt%,$(wildcard *)) PLUGINNAMES := $(patsubst plugin-%,%,$(filter plugin-%,$(wildcard *))) -MY_CXXFLAGS := -I$(TARGET_DIR) -DSRC_DIR=$(TARGET_DIR) -DLIB_DIR=$(LIB_DIR)/$(TARGET_NAME) -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED -DALPAKA_HOST_ONLY +MY_CXXFLAGS := -I$(TARGET_DIR) -DLIB_DIR=$(LIB_DIR)/$(TARGET_NAME) -DALPAKA_HOST_ONLY -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT -DALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT ifdef CUDA_BASE -MY_CXXFLAGS += -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ONLY_MODE +MY_CXXFLAGS += -DALPAKA_ACC_GPU_CUDA_PRESENT -DALPAKA_ACC_GPU_CUDA_ONLY_MODE endif ifdef ROCM_BASE -MY_CXXFLAGS += -DALPAKA_ACC_GPU_HIP_ENABLED -DALPAKA_ACC_GPU_HIP_ONLY_MODE +MY_CXXFLAGS += -DALPAKA_ACC_GPU_HIP_PRESENT -DALPAKA_ACC_GPU_HIP_ONLY_MODE endif MY_LDFLAGS := -ldl -Wl,-rpath,$(LIB_DIR)/$(TARGET_NAME) LIB_LDFLAGS := -L$(LIB_DIR)/$(TARGET_NAME) @@ -182,8 +182,12 @@ $(foreach test,$(TESTS_ROCM_EXE),$(eval $(call RUNTEST_template,$(test),amdgpu)) -include $(ALL_DEPENDS) # Build targets -$(TARGET): $(EXE_OBJ) $(LIBS) $(PLUGINS) | $(TESTS_EXE) - $(CXX) $(EXE_OBJ) $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(patsubst %,-l%,$(LIBNAMES)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS)) +$(LIB_DIR)/$(TARGET_NAME)/plugins.txt: $(PLUGINS) + nm -A -C -D -P --defined-only $(PLUGINS) | sed -n -e"s#$(LIB_DIR)/$(TARGET_NAME)/\(plugin\w\+\.so\): typeinfo for edm::\(PluginFactory\|ESPluginFactory\)::impl::Maker<\([A-Za-z0-9_:]\+\)> V .* .*#\3 \1#p" | sort > $@ + +$(TARGET): $(EXE_OBJ) $(LIBS) $(PLUGINS) $(LIB_DIR)/$(TARGET_NAME)/plugins.txt | $(TESTS_EXE) + # Link all libraries, also the "portable" ones + $(CXX) $(EXE_OBJ) $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(foreach lib,$(LIBNAMES),$($(lib)_LDFLAGS) $($(lib)_SERIAL_LDFLAGS) $($(lib)_TBB_LDFLAGS) $($(lib)_CUDA_LDFLAGS) $($(lib)_ROCM_LDFLAGS)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS)) define BUILD_template $(OBJ_DIR)/$(2)/%.cc.o: $(SRC_DIR)/$(2)/%.cc @@ -219,7 +223,7 @@ $$($(1)_ROCM_LIB): $$($(1)_ROCM_OBJ) $$(foreach dep,$(EXTERNAL_DEPENDS_H),$$($$( # Portable code, for serial backend $(OBJ_DIR)/$(2)/alpaka/%.cc.serial.o: $(SRC_DIR)/$(2)/alpaka/%.cc @[ -d $$(@D) ] || mkdir -p $$(@D) - $(CXX) $(CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_CXXFLAGS)) -c $$< -o $$@ -MMD + $(CXX) $(CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_CXXFLAGS)) -c $$< -o $$@ -MMD @cp $(OBJ_DIR)/$(2)/alpaka/$$*.cc.serial.d $(OBJ_DIR)/$(2)/alpaka/$$*.cc.serial.d.tmp; \ sed 's#\($(2)/alpaka/$$*\)\.o[ :]*#\1.o \1.d : #g' < $(OBJ_DIR)/$(2)/alpaka/$$*.cc.serial.d.tmp > $(OBJ_DIR)/$(2)/alpaka/$$*.cc.serial.d; \ sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$$$//' \ @@ -229,7 +233,7 @@ $(OBJ_DIR)/$(2)/alpaka/%.cc.serial.o: $(SRC_DIR)/$(2)/alpaka/%.cc # Portable code, for TBB backend $(OBJ_DIR)/$(2)/alpaka/%.cc.tbb.o: $(SRC_DIR)/$(2)/alpaka/%.cc @[ -d $$(@D) ] || mkdir -p $$(@D) - $(CXX) $(CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_CXXFLAGS)) -c $$< -o $$@ -MMD + $(CXX) $(CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_CXXFLAGS)) -c $$< -o $$@ -MMD @cp $(OBJ_DIR)/$(2)/alpaka/$$*.cc.tbb.d $(OBJ_DIR)/$(2)/alpaka/$$*.cc.tbb.d.tmp; \ sed 's#\($(2)/alpaka/$$*\)\.o[ :]*#\1.o \1.d : #g' < $(OBJ_DIR)/$(2)/alpaka/$$*.cc.tbb.d.tmp > $(OBJ_DIR)/$(2)/alpaka/$$*.cc.tbb.d; \ sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$$$//' \ @@ -240,7 +244,7 @@ $(OBJ_DIR)/$(2)/alpaka/%.cc.tbb.o: $(SRC_DIR)/$(2)/alpaka/%.cc ifdef CUDA_BASE $(OBJ_DIR)/$(2)/alpaka/%.cc.cuda.o: $(SRC_DIR)/$(2)/alpaka/%.cc @[ -d $$(@D) ] || mkdir -p $$(@D) - $(CUDA_NVCC) -x cu $(CUDA_CUFLAGS) $(CUDA_CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND -UALPAKA_HOST_ONLY $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_CXXFLAGS)) $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_NVCC_CXXFLAGS)) -c $$< -o $$@ -MMD + $(CUDA_NVCC) -x cu $(CUDA_CUFLAGS) $(CUDA_CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND -UALPAKA_HOST_ONLY $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_CXXFLAGS)) $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_NVCC_CXXFLAGS)) -c $$< -o $$@ -MMD $$($(1)_CUDADLINK): $$($(1)_CUOBJ) $(CUDA_NVCC) $(CUDA_DLINKFLAGS) $(CUDA_LDFLAGS) $$($(1)_CUOBJ) -o $$@ @@ -250,7 +254,7 @@ endif ifdef ROCM_BASE $(OBJ_DIR)/$(2)/alpaka/%.cc.rocm.o: $(SRC_DIR)/$(2)/alpaka/%.cc @[ -d $$(@D) ] || mkdir -p $$(@D) - $(ROCM_HIPCC) $(HIPCC_CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_GPU_HIP_ASYNC_BACKEND -UALPAKA_HOST_ONLY $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_CXXFLAGS)) -c $$< -o $$@ -MMD + $(ROCM_HIPCC) $(HIPCC_CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_GPU_HIP_ENABLED -DALPAKA_ACC_GPU_HIP_ASYNC_BACKEND -UALPAKA_HOST_ONLY $$(foreach dep,$(EXTERNAL_DEPENDS),$$($$(dep)_CXXFLAGS)) -c $$< -o $$@ -MMD endif endef @@ -271,7 +275,7 @@ $(OBJ_DIR)/$(TARGET_NAME)/bin/%.cc.o: $(SRC_DIR)/$(TARGET_NAME)/bin/%.cc # Serial backend $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.serial.o: $(SRC_DIR)/$(TARGET_NAME)/test/alpaka/%.cc @[ -d $(@D) ] || mkdir -p $(@D) - $(CXX) $(CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_CXXFLAGS)) -c $< -o $@ -MMD + $(CXX) $(CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_CXXFLAGS)) -c $< -o $@ -MMD @cp $(@D)/$*.cc.serial.d $(@D)/$*.cc.serial.d.tmp; \ sed 's#\($(TARGET_NAME)/$*\)\.o[ :]*#\1.o \1.d : #g' < $(@D)/$*.cc.serial.d.tmp > $(@D)/$*.cc.serial.d; \ sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$//' \ @@ -280,12 +284,12 @@ $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.serial.o: $(SRC_DIR)/$(TARGET_NAME)/t $(TEST_DIR)/$(TARGET_NAME)/%.serial: $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.serial.o | $(LIBS) @[ -d $(@D) ] || mkdir -p $(@D) - $(CXX) $^ $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(patsubst %,-l%,$(LIBNAMES)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS)) + $(CXX) $^ $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(foreach lib,$(LIBNAMES),$($(lib)_LDFLAGS) $($(lib)_SERIAL_LDFLAGS)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS)) # TBB backend $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.tbb.o: $(SRC_DIR)/$(TARGET_NAME)/test/alpaka/%.cc @[ -d $(@D) ] || mkdir -p $(@D) - $(CXX) $(CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_CXXFLAGS)) -c $< -o $@ -MMD + $(CXX) $(CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED -DALPAKA_ACC_CPU_B_TBB_T_SEQ_ASYNC_BACKEND $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_CXXFLAGS)) -c $< -o $@ -MMD @cp $(@D)/$*.cc.tbb.d $(@D)/$*.cc.tbb.d.tmp; \ sed 's#\($(TARGET_NAME)/$*\)\.o[ :]*#\1.o \1.d : #g' < $(@D)/$*.cc.tbb.d.tmp > $(@D)/$*.cc.tbb.d; \ sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$//' \ @@ -294,29 +298,29 @@ $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.tbb.o: $(SRC_DIR)/$(TARGET_NAME)/test $(TEST_DIR)/$(TARGET_NAME)/%.tbb: $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.tbb.o | $(LIBS) @[ -d $(@D) ] || mkdir -p $(@D) - $(CXX) $^ $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(patsubst %,-l%,$(LIBNAMES)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS)) + $(CXX) $^ $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(foreach lib,$(LIBNAMES),$($(lib)_LDFLAGS) $($(lib)_TBB_LDFLAGS)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS)) # CUDA backend ifdef CUDA_BASE $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.cuda.o: $(SRC_DIR)/$(TARGET_NAME)/test/alpaka/%.cc @[ -d $(@D) ] || mkdir -p $(@D) - $(CUDA_NVCC) -x cu $(CUDA_CUFLAGS) $(CUDA_CXXFLAGS) $(CUDA_TEST_CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND -UALPAKA_HOST_ONLY $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_CXXFLAGS)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_NVCC_CXXFLAGS)) -c $< -o $@ -MMD + $(CUDA_NVCC) -x cu $(CUDA_CUFLAGS) $(CUDA_CXXFLAGS) $(CUDA_TEST_CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ASYNC_BACKEND -UALPAKA_HOST_ONLY $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_CXXFLAGS)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_NVCC_CXXFLAGS)) -c $< -o $@ -MMD $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cudadlink.o: $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.o $(CUDA_NVCC) $(CUDA_DLINKFLAGS) $(CUDA_LDFLAGS) $< -o $@ $(TEST_DIR)/$(TARGET_NAME)/%.cuda: $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.cuda.o $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.cuda.cudadlink.o | $(LIBS) @[ -d $(@D) ] || mkdir -p $(@D) - $(CXX) $^ $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(patsubst %,-l%,$(LIBNAMES)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS)) + $(CXX) $^ $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(foreach lib,$(LIBNAMES),$($(lib)_LDFLAGS) $($(lib)_CUDA_LDFLAGS)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS)) endif # ROCm backend ifdef ROCM_BASE $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.rocm.o: $(SRC_DIR)/$(TARGET_NAME)/test/alpaka/%.cc @[ -d $(@D) ] || mkdir -p $(@D) - $(ROCM_HIPCC) $(HIPCC_CXXFLAGS) $(ROCM_TEST_CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_GPU_HIP_ASYNC_BACKEND -UALPAKA_HOST_ONLY $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_CXXFLAGS)) -c $< -o $@ -MMD + $(ROCM_HIPCC) $(HIPCC_CXXFLAGS) $(ROCM_TEST_CXXFLAGS) $(MY_CXXFLAGS) -DALPAKA_ACC_GPU_HIP_ENABLED -DALPAKA_ACC_GPU_HIP_ASYNC_BACKEND -UALPAKA_HOST_ONLY $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_CXXFLAGS)) -c $< -o $@ -MMD $(TEST_DIR)/$(TARGET_NAME)/%.rocm: $(OBJ_DIR)/$(TARGET_NAME)/test/alpaka/%.cc.rocm.o | $(LIBS) @[ -d $(@D) ] || mkdir -p $(@D) - $(CXX) $^ $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(patsubst %,-l%,$(LIBNAMES)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS)) + $(CXX) $^ $(LDFLAGS) $(MY_LDFLAGS) -o $@ -L$(LIB_DIR)/$(TARGET_NAME) $(foreach lib,$(LIBNAMES),$($(lib)_LDFLAGS) $($(lib)_ROCM_LDFLAGS)) $(foreach dep,$(EXTERNAL_DEPENDS),$($(dep)_LDFLAGS)) endif diff --git a/src/alpakatest/bin/EventProcessor.cc b/src/alpakatest/bin/EventProcessor.cc index 3d7a3546f..7e84f5114 100644 --- a/src/alpakatest/bin/EventProcessor.cc +++ b/src/alpakatest/bin/EventProcessor.cc @@ -1,3 +1,4 @@ +#include #include #include #include @@ -13,7 +14,7 @@ namespace edm { EventProcessor::EventProcessor(int maxEvents, int runForMinutes, int numberOfStreams, - std::vector const& path, + Alternatives alternatives, std::vector const& esproducers, std::filesystem::path const& datadir, bool validation) @@ -24,9 +25,23 @@ namespace edm { esp->produce(eventSetup_); } + // normalise the total weight to the number of streams + float total = 0.; + for (auto const& alternative : alternatives) { + total += alternative.weight; + } //schedules_.reserve(numberOfStreams); - for (int i = 0; i < numberOfStreams; ++i) { - schedules_.emplace_back(registry_, pluginManager_, &source_, &eventSetup_, i, path); + float cumulative = 0.; + int lower_range = 0; + int upper_range = 0; + for (auto& alternative : alternatives) { + cumulative += alternative.weight; + lower_range = upper_range; + upper_range = static_cast(std::round(cumulative * numberOfStreams / total)); + for (int i = lower_range; i < upper_range; ++i) { + schedules_.emplace_back(registry_, pluginManager_, &source_, &eventSetup_, i, alternative.path); + } + streamsPerBackend_.emplace_back(alternative.backend, upper_range - lower_range); } } diff --git a/src/alpakatest/bin/EventProcessor.h b/src/alpakatest/bin/EventProcessor.h index 5fb20f0b9..e1ca9fe01 100644 --- a/src/alpakatest/bin/EventProcessor.h +++ b/src/alpakatest/bin/EventProcessor.h @@ -5,6 +5,7 @@ #include #include +#include "AlpakaCore/backend.h" #include "Framework/EventSetup.h" #include "PluginManager.h" @@ -12,18 +13,31 @@ #include "Source.h" namespace edm { + struct Alternative { + Alternative() = default; + Alternative(Backend backend, float weight, std::vector path) + : backend{backend}, weight{weight}, path{std::move(path)} {} + + Backend backend; + float weight; + std::vector path; + }; + + using Alternatives = std::vector; + class EventProcessor { public: explicit EventProcessor(int maxEvents, int runForMinutes, int numberOfStreams, - std::vector const& path, + Alternatives alternatives, std::vector const& esproducers, std::filesystem::path const& datadir, bool validation); int maxEvents() const { return source_.maxEvents(); } int processedEvents() const { return source_.processedEvents(); } + std::vector> const& backends() const { return streamsPerBackend_; } void runToCompletion(); @@ -35,6 +49,7 @@ namespace edm { Source source_; EventSetup eventSetup_; std::vector schedules_; + std::vector> streamsPerBackend_; }; } // namespace edm diff --git a/src/alpakatest/bin/PluginManager.cc b/src/alpakatest/bin/PluginManager.cc index 0fd46bbff..d1fa1d0f7 100644 --- a/src/alpakatest/bin/PluginManager.cc +++ b/src/alpakatest/bin/PluginManager.cc @@ -6,9 +6,6 @@ #include "PluginManager.h" -#ifndef SRC_DIR -#error "SRC_DIR undefined" -#endif #ifndef LIB_DIR #error "LIB_DIR undefined" #endif @@ -18,7 +15,7 @@ namespace edmplugin { PluginManager::PluginManager() { - std::ifstream pluginMap(STR(SRC_DIR) "/plugins.txt"); + std::ifstream pluginMap(STR(LIB_DIR) "/plugins.txt"); std::string plugin, library; while (pluginMap >> plugin >> library) { //std::cout << "plugin " << plugin << " in " << library << std::endl; diff --git a/src/alpakatest/bin/main.cc b/src/alpakatest/bin/main.cc index ba18f372e..8e7a36c6c 100644 --- a/src/alpakatest/bin/main.cc +++ b/src/alpakatest/bin/main.cc @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -25,37 +26,37 @@ namespace { void print_help(std::string const& name) { std::cout << name << ": " -#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT << "[--serial] " #endif -#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED +#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT << "[--tbb] " #endif -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED +#ifdef ALPAKA_ACC_GPU_CUDA_PRESENT << "[--cuda] " #endif -#ifdef ALPAKA_ACC_GPU_HIP_ENABLED +#ifdef ALPAKA_ACC_GPU_HIP_PRESENT << "[--hip] " #endif << "[--numberOfThreads NT] [--numberOfStreams NS] [--maxEvents ME] [--data PATH] " "[--transfer]\n\n" << "Options\n" -#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT << " --serial Use CPU Serial backend\n" #endif -#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED +#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT << " --tbb Use CPU TBB backend\n" #endif -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED +#ifdef ALPAKA_ACC_GPU_CUDA_PRESENT << " --cuda Use CUDA backend\n" #endif -#ifdef ALPAKA_ACC_GPU_HIP_ENABLED +#ifdef ALPAKA_ACC_GPU_HIP_PRESENT << " --hip Use ROCm/HIP backend\n" #endif << " --numberOfThreads Number of threads to use (default 1, use 0 to use all CPU cores)\n" << " --numberOfStreams Number of concurrent events (default 0 = numberOfThreads)\n" << " --maxEvents Number of events to process (default -1 for all events in the input file)\n" - << " --runForMinutes Continue processing the set of 1000 events until this many minutes have passed" + << " --runForMinutes Continue processing the set of 1000 events until this many minutes have passed " "(default -1 for disabled; conflicts with --maxEvents)\n" << " --data Path to the 'data' directory (default 'data' in the directory of the executable)\n" << " --transfer Transfer results from GPU to CPU (default is to leave them on GPU)\n" @@ -64,10 +65,60 @@ namespace { } } // namespace +bool getOptionalArgument(std::vector const& args, std::vector::iterator& i, int& value) { + auto it = i; + ++it; + if (it == args.end()) { + return false; + } + try { + value = std::stoi(*it); + ++i; + return true; + } catch (...) { + return false; + } +} + +bool getOptionalArgument(std::vector const& args, std::vector::iterator& i, float& value) { + auto it = i; + ++it; + if (it == args.end()) { + return false; + } + try { + value = std::stof(*it); + ++i; + return true; + } catch (...) { + return false; + } +} + +bool getOptionalArgument(std::vector const& args, + std::vector::iterator& i, + std::filesystem::path& value) { + auto it = i; + ++it; + if (it == args.end()) { + return false; + } + value = *it; + return true; +} + +template +void getArgument(std::vector const& args, std::vector::iterator& i, T& value) { + if (not getOptionalArgument(args, i, value)) { + std::cerr << "error: " << *i << " expects an argument" << std::endl; + exit(EXIT_FAILURE); + } +} + int main(int argc, char** argv) { // Parse command line arguments std::vector args(argv, argv + argc); - std::vector backends; + std::unordered_map backends; int numberOfThreads = 1; int numberOfStreams = 0; int maxEvents = -1; @@ -79,37 +130,40 @@ int main(int argc, char** argv) { if (*i == "-h" or *i == "--help") { print_help(args.front()); return EXIT_SUCCESS; -#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT } else if (*i == "--serial") { - backends.emplace_back(Backend::SERIAL); + float weight = 1.; + getOptionalArgument(args, i, weight); + backends.insert_or_assign(Backend::SERIAL, weight); #endif -#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED +#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT } else if (*i == "--tbb") { - backends.emplace_back(Backend::TBB); + float weight = 1.; + getOptionalArgument(args, i, weight); + backends.insert_or_assign(Backend::TBB, weight); #endif -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED +#ifdef ALPAKA_ACC_GPU_CUDA_PRESENT } else if (*i == "--cuda") { - backends.emplace_back(Backend::CUDA); + float weight = 1.; + getOptionalArgument(args, i, weight); + backends.insert_or_assign(Backend::CUDA, weight); #endif -#ifdef ALPAKA_ACC_GPU_HIP_ENABLED +#ifdef ALPAKA_ACC_GPU_HIP_PRESENT } else if (*i == "--hip") { - backends.emplace_back(Backend::HIP); + float weight = 1.; + getOptionalArgument(args, i, weight); + backends.insert_or_assign(Backend::HIP, weight); #endif } else if (*i == "--numberOfThreads") { - ++i; - numberOfThreads = std::stoi(*i); + getArgument(args, i, numberOfThreads); } else if (*i == "--numberOfStreams") { - ++i; - numberOfStreams = std::stoi(*i); + getArgument(args, i, numberOfStreams); } else if (*i == "--maxEvents") { - ++i; - maxEvents = std::stoi(*i); + getArgument(args, i, maxEvents); } else if (*i == "--runForMinutes") { - ++i; - runForMinutes = std::stoi(*i); + getArgument(args, i, runForMinutes); } else if (*i == "--data") { - ++i; - datadir = *i; + getArgument(args, i, datadir); } else if (*i == "--transfer") { transfer = true; } else if (*i == "--empty") { @@ -139,65 +193,65 @@ int main(int argc, char** argv) { } // Initialiase the selected backends -#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED - if (std::find(backends.begin(), backends.end(), Backend::SERIAL) != backends.end()) { +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_PRESENT + if (backends.find(Backend::SERIAL) != backends.end()) { cms::alpakatools::initialise(); } #endif -#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED - if (std::find(backends.begin(), backends.end(), Backend::TBB) != backends.end()) { +#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_PRESENT + if (backends.find(Backend::TBB) != backends.end()) { cms::alpakatools::initialise(); } #endif -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - if (std::find(backends.begin(), backends.end(), Backend::CUDA) != backends.end()) { +#ifdef ALPAKA_ACC_GPU_CUDA_PRESENT + if (backends.find(Backend::CUDA) != backends.end()) { cms::alpakatools::initialise(); } #endif -#ifdef ALPAKA_ACC_GPU_HIP_ENABLED - if (std::find(backends.begin(), backends.end(), Backend::HIP) != backends.end()) { +#ifdef ALPAKA_ACC_GPU_HIP_PRESENT + if (backends.find(Backend::HIP) != backends.end()) { cms::alpakatools::initialise(); } #endif // Initialize EventProcessor - std::vector edmodules; std::vector esmodules; + edm::Alternatives alternatives; if (not empty) { - auto addModules = [&](std::string const& accelerator_namespace, Backend backend) { - if (std::find(backends.begin(), backends.end(), backend) != backends.end()) { - edmodules.emplace_back(accelerator_namespace + "::" + "TestProducer"); - edmodules.emplace_back(accelerator_namespace + "::" + "TestProducer3"); - edmodules.emplace_back(accelerator_namespace + "::" + "TestProducer2"); - } - }; - -#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED - addModules("alpaka_serial_sync", Backend::SERIAL); -#endif -#ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED - addModules("alpaka_tbb_async", Backend::TBB); -#endif -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - addModules("alpaka_cuda_async", Backend::CUDA); -#endif -#ifdef ALPAKA_ACC_GPU_HIP_ENABLED - addModules("alpaka_rocm_async", Backend::HIP); -#endif + // host-only ESModules esmodules = {"IntESProducer"}; - if (transfer) { - // add modules for transfer + for (auto const& [backend, weight] : backends) { + std::string prefix = "alpaka_" + name(backend) + "::"; + // "portable" EDModules + std::vector edmodules; + edmodules.emplace_back(prefix + "TestProducer"); + edmodules.emplace_back(prefix + "TestProducer3"); + edmodules.emplace_back(prefix + "TestProducer2"); + if (transfer) { + // add modules for transfer + } + alternatives.emplace_back(backend, weight, std::move(edmodules)); } } edm::EventProcessor processor( - maxEvents, runForMinutes, numberOfStreams, std::move(edmodules), std::move(esmodules), datadir, false); + maxEvents, runForMinutes, numberOfStreams, std::move(alternatives), std::move(esmodules), datadir, false); if (runForMinutes < 0) { - std::cout << "Processing " << processor.maxEvents() << " events, of which " << numberOfStreams - << " concurrently, with " << numberOfThreads << " threads." << std::endl; + std::cout << "Processing " << processor.maxEvents() << " events,"; } else { - std::cout << "Processing for about " << runForMinutes << " minutes with " << numberOfStreams - << " concurrent events and " << numberOfThreads << " threads." << std::endl; + std::cout << "Processing for about " << runForMinutes << " minutes,"; + } + { + std::cout << " with " << numberOfStreams << " concurrent events ("; + bool need_comma = false; + for (auto const& [backend, streams] : processor.backends()) { + if (need_comma) { + std::cout << ", "; + } + std::cout << streams << " on " << backend; + need_comma = true; + } + std::cout << ") and " << numberOfThreads << " threads." << std::endl; } // Initialize the TBB thread pool diff --git a/src/alpakatest/plugins.txt b/src/alpakatest/plugins.txt deleted file mode 100644 index 6c4d0c386..000000000 --- a/src/alpakatest/plugins.txt +++ /dev/null @@ -1,13 +0,0 @@ -IntESProducer pluginTest1.so -alpaka_serial_sync::TestProducer pluginTest1_serial.so -alpaka_tbb_async::TestProducer pluginTest1_tbb.so -alpaka_cuda_async::TestProducer pluginTest1_cuda.so -alpaka_rocm_async::TestProducer pluginTest1_rocm.so -alpaka_serial_sync::TestProducer2 pluginTest2_serial.so -alpaka_tbb_async::TestProducer2 pluginTest2_tbb.so -alpaka_cuda_async::TestProducer2 pluginTest2_cuda.so -alpaka_rocm_async::TestProducer2 pluginTest2_rocm.so -alpaka_serial_sync::TestProducer3 pluginTest2_serial.so -alpaka_tbb_async::TestProducer3 pluginTest2_tbb.so -alpaka_cuda_async::TestProducer3 pluginTest2_cuda.so -alpaka_rocm_async::TestProducer3 pluginTest2_rocm.so diff --git a/src/alpakatest/test/alpaka/hello.cc b/src/alpakatest/test/alpaka/hello.cc index 7d8d134ae..fe5fd4201 100644 --- a/src/alpakatest/test/alpaka/hello.cc +++ b/src/alpakatest/test/alpaka/hello.cc @@ -1,16 +1,19 @@ #include -#include "AlpakaCore/alpakaConfig.h" - int main() { - std::cout << "Hello from " -#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_SYNC_BACKEND - << "CPU serial" -#elif defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED - << "CPU TBB" -#elif defined ALPAKA_ACC_GPU_CUDA_ENABLED - << "CUDA" + std::cout << "Hello from the " +#if defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + << "CPU serial " +#endif +#if defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED + << "CPU TBB " +#endif +#if defined ALPAKA_ACC_GPU_CUDA_ENABLED + << "CUDA " +#endif +#if defined ALPAKA_ACC_GPU_HIP_ENABLED + << "HIP/ROCm " #endif - << " backend" << std::endl; + << "backend" << std::endl; return 0; }