diff --git a/includes/Milhoja_MoverUnpacker.h b/includes/Milhoja_MoverUnpacker.h index d5a44424..50f3e808 100644 --- a/includes/Milhoja_MoverUnpacker.h +++ b/includes/Milhoja_MoverUnpacker.h @@ -46,6 +46,7 @@ #include +#include "Milhoja_DataItem.h" #include "Milhoja_TileWrapper.h" #include "Milhoja_RuntimeElement.h" @@ -63,7 +64,6 @@ class MoverUnpacker : public RuntimeElement { MoverUnpacker& operator=(const MoverUnpacker&) = delete; MoverUnpacker& operator=(MoverUnpacker&&) = delete; - void setReceiverProto(TileWrapper const *); void startCycle(void); void increaseThreadCount(const unsigned int nThreads) override; void enqueue(std::shared_ptr&& dataItem) override; @@ -72,6 +72,7 @@ class MoverUnpacker : public RuntimeElement { void wait(void); RuntimeElement* dataReceiver(void) const { return dataReceiver_; } + const DataItem* receiverPrototype(void) const { return receiverPrototype_; } private: enum class State {Idle, Open, Closed}; diff --git a/includes/Milhoja_Runtime.h b/includes/Milhoja_Runtime.h index 314c57bd..ec10fd4f 100644 --- a/includes/Milhoja_Runtime.h +++ b/includes/Milhoja_Runtime.h @@ -165,10 +165,30 @@ class Runtime { void executeExtendedCpuGpuSplitTasks(const std::string& bundleName, const unsigned int nDistributorThreads, const RuntimeAction& actionA_cpu, + const TileWrapper& tilePrototype, const RuntimeAction& actionA_gpu, - const RuntimeAction& postActionB_cpu, const DataPacket& packetPrototype, + const RuntimeAction& postActionB_cpu, + const TileWrapper& postTilePrototype, const unsigned int nTilesPerCpuTurn); +# ifndef RUNTIME_MUST_USE_TILEITER + void setupPipelineForExtCpuGpuSplitTasks(const std::string& bundleName, + const RuntimeAction& actionA_cpu, + const TileWrapper& tilePrototype, + const RuntimeAction& actionA_gpu, + const DataPacket& packetPrototype, + const RuntimeAction& postActionB_cpu, + const TileWrapper& postTilePrototype, + const unsigned int nTilesPerCpuTurn); + void pushTileToExtCpuGpuSplitPipeline(const std::string& bundleName, + const TileWrapper& tilePrototype, + const DataPacket& packetPrototype, + const TileWrapper& postTilePrototype, + const FlashxrTileRawPtrs& tP, + const FlashxTileRawInts& tI, + const FlashxTileRawReals& tR); + void teardownPipelineForExtCpuGpuSplitTasks(const std::string& bundleName); +# endif void executeCpuGpuWowzaTasks(const std::string& bundleName, const RuntimeAction& actionA_cpu, const TileWrapper& tilePrototype, diff --git a/includes/Milhoja_RuntimeElement.h b/includes/Milhoja_RuntimeElement.h index 476d0a81..0ff0bd6e 100644 --- a/includes/Milhoja_RuntimeElement.h +++ b/includes/Milhoja_RuntimeElement.h @@ -46,6 +46,8 @@ class RuntimeElement { virtual std::string attachDataReceiver(RuntimeElement* receiver); virtual std::string detachDataReceiver(void); + virtual std::string setReceiverPrototype(const DataItem* prototype); + protected: RuntimeElement(void); virtual ~RuntimeElement(void); @@ -58,6 +60,7 @@ class RuntimeElement { to once this team's action has already been applied to the items. */ + const DataItem* receiverPrototype_; std::map calledCloseQueue_; /*!< The keys in this map serve as a list of data publishers attached to the object. diff --git a/includes/Milhoja_ThreadTeam.h b/includes/Milhoja_ThreadTeam.h index 57bbce56..a594624d 100644 --- a/includes/Milhoja_ThreadTeam.h +++ b/includes/Milhoja_ThreadTeam.h @@ -54,6 +54,7 @@ #include +#include "Milhoja_TileWrapper.h" #include "Milhoja_actionRoutine.h" #include "Milhoja_RuntimeAction.h" #include "Milhoja_ThreadTeamMode.h" @@ -113,6 +114,7 @@ class ThreadTeam : public RuntimeElement { // into thread team configurations. std::string attachDataReceiver(RuntimeElement* receiver) override; std::string detachDataReceiver(void) override; + std::string setReceiverPrototype(const DataItem* prototype) override; protected: constexpr static unsigned int THREAD_START_STOP_TIMEOUT_SEC = 1; diff --git a/interfaces/Milhoja_runtime_C_interface.cpp b/interfaces/Milhoja_runtime_C_interface.cpp index ca60edb8..d11ff6bb 100644 --- a/interfaces/Milhoja_runtime_C_interface.cpp +++ b/interfaces/Milhoja_runtime_C_interface.cpp @@ -568,6 +568,77 @@ extern "C" { return MILHOJA_SUCCESS; } + int milhoja_runtime_setup_pipeline_extcpugpusplit_c(milhoja::ACTION_ROUTINE cpuTaskFunction, + milhoja::ACTION_ROUTINE gpuTaskFunction, + milhoja::ACTION_ROUTINE postTaskFunction, + const int nThreads, + const int nTilesPerPacket, + const int nTilesPerCpuTurn, + void* packet, + void* tileWrapper, + void* postTileWrapper) { + if (nThreads < 0) { + std::cerr + << "[milhoja_runtime_setup_pipeline_extcpugpusplit_c] nThreads is negative" + << std::endl; + return MILHOJA_ERROR_N_THREADS_NEGATIVE; + } else if (nTilesPerPacket < 0) { + std::cerr + << "[milhoja_runtime_setup_pipeline_extcpugpusplit_c] nTilesPerPacket is negative" + << std::endl; + return MILHOJA_ERROR_N_TILES_NEGATIVE; + } + + unsigned int nDistributorThreads_ui = 0; + unsigned int nThreads_ui = static_cast(nThreads); + unsigned int nTilesPerPacket_ui = static_cast(nTilesPerPacket); + unsigned int nTilesPerCpuTurn_ui = static_cast(nTilesPerCpuTurn); + + milhoja::TileWrapper* tilePrototype = static_cast(tileWrapper); + milhoja::TileWrapper* postTilePrototype = static_cast(postTileWrapper); + milhoja::DataPacket* pktPrototype = static_cast(packet); + + milhoja::RuntimeAction pktAction; + pktAction.name = "Lazy GPU setup Action Name"; + pktAction.nInitialThreads = nThreads_ui; + pktAction.teamType = milhoja::ThreadTeamDataType::SET_OF_BLOCKS; + pktAction.nTilesPerPacket = nTilesPerPacket_ui; + pktAction.routine = gpuTaskFunction; + + milhoja::RuntimeAction cpuAction; + cpuAction.name = "Lazy CPU setup Action Name"; + cpuAction.nInitialThreads = nThreads_ui; + cpuAction.teamType = milhoja::ThreadTeamDataType::BLOCK; + cpuAction.nTilesPerPacket = 0; + cpuAction.routine = cpuTaskFunction; + + milhoja::RuntimeAction postAction; + postAction.name = "Lazy post CPU setup Action Name"; + postAction.nInitialThreads = nThreads_ui; + postAction.teamType = milhoja::ThreadTeamDataType::BLOCK; + postAction.nTilesPerPacket = 0; + postAction.routine = postTaskFunction; + + try { + milhoja::Runtime::instance().setupPipelineForExtCpuGpuSplitTasks("EXT CPUGPU Split Bundle Name", + cpuAction, + *tilePrototype, + pktAction, + *pktPrototype, + postAction, + *postTilePrototype, + nTilesPerCpuTurn_ui); + } catch (const std::exception& exc) { + std::cerr << exc.what() << std::endl; + return MILHOJA_ERROR_UNABLE_TO_SETUP_PIPELINE; + } catch (...) { + std::cerr << "[milhoja_runtime_setup_pipeline_extcpugpusplit_c] Unknown error caught" << std::endl; + return MILHOJA_ERROR_UNABLE_TO_SETUP_PIPELINE; + } + + return MILHOJA_SUCCESS; + } + int milhoja_runtime_teardown_pipeline_gpu_c(const int nThreads, const int nTilesPerPacket) { if (nThreads < 0) { // nThreads: only use in this function @@ -679,6 +750,34 @@ extern "C" { return MILHOJA_SUCCESS; } + + int milhoja_runtime_teardown_pipeline_extcpugpusplit_c(const int nThreads, + const int nTilesPerPacket) { + if (nThreads < 0) { // nThreads: only use in this function + std::cerr + << "[milhoja_runtime_teardown_pipeline_extcpugpusplit_c] nThreads is negative" + << std::endl; + return MILHOJA_ERROR_N_THREADS_NEGATIVE; + } else if (nTilesPerPacket < 0) { // nTilesPerPacket: only use here + std::cerr + << "[milhoja_runtime_teardown_pipeline_extcpugpusplit_c] nTilesPerPacket is negative" + << std::endl; + return MILHOJA_ERROR_N_TILES_NEGATIVE; + } + + try { + milhoja::Runtime::instance().teardownPipelineForExtCpuGpuSplitTasks( + "Lazy EXT CPUGPU Split setup Bundle Name"); + } catch (const std::exception& exc) { + std::cerr << exc.what() << std::endl; + return MILHOJA_ERROR_UNABLE_TO_TEARDOWN_PIPELINE; + } catch (...) { + std::cerr << "[milhoja_runtime_teardown_pipeline_extcpugpusplit_c] Unknown error caught" << std::endl; + return MILHOJA_ERROR_UNABLE_TO_TEARDOWN_PIPELINE; + } + + return MILHOJA_SUCCESS; + } /** * Push one tile to the prepared pipeline so that the thread team will * eventually execute the task. @@ -797,6 +896,37 @@ extern "C" { return MILHOJA_SUCCESS; } + + int milhoja_runtime_push_pipeline_extcpugpusplit_c(void* tileWrapper, + void* packet, + void* postTileWrapper, + const int nThreads, + FlashxTileRaw* tileInfo) { + if (nThreads < 0) { + std::cerr << "[milhoja_runtime_push_pipeline_extcpugpusplit_c] nThreads is negative" << std::endl; + return MILHOJA_ERROR_N_THREADS_NEGATIVE; + } + + milhoja::TileWrapper* tilePrototype = static_cast(tileWrapper); + milhoja::TileWrapper* postTilePrototype = static_cast(postTileWrapper); + milhoja::DataPacket* pktPrototype = static_cast(packet); + + try { + milhoja::Runtime::instance().pushTileToExtCpuGpuSplitPipeline("Lazy Bundle Name", + *tilePrototype, + *pktPrototype, + *postTilePrototype, + tileInfo->sP, tileInfo->sI, tileInfo->sR); + } catch (const std::exception& exc) { + std::cerr << exc.what() << std::endl; + return MILHOJA_ERROR_UNABLE_TO_EXECUTE_TASKS; + } catch (...) { + std::cerr << "[milhoja_runtime_push_pipeline_extcpugpusplit_c] Unknown error caught" << std::endl; + return MILHOJA_ERROR_UNABLE_TO_EXECUTE_TASKS; + } + + return MILHOJA_SUCCESS; + } # endif # ifdef RUNTIME_CAN_USE_TILEITER @@ -1141,6 +1271,84 @@ extern "C" { return MILHOJA_SUCCESS; } + + int milhoja_runtime_execute_tasks_extcpugpusplit_c(milhoja::ACTION_ROUTINE cpuTaskFunction, + milhoja::ACTION_ROUTINE gpuTaskFunction, + milhoja::ACTION_ROUTINE postTaskFunction, + const int nDistributorThreads, + const int nThreads, + const int nTilesPerPacket, + const int nTilesPerCpuTurn, + void* packet, + void* tileWrapper, + void* postTileWrapper) { + if (nDistributorThreads < 0) { + std::cerr + << "[milhoja_runtime_execute_tasks_extcpugpusplit_c] nDistributorThreads is negative" + << std::endl; + return MILHOJA_ERROR_N_THREADS_NEGATIVE; + } else if (nThreads < 0) { + std::cerr + << "[milhoja_runtime_execute_tasks_extcpugpusplit_c] nThreads is negative" + << std::endl; + return MILHOJA_ERROR_N_THREADS_NEGATIVE; + } else if (nTilesPerPacket < 0) { + std::cerr + << "[milhoja_runtime_execute_tasks_extcpugpusplit_c] nTilesPerPacket is negative" + << std::endl; + return MILHOJA_ERROR_N_TILES_NEGATIVE; + } + + unsigned int nDistributorThreads_ui = static_cast(nDistributorThreads); + unsigned int nThreads_ui = static_cast(nThreads); + unsigned int nTilesPerPacket_ui = static_cast(nTilesPerPacket); + unsigned int nTilesPerCpuTurn_ui = static_cast(nTilesPerCpuTurn); + + milhoja::TileWrapper* tilePrototype = static_cast(tileWrapper); + milhoja::TileWrapper* postTilePrototype = static_cast(postTileWrapper); + milhoja::DataPacket* pktPrototype = static_cast(packet); + + milhoja::RuntimeAction pktAction; + pktAction.name = "Lazy GPU Action Name"; + pktAction.nInitialThreads = nThreads_ui; + pktAction.teamType = milhoja::ThreadTeamDataType::SET_OF_BLOCKS; + pktAction.nTilesPerPacket = nTilesPerPacket_ui; + pktAction.routine = gpuTaskFunction; + + milhoja::RuntimeAction cpuAction; + cpuAction.name = "Lazy CPU Action Name"; + cpuAction.nInitialThreads = nThreads_ui; + cpuAction.teamType = milhoja::ThreadTeamDataType::BLOCK; + cpuAction.nTilesPerPacket = 0; + cpuAction.routine = cpuTaskFunction; + + milhoja::RuntimeAction postAction; + postAction.name = "Lazy CPU Action Name"; + postAction.nInitialThreads = nThreads_ui; + postAction.teamType = milhoja::ThreadTeamDataType::BLOCK; + postAction.nTilesPerPacket = 0; + postAction.routine = postTaskFunction; + + try { + milhoja::Runtime::instance().executeExtendedCpuGpuSplitTasks("Lazy GPU Bundle Name", + nDistributorThreads_ui, + cpuAction, + *tilePrototype, + pktAction, + *pktPrototype, + postAction, + *postTilePrototype, + nTilesPerCpuTurn_ui); + } catch (const std::exception& exc) { + std::cerr << exc.what() << std::endl; + return MILHOJA_ERROR_UNABLE_TO_EXECUTE_TASKS; + } catch (...) { + std::cerr << "[milhoja_runtime_execute_tasks_extcpugpusplit_c] Unknown error caught" << std::endl; + return MILHOJA_ERROR_UNABLE_TO_EXECUTE_TASKS; + } + + return MILHOJA_SUCCESS; + } # endif #endif // #ifdef RUNTIME_SUPPORT_DATAPACKETS } diff --git a/interfaces/Milhoja_runtime_mod.F90 b/interfaces/Milhoja_runtime_mod.F90 index 91503979..6f05cdd2 100644 --- a/interfaces/Milhoja_runtime_mod.F90 +++ b/interfaces/Milhoja_runtime_mod.F90 @@ -33,6 +33,9 @@ module milhoja_runtime_mod public :: milhoja_runtime_setupPipelineForExtGpuTasks public :: milhoja_runtime_pushTileToExtGpuPipeline public :: milhoja_runtime_teardownPipelineForExtGpuTasks + public :: milhoja_runtime_setupPipelineForExtCpuGpuSplitTasks + public :: milhoja_runtime_pushTileToExtCpuGpuSplitPipeline + public :: milhoja_runtime_teardownPipelineForExtCpuGpuSplitTasks # endif #endif #ifdef RUNTIME_SUPPORT_EXECUTE @@ -42,6 +45,7 @@ module milhoja_runtime_mod public :: milhoja_runtime_executeTasks_CpuGpu public :: milhoja_runtime_executeTasks_CpuGpuSplit public :: milhoja_runtime_executeTasks_ExtGpu + public :: milhoja_runtime_executeTasks_ExtCpuGpuSplit # endif #endif @@ -135,7 +139,7 @@ end function milhoja_runtime_teardown_pipeline_cpu_c !> Fortran interface on routine in C interface of same name. function milhoja_runtime_push_pipeline_cpu_c(C_tileWrapperPrototype, & C_nThreads, & - tileCINfo) result(C_ierr) & + tileCInfo) result(C_ierr) & bind(c) use iso_c_binding, ONLY : C_PTR use milhoja_types_mod, ONLY : MILHOJA_INT @@ -214,6 +218,30 @@ function milhoja_runtime_setup_pipeline_extgpu_c(C_taskFunction, & integer(MILHOJA_INT), intent(IN), value :: C_nTilesPerPacket integer(MILHOJA_INT) :: C_ierr end function milhoja_runtime_setup_pipeline_extgpu_c + function milhoja_runtime_setup_pipeline_extcpugpusplit_c(C_cpuTaskFunction, & + C_gpuTaskFunction, & + C_postTaskFunction, & + C_nThreads, & + C_nTilesPerPacket, & + C_nTilesPerCpuTurn, & + C_packetPrototype, & + C_tilePrototype, & + C_postTilePrototype) result(C_ierr) & + bind(c) + use iso_c_binding, ONLY : C_PTR, C_FUNPTR + use milhoja_types_mod, ONLY : MILHOJA_INT + implicit none + type(C_FUNPTR), intent(IN), value :: C_cpuTaskFunction + type(C_FUNPTR), intent(IN), value :: C_gpuTaskFunction + type(C_FUNPTR), intent(IN), value :: C_postTaskFunction + type(C_PTR), intent(IN), value :: C_packetPrototype + type(C_PTR), intent(IN), value :: C_tilePrototype + type(C_PTR), intent(IN), value :: C_postTilePrototype + integer(MILHOJA_INT), intent(IN), value :: C_nThreads + integer(MILHOJA_INT), intent(IN), value :: C_nTilesPerPacket + integer(MILHOJA_INT), intent(IN), value :: C_nTilesPerCpuTurn + integer(MILHOJA_INT) :: C_ierr + end function milhoja_runtime_setup_pipeline_extcpugpusplit_c !> Fortran interface for the function in C interface of the same name. function milhoja_runtime_teardown_pipeline_gpu_c(C_nThreads, & @@ -256,11 +284,21 @@ function milhoja_runtime_teardown_pipeline_extgpu_c(C_nThreads, & integer(MILHOJA_INT), intent(IN), value :: C_nTilesPerPacket integer(MILHOJA_INT) :: C_ierr end function milhoja_runtime_teardown_pipeline_extgpu_c + function milhoja_runtime_teardown_pipeline_extcpugpusplit_c(C_nThreads, & + C_nTilesPerPacket) & + result(C_ierr) & + bind(c) + use milhoja_types_mod, ONLY : MILHOJA_INT + implicit none + integer(MILHOJA_INT), intent(IN), value :: C_nThreads + integer(MILHOJA_INT), intent(IN), value :: C_nTilesPerPacket + integer(MILHOJA_INT) :: C_ierr + end function milhoja_runtime_teardown_pipeline_extcpugpusplit_c !> Fortran interface for the function in C interface of the same name. function milhoja_runtime_push_pipeline_gpu_c(C_packetPrototype, & C_nThreads, & - tileCINfo) result(C_ierr) & + tileCInfo) result(C_ierr) & bind(c) use iso_c_binding, ONLY : C_PTR use milhoja_types_mod, ONLY : MILHOJA_INT @@ -273,7 +311,7 @@ end function milhoja_runtime_push_pipeline_gpu_c function milhoja_runtime_push_pipeline_cpugpu_c(C_tilePrototype, & C_packetPrototype, & C_nThreads, & - tileCINfo) result(C_ierr) & + tileCInfo) result(C_ierr) & bind(c) use iso_c_binding, ONLY : C_PTR use milhoja_types_mod, ONLY : MILHOJA_INT @@ -287,7 +325,7 @@ end function milhoja_runtime_push_pipeline_cpugpu_c function milhoja_runtime_push_pipeline_cpugpusplit_c(C_tilePrototype, & C_packetPrototype, & C_nThreads, & - tileCINfo) result(C_ierr) & + tileCInfo) result(C_ierr) & bind(c) use iso_c_binding, ONLY : C_PTR use milhoja_types_mod, ONLY : MILHOJA_INT @@ -300,7 +338,7 @@ function milhoja_runtime_push_pipeline_cpugpusplit_c(C_tilePrototype, & end function milhoja_runtime_push_pipeline_cpugpusplit_c function milhoja_runtime_push_pipeline_extgpu_c(C_packetPrototype, & C_nThreads, & - tileCINfo) result(C_ierr) & + tileCInfo) result(C_ierr) & bind(c) use iso_c_binding, ONLY : C_PTR use milhoja_types_mod, ONLY : MILHOJA_INT @@ -310,6 +348,22 @@ function milhoja_runtime_push_pipeline_extgpu_c(C_packetPrototype, & type(C_PTR), intent(IN), value :: tileCInfo integer(MILHOJA_INT) :: C_ierr end function milhoja_runtime_push_pipeline_extgpu_c + function milhoja_runtime_push_pipeline_extcpugpusplit_c(C_tilePrototype, & + C_packetPrototype, & + C_postTilePrototype, & + C_nThreads, & + tileCInfo) result(C_ierr) & + bind(c) + use iso_c_binding, ONLY : C_PTR + use milhoja_types_mod, ONLY : MILHOJA_INT + implicit none + type(C_PTR), intent(IN), value :: C_tilePrototype + type(C_PTR), intent(IN), value :: C_packetPrototype + type(C_PTR), intent(IN), value :: C_postTilePrototype + integer(MILHOJA_INT), intent(IN), value :: C_nThreads + type(C_PTR), intent(IN), value :: tileCInfo + integer(MILHOJA_INT) :: C_ierr + end function milhoja_runtime_push_pipeline_extcpugpusplit_c # ifdef RUNTIME_SUPPORT_EXECUTE !> Fortran interface for the function in C interface of the same name. @@ -391,6 +445,32 @@ function milhoja_runtime_execute_tasks_extgpu_c(C_taskFunction, & type(C_PTR), intent(IN), value :: C_tilePrototype integer(MILHOJA_INT) :: C_ierr end function milhoja_runtime_execute_tasks_extgpu_c + function milhoja_runtime_execute_tasks_extcpugpusplit_c(C_cpuTaskFunction, & + C_gpuTaskFunction, & + C_postTaskFunction, & + C_nDistributorThreads, & + C_nThreads, & + C_nTilesPerPacket, & + C_nTilesPerCpuTurn, & + C_packetPrototype, & + C_tilePrototype, & + C_postTilePrototype) & + result(C_ierr) bind(c) + use iso_c_binding, ONLY : C_PTR, C_FUNPTR + use milhoja_types_mod, ONLY : MILHOJA_INT + implicit none + type(C_FUNPTR), intent(IN), value :: C_cpuTaskFunction + type(C_FUNPTR), intent(IN), value :: C_gpuTaskFunction + type(C_FUNPTR), intent(IN), value :: C_postTaskFunction + integer(MILHOJA_INT), intent(IN), value :: C_nDistributorThreads + integer(MILHOJA_INT), intent(IN), value :: C_nThreads + integer(MILHOJA_INT), intent(IN), value :: C_nTilesPerPacket + integer(MILHOJA_INT), intent(IN), value :: C_nTilesPerCpuTurn + type(C_PTR), intent(IN), value :: C_packetPrototype + type(C_PTR), intent(IN), value :: C_tilePrototype + type(C_PTR), intent(IN), value :: C_postTilePrototype + integer(MILHOJA_INT) :: C_ierr + end function milhoja_runtime_execute_tasks_extcpugpusplit_c # endif #endif end interface @@ -631,6 +711,49 @@ subroutine milhoja_runtime_setupPipelineForExtGpuTasks(taskFunction, & packetPrototype_Cptr, & tilePrototype_Cptr) end subroutine milhoja_runtime_setupPipelineForExtGpuTasks + subroutine milhoja_runtime_setupPipelineForExtCpuGpuSplitTasks(cpuTaskFunction, & + gpuTaskFunction, & + postTaskFunction, & + nThreads, & + nTilesPerPacket, & + nTilesPerCpuTurn, & + packetPrototype_Cptr, & + tilePrototype_Cptr, & + postTilePrototype_Cptr, & + ierr) + use iso_c_binding, ONLY : C_PTR, & + C_FUNPTR, & + C_FUNLOC + + procedure(milhoja_runtime_taskFunction) :: cpuTaskFunction + procedure(milhoja_runtime_taskFunction) :: gpuTaskFunction + procedure(milhoja_runtime_taskFunction) :: postTaskFunction + type(C_PTR), intent(IN) :: packetPrototype_Cptr + type(C_PTR), intent(IN) :: tilePrototype_Cptr + type(C_PTR), intent(IN) :: postTilePrototype_Cptr + integer(MILHOJA_INT), intent(IN) :: nThreads + integer(MILHOJA_INT), intent(IN) :: nTilesPerPacket + integer(MILHOJA_INT), intent(IN) :: nTilesPerCpuTurn + integer(MILHOJA_INT), intent(OUT) :: ierr + + type(C_FUNPTR) :: cpuTaskFunction_Cptr + type(C_FUNPTR) :: gpuTaskFunction_Cptr + type(C_FUNPTR) :: postTaskFunction_Cptr + + cpuTaskFunction_Cptr = C_FUNLOC(cpuTaskFunction) + gpuTaskFunction_Cptr = C_FUNLOC(gpuTaskFunction) + postTaskFunction_Cptr = C_FUNLOC(postTaskFunction) + + ierr = milhoja_runtime_setup_pipeline_extcpugpusplit_c(cpuTaskFunction_Cptr, & + gpuTaskFunction_Cptr, & + postTaskFunction_Cptr, & + nThreads, & + nTilesPerPacket, & + nTilesPerCpuTurn, & + packetPrototype_Cptr, & + tilePrototype_Cptr, & + postTilePrototype_Cptr) + end subroutine milhoja_runtime_setupPipelineForExtCpuGpuSplitTasks !> Instruct the runtime to tear down the GPU-only thread team pipeline. !! @@ -687,6 +810,14 @@ subroutine milhoja_runtime_teardownPipelineForExtGpuTasks(nThreads, nTilesPerPac ierr = milhoja_runtime_teardown_pipeline_extgpu_c(nThreads, nTilesPerPacket) end subroutine milhoja_runtime_teardownPipelineForExtGpuTasks + subroutine milhoja_runtime_teardownPipelineForExtCpuGpuSplitTasks(nThreads, nTilesPerPacket, & + ierr) + integer(MILHOJA_INT), intent(IN) :: nThreads + integer(MILHOJA_INT), intent(IN) :: nTilesPerPacket + integer(MILHOJA_INT), intent(OUT) :: ierr + + ierr = milhoja_runtime_teardown_pipeline_extcpugpusplit_c(nThreads, nTilesPerPacket) + end subroutine milhoja_runtime_teardownPipelineForExtCpuGpuSplitTasks !> Push one tile to the prepared pipeline for task execution. !! @@ -754,6 +885,26 @@ subroutine milhoja_runtime_pushTileToExtGpuPipeline(prototype_Cptr, & nThreads, & tileCInfo_Cp) end subroutine milhoja_runtime_pushTileToExtGpuPipeline + subroutine milhoja_runtime_pushTileToExtCpuGpuSplitPipeline(tilePrototype_Cptr, & + pktPrototype_Cptr, & + postTilePrototype_Cptr, & + nThreads, & + tileCInfo_Cp, & + ierr) + use iso_c_binding, ONLY : C_PTR + + type(C_PTR), intent(IN) :: tilePrototype_Cptr + type(C_PTR), intent(IN) :: pktPrototype_Cptr + type(C_PTR), intent(IN) :: postTilePrototype_Cptr + integer(MILHOJA_INT), intent(IN) :: nThreads + type(C_PTR), intent(IN) :: tileCInfo_Cp + integer(MILHOJA_INT), intent(OUT) :: ierr + ierr = milhoja_runtime_push_pipeline_extcpugpusplit_c(tilePrototype_Cptr, & + pktPrototype_Cptr, & + postTilePrototype_Cptr, & + nThreads, & + tileCInfo_Cp) + end subroutine milhoja_runtime_pushTileToExtCpuGpuSplitPipeline # endif #endif @@ -1010,6 +1161,54 @@ subroutine milhoja_runtime_executeTasks_ExtGpu(taskFunction, & packetPrototype_Cptr, & tilePrototype_Cptr) end subroutine milhoja_runtime_executeTasks_ExtGpu + + + subroutine milhoja_runtime_executeTasks_ExtCpuGpuSplit(tileTaskFunction, & + pktTaskFunction, & + postTaskFunction, & + nDistributorThreads, & + nThreads, & + nTilesPerPacket, & + nTilesPerCpuTurn, & + packetPrototype_Cptr, & + tilePrototype_Cptr, & + postTilePrototype_Cptr, & + ierr) + use iso_c_binding, ONLY : C_FUNPTR, & + C_PTR, & + C_FUNLOC + + procedure(milhoja_runtime_taskFunction) :: tileTaskFunction + procedure(milhoja_runtime_taskFunction) :: pktTaskFunction + procedure(milhoja_runtime_taskFunction) :: postTaskFunction + integer(MILHOJA_INT), intent(IN) :: nDistributorThreads + integer(MILHOJA_INT), intent(IN) :: nThreads + integer(MILHOJA_INT), intent(IN) :: nTilesPerPacket + integer(MILHOJA_INT), intent(IN) :: nTilesPerCpuTurn + type(C_PTR), intent(IN) :: packetPrototype_Cptr + type(C_PTR), intent(IN) :: tilePrototype_Cptr + type(C_PTR), intent(IN) :: postTilePrototype_Cptr + integer(MILHOJA_INT), intent(OUT) :: ierr + + type(C_FUNPTR) :: tileTaskFunction_Cptr + type(C_FUNPTR) :: pktTaskFunction_Cptr + type(C_FUNPTR) :: postTaskFunction_Cptr + + tileTaskFunction_Cptr = C_FUNLOC(tileTaskFunction) + pktTaskFunction_Cptr = C_FUNLOC(pktTaskFunction) + postTaskFunction_Cptr = C_FUNLOC(postTaskFunction) + + ierr = milhoja_runtime_execute_tasks_extcpugpusplit_c(tileTaskFunction_Cptr, & + pktTaskFunction_Cptr, & + postTaskFunction_Cptr, & + nDistributorThreads, & + nThreads, & + nTilesPerPacket, & + nTilesPerCpuTurn, & + packetPrototype_Cptr, & + tilePrototype_Cptr, & + postTilePrototype_Cptr) + end subroutine milhoja_runtime_executeTasks_ExtCpuGpuSplit # endif #endif diff --git a/src/Milhoja_MoverUnpacker.cpp b/src/Milhoja_MoverUnpacker.cpp index 2c350f71..39c0d5dc 100644 --- a/src/Milhoja_MoverUnpacker.cpp +++ b/src/Milhoja_MoverUnpacker.cpp @@ -193,18 +193,17 @@ void MoverUnpacker::handleTransferFinished(void* userData) { // Transfer the ownership of the data items in the packet to the next team if (dataReceiver) { - while (packet->nTiles() > 0) { -#if(0) - std::shared_ptr curTile = std::move(packet->popTile()); - std::shared_ptr wrappedTile = - unpacker->tileProto_->clone( std::move(curTile) ); - dataReceiver->enqueue( std::move(wrappedTile) ); -#endif - dataReceiver->enqueue( - unpacker->tileProto_->clone(packet->popTile()) - ); + auto receiverPrototype = unpacker->receiverPrototype(); + if (receiverPrototype) { + const TileWrapper* tileWrapperPrototype = + dynamic_cast(receiverPrototype); + while (packet->nTiles() > 0) { + dataReceiver->enqueue( + tileWrapperPrototype->clone(packet->popTile()) + ); + } + dataReceiver = nullptr; } - dataReceiver = nullptr; } packet = nullptr; @@ -312,15 +311,5 @@ void MoverUnpacker::wait(void) { pthread_mutex_unlock(&mutex_); } -void MoverUnpacker::setReceiverProto(TileWrapper const * w) { - - if (state_ != State::Idle) { - throw std::logic_error("[MoverUnpacker::setReceiverProto] " - "This setter should only be called in Idle state"); - } - tileProto_ = w; - -} - } diff --git a/src/Milhoja_Runtime.cpp b/src/Milhoja_Runtime.cpp index d9cf0e71..bb9fc23a 100644 --- a/src/Milhoja_Runtime.cpp +++ b/src/Milhoja_Runtime.cpp @@ -1145,7 +1145,7 @@ void Runtime::executeExtendedGpuTasks(const std::string& bundleName, gpuTeam->attachThreadReceiver(postGpuTeam); gpuTeam->attachDataReceiver(&gpuToHost1_); gpuToHost1_.attachDataReceiver(postGpuTeam); - gpuToHost1_.setReceiverProto(&tilePrototype); + gpuToHost1_.setReceiverPrototype(&tilePrototype); unsigned int nTotalThreads = gpuAction.nInitialThreads + postGpuAction.nInitialThreads @@ -1252,7 +1252,7 @@ void Runtime::setupPipelineForExtGpuTasks(const std::string& bundleName, gpuTeam->attachThreadReceiver(postGpuTeam); gpuTeam->attachDataReceiver(&gpuToHost1_); gpuToHost1_.attachDataReceiver(postGpuTeam); - gpuToHost1_.setReceiverProto(&tilePrototype); + gpuToHost1_.setReceiverPrototype(&tilePrototype); unsigned int nTotalThreads = gpuAction.nInitialThreads + postGpuAction.nInitialThreads @@ -2029,9 +2029,11 @@ void Runtime::executeCpuGpuSplitTasks_timed(const std::string& bundleName, void Runtime::executeExtendedCpuGpuSplitTasks(const std::string& bundleName, const unsigned int nDistributorThreads, const RuntimeAction& actionA_cpu, + const TileWrapper& tilePrototype, const RuntimeAction& actionA_gpu, - const RuntimeAction& postActionB_cpu, const DataPacket& packetPrototype, + const RuntimeAction& postActionB_cpu, + const TileWrapper& postTilePrototype, const unsigned int nTilesPerCpuTurn) { #ifdef USE_THREADED_DISTRIBUTOR const unsigned int nDistThreads = nDistributorThreads; @@ -2097,8 +2099,10 @@ void Runtime::executeExtendedCpuGpuSplitTasks(const std::string& bundleName, teamA_cpu->attachThreadReceiver(teamB_cpu); teamA_cpu->attachDataReceiver(teamB_cpu); + teamA_cpu->setReceiverPrototype(&postTilePrototype); teamA_gpu->attachDataReceiver(&gpuToHost1_); gpuToHost1_.attachDataReceiver(teamB_cpu); + gpuToHost1_.setReceiverPrototype(&postTilePrototype); // The action parallel distributor's thread resource is used // once the distributor starts to wait @@ -2148,7 +2152,7 @@ void Runtime::executeExtendedCpuGpuSplitTasks(const std::string& bundleName, tileDesc = ti->buildCurrentTile(); if (isCpuTurn) { - teamA_cpu->enqueue( std::move(tileDesc) ); + teamA_cpu->enqueue( tilePrototype.clone( std::move(tileDesc) ) ); ++nInCpuTurn; if (nInCpuTurn >= nTilesPerCpuTurn) { @@ -2195,7 +2199,229 @@ void Runtime::executeExtendedCpuGpuSplitTasks(const std::string& bundleName, Logger::instance().log("[Runtime] End Extended CPU/GPU shared action"); } # endif +# ifndef RUNTIME_MUST_USE_TILEITER +void Runtime::setupPipelineForExtCpuGpuSplitTasks(const std::string& bundleName, + const RuntimeAction& actionA_cpu, + const TileWrapper& tilePrototype, + const RuntimeAction& actionA_gpu, + const DataPacket& packetPrototype, + const RuntimeAction& postActionB_cpu, + const TileWrapper& postTilePrototype, + const unsigned int nTilesPerCpuTurn) { + + const unsigned int nDistThreads = 1; + + Logger::instance().log("[Runtime] Start extended CPU/GPU shared action"); + std::string msg = "[Runtime] " + + std::to_string(nDistThreads) + + " distributor threads"; + Logger::instance().log(msg); + msg = "[Runtime] " + + std::to_string(nTilesPerCpuTurn) + + " tiles sent to CPU for every packet of " + + std::to_string(actionA_gpu.nTilesPerPacket) + + " tiles sent to GPU"; + Logger::instance().log(msg); + + if (nDistThreads <= 0) { + throw std::invalid_argument("[Runtime::setupPipelineForExtCpuGpuSplitTasks] " + "nDistributorThreads must be positive"); + } else if (actionA_cpu.teamType != ThreadTeamDataType::BLOCK) { + throw std::logic_error("[Runtime::setupPipelineForExtCpuGpuSplitTasks] " + "Given CPU action A should run on tiles, " + "which is not in configuration"); + } else if (actionA_cpu.nTilesPerPacket != 0) { + throw std::invalid_argument("[Runtime::setupPipelineForExtCpuGpuSplitTasks] " + "CPU A tiles/packet should be zero since it is tile-based"); + } else if (actionA_gpu.teamType != ThreadTeamDataType::SET_OF_BLOCKS) { + throw std::logic_error("[Runtime::setupPipelineForExtCpuGpuSplitTasks] " + "Given GPU action should run on packet of blocks, " + "which is not in configuration"); + } else if (actionA_gpu.nTilesPerPacket <= 0) { + throw std::invalid_argument("[Runtime::setupPipelineForExtCpuGpuSplitTasks] " + "Need at least one tile per GPU packet"); + } else if (postActionB_cpu.teamType != actionA_cpu.teamType) { + throw std::logic_error("[Runtime::setupPipelineForExtCpuGpuSplitTasks] " + "Given post action data type must match that " + "of CPU action A"); + } else if (postActionB_cpu.nTilesPerPacket != actionA_cpu.nTilesPerPacket) { + throw std::invalid_argument("[Runtime::setupPipelineForExtCpuGpuSplitTasks] " + "Given post action tiles/packet must match that " + "of CPU action A"); + } else if (nTeams_ < 3) { + throw std::logic_error("[Runtime::setupPipelineForExtCpuGpuSplitTasks] " + "Need at least three ThreadTeams in runtime"); + } + nTilesPerPacket_ = actionA_gpu.nTilesPerPacket; + nTilesPerCpuTurn_ = nTilesPerCpuTurn; + isCpuTurn_ = true; + nInCpuTurn_ = 0; + + //***** ASSEMBLE THREAD TEAM CONFIGURATION + // CPU/GPU action parallel pipeline + // 1) Action Parallel Distributor will send one fraction of data items + // to CPU for computation and each of these is enqueued directly with the post + // action thread team. + // 2) For the remaining data items, + // a) Asynchronous transfer of Packets of Blocks to GPU by distributor, + // b) GPU action applied to blocks in packet by GPU team + // c) Mover/Unpacker transfers packet back to CPU, + // copies results to Grid data structures, + // and enqueues with post action thread team. + ThreadTeam* teamA_cpu = teams_[0]; + ThreadTeam* teamA_gpu = teams_[1]; + ThreadTeam* teamB_cpu = teams_[2]; + + teamA_cpu->attachThreadReceiver(teamB_cpu); + teamA_cpu->attachDataReceiver(teamB_cpu); + teamA_cpu->setReceiverPrototype(&postTilePrototype); + teamA_gpu->attachDataReceiver(&gpuToHost1_); + gpuToHost1_.attachDataReceiver(teamB_cpu); + gpuToHost1_.setReceiverPrototype(&postTilePrototype); + + // The action parallel distributor's thread resource is used + // once the distributor starts to wait + unsigned int nTotalThreads = actionA_cpu.nInitialThreads + + nDistThreads; + if (nTotalThreads > teamA_cpu->nMaximumThreads()) { + throw std::logic_error("[Runtime::setupPipelineForExtCpuGpuSplitTasks] " + "CPU team could receive too many thread " + "activation calls"); + } + nTotalThreads = actionA_cpu.nInitialThreads + + postActionB_cpu.nInitialThreads + + nDistThreads; + if (nTotalThreads > teamB_cpu->nMaximumThreads()) { + throw std::logic_error("[Runtime::setupPipelineForExtCpuGpuSplitTasks] " + "Post could receive too many thread " + "activation calls"); + } + + //***** START EXECUTION CYCLE + teamA_cpu->startCycle(actionA_cpu, "ActionSharing_CPU_Block_Team"); + teamA_gpu->startCycle(actionA_gpu, "ActionSharing_GPU_Packet_Team"); + teamB_cpu->startCycle(postActionB_cpu, "PostAction_CPU_Block_Team"); + gpuToHost1_.startCycle(); + + packet_gpu_ = packetPrototype.clone(); + + Logger::instance().log("[Runtime] End setting up extended CPU/GPU shared action"); +} + +void Runtime::pushTileToExtCpuGpuSplitPipeline(const std::string& bundleName, + const TileWrapper& tilePrototype, + const DataPacket& packetPrototype, + const TileWrapper& postTilePrototype, + const FlashxrTileRawPtrs& tP, + const FlashxTileRawInts& tI, + const FlashxTileRawReals& tR) { +#ifdef RUNTIME_PERTILE_LOG + Logger::instance().log("[Runtime] Push single tile task to EXT CPU/GPU split pipeline"); #endif + if (nTilesPerPacket_ <= 0) { + throw std::invalid_argument("[Runtime:pushTileToExtCpuGpuSplitPipeline] " + "Need at least one block per packet"); + } else if (nTeams_ < 3) { + throw std::logic_error("[Runtime:pushTileToExtCpuGpuSplitPipeline] " + "Need at three ThreadTeams in runtime"); + } + + ThreadTeam* teamA_cpu = teams_[0]; + ThreadTeam* teamA_gpu = teams_[1]; + ThreadTeam* teamB_cpu = teams_[2]; + + RuntimeBackend& backend = RuntimeBackend::instance(); + std::shared_ptr tileDesc{}; + { + + tileDesc = static_cast>(std::make_unique(tP, tI, tR)); + if (isCpuTurn_) { + teamA_cpu->enqueue( tilePrototype.clone( std::move(tileDesc) ) ); + if ((tileDesc != nullptr) || (tileDesc.use_count() != 0)) { + throw std::logic_error("[Runtime::pushTileToExtCpuGpuSplitPipeline] tileDesc ownership not transferred"); + } + + ++nInCpuTurn_; + if (nInCpuTurn_ >= nTilesPerCpuTurn_) { + isCpuTurn_ = false; + nInCpuTurn_ = 0; + } + } else { + packet_gpu_->addTile( std::move(tileDesc) ); + if ((tileDesc != nullptr) || (tileDesc.use_count() != 0)) { + throw std::logic_error("[Runtime::pushTileToExtCpuGpuSplitPipeline] tileDesc ownership not transferred"); + } + + if (packet_gpu_->nTiles() >= nTilesPerPacket_) { + packet_gpu_->pack(); +#ifdef RUNTIME_PERTILE_LOG + Logger::instance().log("[Runtime] Shipping off packet with " + + std::to_string(packet_gpu_->nTiles()) + + " tiles..."); +#endif + backend.initiateHostToGpuTransfer(*(packet_gpu_.get())); + teamA_gpu->enqueue( std::move(packet_gpu_) ); + + packet_gpu_ = packetPrototype.clone(); + isCpuTurn_ = true; + } + } + } +#ifdef RUNTIME_PERTILE_LOG + Logger::instance().log("[Runtime] Single tile task was pushed to EXT CPU/GPU split pipeline"); +#endif +} + +void Runtime::teardownPipelineForExtCpuGpuSplitTasks(const std::string& bundleName) { + + Logger::instance().log("[Runtime] Tear Down extended CPU/GPU shared action"); + + if (nTilesPerPacket_ <= 0) { + throw std::invalid_argument("[Runtime:teardownPipelineForExtCpuGpuSplitTasks] " + "Need at least one block per packet"); + } else if (nTeams_ < 3) { + throw std::logic_error("[Runtime:teardownPipelineForExtCpuGpuSplitTasks] " + "Need at least three ThreadTeams in runtime"); + } + ThreadTeam* teamA_cpu = teams_[0]; + ThreadTeam* teamA_gpu = teams_[1]; + ThreadTeam* teamB_cpu = teams_[2]; + + RuntimeBackend& backend = RuntimeBackend::instance(); + { + if (packet_gpu_->nTiles() > 0) { + packet_gpu_->pack(); +#ifdef RUNTIME_PERTILE_LOG + Logger::instance().log("[Runtime] Shipping off packet with " + + std::to_string(packet_gpu_->nTiles()) + + " final tiles..."); +#endif + backend.initiateHostToGpuTransfer(*(packet_gpu_.get())); + teamA_gpu->enqueue( std::move(packet_gpu_) ); + } else { + packet_gpu_.reset(); + } + + teamA_cpu->increaseThreadCount(1); + } // implied barrier + + teamA_gpu->closeQueue(nullptr); + teamA_cpu->closeQueue(nullptr); + + // All data flowing through the Action B/Post-A team + teamB_cpu->wait(); + + //***** BREAK APART THREAD TEAM CONFIGURATION + teamA_cpu->detachThreadReceiver(); + teamA_cpu->detachDataReceiver(); + teamA_gpu->detachDataReceiver(); + gpuToHost1_.detachDataReceiver(); + + Logger::instance().log("[Runtime:teardownPipelineForExtCpuGpuSplitTasks] End extended CPU/GPU shared action"); + +} +# endif // ifndef RUNTIME_MUST_USE_TILEITER +#endif // ifdef RUNTIME_SUPPORT_DATAPACKETS /** * diff --git a/src/Milhoja_RuntimeElement.cpp b/src/Milhoja_RuntimeElement.cpp index 9cc0980d..7d768131 100644 --- a/src/Milhoja_RuntimeElement.cpp +++ b/src/Milhoja_RuntimeElement.cpp @@ -7,6 +7,7 @@ using namespace milhoja; RuntimeElement::RuntimeElement(void) : threadReceiver_{nullptr}, dataReceiver_{nullptr}, + receiverPrototype_{nullptr}, calledCloseQueue_{} { } @@ -17,6 +18,9 @@ RuntimeElement::~RuntimeElement(void) { if (dataReceiver_) { std::cerr << "[RuntimeElement::~RuntimeElement] Data Subscriber still attached\n"; } + if (receiverPrototype_) { + std::cerr << "[RuntimeElement::~RuntimeElement] Receiver Prototype still set\n"; + } if (!calledCloseQueue_.empty()) { std::cerr << "[RuntimeElement::~RuntimeElement] Data publishers still attached\n"; // FIXME: Does this help prevent valgrind from finding potential pointer @@ -105,7 +109,10 @@ std::string RuntimeElement::detachDataReceiver(void) { } dataReceiver_ = nullptr; - + + // if it has a receiver's prototype, release it + receiverPrototype_ = nullptr; + return ""; } @@ -155,3 +162,25 @@ std::string RuntimeElement::detachDataPublisher(const RuntimeElement* publisher) return ""; } +/** + * Set the data receiver's prototype for later use when passing + * a DataItem to the data receiver, for calling a proper constructor. + * Note that the receiver's prototype is only required for passing TilwWrapper, currently. + * Thus, calling this function for the DataPacket has no effect. + * The receiverPrototype_ will be nullified when RuntimeElement::detachDataReceiver is called. + * + * \param prototype - A prototype of a DataItem to be passed to the DataReceiver. + */ +std::string RuntimeElement::setReceiverPrototype(const DataItem* prototype) { + + if (!prototype) { + return "Null receiver prototype is given"; + } else if (receiverPrototype_) { + return "A receiver prototype is already given"; + } + + receiverPrototype_ = prototype; + + return ""; +} + diff --git a/src/Milhoja_ThreadTeam.cpp b/src/Milhoja_ThreadTeam.cpp index 8673fc5d..5f4a5e59 100644 --- a/src/Milhoja_ThreadTeam.cpp +++ b/src/Milhoja_ThreadTeam.cpp @@ -4,6 +4,7 @@ #include "Milhoja_ThreadTeam.h" +#include #include #include #include @@ -15,6 +16,7 @@ #include "Milhoja_ThreadTeamRunningOpen.h" #include "Milhoja_ThreadTeamRunningClosed.h" #include "Milhoja_ThreadTeamRunningNoMoreWork.h" +#include "Milhoja_TileWrapper.h" namespace milhoja { @@ -907,6 +909,44 @@ std::string ThreadTeam::attachDataReceiver(RuntimeElement* receiver) { return ""; } + +/** + * + */ +std::string ThreadTeam::setReceiverPrototype(const DataItem* prototype) { + pthread_mutex_lock(&teamMutex_); + + std::string errMsg(""); + if (!state_) { + errMsg = printState_NotThreadsafe("setReceiverPrototype", 0, + "state_ is NULL"); + pthread_mutex_unlock(&teamMutex_); + throw std::runtime_error(errMsg); + } + std::string msg = state_->isStateValid_NotThreadSafe(); + if (msg != "") { + errMsg = printState_NotThreadsafe("setReceiverPrototype", 0, msg); + pthread_mutex_unlock(&teamMutex_); + throw std::runtime_error(errMsg); + } else if (state_->mode() != ThreadTeamMode::IDLE) { + errMsg = printState_NotThreadsafe("setReceiverPrototype", 0, + "A team can only be attached in the Idle mode"); + pthread_mutex_unlock(&teamMutex_); + throw std::logic_error(errMsg); + } + + errMsg = RuntimeElement::setReceiverPrototype(prototype); + if (errMsg != "") { + errMsg = printState_NotThreadsafe("setReceiverPrototype", 0, errMsg); + pthread_mutex_unlock(&teamMutex_); + throw std::logic_error(errMsg); + } + + pthread_mutex_unlock(&teamMutex_); + + return ""; +} + /** * Detach the data subscriber so that the calling object is no longer a data * publisher. @@ -1473,7 +1513,29 @@ void* ThreadTeam::threadRoutine(void* varg) { if (team->dataReceiver_) { // Move the data item along so that dataItem is null - team->dataReceiver_->enqueue(std::move(dataItem)); + if (auto tileWrapper = std::dynamic_pointer_cast(dataItem)) { + if (auto tileWrapperPrototype = + dynamic_cast(team->receiverPrototype_)) { + // NOTE: this is the case where dataItem is a TilwWrapper, + // and the team->receiverPrototype_ is another TileWrapper. + // Need to transfer dataItem initialized with data receiver's + // tileProtoType, as it may differ. + // TODO: very dirty ownership transfers + std::unique_ptr clonedTileWrapper = + tileWrapperPrototype->clone(std::move(tileWrapper->tile_)); + // Release ownership, assuming clonedTileWrapper has new ownership + dataItem.reset(); + team->dataReceiver_->enqueue(std::move(clonedTileWrapper)); + } + else { + // receiver prototype is not a tilewrapper. do the normal thing + team->dataReceiver_->enqueue(std::move(dataItem)); + } + } + else { + // the data receiver is a mover/unpacker + team->dataReceiver_->enqueue(std::move(dataItem)); + } } else { // The data item is done. Null dataItem so that the current // data item's resources can be released if this was the last diff --git a/tools/milhoja_pypkg/src/milhoja/TaskFunctionGenerator_OpenACC_F.py b/tools/milhoja_pypkg/src/milhoja/TaskFunctionGenerator_OpenACC_F.py index f436d2ec..40a3d092 100644 --- a/tools/milhoja_pypkg/src/milhoja/TaskFunctionGenerator_OpenACC_F.py +++ b/tools/milhoja_pypkg/src/milhoja/TaskFunctionGenerator_OpenACC_F.py @@ -147,7 +147,9 @@ def generate_source_code(self, destination, overwrite): assert interface.endswith(".F90") interface = interface.rstrip(".F90") fptr.write(f"{INDENT*2}use {interface}, ONLY : {subroutine}\n") - offloading.append(f"{INDENT*2}!$acc routine ({subroutine}) vector\n") + offloading.append( + f"{INDENT*2}!$acc routine ({self._get_wrapper_name(subroutine)}) vector\n" + ) fptr.writelines(["\n", *offloading, "\n"]) # No implicit variables fptr.write(f"{INDENT*2}implicit none\n\n") @@ -243,6 +245,7 @@ def generate_source_code(self, destination, overwrite): # Data packet sent on dataQ_h current_queues = ["dataQ_h"] + subroutine_wrappers = {} for node in self._tf_spec.internal_subroutine_graph: # Insert waits if needed before next round of kernel launches extras = [f"queue{i}_h" for i in range(2, len(node) + 1)] @@ -286,33 +289,25 @@ def generate_source_code(self, destination, overwrite): current_queues = next_queues.copy() assert len(current_queues) == len(node) for subroutine, queue in zip(node, current_queues): + # subroutine wrapper + # to prevent passing a slice of array + # which may introduce unnecessary device to host maps + wrapper_name, wrapper_lines = self._generate_subroutine_wrapper(INDENT, subroutine) + subroutine_wrappers[wrapper_name] = wrapper_lines + fptr.write(f"{INDENT*2}!$acc parallel loop gang default(none) &\n") fptr.write(f"{INDENT*2}!$acc& async({queue})\n") fptr.write(f"{INDENT*2}do n = 1, nTiles_d\n") - fptr.write(f"{INDENT*3}CALL {subroutine}( &\n") + fptr.write(f"{INDENT*3}CALL {wrapper_name}( &\n") actual_args = \ self._tf_spec.subroutine_actual_arguments(subroutine) - arg_list = [] + arg_list = [f"{INDENT*5}n"] for argument in actual_args: spec = self._tf_spec.argument_specification(argument) - extents = "" offs = "" - if spec["source"] in points: - extents = "(:, n)" - elif spec["source"] == TILE_DELTAS_ARGUMENT: - extents = "(:, n)" - elif spec["source"] == TILE_LEVEL_ARGUMENT: - extents = "(1, n)" + if spec["source"] == TILE_LEVEL_ARGUMENT: offs = " + 1" - elif spec["source"] in bounds: - extents = "(:, :, n)" - elif spec["source"] == GRID_DATA_ARGUMENT: - extents = "(:, :, :, :, n)" - elif spec["source"] == SCRATCH_ARGUMENT: - dimension = len(parse_extents(spec["extents"])) - tmp = [":" for _ in range(dimension)] - extents = "(" + ", ".join(tmp) + ", n)" - arg_list.append(f"{INDENT*5}{argument}_d{extents}{offs}") + arg_list.append(f"{INDENT*5}{argument}_d{offs}") fptr.write(", &\n".join(arg_list) + " &\n") fptr.write(f"{INDENT*5})\n") fptr.write(f"{INDENT*2}end do\n") @@ -342,5 +337,156 @@ def generate_source_code(self, destination, overwrite): # End subroutine declaration fptr.write(f"{INDENT}end subroutine {self._tf_spec.function_name}\n") fptr.write("\n") + + # Write subroutine wrappers + for wrapper, lines in subroutine_wrappers.items(): + for line in lines: + fptr.write(line + "\n") + # End module declaration fptr.write(f"end module {module}\n\n") + + def _get_wrapper_name(self, subroutine): + """ + A helper function to determine the name of subroutine wrapper, consisntently + """ + return "wrapper_" + subroutine + + def _generate_subroutine_wrapper(self, indent, subroutine): + """ + A helper function to generate a subroutine wrapper + """ + subroutine_wrapper = self._get_wrapper_name(subroutine) + lines = [] + + actual_args = self._tf_spec.subroutine_actual_arguments(subroutine) + dummy_args = ["nblk"] + [f"{arg}_d" for arg in actual_args] + + lines.append(f"{indent*1}subroutine {subroutine_wrapper} ( &") + dummy_arg_str = f"{indent*5}" + f", &\n{indent*5}".join(dummy_args) + f" &\n{indent*3})\n" + dummy_arg_str = "()\n" if len(dummy_args) == 0 else dummy_arg_str + lines.append(dummy_arg_str) + + interface = self._tf_spec.subroutine_interface_file(subroutine).strip() + interface = interface.rstrip(".F90") + lines.append(f"{indent*2}use {interface}, ONLY: {subroutine}") + lines.append("") + + lines.append(f"{indent*2}!$acc routine vector") + lines.append(f"{indent*2}!$acc routine ({subroutine}) vector") + lines.append("") + + lines.append(f"{indent*2}implicit none") + lines.append("") + + lines.append(f"{indent*2}! Arguments") + lines.append(f"{indent*2}integer, intent(IN) :: nblk") + + points = { + TILE_LO_ARGUMENT, TILE_HI_ARGUMENT, TILE_LBOUND_ARGUMENT, + TILE_UBOUND_ARGUMENT, LBOUND_ARGUMENT + } + bounds = {TILE_INTERIOR_ARGUMENT, TILE_ARRAY_BOUNDS_ARGUMENT} + pointer_extents = {} + pointer_types = {} + for arg in actual_args: + spec = self._tf_spec.argument_specification(arg) + src = spec["source"] + if src == EXTERNAL_ARGUMENT: + extents = spec["extents"] + if extents != "()": + msg = "No test case for non-scalar externals" + raise NotImplementedError(msg) + + # is this okay? Should we fail if there is no type mapping? + arg_type = C2F_TYPE_MAPPING.get(spec["type"], spec["type"]) + pointer_extents[arg] = 0 + pointer_types[arg] = arg_type + lines.append(f"{indent*2}{arg_type}, target, intent(IN) :: {arg}_d") + + elif src in points: + pointer_extents[arg] = 1 + pointer_types[arg] = "integer" + lines.append(f"{indent*2}integer, target, intent(IN) :: {arg}_d(:, :)") + + elif src == TILE_DELTAS_ARGUMENT: + pointer_extents[arg] = 1 + pointer_types[arg] = "real" + lines.append(f"{indent*2}real, target, intent(IN) :: {arg}_d(:, :)") + + elif src in bounds: + pointer_extents[arg] = 2 + pointer_types[arg] = "integer" + lines.append(f"{indent*2}integer, target, intent(IN) :: {arg}_d(:, :, :)") + + elif src == TILE_LEVEL_ARGUMENT: + pointer_extents[arg] = 1 + pointer_types[arg] = "integer" + lines.append(f"{indent*2}integer, target, intent(IN) :: {arg}_d(:, :)") + + elif src == GRID_DATA_ARGUMENT: + if arg in self._tf_spec.tile_in_arguments: + intent = "IN" + elif arg in self._tf_spec.tile_in_out_arguments: + intent = "INOUT" + elif arg in self._tf_spec.tile_out_arguments: + intent = "OUT" + else: + raise LogicError("Unknown grid data variable class") + + pointer_extents[arg] = 4 + pointer_types[arg] = "real" + lines.append(f"{indent*2}real, target, intent({intent}) :: {arg}_d(:, :, :, :, :)") + + elif src == SCRATCH_ARGUMENT: + arg_type = spec["type"] + dimension = len(parse_extents(spec["extents"])) + assert dimension > 0 + tmp = [":" for _ in range(dimension + 1)] + array = "(" + ", ".join(tmp) + ")" + pointer_extents[arg] = len(tmp) - 1 + pointer_types[arg] = arg_type + lines.append(f"{indent*2}{arg_type}, target, intent(INOUT) :: {arg}_d{array}") + + else: + raise LogicError(f"{arg} of unknown argument class") + lines.append("") + + lines.append(f"{indent*2}! Local variables") + pointer_mapping = {} + for arg in actual_args: + spec = self._tf_spec.argument_specification(arg) + arg_p = f"{arg}_d_p" + + ptr_type = pointer_types[arg] + ptr_extents = pointer_extents[arg] + + if ptr_extents > 0: + pointer_mapping[arg] = arg_p + _ext_str = ", ".join([":"] * ptr_extents) + _line = f"{indent*2}{ptr_type}, pointer :: {arg_p}({_ext_str})" + lines.append(_line) + lines.append("") + + lines.append(f"{indent*2}! Attach pointers") + for arg, ptr in pointer_mapping.items(): + ptr_extents = pointer_extents[arg] + _ext_str = ", ".join([":"] * ptr_extents) + ", nblk" + _line = f"{indent*2}{ptr} => {arg}_d({_ext_str})" + lines.append(_line) + lines.append("") + + lines.append(f"{indent*2}! Call subroutine") + lines.append(f"{indent*2}CALL {subroutine}( &") + arg_list = [] + for arg in actual_args: + _arg = pointer_mapping[arg] if arg in pointer_mapping else f"{arg}_d" + arg_list.append(_arg) + lines.append(f"{indent*5}" + f", &\n{indent*5}".join(arg_list) + " &") + lines.append(f"{indent*4})") + + lines.append("") + lines.append(f"{indent*1}end subroutine {subroutine_wrapper}") + lines.append("") + + return subroutine_wrapper, lines diff --git a/tools/milhoja_pypkg/src/milhoja/tests/TestCodeGenerators.py b/tools/milhoja_pypkg/src/milhoja/tests/TestCodeGenerators.py index a4698268..55a3a5ed 100644 --- a/tools/milhoja_pypkg/src/milhoja/tests/TestCodeGenerators.py +++ b/tools/milhoja_pypkg/src/milhoja/tests/TestCodeGenerators.py @@ -74,7 +74,7 @@ def run_tests(self, tests_all, dims_all, create_generator): ref = self.__load_code(ref_hdr_fname) generated = self.__load_code(header_filename) - self.assertEqual(len(ref), len(generated)) + self.assertEqual(len(ref), len(generated), f"generated != {ref_hdr_fname}") for gen_line, ref_line in zip(generated, ref): self.assertEqual(gen_line, ref_line, f"generated != {ref_hdr_fname}") @@ -95,7 +95,7 @@ def run_tests(self, tests_all, dims_all, create_generator): ref = self.__load_code(ref_src_fname) generated = self.__load_code(source_filename) - self.assertEqual(len(ref), len(generated)) + self.assertEqual(len(ref), len(generated), f"generated != {ref_src_fname}") for gen_line, ref_line in zip(generated, ref): self.assertEqual(gen_line, ref_line, f"generated != {ref_src_fname}") diff --git a/tools/milhoja_pypkg/src/milhoja/tests/data/FlashX/REF_gpu_tf_hydroFC_2D.F90 b/tools/milhoja_pypkg/src/milhoja/tests/data/FlashX/REF_gpu_tf_hydroFC_2D.F90 index c3149757..18d36260 100644 --- a/tools/milhoja_pypkg/src/milhoja/tests/data/FlashX/REF_gpu_tf_hydroFC_2D.F90 +++ b/tools/milhoja_pypkg/src/milhoja/tests/data/FlashX/REF_gpu_tf_hydroFC_2D.F90 @@ -40,10 +40,10 @@ subroutine gpu_tf_hydroFC_Fortran( & use dr_cg_hydroAdvance_mod, ONLY : Hydro_computeFluxesHll_Y_gpu_oacc use dr_cg_hydroAdvance_mod, ONLY : Hydro_updateSolutionHll_gpu_oacc - !$acc routine (Hydro_computeSoundSpeedHll_gpu_oacc) vector - !$acc routine (Hydro_computeFluxesHll_X_gpu_oacc) vector - !$acc routine (Hydro_computeFluxesHll_Y_gpu_oacc) vector - !$acc routine (Hydro_updateSolutionHll_gpu_oacc) vector + !$acc routine (wrapper_Hydro_computeSoundSpeedHll_gpu_oacc) vector + !$acc routine (wrapper_Hydro_computeFluxesHll_X_gpu_oacc) vector + !$acc routine (wrapper_Hydro_computeFluxesHll_Y_gpu_oacc) vector + !$acc routine (wrapper_Hydro_updateSolutionHll_gpu_oacc) vector implicit none @@ -79,56 +79,60 @@ subroutine gpu_tf_hydroFC_Fortran( & !$acc parallel loop gang default(none) & !$acc& async(dataQ_h) do n = 1, nTiles_d - CALL Hydro_computeSoundSpeedHll_gpu_oacc( & - tile_lo_d(:, n), & - tile_hi_d(:, n), & - CC_1_d(:, :, :, :, n), & - hydro_op1_auxc_d(:, :, :, n) & - ) + CALL wrapper_Hydro_computeSoundSpeedHll_gpu_oacc( & + n, & + tile_lo_d, & + tile_hi_d, & + CC_1_d, & + hydro_op1_auxc_d & + ) end do !$acc end parallel loop !$acc parallel loop gang default(none) & !$acc& async(dataQ_h) do n = 1, nTiles_d - CALL Hydro_computeFluxesHll_X_gpu_oacc( & - dt_d, & - tile_lo_d(:, n), & - tile_hi_d(:, n), & - tile_deltas_d(:, n), & - CC_1_d(:, :, :, :, n), & - hydro_op1_auxc_d(:, :, :, n), & - FLX_1_d(:, :, :, :, n) & - ) + CALL wrapper_Hydro_computeFluxesHll_X_gpu_oacc( & + n, & + dt_d, & + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + CC_1_d, & + hydro_op1_auxc_d, & + FLX_1_d & + ) end do !$acc end parallel loop !$acc parallel loop gang default(none) & !$acc& async(dataQ_h) do n = 1, nTiles_d - CALL Hydro_computeFluxesHll_Y_gpu_oacc( & - dt_d, & - tile_lo_d(:, n), & - tile_hi_d(:, n), & - tile_deltas_d(:, n), & - CC_1_d(:, :, :, :, n), & - hydro_op1_auxc_d(:, :, :, n), & - FLY_1_d(:, :, :, :, n) & - ) + CALL wrapper_Hydro_computeFluxesHll_Y_gpu_oacc( & + n, & + dt_d, & + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + CC_1_d, & + hydro_op1_auxc_d, & + FLY_1_d & + ) end do !$acc end parallel loop !$acc parallel loop gang default(none) & !$acc& async(dataQ_h) do n = 1, nTiles_d - CALL Hydro_updateSolutionHll_gpu_oacc( & - tile_lo_d(:, n), & - tile_hi_d(:, n), & - FLX_1_d(:, :, :, :, n), & - FLY_1_d(:, :, :, :, n), & - FLZ_1_d(:, :, :, :, n), & - CC_1_d(:, :, :, :, n) & - ) + CALL wrapper_Hydro_updateSolutionHll_gpu_oacc( & + n, & + tile_lo_d, & + tile_hi_d, & + FLX_1_d, & + FLY_1_d, & + FLZ_1_d, & + CC_1_d & + ) end do !$acc end parallel loop @@ -139,4 +143,216 @@ subroutine gpu_tf_hydroFC_Fortran( & !$acc end data end subroutine gpu_tf_hydroFC_Fortran + subroutine wrapper_Hydro_computeSoundSpeedHll_gpu_oacc ( & + nblk, & + tile_lo_d, & + tile_hi_d, & + CC_1_d, & + hydro_op1_auxc_d & + ) + + use dr_cg_hydroAdvance_mod, ONLY: Hydro_computeSoundSpeedHll_gpu_oacc + + !$acc routine vector + !$acc routine (Hydro_computeSoundSpeedHll_gpu_oacc) vector + + implicit none + + ! Arguments + integer, intent(IN) :: nblk + integer, target, intent(IN) :: tile_lo_d(:, :) + integer, target, intent(IN) :: tile_hi_d(:, :) + real, target, intent(INOUT) :: CC_1_d(:, :, :, :, :) + real, target, intent(INOUT) :: hydro_op1_auxc_d(:, :, :, :) + + ! Local variables + integer, pointer :: tile_lo_d_p(:) + integer, pointer :: tile_hi_d_p(:) + real, pointer :: CC_1_d_p(:, :, :, :) + real, pointer :: hydro_op1_auxc_d_p(:, :, :) + + ! Attach pointers + tile_lo_d_p => tile_lo_d(:, nblk) + tile_hi_d_p => tile_hi_d(:, nblk) + CC_1_d_p => CC_1_d(:, :, :, :, nblk) + hydro_op1_auxc_d_p => hydro_op1_auxc_d(:, :, :, nblk) + + ! Call subroutine + CALL Hydro_computeSoundSpeedHll_gpu_oacc( & + tile_lo_d_p, & + tile_hi_d_p, & + CC_1_d_p, & + hydro_op1_auxc_d_p & + ) + + end subroutine wrapper_Hydro_computeSoundSpeedHll_gpu_oacc + + subroutine wrapper_Hydro_computeFluxesHll_X_gpu_oacc ( & + nblk, & + dt_d, & + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + CC_1_d, & + hydro_op1_auxc_d, & + FLX_1_d & + ) + + use dr_cg_hydroAdvance_mod, ONLY: Hydro_computeFluxesHll_X_gpu_oacc + + !$acc routine vector + !$acc routine (Hydro_computeFluxesHll_X_gpu_oacc) vector + + implicit none + + ! Arguments + integer, intent(IN) :: nblk + real, target, intent(IN) :: dt_d + integer, target, intent(IN) :: tile_lo_d(:, :) + integer, target, intent(IN) :: tile_hi_d(:, :) + real, target, intent(IN) :: tile_deltas_d(:, :) + real, target, intent(INOUT) :: CC_1_d(:, :, :, :, :) + real, target, intent(INOUT) :: hydro_op1_auxc_d(:, :, :, :) + real, target, intent(OUT) :: FLX_1_d(:, :, :, :, :) + + ! Local variables + integer, pointer :: tile_lo_d_p(:) + integer, pointer :: tile_hi_d_p(:) + real, pointer :: tile_deltas_d_p(:) + real, pointer :: CC_1_d_p(:, :, :, :) + real, pointer :: hydro_op1_auxc_d_p(:, :, :) + real, pointer :: FLX_1_d_p(:, :, :, :) + + ! Attach pointers + tile_lo_d_p => tile_lo_d(:, nblk) + tile_hi_d_p => tile_hi_d(:, nblk) + tile_deltas_d_p => tile_deltas_d(:, nblk) + CC_1_d_p => CC_1_d(:, :, :, :, nblk) + hydro_op1_auxc_d_p => hydro_op1_auxc_d(:, :, :, nblk) + FLX_1_d_p => FLX_1_d(:, :, :, :, nblk) + + ! Call subroutine + CALL Hydro_computeFluxesHll_X_gpu_oacc( & + dt_d, & + tile_lo_d_p, & + tile_hi_d_p, & + tile_deltas_d_p, & + CC_1_d_p, & + hydro_op1_auxc_d_p, & + FLX_1_d_p & + ) + + end subroutine wrapper_Hydro_computeFluxesHll_X_gpu_oacc + + subroutine wrapper_Hydro_computeFluxesHll_Y_gpu_oacc ( & + nblk, & + dt_d, & + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + CC_1_d, & + hydro_op1_auxc_d, & + FLY_1_d & + ) + + use dr_cg_hydroAdvance_mod, ONLY: Hydro_computeFluxesHll_Y_gpu_oacc + + !$acc routine vector + !$acc routine (Hydro_computeFluxesHll_Y_gpu_oacc) vector + + implicit none + + ! Arguments + integer, intent(IN) :: nblk + real, target, intent(IN) :: dt_d + integer, target, intent(IN) :: tile_lo_d(:, :) + integer, target, intent(IN) :: tile_hi_d(:, :) + real, target, intent(IN) :: tile_deltas_d(:, :) + real, target, intent(INOUT) :: CC_1_d(:, :, :, :, :) + real, target, intent(INOUT) :: hydro_op1_auxc_d(:, :, :, :) + real, target, intent(OUT) :: FLY_1_d(:, :, :, :, :) + + ! Local variables + integer, pointer :: tile_lo_d_p(:) + integer, pointer :: tile_hi_d_p(:) + real, pointer :: tile_deltas_d_p(:) + real, pointer :: CC_1_d_p(:, :, :, :) + real, pointer :: hydro_op1_auxc_d_p(:, :, :) + real, pointer :: FLY_1_d_p(:, :, :, :) + + ! Attach pointers + tile_lo_d_p => tile_lo_d(:, nblk) + tile_hi_d_p => tile_hi_d(:, nblk) + tile_deltas_d_p => tile_deltas_d(:, nblk) + CC_1_d_p => CC_1_d(:, :, :, :, nblk) + hydro_op1_auxc_d_p => hydro_op1_auxc_d(:, :, :, nblk) + FLY_1_d_p => FLY_1_d(:, :, :, :, nblk) + + ! Call subroutine + CALL Hydro_computeFluxesHll_Y_gpu_oacc( & + dt_d, & + tile_lo_d_p, & + tile_hi_d_p, & + tile_deltas_d_p, & + CC_1_d_p, & + hydro_op1_auxc_d_p, & + FLY_1_d_p & + ) + + end subroutine wrapper_Hydro_computeFluxesHll_Y_gpu_oacc + + subroutine wrapper_Hydro_updateSolutionHll_gpu_oacc ( & + nblk, & + tile_lo_d, & + tile_hi_d, & + FLX_1_d, & + FLY_1_d, & + FLZ_1_d, & + CC_1_d & + ) + + use dr_cg_hydroAdvance_mod, ONLY: Hydro_updateSolutionHll_gpu_oacc + + !$acc routine vector + !$acc routine (Hydro_updateSolutionHll_gpu_oacc) vector + + implicit none + + ! Arguments + integer, intent(IN) :: nblk + integer, target, intent(IN) :: tile_lo_d(:, :) + integer, target, intent(IN) :: tile_hi_d(:, :) + real, target, intent(OUT) :: FLX_1_d(:, :, :, :, :) + real, target, intent(OUT) :: FLY_1_d(:, :, :, :, :) + real, target, intent(OUT) :: FLZ_1_d(:, :, :, :, :) + real, target, intent(INOUT) :: CC_1_d(:, :, :, :, :) + + ! Local variables + integer, pointer :: tile_lo_d_p(:) + integer, pointer :: tile_hi_d_p(:) + real, pointer :: FLX_1_d_p(:, :, :, :) + real, pointer :: FLY_1_d_p(:, :, :, :) + real, pointer :: FLZ_1_d_p(:, :, :, :) + real, pointer :: CC_1_d_p(:, :, :, :) + + ! Attach pointers + tile_lo_d_p => tile_lo_d(:, nblk) + tile_hi_d_p => tile_hi_d(:, nblk) + FLX_1_d_p => FLX_1_d(:, :, :, :, nblk) + FLY_1_d_p => FLY_1_d(:, :, :, :, nblk) + FLZ_1_d_p => FLZ_1_d(:, :, :, :, nblk) + CC_1_d_p => CC_1_d(:, :, :, :, nblk) + + ! Call subroutine + CALL Hydro_updateSolutionHll_gpu_oacc( & + tile_lo_d_p, & + tile_hi_d_p, & + FLX_1_d_p, & + FLY_1_d_p, & + FLZ_1_d_p, & + CC_1_d_p & + ) + + end subroutine wrapper_Hydro_updateSolutionHll_gpu_oacc + end module gpu_tf_hydroFC_mod diff --git a/tools/milhoja_pypkg/src/milhoja/tests/data/FlashX/REF_gpu_tf_hydro_2D.F90 b/tools/milhoja_pypkg/src/milhoja/tests/data/FlashX/REF_gpu_tf_hydro_2D.F90 index 73fd8e1c..39ee372d 100644 --- a/tools/milhoja_pypkg/src/milhoja/tests/data/FlashX/REF_gpu_tf_hydro_2D.F90 +++ b/tools/milhoja_pypkg/src/milhoja/tests/data/FlashX/REF_gpu_tf_hydro_2D.F90 @@ -40,10 +40,10 @@ subroutine gpu_tf_hydro_Fortran( & use dr_cg_hydroAdvance_mod, ONLY : Hydro_computeFluxesHll_Y_gpu_oacc use dr_cg_hydroAdvance_mod, ONLY : Hydro_updateSolutionHll_gpu_oacc - !$acc routine (Hydro_computeSoundSpeedHll_gpu_oacc) vector - !$acc routine (Hydro_computeFluxesHll_X_gpu_oacc) vector - !$acc routine (Hydro_computeFluxesHll_Y_gpu_oacc) vector - !$acc routine (Hydro_updateSolutionHll_gpu_oacc) vector + !$acc routine (wrapper_Hydro_computeSoundSpeedHll_gpu_oacc) vector + !$acc routine (wrapper_Hydro_computeFluxesHll_X_gpu_oacc) vector + !$acc routine (wrapper_Hydro_computeFluxesHll_Y_gpu_oacc) vector + !$acc routine (wrapper_Hydro_updateSolutionHll_gpu_oacc) vector implicit none @@ -79,56 +79,60 @@ subroutine gpu_tf_hydro_Fortran( & !$acc parallel loop gang default(none) & !$acc& async(dataQ_h) do n = 1, nTiles_d - CALL Hydro_computeSoundSpeedHll_gpu_oacc( & - tile_lo_d(:, n), & - tile_hi_d(:, n), & - U_d(:, :, :, :, n), & - hydro_op1_auxc_d(:, :, :, n) & - ) + CALL wrapper_Hydro_computeSoundSpeedHll_gpu_oacc( & + n, & + tile_lo_d, & + tile_hi_d, & + U_d, & + hydro_op1_auxc_d & + ) end do !$acc end parallel loop !$acc parallel loop gang default(none) & !$acc& async(dataQ_h) do n = 1, nTiles_d - CALL Hydro_computeFluxesHll_X_gpu_oacc( & - dt_d, & - tile_lo_d(:, n), & - tile_hi_d(:, n), & - tile_deltas_d(:, n), & - U_d(:, :, :, :, n), & - hydro_op1_auxc_d(:, :, :, n), & - hydro_op1_flX_d(:, :, :, :, n) & - ) + CALL wrapper_Hydro_computeFluxesHll_X_gpu_oacc( & + n, & + dt_d, & + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + U_d, & + hydro_op1_auxc_d, & + hydro_op1_flX_d & + ) end do !$acc end parallel loop !$acc parallel loop gang default(none) & !$acc& async(dataQ_h) do n = 1, nTiles_d - CALL Hydro_computeFluxesHll_Y_gpu_oacc( & + CALL wrapper_Hydro_computeFluxesHll_Y_gpu_oacc( & + n, & dt_d, & - tile_lo_d(:, n), & - tile_hi_d(:, n), & - tile_deltas_d(:, n), & - U_d(:, :, :, :, n), & - hydro_op1_auxc_d(:, :, :, n), & - hydro_op1_flY_d(:, :, :, :, n) & - ) + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + U_d, & + hydro_op1_auxc_d, & + hydro_op1_flY_d & + ) end do !$acc end parallel loop !$acc parallel loop gang default(none) & !$acc& async(dataQ_h) do n = 1, nTiles_d - CALL Hydro_updateSolutionHll_gpu_oacc( & - tile_lo_d(:, n), & - tile_hi_d(:, n), & - hydro_op1_flX_d(:, :, :, :, n), & - hydro_op1_flY_d(:, :, :, :, n), & - hydro_op1_flZ_d(:, :, :, :, n), & - U_d(:, :, :, :, n) & - ) + CALL wrapper_Hydro_updateSolutionHll_gpu_oacc( & + n, & + tile_lo_d, & + tile_hi_d, & + hydro_op1_flX_d, & + hydro_op1_flY_d, & + hydro_op1_flZ_d, & + U_d & + ) end do !$acc end parallel loop @@ -139,5 +143,217 @@ subroutine gpu_tf_hydro_Fortran( & !$acc end data end subroutine gpu_tf_hydro_Fortran + subroutine wrapper_Hydro_computeSoundSpeedHll_gpu_oacc ( & + nblk, & + tile_lo_d, & + tile_hi_d, & + U_d, & + hydro_op1_auxc_d & + ) + + use dr_cg_hydroAdvance_mod, ONLY: Hydro_computeSoundSpeedHll_gpu_oacc + + !$acc routine vector + !$acc routine (Hydro_computeSoundSpeedHll_gpu_oacc) vector + + implicit none + + ! Arguments + integer, intent(IN) :: nblk + integer, target, intent(IN) :: tile_lo_d(:, :) + integer, target, intent(IN) :: tile_hi_d(:, :) + real, target, intent(INOUT) :: U_d(:, :, :, :, :) + real, target, intent(INOUT) :: hydro_op1_auxc_d(:, :, :, :) + + ! Local variables + integer, pointer :: tile_lo_d_p(:) + integer, pointer :: tile_hi_d_p(:) + real, pointer :: U_d_p(:, :, :, :) + real, pointer :: hydro_op1_auxc_d_p(:, :, :) + + ! Attach pointers + tile_lo_d_p => tile_lo_d(:, nblk) + tile_hi_d_p => tile_hi_d(:, nblk) + U_d_p => U_d(:, :, :, :, nblk) + hydro_op1_auxc_d_p => hydro_op1_auxc_d(:, :, :, nblk) + + ! Call subroutine + CALL Hydro_computeSoundSpeedHll_gpu_oacc( & + tile_lo_d_p, & + tile_hi_d_p, & + U_d_p, & + hydro_op1_auxc_d_p & + ) + + end subroutine wrapper_Hydro_computeSoundSpeedHll_gpu_oacc + + subroutine wrapper_Hydro_computeFluxesHll_X_gpu_oacc ( & + nblk, & + dt_d, & + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + U_d, & + hydro_op1_auxc_d, & + hydro_op1_flX_d & + ) + + use dr_cg_hydroAdvance_mod, ONLY: Hydro_computeFluxesHll_X_gpu_oacc + + !$acc routine vector + !$acc routine (Hydro_computeFluxesHll_X_gpu_oacc) vector + + implicit none + + ! Arguments + integer, intent(IN) :: nblk + real, target, intent(IN) :: dt_d + integer, target, intent(IN) :: tile_lo_d(:, :) + integer, target, intent(IN) :: tile_hi_d(:, :) + real, target, intent(IN) :: tile_deltas_d(:, :) + real, target, intent(INOUT) :: U_d(:, :, :, :, :) + real, target, intent(INOUT) :: hydro_op1_auxc_d(:, :, :, :) + real, target, intent(INOUT) :: hydro_op1_flX_d(:, :, :, :, :) + + ! Local variables + integer, pointer :: tile_lo_d_p(:) + integer, pointer :: tile_hi_d_p(:) + real, pointer :: tile_deltas_d_p(:) + real, pointer :: U_d_p(:, :, :, :) + real, pointer :: hydro_op1_auxc_d_p(:, :, :) + real, pointer :: hydro_op1_flX_d_p(:, :, :, :) + + ! Attach pointers + tile_lo_d_p => tile_lo_d(:, nblk) + tile_hi_d_p => tile_hi_d(:, nblk) + tile_deltas_d_p => tile_deltas_d(:, nblk) + U_d_p => U_d(:, :, :, :, nblk) + hydro_op1_auxc_d_p => hydro_op1_auxc_d(:, :, :, nblk) + hydro_op1_flX_d_p => hydro_op1_flX_d(:, :, :, :, nblk) + + ! Call subroutine + CALL Hydro_computeFluxesHll_X_gpu_oacc( & + dt_d, & + tile_lo_d_p, & + tile_hi_d_p, & + tile_deltas_d_p, & + U_d_p, & + hydro_op1_auxc_d_p, & + hydro_op1_flX_d_p & + ) + + end subroutine wrapper_Hydro_computeFluxesHll_X_gpu_oacc + + subroutine wrapper_Hydro_computeFluxesHll_Y_gpu_oacc ( & + nblk, & + dt_d, & + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + U_d, & + hydro_op1_auxc_d, & + hydro_op1_flY_d & + ) + + use dr_cg_hydroAdvance_mod, ONLY: Hydro_computeFluxesHll_Y_gpu_oacc + + !$acc routine vector + !$acc routine (Hydro_computeFluxesHll_Y_gpu_oacc) vector + + implicit none + + ! Arguments + integer, intent(IN) :: nblk + real, target, intent(IN) :: dt_d + integer, target, intent(IN) :: tile_lo_d(:, :) + integer, target, intent(IN) :: tile_hi_d(:, :) + real, target, intent(IN) :: tile_deltas_d(:, :) + real, target, intent(INOUT) :: U_d(:, :, :, :, :) + real, target, intent(INOUT) :: hydro_op1_auxc_d(:, :, :, :) + real, target, intent(INOUT) :: hydro_op1_flY_d(:, :, :, :, :) + + ! Local variables + integer, pointer :: tile_lo_d_p(:) + integer, pointer :: tile_hi_d_p(:) + real, pointer :: tile_deltas_d_p(:) + real, pointer :: U_d_p(:, :, :, :) + real, pointer :: hydro_op1_auxc_d_p(:, :, :) + real, pointer :: hydro_op1_flY_d_p(:, :, :, :) + + ! Attach pointers + tile_lo_d_p => tile_lo_d(:, nblk) + tile_hi_d_p => tile_hi_d(:, nblk) + tile_deltas_d_p => tile_deltas_d(:, nblk) + U_d_p => U_d(:, :, :, :, nblk) + hydro_op1_auxc_d_p => hydro_op1_auxc_d(:, :, :, nblk) + hydro_op1_flY_d_p => hydro_op1_flY_d(:, :, :, :, nblk) + + ! Call subroutine + CALL Hydro_computeFluxesHll_Y_gpu_oacc( & + dt_d, & + tile_lo_d_p, & + tile_hi_d_p, & + tile_deltas_d_p, & + U_d_p, & + hydro_op1_auxc_d_p, & + hydro_op1_flY_d_p & + ) + + end subroutine wrapper_Hydro_computeFluxesHll_Y_gpu_oacc + + subroutine wrapper_Hydro_updateSolutionHll_gpu_oacc ( & + nblk, & + tile_lo_d, & + tile_hi_d, & + hydro_op1_flX_d, & + hydro_op1_flY_d, & + hydro_op1_flZ_d, & + U_d & + ) + + use dr_cg_hydroAdvance_mod, ONLY: Hydro_updateSolutionHll_gpu_oacc + + !$acc routine vector + !$acc routine (Hydro_updateSolutionHll_gpu_oacc) vector + + implicit none + + ! Arguments + integer, intent(IN) :: nblk + integer, target, intent(IN) :: tile_lo_d(:, :) + integer, target, intent(IN) :: tile_hi_d(:, :) + real, target, intent(INOUT) :: hydro_op1_flX_d(:, :, :, :, :) + real, target, intent(INOUT) :: hydro_op1_flY_d(:, :, :, :, :) + real, target, intent(INOUT) :: hydro_op1_flZ_d(:, :, :, :, :) + real, target, intent(INOUT) :: U_d(:, :, :, :, :) + + ! Local variables + integer, pointer :: tile_lo_d_p(:) + integer, pointer :: tile_hi_d_p(:) + real, pointer :: hydro_op1_flX_d_p(:, :, :, :) + real, pointer :: hydro_op1_flY_d_p(:, :, :, :) + real, pointer :: hydro_op1_flZ_d_p(:, :, :, :) + real, pointer :: U_d_p(:, :, :, :) + + ! Attach pointers + tile_lo_d_p => tile_lo_d(:, nblk) + tile_hi_d_p => tile_hi_d(:, nblk) + hydro_op1_flX_d_p => hydro_op1_flX_d(:, :, :, :, nblk) + hydro_op1_flY_d_p => hydro_op1_flY_d(:, :, :, :, nblk) + hydro_op1_flZ_d_p => hydro_op1_flZ_d(:, :, :, :, nblk) + U_d_p => U_d(:, :, :, :, nblk) + + ! Call subroutine + CALL Hydro_updateSolutionHll_gpu_oacc( & + tile_lo_d_p, & + tile_hi_d_p, & + hydro_op1_flX_d_p, & + hydro_op1_flY_d_p, & + hydro_op1_flZ_d_p, & + U_d_p & + ) + + end subroutine wrapper_Hydro_updateSolutionHll_gpu_oacc + end module gpu_tf_hydro_mod diff --git a/tools/milhoja_pypkg/src/milhoja/tests/data/FlashX/REF_gpu_tf_hydro_3D.F90 b/tools/milhoja_pypkg/src/milhoja/tests/data/FlashX/REF_gpu_tf_hydro_3D.F90 index 019853b9..ffc3d7b9 100644 --- a/tools/milhoja_pypkg/src/milhoja/tests/data/FlashX/REF_gpu_tf_hydro_3D.F90 +++ b/tools/milhoja_pypkg/src/milhoja/tests/data/FlashX/REF_gpu_tf_hydro_3D.F90 @@ -20,17 +20,17 @@ end subroutine gpu_tf_hydro_Cpp2C contains - subroutine gpu_tf_hydro_Fortran( & - C_packet_h, & - dataQ_h, & - queue2_h, & - queue3_h, & - nTiles_d, & + subroutine gpu_tf_hydro_Fortran( & + C_packet_h, & + dataQ_h, & + queue2_h, & + queue3_h, & + nTiles_d, & external_hydro_op1_dt_d, & - tile_deltas_d, & - tile_hi_d, & - tile_lo_d, & - CC_1_d, & + tile_deltas_d, & + tile_hi_d, & + tile_lo_d, & + CC_1_d, & scratch_hydro_op1_auxC_d, & scratch_hydro_op1_flX_d, & scratch_hydro_op1_flY_d, & @@ -53,11 +53,11 @@ subroutine gpu_tf_hydro_Fortran( & use dr_cg_hydroAdvance_mod, ONLY : Hydro_computeFluxesHll_Z_gpu_oacc use dr_cg_hydroAdvance_mod, ONLY : Hydro_updateSolutionHll_gpu_oacc - !$acc routine (Hydro_computeSoundSpeedHll_gpu_oacc) vector - !$acc routine (Hydro_computeFluxesHll_X_gpu_oacc) vector - !$acc routine (Hydro_computeFluxesHll_Y_gpu_oacc) vector - !$acc routine (Hydro_computeFluxesHll_Z_gpu_oacc) vector - !$acc routine (Hydro_updateSolutionHll_gpu_oacc) vector + !$acc routine (wrapper_Hydro_computeSoundSpeedHll_gpu_oacc) vector + !$acc routine (wrapper_Hydro_computeFluxesHll_X_gpu_oacc) vector + !$acc routine (wrapper_Hydro_computeFluxesHll_Y_gpu_oacc) vector + !$acc routine (wrapper_Hydro_computeFluxesHll_Z_gpu_oacc) vector + !$acc routine (wrapper_Hydro_updateSolutionHll_gpu_oacc) vector implicit none @@ -108,14 +108,15 @@ subroutine gpu_tf_hydro_Fortran( & !$acc parallel loop gang default(none) & !$acc& async(dataQ_h) do n = 1, nTiles_d - CALL Hydro_computeSoundSpeedHll_gpu_oacc( & - tile_lo_d(:, n), & - tile_hi_d(:, n), & - CC_1_d(:, :, :, :, n), & - lbdd_CC_1_d(:, n), & - scratch_hydro_op1_auxC_d(:, :, :, n), & - lbdd_scratch_hydro_op1_auxC_d(:, n) & - ) + CALL wrapper_Hydro_computeSoundSpeedHll_gpu_oacc( & + n, & + tile_lo_d, & + tile_hi_d, & + CC_1_d, & + lbdd_CC_1_d, & + scratch_hydro_op1_auxC_d, & + lbdd_scratch_hydro_op1_auxC_d & + ) end do !$acc end parallel loop @@ -124,17 +125,18 @@ subroutine gpu_tf_hydro_Fortran( & !$acc parallel loop gang default(none) & !$acc& async(dataQ_h) do n = 1, nTiles_d - CALL Hydro_computeFluxesHll_X_gpu_oacc( & - external_hydro_op1_dt_d, & - tile_lo_d(:, n), & - tile_hi_d(:, n), & - tile_deltas_d(:, n), & - CC_1_d(:, :, :, :, n), & - lbdd_CC_1_d(:, n), & - scratch_hydro_op1_auxC_d(:, :, :, n), & - lbdd_scratch_hydro_op1_auxC_d(:, n), & - scratch_hydro_op1_flX_d(:, :, :, :, n), & - lbdd_scratch_hydro_op1_flX_d(:, n) & + CALL wrapper_Hydro_computeFluxesHll_X_gpu_oacc( & + n, & + external_hydro_op1_dt_d, & + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + CC_1_d, & + lbdd_CC_1_d, & + scratch_hydro_op1_auxC_d, & + lbdd_scratch_hydro_op1_auxC_d, & + scratch_hydro_op1_flX_d, & + lbdd_scratch_hydro_op1_flX_d & ) end do !$acc end parallel loop @@ -142,64 +144,67 @@ subroutine gpu_tf_hydro_Fortran( & !$acc parallel loop gang default(none) & !$acc& async(queue2_h) do n = 1, nTiles_d - CALL Hydro_computeFluxesHll_Y_gpu_oacc( & - external_hydro_op1_dt_d, & - tile_lo_d(:, n), & - tile_hi_d(:, n), & - tile_deltas_d(:, n), & - CC_1_d(:, :, :, :, n), & - lbdd_CC_1_d(:, n), & - scratch_hydro_op1_auxC_d(:, :, :, n), & - lbdd_scratch_hydro_op1_auxC_d(:, n), & - scratch_hydro_op1_flY_d(:, :, :, :, n), & - lbdd_scratch_hydro_op1_flY_d(:, n) & - ) + CALL wrapper_Hydro_computeFluxesHll_Y_gpu_oacc( & + n, & + external_hydro_op1_dt_d, & + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + CC_1_d, & + lbdd_CC_1_d, & + scratch_hydro_op1_auxC_d, & + lbdd_scratch_hydro_op1_auxC_d, & + scratch_hydro_op1_flY_d, & + lbdd_scratch_hydro_op1_flY_d & + ) end do !$acc end parallel loop !$acc parallel loop gang default(none) & !$acc& async(queue3_h) do n = 1, nTiles_d - CALL Hydro_computeFluxesHll_Z_gpu_oacc( & + CALL wrapper_Hydro_computeFluxesHll_Z_gpu_oacc( & + n, & external_hydro_op1_dt_d, & - tile_lo_d(:, n), & - tile_hi_d(:, n), & - tile_deltas_d(:, n), & - CC_1_d(:, :, :, :, n), & - lbdd_CC_1_d(:, n), & - scratch_hydro_op1_auxC_d(:, :, :, n), & - lbdd_scratch_hydro_op1_auxC_d(:, n), & - scratch_hydro_op1_flZ_d(:, :, :, :, n), & - lbdd_scratch_hydro_op1_flZ_d(:, n) & - ) + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + CC_1_d, & + lbdd_CC_1_d, & + scratch_hydro_op1_auxC_d, & + lbdd_scratch_hydro_op1_auxC_d, & + scratch_hydro_op1_flZ_d, & + lbdd_scratch_hydro_op1_flZ_d & + ) end do !$acc end parallel loop - !$acc wait( & - !$acc& queue2_h, & - !$acc& queue3_h & - !$acc& ) + !$acc wait( & + !$acc& queue2_h, & + !$acc& queue3_h & + !$acc& ) !$acc parallel loop gang default(none) & !$acc& async(dataQ_h) do n = 1, nTiles_d - CALL Hydro_updateSolutionHll_gpu_oacc( & - tile_lo_d(:, n), & - tile_hi_d(:, n), & - scratch_hydro_op1_flX_d(:, :, :, :, n), & - scratch_hydro_op1_flY_d(:, :, :, :, n), & - scratch_hydro_op1_flZ_d(:, :, :, :, n), & - lbdd_scratch_hydro_op1_flX_d(:, n), & - CC_1_d(:, :, :, :, n), & - lbdd_CC_1_d(:, n) & - ) + CALL wrapper_Hydro_updateSolutionHll_gpu_oacc( & + n, & + tile_lo_d, & + tile_hi_d, & + scratch_hydro_op1_flX_d, & + scratch_hydro_op1_flY_d, & + scratch_hydro_op1_flZ_d, & + lbdd_scratch_hydro_op1_flX_d, & + CC_1_d, & + lbdd_CC_1_d & + ) end do !$acc end parallel loop !$acc wait( & - !$acc& dataQ_h & - !$acc& ) - + !$acc& dataQ_h & + !$acc& ) + MH_idx = INT(2, kind=MILHOJA_INT) MH_ierr = release_gpu_tf_hydro_extra_queue_c(C_packet_h, MH_idx) if (MH_ierr /= MILHOJA_SUCCESS) then @@ -217,5 +222,339 @@ subroutine gpu_tf_hydro_Fortran( & !$acc end data end subroutine gpu_tf_hydro_Fortran + subroutine wrapper_Hydro_computeSoundSpeedHll_gpu_oacc ( & + nblk, & + tile_lo_d, & + tile_hi_d, & + CC_1_d, & + lbdd_CC_1_d, & + scratch_hydro_op1_auxC_d, & + lbdd_scratch_hydro_op1_auxC_d & + ) + + use dr_cg_hydroAdvance_mod, ONLY: Hydro_computeSoundSpeedHll_gpu_oacc + + !$acc routine vector + !$acc routine (Hydro_computeSoundSpeedHll_gpu_oacc) vector + + implicit none + + ! Arguments + integer, intent(IN) :: nblk + integer, target, intent(IN) :: tile_lo_d(:, :) + integer, target, intent(IN) :: tile_hi_d(:, :) + real, target, intent(INOUT) :: CC_1_d(:, :, :, :, :) + integer, target, intent(IN) :: lbdd_CC_1_d(:, :) + real, target, intent(INOUT) :: scratch_hydro_op1_auxC_d(:, :, :, :) + integer, target, intent(IN) :: lbdd_scratch_hydro_op1_auxC_d(:, :) + + ! Local variables + integer, pointer :: tile_lo_d_p(:) + integer, pointer :: tile_hi_d_p(:) + real, pointer :: CC_1_d_p(:, :, :, :) + integer, pointer :: lbdd_CC_1_d_p(:) + real, pointer :: scratch_hydro_op1_auxC_d_p(:, :, :) + integer, pointer :: lbdd_scratch_hydro_op1_auxC_d_p(:) + + ! Attach pointers + tile_lo_d_p => tile_lo_d(:, nblk) + tile_hi_d_p => tile_hi_d(:, nblk) + CC_1_d_p => CC_1_d(:, :, :, :, nblk) + lbdd_CC_1_d_p => lbdd_CC_1_d(:, nblk) + scratch_hydro_op1_auxC_d_p => scratch_hydro_op1_auxC_d(:, :, :, nblk) + lbdd_scratch_hydro_op1_auxC_d_p => lbdd_scratch_hydro_op1_auxC_d(:, nblk) + + ! Call subroutine + CALL Hydro_computeSoundSpeedHll_gpu_oacc( & + tile_lo_d_p, & + tile_hi_d_p, & + CC_1_d_p, & + lbdd_CC_1_d_p, & + scratch_hydro_op1_auxC_d_p, & + lbdd_scratch_hydro_op1_auxC_d_p & + ) + + end subroutine wrapper_Hydro_computeSoundSpeedHll_gpu_oacc + + subroutine wrapper_Hydro_computeFluxesHll_X_gpu_oacc ( & + nblk, & + external_hydro_op1_dt_d, & + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + CC_1_d, & + lbdd_CC_1_d, & + scratch_hydro_op1_auxC_d, & + lbdd_scratch_hydro_op1_auxC_d, & + scratch_hydro_op1_flX_d, & + lbdd_scratch_hydro_op1_flX_d & + ) + + use dr_cg_hydroAdvance_mod, ONLY: Hydro_computeFluxesHll_X_gpu_oacc + + !$acc routine vector + !$acc routine (Hydro_computeFluxesHll_X_gpu_oacc) vector + + implicit none + + ! Arguments + integer, intent(IN) :: nblk + real, target, intent(IN) :: external_hydro_op1_dt_d + integer, target, intent(IN) :: tile_lo_d(:, :) + integer, target, intent(IN) :: tile_hi_d(:, :) + real, target, intent(IN) :: tile_deltas_d(:, :) + real, target, intent(INOUT) :: CC_1_d(:, :, :, :, :) + integer, target, intent(IN) :: lbdd_CC_1_d(:, :) + real, target, intent(INOUT) :: scratch_hydro_op1_auxC_d(:, :, :, :) + integer, target, intent(IN) :: lbdd_scratch_hydro_op1_auxC_d(:, :) + real, target, intent(INOUT) :: scratch_hydro_op1_flX_d(:, :, :, :, :) + integer, target, intent(IN) :: lbdd_scratch_hydro_op1_flX_d(:, :) + + ! Local variables + integer, pointer :: tile_lo_d_p(:) + integer, pointer :: tile_hi_d_p(:) + real, pointer :: tile_deltas_d_p(:) + real, pointer :: CC_1_d_p(:, :, :, :) + integer, pointer :: lbdd_CC_1_d_p(:) + real, pointer :: scratch_hydro_op1_auxC_d_p(:, :, :) + integer, pointer :: lbdd_scratch_hydro_op1_auxC_d_p(:) + real, pointer :: scratch_hydro_op1_flX_d_p(:, :, :, :) + integer, pointer :: lbdd_scratch_hydro_op1_flX_d_p(:) + + ! Attach pointers + tile_lo_d_p => tile_lo_d(:, nblk) + tile_hi_d_p => tile_hi_d(:, nblk) + tile_deltas_d_p => tile_deltas_d(:, nblk) + CC_1_d_p => CC_1_d(:, :, :, :, nblk) + lbdd_CC_1_d_p => lbdd_CC_1_d(:, nblk) + scratch_hydro_op1_auxC_d_p => scratch_hydro_op1_auxC_d(:, :, :, nblk) + lbdd_scratch_hydro_op1_auxC_d_p => lbdd_scratch_hydro_op1_auxC_d(:, nblk) + scratch_hydro_op1_flX_d_p => scratch_hydro_op1_flX_d(:, :, :, :, nblk) + lbdd_scratch_hydro_op1_flX_d_p => lbdd_scratch_hydro_op1_flX_d(:, nblk) + + ! Call subroutine + CALL Hydro_computeFluxesHll_X_gpu_oacc( & + external_hydro_op1_dt_d, & + tile_lo_d_p, & + tile_hi_d_p, & + tile_deltas_d_p, & + CC_1_d_p, & + lbdd_CC_1_d_p, & + scratch_hydro_op1_auxC_d_p, & + lbdd_scratch_hydro_op1_auxC_d_p, & + scratch_hydro_op1_flX_d_p, & + lbdd_scratch_hydro_op1_flX_d_p & + ) + + end subroutine wrapper_Hydro_computeFluxesHll_X_gpu_oacc + + subroutine wrapper_Hydro_computeFluxesHll_Y_gpu_oacc ( & + nblk, & + external_hydro_op1_dt_d, & + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + CC_1_d, & + lbdd_CC_1_d, & + scratch_hydro_op1_auxC_d, & + lbdd_scratch_hydro_op1_auxC_d, & + scratch_hydro_op1_flY_d, & + lbdd_scratch_hydro_op1_flY_d & + ) + + use dr_cg_hydroAdvance_mod, ONLY: Hydro_computeFluxesHll_Y_gpu_oacc + + !$acc routine vector + !$acc routine (Hydro_computeFluxesHll_Y_gpu_oacc) vector + + implicit none + + ! Arguments + integer, intent(IN) :: nblk + real, target, intent(IN) :: external_hydro_op1_dt_d + integer, target, intent(IN) :: tile_lo_d(:, :) + integer, target, intent(IN) :: tile_hi_d(:, :) + real, target, intent(IN) :: tile_deltas_d(:, :) + real, target, intent(INOUT) :: CC_1_d(:, :, :, :, :) + integer, target, intent(IN) :: lbdd_CC_1_d(:, :) + real, target, intent(INOUT) :: scratch_hydro_op1_auxC_d(:, :, :, :) + integer, target, intent(IN) :: lbdd_scratch_hydro_op1_auxC_d(:, :) + real, target, intent(INOUT) :: scratch_hydro_op1_flY_d(:, :, :, :, :) + integer, target, intent(IN) :: lbdd_scratch_hydro_op1_flY_d(:, :) + + ! Local variables + integer, pointer :: tile_lo_d_p(:) + integer, pointer :: tile_hi_d_p(:) + real, pointer :: tile_deltas_d_p(:) + real, pointer :: CC_1_d_p(:, :, :, :) + integer, pointer :: lbdd_CC_1_d_p(:) + real, pointer :: scratch_hydro_op1_auxC_d_p(:, :, :) + integer, pointer :: lbdd_scratch_hydro_op1_auxC_d_p(:) + real, pointer :: scratch_hydro_op1_flY_d_p(:, :, :, :) + integer, pointer :: lbdd_scratch_hydro_op1_flY_d_p(:) + + ! Attach pointers + tile_lo_d_p => tile_lo_d(:, nblk) + tile_hi_d_p => tile_hi_d(:, nblk) + tile_deltas_d_p => tile_deltas_d(:, nblk) + CC_1_d_p => CC_1_d(:, :, :, :, nblk) + lbdd_CC_1_d_p => lbdd_CC_1_d(:, nblk) + scratch_hydro_op1_auxC_d_p => scratch_hydro_op1_auxC_d(:, :, :, nblk) + lbdd_scratch_hydro_op1_auxC_d_p => lbdd_scratch_hydro_op1_auxC_d(:, nblk) + scratch_hydro_op1_flY_d_p => scratch_hydro_op1_flY_d(:, :, :, :, nblk) + lbdd_scratch_hydro_op1_flY_d_p => lbdd_scratch_hydro_op1_flY_d(:, nblk) + + ! Call subroutine + CALL Hydro_computeFluxesHll_Y_gpu_oacc( & + external_hydro_op1_dt_d, & + tile_lo_d_p, & + tile_hi_d_p, & + tile_deltas_d_p, & + CC_1_d_p, & + lbdd_CC_1_d_p, & + scratch_hydro_op1_auxC_d_p, & + lbdd_scratch_hydro_op1_auxC_d_p, & + scratch_hydro_op1_flY_d_p, & + lbdd_scratch_hydro_op1_flY_d_p & + ) + + end subroutine wrapper_Hydro_computeFluxesHll_Y_gpu_oacc + + subroutine wrapper_Hydro_computeFluxesHll_Z_gpu_oacc ( & + nblk, & + external_hydro_op1_dt_d, & + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + CC_1_d, & + lbdd_CC_1_d, & + scratch_hydro_op1_auxC_d, & + lbdd_scratch_hydro_op1_auxC_d, & + scratch_hydro_op1_flZ_d, & + lbdd_scratch_hydro_op1_flZ_d & + ) + + use dr_cg_hydroAdvance_mod, ONLY: Hydro_computeFluxesHll_Z_gpu_oacc + + !$acc routine vector + !$acc routine (Hydro_computeFluxesHll_Z_gpu_oacc) vector + + implicit none + + ! Arguments + integer, intent(IN) :: nblk + real, target, intent(IN) :: external_hydro_op1_dt_d + integer, target, intent(IN) :: tile_lo_d(:, :) + integer, target, intent(IN) :: tile_hi_d(:, :) + real, target, intent(IN) :: tile_deltas_d(:, :) + real, target, intent(INOUT) :: CC_1_d(:, :, :, :, :) + integer, target, intent(IN) :: lbdd_CC_1_d(:, :) + real, target, intent(INOUT) :: scratch_hydro_op1_auxC_d(:, :, :, :) + integer, target, intent(IN) :: lbdd_scratch_hydro_op1_auxC_d(:, :) + real, target, intent(INOUT) :: scratch_hydro_op1_flZ_d(:, :, :, :, :) + integer, target, intent(IN) :: lbdd_scratch_hydro_op1_flZ_d(:, :) + + ! Local variables + integer, pointer :: tile_lo_d_p(:) + integer, pointer :: tile_hi_d_p(:) + real, pointer :: tile_deltas_d_p(:) + real, pointer :: CC_1_d_p(:, :, :, :) + integer, pointer :: lbdd_CC_1_d_p(:) + real, pointer :: scratch_hydro_op1_auxC_d_p(:, :, :) + integer, pointer :: lbdd_scratch_hydro_op1_auxC_d_p(:) + real, pointer :: scratch_hydro_op1_flZ_d_p(:, :, :, :) + integer, pointer :: lbdd_scratch_hydro_op1_flZ_d_p(:) + + ! Attach pointers + tile_lo_d_p => tile_lo_d(:, nblk) + tile_hi_d_p => tile_hi_d(:, nblk) + tile_deltas_d_p => tile_deltas_d(:, nblk) + CC_1_d_p => CC_1_d(:, :, :, :, nblk) + lbdd_CC_1_d_p => lbdd_CC_1_d(:, nblk) + scratch_hydro_op1_auxC_d_p => scratch_hydro_op1_auxC_d(:, :, :, nblk) + lbdd_scratch_hydro_op1_auxC_d_p => lbdd_scratch_hydro_op1_auxC_d(:, nblk) + scratch_hydro_op1_flZ_d_p => scratch_hydro_op1_flZ_d(:, :, :, :, nblk) + lbdd_scratch_hydro_op1_flZ_d_p => lbdd_scratch_hydro_op1_flZ_d(:, nblk) + + ! Call subroutine + CALL Hydro_computeFluxesHll_Z_gpu_oacc( & + external_hydro_op1_dt_d, & + tile_lo_d_p, & + tile_hi_d_p, & + tile_deltas_d_p, & + CC_1_d_p, & + lbdd_CC_1_d_p, & + scratch_hydro_op1_auxC_d_p, & + lbdd_scratch_hydro_op1_auxC_d_p, & + scratch_hydro_op1_flZ_d_p, & + lbdd_scratch_hydro_op1_flZ_d_p & + ) + + end subroutine wrapper_Hydro_computeFluxesHll_Z_gpu_oacc + + subroutine wrapper_Hydro_updateSolutionHll_gpu_oacc ( & + nblk, & + tile_lo_d, & + tile_hi_d, & + scratch_hydro_op1_flX_d, & + scratch_hydro_op1_flY_d, & + scratch_hydro_op1_flZ_d, & + lbdd_scratch_hydro_op1_flX_d, & + CC_1_d, & + lbdd_CC_1_d & + ) + + use dr_cg_hydroAdvance_mod, ONLY: Hydro_updateSolutionHll_gpu_oacc + + !$acc routine vector + !$acc routine (Hydro_updateSolutionHll_gpu_oacc) vector + + implicit none + + ! Arguments + integer, intent(IN) :: nblk + integer, target, intent(IN) :: tile_lo_d(:, :) + integer, target, intent(IN) :: tile_hi_d(:, :) + real, target, intent(INOUT) :: scratch_hydro_op1_flX_d(:, :, :, :, :) + real, target, intent(INOUT) :: scratch_hydro_op1_flY_d(:, :, :, :, :) + real, target, intent(INOUT) :: scratch_hydro_op1_flZ_d(:, :, :, :, :) + integer, target, intent(IN) :: lbdd_scratch_hydro_op1_flX_d(:, :) + real, target, intent(INOUT) :: CC_1_d(:, :, :, :, :) + integer, target, intent(IN) :: lbdd_CC_1_d(:, :) + + ! Local variables + integer, pointer :: tile_lo_d_p(:) + integer, pointer :: tile_hi_d_p(:) + real, pointer :: scratch_hydro_op1_flX_d_p(:, :, :, :) + real, pointer :: scratch_hydro_op1_flY_d_p(:, :, :, :) + real, pointer :: scratch_hydro_op1_flZ_d_p(:, :, :, :) + integer, pointer :: lbdd_scratch_hydro_op1_flX_d_p(:) + real, pointer :: CC_1_d_p(:, :, :, :) + integer, pointer :: lbdd_CC_1_d_p(:) + + ! Attach pointers + tile_lo_d_p => tile_lo_d(:, nblk) + tile_hi_d_p => tile_hi_d(:, nblk) + scratch_hydro_op1_flX_d_p => scratch_hydro_op1_flX_d(:, :, :, :, nblk) + scratch_hydro_op1_flY_d_p => scratch_hydro_op1_flY_d(:, :, :, :, nblk) + scratch_hydro_op1_flZ_d_p => scratch_hydro_op1_flZ_d(:, :, :, :, nblk) + lbdd_scratch_hydro_op1_flX_d_p => lbdd_scratch_hydro_op1_flX_d(:, nblk) + CC_1_d_p => CC_1_d(:, :, :, :, nblk) + lbdd_CC_1_d_p => lbdd_CC_1_d(:, nblk) + + ! Call subroutine + CALL Hydro_updateSolutionHll_gpu_oacc( & + tile_lo_d_p, & + tile_hi_d_p, & + scratch_hydro_op1_flX_d_p, & + scratch_hydro_op1_flY_d_p, & + scratch_hydro_op1_flZ_d_p, & + lbdd_scratch_hydro_op1_flX_d_p, & + CC_1_d_p, & + lbdd_CC_1_d_p & + ) + + end subroutine wrapper_Hydro_updateSolutionHll_gpu_oacc + end module gpu_tf_hydro_mod diff --git a/tools/milhoja_pypkg/src/milhoja/tests/data/Sedov/REF_gpu_tf_test2_3D.F90 b/tools/milhoja_pypkg/src/milhoja/tests/data/Sedov/REF_gpu_tf_test2_3D.F90 index c0b555c0..a3124b05 100644 --- a/tools/milhoja_pypkg/src/milhoja/tests/data/Sedov/REF_gpu_tf_test2_3D.F90 +++ b/tools/milhoja_pypkg/src/milhoja/tests/data/Sedov/REF_gpu_tf_test2_3D.F90 @@ -47,10 +47,10 @@ subroutine gpu_tf_test2_Fortran( & use dr_cg_hydroAdvance_mod, ONLY : Hydro_computeFluxesHll_Y_gpu_oacc use dr_cg_hydroAdvance_mod, ONLY : Hydro_computeFluxesHll_Z_gpu_oacc - !$acc routine (Hydro_computeSoundSpeedHll_gpu_oacc) vector - !$acc routine (Hydro_computeFluxesHll_X_gpu_oacc) vector - !$acc routine (Hydro_computeFluxesHll_Y_gpu_oacc) vector - !$acc routine (Hydro_computeFluxesHll_Z_gpu_oacc) vector + !$acc routine (wrapper_Hydro_computeSoundSpeedHll_gpu_oacc) vector + !$acc routine (wrapper_Hydro_computeFluxesHll_X_gpu_oacc) vector + !$acc routine (wrapper_Hydro_computeFluxesHll_Y_gpu_oacc) vector + !$acc routine (wrapper_Hydro_computeFluxesHll_Z_gpu_oacc) vector implicit none @@ -91,12 +91,13 @@ subroutine gpu_tf_test2_Fortran( & !$acc parallel loop gang default(none) & !$acc& async(dataQ_h) do n = 1, nTiles_d - CALL Hydro_computeSoundSpeedHll_gpu_oacc( & - tile_lo_d(:, n), & - tile_hi_d(:, n), & - CC_1_d(:, :, :, :, n), & - hydro_op1_auxc_d(:, :, :, n) & - ) + CALL wrapper_Hydro_computeSoundSpeedHll_gpu_oacc( & + n, & + tile_lo_d, & + tile_hi_d, & + CC_1_d, & + hydro_op1_auxc_d & + ) end do !$acc end parallel loop @@ -105,45 +106,48 @@ subroutine gpu_tf_test2_Fortran( & !$acc parallel loop gang default(none) & !$acc& async(dataQ_h) do n = 1, nTiles_d - CALL Hydro_computeFluxesHll_X_gpu_oacc( & + CALL wrapper_Hydro_computeFluxesHll_X_gpu_oacc( & + n, & dt_d, & - tile_lo_d(:, n), & - tile_hi_d(:, n), & - tile_deltas_d(:, n), & - CC_1_d(:, :, :, :, n), & - hydro_op1_auxc_d(:, :, :, n), & - hydro_op1_flX_d(:, :, :, :, n) & - ) + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + CC_1_d, & + hydro_op1_auxc_d, & + hydro_op1_flX_d & + ) end do !$acc end parallel loop !$acc parallel loop gang default(none) & !$acc& async(queue2_h) do n = 1, nTiles_d - CALL Hydro_computeFluxesHll_Y_gpu_oacc( & + CALL wrapper_Hydro_computeFluxesHll_Y_gpu_oacc( & + n, & dt_d, & - tile_lo_d(:, n), & - tile_hi_d(:, n), & - tile_deltas_d(:, n), & - CC_1_d(:, :, :, :, n), & - hydro_op1_auxc_d(:, :, :, n), & - hydro_op1_flY_d(:, :, :, :, n) & - ) + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + CC_1_d, & + hydro_op1_auxc_d, & + hydro_op1_flY_d & + ) end do !$acc end parallel loop !$acc parallel loop gang default(none) & !$acc& async(queue3_h) do n = 1, nTiles_d - CALL Hydro_computeFluxesHll_Z_gpu_oacc( & + CALL wrapper_Hydro_computeFluxesHll_Z_gpu_oacc( & + n, & dt_d, & - tile_lo_d(:, n), & - tile_hi_d(:, n), & - tile_deltas_d(:, n), & - CC_1_d(:, :, :, :, n), & - hydro_op1_auxc_d(:, :, :, n), & - hydro_op1_flZ_d(:, :, :, :, n) & - ) + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + CC_1_d, & + hydro_op1_auxc_d, & + hydro_op1_flZ_d & + ) end do !$acc end parallel loop @@ -170,5 +174,220 @@ subroutine gpu_tf_test2_Fortran( & !$acc end data end subroutine gpu_tf_test2_Fortran + subroutine wrapper_Hydro_computeSoundSpeedHll_gpu_oacc ( & + nblk, & + tile_lo_d, & + tile_hi_d, & + CC_1_d, & + hydro_op1_auxc_d & + ) + + use dr_cg_hydroAdvance_mod, ONLY: Hydro_computeSoundSpeedHll_gpu_oacc + + !$acc routine vector + !$acc routine (Hydro_computeSoundSpeedHll_gpu_oacc) vector + + implicit none + + ! Arguments + integer, intent(IN) :: nblk + integer, target, intent(IN) :: tile_lo_d(:, :) + integer, target, intent(IN) :: tile_hi_d(:, :) + real, target, intent(INOUT) :: CC_1_d(:, :, :, :, :) + real, target, intent(INOUT) :: hydro_op1_auxc_d(:, :, :, :) + + ! Local variables + integer, pointer :: tile_lo_d_p(:) + integer, pointer :: tile_hi_d_p(:) + real, pointer :: CC_1_d_p(:, :, :, :) + real, pointer :: hydro_op1_auxc_d_p(:, :, :) + + ! Attach pointers + tile_lo_d_p => tile_lo_d(:, nblk) + tile_hi_d_p => tile_hi_d(:, nblk) + CC_1_d_p => CC_1_d(:, :, :, :, nblk) + hydro_op1_auxc_d_p => hydro_op1_auxc_d(:, :, :, nblk) + + ! Call subroutine + CALL Hydro_computeSoundSpeedHll_gpu_oacc( & + tile_lo_d_p, & + tile_hi_d_p, & + CC_1_d_p, & + hydro_op1_auxc_d_p & + ) + + end subroutine wrapper_Hydro_computeSoundSpeedHll_gpu_oacc + + subroutine wrapper_Hydro_computeFluxesHll_X_gpu_oacc ( & + nblk, & + dt_d, & + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + CC_1_d, & + hydro_op1_auxc_d, & + hydro_op1_flX_d & + ) + + use dr_cg_hydroAdvance_mod, ONLY: Hydro_computeFluxesHll_X_gpu_oacc + + !$acc routine vector + !$acc routine (Hydro_computeFluxesHll_X_gpu_oacc) vector + + implicit none + + ! Arguments + integer, intent(IN) :: nblk + real, target, intent(IN) :: dt_d + integer, target, intent(IN) :: tile_lo_d(:, :) + integer, target, intent(IN) :: tile_hi_d(:, :) + real, target, intent(IN) :: tile_deltas_d(:, :) + real, target, intent(INOUT) :: CC_1_d(:, :, :, :, :) + real, target, intent(INOUT) :: hydro_op1_auxc_d(:, :, :, :) + real, target, intent(INOUT) :: hydro_op1_flX_d(:, :, :, :, :) + + ! Local variables + integer, pointer :: tile_lo_d_p(:) + integer, pointer :: tile_hi_d_p(:) + real, pointer :: tile_deltas_d_p(:) + real, pointer :: CC_1_d_p(:, :, :, :) + real, pointer :: hydro_op1_auxc_d_p(:, :, :) + real, pointer :: hydro_op1_flX_d_p(:, :, :, :) + + ! Attach pointers + tile_lo_d_p => tile_lo_d(:, nblk) + tile_hi_d_p => tile_hi_d(:, nblk) + tile_deltas_d_p => tile_deltas_d(:, nblk) + CC_1_d_p => CC_1_d(:, :, :, :, nblk) + hydro_op1_auxc_d_p => hydro_op1_auxc_d(:, :, :, nblk) + hydro_op1_flX_d_p => hydro_op1_flX_d(:, :, :, :, nblk) + + ! Call subroutine + CALL Hydro_computeFluxesHll_X_gpu_oacc( & + dt_d, & + tile_lo_d_p, & + tile_hi_d_p, & + tile_deltas_d_p, & + CC_1_d_p, & + hydro_op1_auxc_d_p, & + hydro_op1_flX_d_p & + ) + + end subroutine wrapper_Hydro_computeFluxesHll_X_gpu_oacc + + subroutine wrapper_Hydro_computeFluxesHll_Y_gpu_oacc ( & + nblk, & + dt_d, & + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + CC_1_d, & + hydro_op1_auxc_d, & + hydro_op1_flY_d & + ) + + use dr_cg_hydroAdvance_mod, ONLY: Hydro_computeFluxesHll_Y_gpu_oacc + + !$acc routine vector + !$acc routine (Hydro_computeFluxesHll_Y_gpu_oacc) vector + + implicit none + + ! Arguments + integer, intent(IN) :: nblk + real, target, intent(IN) :: dt_d + integer, target, intent(IN) :: tile_lo_d(:, :) + integer, target, intent(IN) :: tile_hi_d(:, :) + real, target, intent(IN) :: tile_deltas_d(:, :) + real, target, intent(INOUT) :: CC_1_d(:, :, :, :, :) + real, target, intent(INOUT) :: hydro_op1_auxc_d(:, :, :, :) + real, target, intent(INOUT) :: hydro_op1_flY_d(:, :, :, :, :) + + ! Local variables + integer, pointer :: tile_lo_d_p(:) + integer, pointer :: tile_hi_d_p(:) + real, pointer :: tile_deltas_d_p(:) + real, pointer :: CC_1_d_p(:, :, :, :) + real, pointer :: hydro_op1_auxc_d_p(:, :, :) + real, pointer :: hydro_op1_flY_d_p(:, :, :, :) + + ! Attach pointers + tile_lo_d_p => tile_lo_d(:, nblk) + tile_hi_d_p => tile_hi_d(:, nblk) + tile_deltas_d_p => tile_deltas_d(:, nblk) + CC_1_d_p => CC_1_d(:, :, :, :, nblk) + hydro_op1_auxc_d_p => hydro_op1_auxc_d(:, :, :, nblk) + hydro_op1_flY_d_p => hydro_op1_flY_d(:, :, :, :, nblk) + + ! Call subroutine + CALL Hydro_computeFluxesHll_Y_gpu_oacc( & + dt_d, & + tile_lo_d_p, & + tile_hi_d_p, & + tile_deltas_d_p, & + CC_1_d_p, & + hydro_op1_auxc_d_p, & + hydro_op1_flY_d_p & + ) + + end subroutine wrapper_Hydro_computeFluxesHll_Y_gpu_oacc + + subroutine wrapper_Hydro_computeFluxesHll_Z_gpu_oacc ( & + nblk, & + dt_d, & + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + CC_1_d, & + hydro_op1_auxc_d, & + hydro_op1_flZ_d & + ) + + use dr_cg_hydroAdvance_mod, ONLY: Hydro_computeFluxesHll_Z_gpu_oacc + + !$acc routine vector + !$acc routine (Hydro_computeFluxesHll_Z_gpu_oacc) vector + + implicit none + + ! Arguments + integer, intent(IN) :: nblk + real, target, intent(IN) :: dt_d + integer, target, intent(IN) :: tile_lo_d(:, :) + integer, target, intent(IN) :: tile_hi_d(:, :) + real, target, intent(IN) :: tile_deltas_d(:, :) + real, target, intent(INOUT) :: CC_1_d(:, :, :, :, :) + real, target, intent(INOUT) :: hydro_op1_auxc_d(:, :, :, :) + real, target, intent(INOUT) :: hydro_op1_flZ_d(:, :, :, :, :) + + ! Local variables + integer, pointer :: tile_lo_d_p(:) + integer, pointer :: tile_hi_d_p(:) + real, pointer :: tile_deltas_d_p(:) + real, pointer :: CC_1_d_p(:, :, :, :) + real, pointer :: hydro_op1_auxc_d_p(:, :, :) + real, pointer :: hydro_op1_flZ_d_p(:, :, :, :) + + ! Attach pointers + tile_lo_d_p => tile_lo_d(:, nblk) + tile_hi_d_p => tile_hi_d(:, nblk) + tile_deltas_d_p => tile_deltas_d(:, nblk) + CC_1_d_p => CC_1_d(:, :, :, :, nblk) + hydro_op1_auxc_d_p => hydro_op1_auxc_d(:, :, :, nblk) + hydro_op1_flZ_d_p => hydro_op1_flZ_d(:, :, :, :, nblk) + + ! Call subroutine + CALL Hydro_computeFluxesHll_Z_gpu_oacc( & + dt_d, & + tile_lo_d_p, & + tile_hi_d_p, & + tile_deltas_d_p, & + CC_1_d_p, & + hydro_op1_auxc_d_p, & + hydro_op1_flZ_d_p & + ) + + end subroutine wrapper_Hydro_computeFluxesHll_Z_gpu_oacc + end module gpu_tf_test2_mod diff --git a/tools/milhoja_pypkg/src/milhoja/tests/data/Sedov/REF_gpu_tf_test_3D.F90 b/tools/milhoja_pypkg/src/milhoja/tests/data/Sedov/REF_gpu_tf_test_3D.F90 index 0c0e5ef6..0ffd41b4 100644 --- a/tools/milhoja_pypkg/src/milhoja/tests/data/Sedov/REF_gpu_tf_test_3D.F90 +++ b/tools/milhoja_pypkg/src/milhoja/tests/data/Sedov/REF_gpu_tf_test_3D.F90 @@ -47,10 +47,10 @@ subroutine gpu_tf_test_Fortran( & use dr_cg_hydroAdvance_mod, ONLY : Hydro_computeFluxesHll_Z_gpu_oacc use dr_cg_hydroAdvance_mod, ONLY : Hydro_updateSolutionHll_gpu_oacc - !$acc routine (Hydro_computeFluxesHll_X_gpu_oacc) vector - !$acc routine (Hydro_computeFluxesHll_Y_gpu_oacc) vector - !$acc routine (Hydro_computeFluxesHll_Z_gpu_oacc) vector - !$acc routine (Hydro_updateSolutionHll_gpu_oacc) vector + !$acc routine (wrapper_Hydro_computeFluxesHll_X_gpu_oacc) vector + !$acc routine (wrapper_Hydro_computeFluxesHll_Y_gpu_oacc) vector + !$acc routine (wrapper_Hydro_computeFluxesHll_Z_gpu_oacc) vector + !$acc routine (wrapper_Hydro_updateSolutionHll_gpu_oacc) vector implicit none @@ -93,45 +93,48 @@ subroutine gpu_tf_test_Fortran( & !$acc parallel loop gang default(none) & !$acc& async(dataQ_h) do n = 1, nTiles_d - CALL Hydro_computeFluxesHll_X_gpu_oacc( & + CALL wrapper_Hydro_computeFluxesHll_X_gpu_oacc( & + n, & dt_d, & - tile_lo_d(:, n), & - tile_hi_d(:, n), & - tile_deltas_d(:, n), & - CC_1_d(:, :, :, :, n), & - hydro_op1_auxc_d(:, :, :, n), & - hydro_op1_flX_d(:, :, :, :, n) & - ) + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + CC_1_d, & + hydro_op1_auxc_d, & + hydro_op1_flX_d & + ) end do !$acc end parallel loop !$acc parallel loop gang default(none) & !$acc& async(queue2_h) do n = 1, nTiles_d - CALL Hydro_computeFluxesHll_Y_gpu_oacc( & + CALL wrapper_Hydro_computeFluxesHll_Y_gpu_oacc( & + n, & dt_d, & - tile_lo_d(:, n), & - tile_hi_d(:, n), & - tile_deltas_d(:, n), & - CC_1_d(:, :, :, :, n), & - hydro_op1_auxc_d(:, :, :, n), & - hydro_op1_flY_d(:, :, :, :, n) & - ) + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + CC_1_d, & + hydro_op1_auxc_d, & + hydro_op1_flY_d & + ) end do !$acc end parallel loop !$acc parallel loop gang default(none) & !$acc& async(queue3_h) do n = 1, nTiles_d - CALL Hydro_computeFluxesHll_Z_gpu_oacc( & + CALL wrapper_Hydro_computeFluxesHll_Z_gpu_oacc( & + n, & dt_d, & - tile_lo_d(:, n), & - tile_hi_d(:, n), & - tile_deltas_d(:, n), & - CC_1_d(:, :, :, :, n), & - hydro_op1_auxc_d(:, :, :, n), & - hydro_op1_flZ_d(:, :, :, :, n) & - ) + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + CC_1_d, & + hydro_op1_auxc_d, & + hydro_op1_flZ_d & + ) end do !$acc end parallel loop @@ -143,14 +146,15 @@ subroutine gpu_tf_test_Fortran( & !$acc parallel loop gang default(none) & !$acc& async(dataQ_h) do n = 1, nTiles_d - CALL Hydro_updateSolutionHll_gpu_oacc( & - tile_lo_d(:, n), & - tile_hi_d(:, n), & - hydro_op1_flX_d(:, :, :, :, n), & - hydro_op1_flY_d(:, :, :, :, n), & - hydro_op1_flZ_d(:, :, :, :, n), & - CC_1_d(:, :, :, :, n) & - ) + CALL wrapper_Hydro_updateSolutionHll_gpu_oacc( & + n, & + tile_lo_d, & + tile_hi_d, & + hydro_op1_flX_d, & + hydro_op1_flY_d, & + hydro_op1_flZ_d, & + CC_1_d & + ) end do !$acc end parallel loop @@ -175,5 +179,230 @@ subroutine gpu_tf_test_Fortran( & !$acc end data end subroutine gpu_tf_test_Fortran + subroutine wrapper_Hydro_computeFluxesHll_X_gpu_oacc ( & + nblk, & + dt_d, & + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + CC_1_d, & + hydro_op1_auxc_d, & + hydro_op1_flX_d & + ) + + use dr_cg_hydroAdvance_mod, ONLY: Hydro_computeFluxesHll_X_gpu_oacc + + !$acc routine vector + !$acc routine (Hydro_computeFluxesHll_X_gpu_oacc) vector + + implicit none + + ! Arguments + integer, intent(IN) :: nblk + real, target, intent(IN) :: dt_d + integer, target, intent(IN) :: tile_lo_d(:, :) + integer, target, intent(IN) :: tile_hi_d(:, :) + real, target, intent(IN) :: tile_deltas_d(:, :) + real, target, intent(INOUT) :: CC_1_d(:, :, :, :, :) + real, target, intent(INOUT) :: hydro_op1_auxc_d(:, :, :, :) + real, target, intent(INOUT) :: hydro_op1_flX_d(:, :, :, :, :) + + ! Local variables + integer, pointer :: tile_lo_d_p(:) + integer, pointer :: tile_hi_d_p(:) + real, pointer :: tile_deltas_d_p(:) + real, pointer :: CC_1_d_p(:, :, :, :) + real, pointer :: hydro_op1_auxc_d_p(:, :, :) + real, pointer :: hydro_op1_flX_d_p(:, :, :, :) + + ! Attach pointers + tile_lo_d_p => tile_lo_d(:, nblk) + tile_hi_d_p => tile_hi_d(:, nblk) + tile_deltas_d_p => tile_deltas_d(:, nblk) + CC_1_d_p => CC_1_d(:, :, :, :, nblk) + hydro_op1_auxc_d_p => hydro_op1_auxc_d(:, :, :, nblk) + hydro_op1_flX_d_p => hydro_op1_flX_d(:, :, :, :, nblk) + + ! Call subroutine + CALL Hydro_computeFluxesHll_X_gpu_oacc( & + dt_d, & + tile_lo_d_p, & + tile_hi_d_p, & + tile_deltas_d_p, & + CC_1_d_p, & + hydro_op1_auxc_d_p, & + hydro_op1_flX_d_p & + ) + + end subroutine wrapper_Hydro_computeFluxesHll_X_gpu_oacc + + subroutine wrapper_Hydro_computeFluxesHll_Y_gpu_oacc ( & + nblk, & + dt_d, & + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + CC_1_d, & + hydro_op1_auxc_d, & + hydro_op1_flY_d & + ) + + use dr_cg_hydroAdvance_mod, ONLY: Hydro_computeFluxesHll_Y_gpu_oacc + + !$acc routine vector + !$acc routine (Hydro_computeFluxesHll_Y_gpu_oacc) vector + + implicit none + + ! Arguments + integer, intent(IN) :: nblk + real, target, intent(IN) :: dt_d + integer, target, intent(IN) :: tile_lo_d(:, :) + integer, target, intent(IN) :: tile_hi_d(:, :) + real, target, intent(IN) :: tile_deltas_d(:, :) + real, target, intent(INOUT) :: CC_1_d(:, :, :, :, :) + real, target, intent(INOUT) :: hydro_op1_auxc_d(:, :, :, :) + real, target, intent(INOUT) :: hydro_op1_flY_d(:, :, :, :, :) + + ! Local variables + integer, pointer :: tile_lo_d_p(:) + integer, pointer :: tile_hi_d_p(:) + real, pointer :: tile_deltas_d_p(:) + real, pointer :: CC_1_d_p(:, :, :, :) + real, pointer :: hydro_op1_auxc_d_p(:, :, :) + real, pointer :: hydro_op1_flY_d_p(:, :, :, :) + + ! Attach pointers + tile_lo_d_p => tile_lo_d(:, nblk) + tile_hi_d_p => tile_hi_d(:, nblk) + tile_deltas_d_p => tile_deltas_d(:, nblk) + CC_1_d_p => CC_1_d(:, :, :, :, nblk) + hydro_op1_auxc_d_p => hydro_op1_auxc_d(:, :, :, nblk) + hydro_op1_flY_d_p => hydro_op1_flY_d(:, :, :, :, nblk) + + ! Call subroutine + CALL Hydro_computeFluxesHll_Y_gpu_oacc( & + dt_d, & + tile_lo_d_p, & + tile_hi_d_p, & + tile_deltas_d_p, & + CC_1_d_p, & + hydro_op1_auxc_d_p, & + hydro_op1_flY_d_p & + ) + + end subroutine wrapper_Hydro_computeFluxesHll_Y_gpu_oacc + + subroutine wrapper_Hydro_computeFluxesHll_Z_gpu_oacc ( & + nblk, & + dt_d, & + tile_lo_d, & + tile_hi_d, & + tile_deltas_d, & + CC_1_d, & + hydro_op1_auxc_d, & + hydro_op1_flZ_d & + ) + + use dr_cg_hydroAdvance_mod, ONLY: Hydro_computeFluxesHll_Z_gpu_oacc + + !$acc routine vector + !$acc routine (Hydro_computeFluxesHll_Z_gpu_oacc) vector + + implicit none + + ! Arguments + integer, intent(IN) :: nblk + real, target, intent(IN) :: dt_d + integer, target, intent(IN) :: tile_lo_d(:, :) + integer, target, intent(IN) :: tile_hi_d(:, :) + real, target, intent(IN) :: tile_deltas_d(:, :) + real, target, intent(INOUT) :: CC_1_d(:, :, :, :, :) + real, target, intent(INOUT) :: hydro_op1_auxc_d(:, :, :, :) + real, target, intent(INOUT) :: hydro_op1_flZ_d(:, :, :, :, :) + + ! Local variables + integer, pointer :: tile_lo_d_p(:) + integer, pointer :: tile_hi_d_p(:) + real, pointer :: tile_deltas_d_p(:) + real, pointer :: CC_1_d_p(:, :, :, :) + real, pointer :: hydro_op1_auxc_d_p(:, :, :) + real, pointer :: hydro_op1_flZ_d_p(:, :, :, :) + + ! Attach pointers + tile_lo_d_p => tile_lo_d(:, nblk) + tile_hi_d_p => tile_hi_d(:, nblk) + tile_deltas_d_p => tile_deltas_d(:, nblk) + CC_1_d_p => CC_1_d(:, :, :, :, nblk) + hydro_op1_auxc_d_p => hydro_op1_auxc_d(:, :, :, nblk) + hydro_op1_flZ_d_p => hydro_op1_flZ_d(:, :, :, :, nblk) + + ! Call subroutine + CALL Hydro_computeFluxesHll_Z_gpu_oacc( & + dt_d, & + tile_lo_d_p, & + tile_hi_d_p, & + tile_deltas_d_p, & + CC_1_d_p, & + hydro_op1_auxc_d_p, & + hydro_op1_flZ_d_p & + ) + + end subroutine wrapper_Hydro_computeFluxesHll_Z_gpu_oacc + + subroutine wrapper_Hydro_updateSolutionHll_gpu_oacc ( & + nblk, & + tile_lo_d, & + tile_hi_d, & + hydro_op1_flX_d, & + hydro_op1_flY_d, & + hydro_op1_flZ_d, & + CC_1_d & + ) + + use dr_cg_hydroAdvance_mod, ONLY: Hydro_updateSolutionHll_gpu_oacc + + !$acc routine vector + !$acc routine (Hydro_updateSolutionHll_gpu_oacc) vector + + implicit none + + ! Arguments + integer, intent(IN) :: nblk + integer, target, intent(IN) :: tile_lo_d(:, :) + integer, target, intent(IN) :: tile_hi_d(:, :) + real, target, intent(INOUT) :: hydro_op1_flX_d(:, :, :, :, :) + real, target, intent(INOUT) :: hydro_op1_flY_d(:, :, :, :, :) + real, target, intent(INOUT) :: hydro_op1_flZ_d(:, :, :, :, :) + real, target, intent(INOUT) :: CC_1_d(:, :, :, :, :) + + ! Local variables + integer, pointer :: tile_lo_d_p(:) + integer, pointer :: tile_hi_d_p(:) + real, pointer :: hydro_op1_flX_d_p(:, :, :, :) + real, pointer :: hydro_op1_flY_d_p(:, :, :, :) + real, pointer :: hydro_op1_flZ_d_p(:, :, :, :) + real, pointer :: CC_1_d_p(:, :, :, :) + + ! Attach pointers + tile_lo_d_p => tile_lo_d(:, nblk) + tile_hi_d_p => tile_hi_d(:, nblk) + hydro_op1_flX_d_p => hydro_op1_flX_d(:, :, :, :, nblk) + hydro_op1_flY_d_p => hydro_op1_flY_d(:, :, :, :, nblk) + hydro_op1_flZ_d_p => hydro_op1_flZ_d(:, :, :, :, nblk) + CC_1_d_p => CC_1_d(:, :, :, :, nblk) + + ! Call subroutine + CALL Hydro_updateSolutionHll_gpu_oacc( & + tile_lo_d_p, & + tile_hi_d_p, & + hydro_op1_flX_d_p, & + hydro_op1_flY_d_p, & + hydro_op1_flZ_d_p, & + CC_1_d_p & + ) + + end subroutine wrapper_Hydro_updateSolutionHll_gpu_oacc + end module gpu_tf_test_mod