Reland "[flang][cuda] Use cuda runtime API #103488"

CUDA Fortran is meant to be an equivalent to the runtime API. Therefore, it makes more sense to use the cuda rt API in the allocators for CUF.
llvm · Aug 14, 2024 · 743e99d · 743e99d
1 parent a88f3a3
commit 743e99d
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 52 deletions.
diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -13,11 +13,10 @@
 #include "flang/Runtime/entry-names.h"
 
 #define CUDA_REPORT_IF_ERROR(expr) \
-  [](CUresult result) { \
-    if (!result) \
+  [](cudaError_t err) { \
+    if (err == cudaSuccess) \
       return; \
-    const char *name = nullptr; \
-    cuGetErrorName(result, &name); \
+    const char *name = cudaGetErrorName(err); \
     if (!name) \
       name = "<unknown>"; \
     Terminator terminator{__FILE__, __LINE__}; \

diff --git a/flang/runtime/CUDA/CMakeLists.txt b/flang/runtime/CUDA/CMakeLists.txt
@@ -7,14 +7,20 @@
 #===------------------------------------------------------------------------===#
 
 include_directories(${CUDAToolkit_INCLUDE_DIRS})
-find_library(CUDA_RUNTIME_LIBRARY cuda HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} REQUIRED)
 
 add_flang_library(CufRuntime
   allocator.cpp
   descriptor.cpp
 )
+
+if (BUILD_SHARED_LIBS)
+  set(CUDA_RT_TARGET CUDA::cudart)
+else()
+  set(CUDA_RT_TARGET CUDA::cudart_static)
+endif()
+
 target_link_libraries(CufRuntime
   PRIVATE
   FortranRuntime
-  ${CUDA_RUNTIME_LIBRARY}
+  ${CUDA_RT_TARGET}
 )
diff --git a/flang/runtime/CUDA/allocator.cpp b/flang/runtime/CUDA/allocator.cpp
@@ -15,7 +15,7 @@
 #include "flang/ISO_Fortran_binding_wrapper.h"
 #include "flang/Runtime/allocator-registry.h"
 
-#include "cuda.h"
+#include "cuda_runtime.h"
 
 namespace Fortran::runtime::cuda {
 extern "C" {
@@ -34,32 +34,28 @@ void RTDEF(CUFRegisterAllocator)() {
 
 void *CUFAllocPinned(std::size_t sizeInBytes) {
   void *p;
-  CUDA_REPORT_IF_ERROR(cuMemAllocHost(&p, sizeInBytes));
+  CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
   return p;
 }
 
-void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cuMemFreeHost(p)); }
+void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
 
 void *CUFAllocDevice(std::size_t sizeInBytes) {
-  CUdeviceptr p = 0;
-  CUDA_REPORT_IF_ERROR(cuMemAlloc(&p, sizeInBytes));
-  return reinterpret_cast<void *>(p);
+  void *p;
+  CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
+  return p;
 }
 
-void CUFFreeDevice(void *p) {
-  CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(p)));
-}
+void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
 
 void *CUFAllocManaged(std::size_t sizeInBytes) {
-  CUdeviceptr p = 0;
+  void *p;
   CUDA_REPORT_IF_ERROR(
-      cuMemAllocManaged(&p, sizeInBytes, CU_MEM_ATTACH_GLOBAL));
+      cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
   return reinterpret_cast<void *>(p);
 }
 
-void CUFFreeManaged(void *p) {
-  CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(p)));
-}
+void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
 
 void *CUFAllocUnified(std::size_t sizeInBytes) {
   // Call alloc managed for the time being.

diff --git a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
@@ -14,7 +14,7 @@
 #include "flang/Runtime/allocatable.h"
 #include "flang/Runtime/allocator-registry.h"
 
-#include "cuda.h"
+#include "cuda_runtime.h"
 
 using namespace Fortran::runtime;
 using namespace Fortran::runtime::cuda;
@@ -25,38 +25,9 @@ static OwningPtr<Descriptor> createAllocatable(
       CFI_attribute_allocatable);
 }
 
-thread_local static int32_t defaultDevice = 0;
-
-CUdevice getDefaultCuDevice() {
-  CUdevice device;
-  CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
-  return device;
-}
-
-class ScopedContext {
-public:
-  ScopedContext() {
-    // Static reference to CUDA primary context for device ordinal
-    // defaultDevice.
-    static CUcontext context = [] {
-      CUDA_REPORT_IF_ERROR(cuInit(/*flags=*/0));
-      CUcontext ctx;
-      // Note: this does not affect the current context.
-      CUDA_REPORT_IF_ERROR(
-          cuDevicePrimaryCtxRetain(&ctx, getDefaultCuDevice()));
-      return ctx;
-    }();
-
-    CUDA_REPORT_IF_ERROR(cuCtxPushCurrent(context));
-  }
-
-  ~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); }
-};
-
 TEST(AllocatableCUFTest, SimpleDeviceAllocate) {
   using Fortran::common::TypeCategory;
   RTNAME(CUFRegisterAllocator)();
-  ScopedContext ctx;
   // REAL(4), DEVICE, ALLOCATABLE :: a(:)
   auto a{createAllocatable(TypeCategory::Real, 4)};
   a->SetAllocIdx(kDeviceAllocatorPos);
@@ -74,7 +45,6 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocate) {
 TEST(AllocatableCUFTest, SimplePinnedAllocate) {
   using Fortran::common::TypeCategory;
   RTNAME(CUFRegisterAllocator)();
-  ScopedContext ctx;
   // INTEGER(4), PINNED, ALLOCATABLE :: a(:)
   auto a{createAllocatable(TypeCategory::Integer, 4)};
   EXPECT_FALSE(a->HasAddendum());
@@ -93,7 +63,6 @@ TEST(AllocatableCUFTest, SimplePinnedAllocate) {
 TEST(AllocatableCUFTest, DescriptorAllocationTest) {
   using Fortran::common::TypeCategory;
   RTNAME(CUFRegisterAllocator)();
-  ScopedContext ctx;
   // REAL(4), DEVICE, ALLOCATABLE :: a(:)
   auto a{createAllocatable(TypeCategory::Real, 4)};
   Descriptor *desc = nullptr;