From c820bb617ed592a208c66cb7de69a294a7dfea6c Mon Sep 17 00:00:00 2001 From: Triston Cao Date: Tue, 3 Aug 2021 14:58:43 -0700 Subject: [PATCH 1/4] Load libcuda with dlopen instead of dynamic linking --- CMakeLists.txt | 4 ++-- src/common/cuda/rtc.cc | 46 +++++++++++++++++++++++++++++++++--------- src/initialize.cc | 2 +- src/initialize.h | 4 ++-- 4 files changed, 42 insertions(+), 14 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b901f41c29f2..c5104f139b88 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -586,11 +586,11 @@ if(USE_CUDA) string(REPLACE ";" " " CUDA_ARCH_FLAGS_SPACES "${CUDA_ARCH_FLAGS}") - find_package(CUDAToolkit REQUIRED cublas cufft cusolver curand nvrtc cuda_driver + find_package(CUDAToolkit REQUIRED cublas cufft cusolver curand nvrtc OPTIONAL_COMPONENTS nvToolsExt) list(APPEND mxnet_LINKER_LIBS CUDA::cudart CUDA::cublas CUDA::cufft CUDA::cusolver CUDA::curand - CUDA::nvrtc CUDA::cuda_driver) + CUDA::nvrtc) list(APPEND SOURCE ${CUDA}) add_definitions(-DMXNET_USE_CUDA=1) diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc index 5b27e0bbd225..d10acd2b3fbf 100644 --- a/src/common/cuda/rtc.cc +++ b/src/common/cuda/rtc.cc @@ -32,6 +32,7 @@ #include #include "rtc.h" +#include "../../initialize.h" #include "rtc/half-inl.h" #include "rtc/util-inl.h" #include "rtc/forward_functions-inl.h" @@ -41,6 +42,18 @@ #include "rtc/reducer-inl.h" #include "utils.h" +typedef CUresult (*cuDeviceGetPtr) (CUdevice* device, int ordinal); +typedef CUresult (*cuDevicePrimaryCtxRetainPtr) (CUcontext* pctx, CUdevice dev); +typedef CUresult (*cuModuleLoadDataExPtr) (CUmodule* module, const void* image, + unsigned int numOptions, CUjit_option* options, void** optionValues); +typedef CUresult (*cuModuleGetFunctionPtr) (CUfunction* hfunc, CUmodule hmod, + const char* name); +typedef CUresult (*cuLaunchKernelPtr) (CUfunction f, unsigned int gridDimX, + unsigned int gridDimY, unsigned int gridDimZ, + unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, + unsigned int sharedMemBytes, CUstream hStream, void** kernelParams, + void** extra); +typedef CUresult (*cuGetErrorStringPtr) (CUresult error, const char** pStr); namespace mxnet { namespace common { @@ -149,6 +162,8 @@ CUfunction get_function(const std::string ¶meters, std::string ptx; std::vector functions; }; + void* cuda_lib_handle = LibraryInitializer::Get()->lib_load("libcuda.so"); + // Maps from the kernel name and parameters to the ptx and jit-compiled CUfunctions. using KernelCache = std::unordered_map; // Per-gpu-architecture compiled kernel cache with jit-compiled function for each device context @@ -233,8 +248,12 @@ CUfunction get_function(const std::string ¶meters, // Make sure driver context is set to the proper device CUdevice cu_device; CUcontext context; - CUDA_DRIVER_CALL(cuDeviceGet(&cu_device, dev_id)); - CUDA_DRIVER_CALL(cuDevicePrimaryCtxRetain(&context, cu_device)); + cuDeviceGetPtr device_get_ptr = get_func(cuda_lib_handle, "cuDeviceGet"); + CUDA_DRIVER_CALL((*device_get_ptr)(&cu_device, dev_id)); + cuDevicePrimaryCtxRetainPtr device_primary_ctx_retain = + get_func(cuda_lib_handle, "cuDevicePrimaryCtxRetain"); + CUDA_DRIVER_CALL((*device_primary_ctx_retain)(&context, cu_device)); + // Jit-compile ptx for the driver's current context CUmodule module; @@ -250,10 +269,15 @@ CUfunction get_function(const std::string ¶meters, void* jit_opt_values[] = {reinterpret_cast(debug_info), reinterpret_cast(line_info)}; - CUDA_DRIVER_CALL(cuModuleLoadDataEx(&module, kinfo.ptx.c_str(), 2, jit_opts, jit_opt_values)); - CUDA_DRIVER_CALL(cuModuleGetFunction(&kinfo.functions[dev_id], - module, - kinfo.mangled_name.c_str())); + cuModuleLoadDataExPtr module_load_data_ex_ptr = + get_func(cuda_lib_handle, "cuModuleLoadDataEx"); + CUDA_DRIVER_CALL((*module_load_data_ex_ptr)(&module, kinfo.ptx.c_str(), 2, + jit_opts, jit_opt_values)); + cuModuleGetFunctionPtr module_get_function_ptr = + get_func(cuda_lib_handle, "cuModuleGetFunction"); + CUDA_DRIVER_CALL((*module_get_function_ptr)(&kinfo.functions[dev_id], + module, + kinfo.mangled_name.c_str())); } return kinfo.functions[dev_id]; } @@ -266,8 +290,10 @@ void launch(CUfunction function, std::vector *args) { CHECK(args->size() != 0) << "Empty argument list passed to a kernel."; - // CUDA_DRIVER_CALL( - CUresult err = cuLaunchKernel(function, // function to launch + void* cuda_lib_handle = LibraryInitializer::Get()->lib_load("libcuda.so"); + cuLaunchKernelPtr launch_kernel_ptr = + get_func(cuda_lib_handle, "cuLaunchKernel"); + CUresult err = (*launch_kernel_ptr)(function, // function to launch grid_dim.x, grid_dim.y, grid_dim.z, // grid dim block_dim.x, block_dim.y, block_dim.z, // block dim shared_mem_bytes, // shared memory @@ -276,7 +302,9 @@ void launch(CUfunction function, nullptr); // ); if (err != CUDA_SUCCESS) { const char* error_string; - cuGetErrorString(err, &error_string); + cuGetErrorStringPtr get_error_string_ptr = + get_func(cuda_lib_handle, "cuGetErrorString"); + (*get_error_string_ptr)(err, &error_string); LOG(FATAL) << "cuLaunchKernel failed: " << err << " " << error_string << ": " << reinterpret_cast(function) << " " diff --git a/src/initialize.cc b/src/initialize.cc index 9ef51219609f..6be13e61ae9e 100644 --- a/src/initialize.cc +++ b/src/initialize.cc @@ -173,7 +173,7 @@ void LibraryInitializer::lib_close(void* handle) { * \param func function pointer that gets output address * \param name function name to be fetched */ -void LibraryInitializer::get_sym(void* handle, void** func, char* name) { +void LibraryInitializer::get_sym(void* handle, void** func, const char* name) { #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__) *func = GetProcAddress((HMODULE)handle, name); if (!(*func)) { diff --git a/src/initialize.h b/src/initialize.h index 8a6dc3aa5f7f..d792613aefb2 100644 --- a/src/initialize.h +++ b/src/initialize.h @@ -69,7 +69,7 @@ class LibraryInitializer { bool lib_is_loaded(const std::string& path) const; void* lib_load(const char* path); void lib_close(void* handle); - static void get_sym(void* handle, void** func, char* name); + static void get_sym(void* handle, void** func, const char* name); /** * Original pid of the process which first loaded and initialized the library @@ -114,7 +114,7 @@ class LibraryInitializer { * \return func a function pointer */ template -T get_func(void *lib, char *func_name) { +T get_func(void *lib, const char *func_name) { T func; LibraryInitializer::Get()->get_sym(lib, reinterpret_cast(&func), func_name); if (!func) From 6d8d0445f76412501132a235808a53d15540d8c6 Mon Sep 17 00:00:00 2001 From: Triston Cao Date: Tue, 3 Aug 2021 15:19:12 -0700 Subject: [PATCH 2/4] Fix a lint error --- src/common/cuda/rtc.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc index d10acd2b3fbf..0d0c7a49f653 100644 --- a/src/common/cuda/rtc.cc +++ b/src/common/cuda/rtc.cc @@ -293,7 +293,7 @@ void launch(CUfunction function, void* cuda_lib_handle = LibraryInitializer::Get()->lib_load("libcuda.so"); cuLaunchKernelPtr launch_kernel_ptr = get_func(cuda_lib_handle, "cuLaunchKernel"); - CUresult err = (*launch_kernel_ptr)(function, // function to launch + CUresult err = (*launch_kernel_ptr)(function, // function to launch grid_dim.x, grid_dim.y, grid_dim.z, // grid dim block_dim.x, block_dim.y, block_dim.z, // block dim shared_mem_bytes, // shared memory From f5e129059424015eb9d442a91824708e53f9836d Mon Sep 17 00:00:00 2001 From: Triston Cao Date: Tue, 3 Aug 2021 16:28:55 -0700 Subject: [PATCH 3/4] Make naming style consistent --- src/common/cuda/rtc.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc index 0d0c7a49f653..02990c22c66f 100644 --- a/src/common/cuda/rtc.cc +++ b/src/common/cuda/rtc.cc @@ -250,9 +250,9 @@ CUfunction get_function(const std::string ¶meters, CUcontext context; cuDeviceGetPtr device_get_ptr = get_func(cuda_lib_handle, "cuDeviceGet"); CUDA_DRIVER_CALL((*device_get_ptr)(&cu_device, dev_id)); - cuDevicePrimaryCtxRetainPtr device_primary_ctx_retain = + cuDevicePrimaryCtxRetainPtr device_primary_ctx_retain_ptr = get_func(cuda_lib_handle, "cuDevicePrimaryCtxRetain"); - CUDA_DRIVER_CALL((*device_primary_ctx_retain)(&context, cu_device)); + CUDA_DRIVER_CALL((*device_primary_ctx_retain_ptr)(&context, cu_device)); // Jit-compile ptx for the driver's current context CUmodule module; From 9510caf956b5b56b3f29e34adb3ec5d143504d4f Mon Sep 17 00:00:00 2001 From: Triston Cao Date: Tue, 3 Aug 2021 23:13:57 -0700 Subject: [PATCH 4/4] Use correct the CUDA library names on Windows and Unix-like system --- src/common/cuda/rtc.cc | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/common/cuda/rtc.cc b/src/common/cuda/rtc.cc index 02990c22c66f..dda3b7421bed 100644 --- a/src/common/cuda/rtc.cc +++ b/src/common/cuda/rtc.cc @@ -60,6 +60,12 @@ namespace common { namespace cuda { namespace rtc { +#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__) + const char cuda_lib_name[] = "nvcuda.dll"; +#else + const char cuda_lib_name[] = "libcuda.so"; +#endif + std::mutex lock; namespace util { @@ -162,7 +168,7 @@ CUfunction get_function(const std::string ¶meters, std::string ptx; std::vector functions; }; - void* cuda_lib_handle = LibraryInitializer::Get()->lib_load("libcuda.so"); + void* cuda_lib_handle = LibraryInitializer::Get()->lib_load(cuda_lib_name); // Maps from the kernel name and parameters to the ptx and jit-compiled CUfunctions. using KernelCache = std::unordered_map; @@ -290,7 +296,7 @@ void launch(CUfunction function, std::vector *args) { CHECK(args->size() != 0) << "Empty argument list passed to a kernel."; - void* cuda_lib_handle = LibraryInitializer::Get()->lib_load("libcuda.so"); + void* cuda_lib_handle = LibraryInitializer::Get()->lib_load(cuda_lib_name); cuLaunchKernelPtr launch_kernel_ptr = get_func(cuda_lib_handle, "cuLaunchKernel"); CUresult err = (*launch_kernel_ptr)(function, // function to launch