From 088e6cf7bb2739a3e22c8756b0d4bdf8610a8af7 Mon Sep 17 00:00:00 2001 From: "Gao, Xiang" Date: Sat, 23 Sep 2023 10:22:55 -0700 Subject: [PATCH] Lazily load and invoke driver API (#934) Suggest starting review from `csrc/driver_api.h` --- CMakeLists.txt | 1 + csrc/cuda_utils.h | 1 + csrc/driver_api.cpp | 123 ++++++++++++++++++++++++++++++++++++++++++++ csrc/driver_api.h | 29 +++++++++++ csrc/tma.cpp | 3 +- 5 files changed, 155 insertions(+), 2 deletions(-) create mode 100644 csrc/driver_api.cpp create mode 100644 csrc/driver_api.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 6becef52068..d7013b042f4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -80,6 +80,7 @@ list(APPEND NVFUSER_SRCS ${NVFUSER_SRCS_DIR}/contiguity.cpp ${NVFUSER_SRCS_DIR}/debug.cpp ${NVFUSER_SRCS_DIR}/dispatch.cpp + ${NVFUSER_SRCS_DIR}/driver_api.cpp ${NVFUSER_SRCS_DIR}/dynamic_transform.cpp ${NVFUSER_SRCS_DIR}/expr_evaluator.cpp ${NVFUSER_SRCS_DIR}/expr_simplifier.cpp diff --git a/csrc/cuda_utils.h b/csrc/cuda_utils.h index 8e1e7095f57..27f08fb3538 100644 --- a/csrc/cuda_utils.h +++ b/csrc/cuda_utils.h @@ -9,6 +9,7 @@ #include #include +#include #include #define NVFUSER_NVRTC_SAFE_CALL(x) \ diff --git a/csrc/driver_api.cpp b/csrc/driver_api.cpp new file mode 100644 index 00000000000..a70f2a7ca28 --- /dev/null +++ b/csrc/driver_api.cpp @@ -0,0 +1,123 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#include + +#include +#include + +#include + +#include + +namespace { + +class CUDADriverAPIDynamicLoader { + void* handle_ = nullptr; + + public: + constexpr static const char* filename = "libcuda.so"; + + ~CUDADriverAPIDynamicLoader() { + if (handle_) { + dlclose(handle_); + } + } + + void* sym(const char* symbolName) { + if (!handle_) { + handle_ = dlopen(filename, RTLD_LAZY); + } + NVF_CHECK( + handle_, "Dynamic library not loaded. Please check CUDA installation"); + void* symbol = dlsym(handle_, symbolName); + NVF_CHECK( + symbol, + "Failed to load symbol: ", + symbolName, + " ", + dlerror(), + "Please check CUDA installation"); + return symbol; + } +} loader; + +} // namespace + +// How does the magic work? +// +// Let's take driver API cuGetErrorName as an example. Because all nvFuser's +// code are in the nvfuser namespace, when we define nvfuser::cuGetErrorName, +// this name will shadow the driver API cuGetErrorName. So when we write code +// cuGetErrorName(...), we will be using nvfuser::cuGetErrorName, instead of +// the driver API cuGetErrorName, due to C++'s name lookup rule. So the goal is +// to make nvfuser::cuGetErrorName behave just like the driver API +// cuGetErrorName, except that it is lazily loaded. +// +// We define nvfuser::cuGetErrorName as a pointer which is initialized to a +// function lazilyLoadAndInvoke. When nvfuser::cuGetErrorName is invoked for the +// first time, it will invoke its lazilyLoadAndInvoke function. This function +// lazily loads the driver API cuGetErrorName, replaces nvfuser::cuGetErrorName +// with the loaded driver API function pointer, and call the newly loaded +// cuGetErrorName driver API. The next time when nvfuser::cuGetErrorName is +// invoked, it will be calling driver API directly. +// +// For each driver API, we need to define its own lazilyLoadAndInvoke function. +// The function signature of each lazilyLoadAndInvoke must be exactly the same +// as its corresponding driver API, because otherwise, we can not assign it to +// our function pointers like nvfuser::cuGetErrorName. +// +// We could of course define these lazilyLoadAndInvoke functions manually for +// each driver API, but it would be very tedious and error-prone. We want to +// automate this process so that adding a new driver API is as trivial as: +// DEFINE_DRIVER_API_WRAPPER(cuDriverAPIName) +// +// C++'s syntax only allows us to create a function like +// ReturnType lazilyLoadAndInvoke(Arg1 arg1, Arg2 arg2, ...) { +// ... +// } +// Because the number of parameters of the driver API can vary, the only way to +// do it generally that I can think of is to put lazilyLoadAndInvoke into a +// struct template so that we can use parameter pack `typename ... Args`, which +// will be deducted from decltype(cuDriverAPIName), as struct template +// parameters. To make the decltype(cuDriverAPIName) -> ReturnType(Args...) +// deduction work, we can define a ctor for the struct template and add a CTAD +// rule to tell the compiler how to deduce the template parameters. +// +// Doc for CTAD: +// https://en.cppreference.com/w/cpp/language/class_template_argument_deduction +#define DEFINE_DRIVER_API_WRAPPER(funcName) \ + namespace { \ + template \ + struct funcName##Loader { \ + static ReturnType lazilyLoadAndInvoke(Args... args) { \ + funcName = (decltype(funcName))loader.sym(#funcName); \ + return funcName(args...); \ + } \ + /* This ctor is just a CTAD helper, it is only used in a */ \ + /* non-evaluated environment*/ \ + funcName##Loader(ReturnType(Args...)){}; \ + }; \ + \ + /* Use CTAD rule to deduct return and argument types */ \ + template \ + funcName##Loader(ReturnType(Args...)) \ + ->funcName##Loader; \ + } \ + \ + decltype(::funcName)* funcName = \ + decltype(funcName##Loader(::funcName))::lazilyLoadAndInvoke + +namespace nvfuser { + +DEFINE_DRIVER_API_WRAPPER(cuGetErrorName); +DEFINE_DRIVER_API_WRAPPER(cuGetErrorString); +DEFINE_DRIVER_API_WRAPPER(cuTensorMapEncodeTiled); + +} // namespace nvfuser + +#undef DEFINE_DRIVER_API_WRAPPER diff --git a/csrc/driver_api.h b/csrc/driver_api.h new file mode 100644 index 00000000000..ffe46682caf --- /dev/null +++ b/csrc/driver_api.h @@ -0,0 +1,29 @@ +// clang-format off +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ +// clang-format on +#pragma once + +#include + +// How to lazily load a driver API and invoke it? Just forget about lazy loading +// and write code as if you are using the driver API directly. Magic will +// happen. To understand how the magic works, please refer to the cpp file's doc +// "How does the magic work?" + +namespace nvfuser { + +#define DECLARE_DRIVER_API_WRAPPER(funcName) \ + extern decltype(::funcName)* funcName; + +// List of driver APIs that you want the magic to happen. +DECLARE_DRIVER_API_WRAPPER(cuGetErrorName); +DECLARE_DRIVER_API_WRAPPER(cuGetErrorString); +DECLARE_DRIVER_API_WRAPPER(cuTensorMapEncodeTiled); + +#undef DECLARE_DRIVER_API_WRAPPER + +} // namespace nvfuser diff --git a/csrc/tma.cpp b/csrc/tma.cpp index 90f37d09ecb..b95bad474ef 100644 --- a/csrc/tma.cpp +++ b/csrc/tma.cpp @@ -17,8 +17,7 @@ #include -#include -#include +#include namespace nvfuser {