From 088e6cf7bb2739a3e22c8756b0d4bdf8610a8af7 Mon Sep 17 00:00:00 2001
From: "Gao, Xiang" <qasdfgtyuiop@gmail.com>
Date: Sat, 23 Sep 2023 10:22:55 -0700
Subject: [PATCH] Lazily load and invoke driver API (#934)

Suggest starting review from `csrc/driver_api.h`
---
 CMakeLists.txt      |   1 +
 csrc/cuda_utils.h   |   1 +
 csrc/driver_api.cpp | 123 ++++++++++++++++++++++++++++++++++++++++++++
 csrc/driver_api.h   |  29 +++++++++++
 csrc/tma.cpp        |   3 +-
 5 files changed, 155 insertions(+), 2 deletions(-)
 create mode 100644 csrc/driver_api.cpp
 create mode 100644 csrc/driver_api.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6becef52068..d7013b042f4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -80,6 +80,7 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/contiguity.cpp
   ${NVFUSER_SRCS_DIR}/debug.cpp
   ${NVFUSER_SRCS_DIR}/dispatch.cpp
+  ${NVFUSER_SRCS_DIR}/driver_api.cpp
   ${NVFUSER_SRCS_DIR}/dynamic_transform.cpp
   ${NVFUSER_SRCS_DIR}/expr_evaluator.cpp
   ${NVFUSER_SRCS_DIR}/expr_simplifier.cpp
diff --git a/csrc/cuda_utils.h b/csrc/cuda_utils.h
index 8e1e7095f57..27f08fb3538 100644
--- a/csrc/cuda_utils.h
+++ b/csrc/cuda_utils.h
@@ -9,6 +9,7 @@
 
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <driver_api.h>
 #include <exceptions.h>
 
 #define NVFUSER_NVRTC_SAFE_CALL(x)               \
diff --git a/csrc/driver_api.cpp b/csrc/driver_api.cpp
new file mode 100644
index 00000000000..a70f2a7ca28
--- /dev/null
+++ b/csrc/driver_api.cpp
@@ -0,0 +1,123 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#include <driver_api.h>
+
+#include <cuda.h>
+#include <dlfcn.h>
+
+#include <iostream>
+
+#include <exceptions.h>
+
+namespace {
+
+class CUDADriverAPIDynamicLoader {
+  void* handle_ = nullptr;
+
+ public:
+  constexpr static const char* filename = "libcuda.so";
+
+  ~CUDADriverAPIDynamicLoader() {
+    if (handle_) {
+      dlclose(handle_);
+    }
+  }
+
+  void* sym(const char* symbolName) {
+    if (!handle_) {
+      handle_ = dlopen(filename, RTLD_LAZY);
+    }
+    NVF_CHECK(
+        handle_, "Dynamic library not loaded. Please check CUDA installation");
+    void* symbol = dlsym(handle_, symbolName);
+    NVF_CHECK(
+        symbol,
+        "Failed to load symbol: ",
+        symbolName,
+        " ",
+        dlerror(),
+        "Please check CUDA installation");
+    return symbol;
+  }
+} loader;
+
+} // namespace
+
+// How does the magic work?
+//
+// Let's take driver API cuGetErrorName as an example. Because all nvFuser's
+// code are in the nvfuser namespace, when we define nvfuser::cuGetErrorName,
+// this name will shadow the driver API cuGetErrorName. So when we write code
+// cuGetErrorName(...), we will be using nvfuser::cuGetErrorName, instead of
+// the driver API cuGetErrorName, due to C++'s name lookup rule. So the goal is
+// to make nvfuser::cuGetErrorName behave just like the driver API
+// cuGetErrorName, except that it is lazily loaded.
+//
+// We define nvfuser::cuGetErrorName as a pointer which is initialized to a
+// function lazilyLoadAndInvoke. When nvfuser::cuGetErrorName is invoked for the
+// first time, it will invoke its lazilyLoadAndInvoke function. This function
+// lazily loads the driver API cuGetErrorName, replaces nvfuser::cuGetErrorName
+// with the loaded driver API function pointer, and call the newly loaded
+// cuGetErrorName driver API. The next time when nvfuser::cuGetErrorName is
+// invoked, it will be calling driver API directly.
+//
+// For each driver API, we need to define its own lazilyLoadAndInvoke function.
+// The function signature of each lazilyLoadAndInvoke must be exactly the same
+// as its corresponding driver API, because otherwise, we can not assign it to
+// our function pointers like nvfuser::cuGetErrorName.
+//
+// We could of course define these lazilyLoadAndInvoke functions manually for
+// each driver API, but it would be very tedious and error-prone. We want to
+// automate this process so that adding a new driver API is as trivial as:
+//   DEFINE_DRIVER_API_WRAPPER(cuDriverAPIName)
+//
+// C++'s syntax only allows us to create a function like
+//   ReturnType lazilyLoadAndInvoke(Arg1 arg1, Arg2 arg2, ...) {
+//     ...
+//   }
+// Because the number of parameters of the driver API can vary, the only way to
+// do it generally that I can think of is to put lazilyLoadAndInvoke into a
+// struct template so that we can use parameter pack `typename ... Args`, which
+// will be deducted from decltype(cuDriverAPIName), as struct template
+// parameters. To make the decltype(cuDriverAPIName) -> ReturnType(Args...)
+// deduction work, we can define a ctor for the struct template and add a CTAD
+// rule to tell the compiler how to deduce the template parameters.
+//
+// Doc for CTAD:
+// https://en.cppreference.com/w/cpp/language/class_template_argument_deduction
+#define DEFINE_DRIVER_API_WRAPPER(funcName)                     \
+  namespace {                                                   \
+  template <typename ReturnType, typename... Args>              \
+  struct funcName##Loader {                                     \
+    static ReturnType lazilyLoadAndInvoke(Args... args) {       \
+      funcName = (decltype(funcName))loader.sym(#funcName);     \
+      return funcName(args...);                                 \
+    }                                                           \
+    /* This ctor is just a CTAD helper, it is only used in a */ \
+    /* non-evaluated environment*/                              \
+    funcName##Loader(ReturnType(Args...)){};                    \
+  };                                                            \
+                                                                \
+  /* Use CTAD rule to deduct return and argument types */       \
+  template <typename ReturnType, typename... Args>              \
+  funcName##Loader(ReturnType(Args...))                         \
+      ->funcName##Loader<ReturnType, Args...>;                  \
+  }                                                             \
+                                                                \
+  decltype(::funcName)* funcName =                              \
+      decltype(funcName##Loader(::funcName))::lazilyLoadAndInvoke
+
+namespace nvfuser {
+
+DEFINE_DRIVER_API_WRAPPER(cuGetErrorName);
+DEFINE_DRIVER_API_WRAPPER(cuGetErrorString);
+DEFINE_DRIVER_API_WRAPPER(cuTensorMapEncodeTiled);
+
+} // namespace nvfuser
+
+#undef DEFINE_DRIVER_API_WRAPPER
diff --git a/csrc/driver_api.h b/csrc/driver_api.h
new file mode 100644
index 00000000000..ffe46682caf
--- /dev/null
+++ b/csrc/driver_api.h
@@ -0,0 +1,29 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+
+#include <cuda.h>
+
+// How to lazily load a driver API and invoke it? Just forget about lazy loading
+// and write code as if you are using the driver API directly. Magic will
+// happen. To understand how the magic works, please refer to the cpp file's doc
+// "How does the magic work?"
+
+namespace nvfuser {
+
+#define DECLARE_DRIVER_API_WRAPPER(funcName) \
+  extern decltype(::funcName)* funcName;
+
+// List of driver APIs that you want the magic to happen.
+DECLARE_DRIVER_API_WRAPPER(cuGetErrorName);
+DECLARE_DRIVER_API_WRAPPER(cuGetErrorString);
+DECLARE_DRIVER_API_WRAPPER(cuTensorMapEncodeTiled);
+
+#undef DECLARE_DRIVER_API_WRAPPER
+
+} // namespace nvfuser
diff --git a/csrc/tma.cpp b/csrc/tma.cpp
index 90f37d09ecb..b95bad474ef 100644
--- a/csrc/tma.cpp
+++ b/csrc/tma.cpp
@@ -17,8 +17,7 @@
 
 #include <cstdint>
 
-#include <cuda.h>
-#include <cuda_runtime.h>
+#include <driver_api.h>
 
 namespace nvfuser {