From 7b58b0ff307d871ac20a18bd7cebef77d7c516a1 Mon Sep 17 00:00:00 2001 From: Qian Bao Date: Mon, 24 Jul 2023 22:10:06 +0800 Subject: [PATCH] [bug] Fix cusolver load failure caused by version difference (#8283) Issue: #8227 ### Brief Summary Users encountered failure when loading the `cusparse` and `cusolver` lib, which are required for the `ti.linalg.SparseSolver`, on Windows. The failure is usually caused by the version difference between the user's CUDA driver and CUDA Toolkit. For instance, the user in issue #8227 had a CUDA Driver of 12.1 and Toolkit of 11.8. To make things worse, Nvidia tend to ship `cusolver` and `cusparse` lib that seems to be different from the major version of CUDA Toolkit. As an example, CUDA Toolkit 12.2 ships a `cusolver64_11.dll` instead of `cusolver64_12.dll`: ![Screenshot 2023-07-13 145549](https://github.com/taichi-dev/taichi/assets/2747993/e6cc0da1-c3e3-4e61-885d-2c50e1c87025) ### Walkthrough Instead of using the same version as CUDA driver for `cusolver` and `cusparse`, this PR attempts to load a `cusparse` / `cusolver` whose version is one version backward than the CUDA driver: ```c++ // Get the CUDA Driver's version int cuda_version = CUDADriver::get_instance().get_version_major(); // Try to load the cusolver lib whose version is derived from the CUDA driver cusolver_loaded_ = try_load_lib_any_version("cusolver", "64_", {cuda_version, cuda_version - 1}); ``` This might not be 100% reliable since there is no guarantee that Nvidia will only ship libs whose version are one version backward, but it should solve the user's issue for the moment. ### Related PR PR#7724: https://github.com/taichi-dev/taichi/pull/7724 --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- taichi/rhi/cuda/cuda_driver.cpp | 148 +++++++++++++++++--------------- 1 file changed, 79 insertions(+), 69 deletions(-) diff --git a/taichi/rhi/cuda/cuda_driver.cpp b/taichi/rhi/cuda/cuda_driver.cpp index 9593f95f71cd0..254b8a4813c22 100644 --- a/taichi/rhi/cuda/cuda_driver.cpp +++ b/taichi/rhi/cuda/cuda_driver.cpp @@ -54,6 +54,64 @@ bool CUDADriverBase::check_lib_loaded(std::string lib_linux, return DynamicLoader::check_lib_loaded(lib_name); } +std::string get_lib_name_linux(const std::string &lib_name, int version) { + return "lib" + lib_name + ".so." + std::to_string(version); +} + +std::string get_lib_name_windows(const std::string &lib_name, + const std::string &win_arch_name, + int version) { + return lib_name + win_arch_name + std::to_string(version) + ".dll"; +} + +bool CUDADriverBase::try_load_lib_any_version( + const std::string &lib_name, + const std::string &win_arch_name, + const std::vector &versions_to_try) { + // Check if any versions of this lib are already loaded. + for (auto version : versions_to_try) { + std::string lib_name_linux = get_lib_name_linux(lib_name, version); + std::string lib_name_windows = + get_lib_name_windows(lib_name, win_arch_name, version); + if (check_lib_loaded(lib_name_linux, lib_name_windows)) { + load_lib(lib_name_linux, lib_name_windows); + return true; + } + } + + // Try load any version of this lib if none of them are loaded. + bool loaded = false; + if (!loaded) { +#ifdef WIN32 + for (auto version : versions_to_try) { + std::string lib_name_windows = + get_lib_name_windows(lib_name, win_arch_name, version); + loader_ = std::make_unique(lib_name_windows); + loaded = loader_->loaded(); + if (loaded) { + break; + } + } +#else + for (auto version : versions_to_try) { + std::string lib_name_linux = get_lib_name_linux(lib_name, version); + loader_ = std::make_unique(lib_name_linux); + loaded = loader_->loaded(); + if (loaded) { + break; + } + } + if (!loaded) { + // Use the default version on linux. + std::string lib_name_linux = "lib" + lib_name + ".so"; + loader_ = std::make_unique(lib_name_linux); + loaded = loader_->loaded(); + } +#endif + } + return loaded; +} + bool CUDADriver::detected() { return !disabled_by_env_ && cuda_version_valid_ && loader_->loaded(); } @@ -129,12 +187,15 @@ CUSPARSEDriver &CUSPARSEDriver::get_instance() { } bool CUSPARSEDriver::load_cusparse() { - cusparse_loaded_ = load_lib( - "libcusparse.so", - "cusparse64_" + - std::to_string(CUDADriver::get_instance().get_version_major()) + - ".dll"); - + /* + Load the cuSparse lib whose version follows the CUDA driver's version. + See load_cusolver() for more information. + */ + // Get the CUDA Driver's version + int cuda_version = CUDADriver::get_instance().get_version_major(); + // Try to load the cusparse lib whose version is derived from the CUDA driver + cusparse_loaded_ = try_load_lib_any_version("cusparse", "64_", + {cuda_version, cuda_version - 1}); if (!cusparse_loaded_) { return false; } @@ -156,11 +217,18 @@ CUSOLVERDriver &CUSOLVERDriver::get_instance() { } bool CUSOLVERDriver::load_cusolver() { - cusolver_loaded_ = load_lib( - "libcusolver.so", - "cusolver64_" + - std::to_string(CUDADriver::get_instance().get_version_major()) + - ".dll"); + /* + Load the cuSolver lib whose version follows the CUDA driver's version. + Note that cusolver's filename is NOT necessarily the same with CUDA Toolkit + (on Windows). For instance, CUDA Toolkit 12.2 ships a cusolver64_11.dll + (checked on 2023.7.13) Therefore, the following function attempts to load a + cusolver lib which is one version backward from the CUDA Driver's version. + */ + // Get the CUDA Driver's version + int cuda_version = CUDADriver::get_instance().get_version_major(); + // Try to load the cusolver lib whose version is derived from the CUDA driver + cusolver_loaded_ = try_load_lib_any_version("cusolver", "64_", + {cuda_version, cuda_version - 1}); if (!cusolver_loaded_) { return false; } @@ -181,64 +249,6 @@ CUBLASDriver &CUBLASDriver::get_instance() { return *instance; } -std::string get_lib_name_linux(const std::string &lib_name, int version) { - return "lib" + lib_name + ".so." + std::to_string(version); -} - -std::string get_lib_name_windows(const std::string &lib_name, - const std::string &win_arch_name, - int version) { - return lib_name + win_arch_name + std::to_string(version) + ".dll"; -} - -bool CUDADriverBase::try_load_lib_any_version( - const std::string &lib_name, - const std::string &win_arch_name, - const std::vector &versions_to_try) { - // Check if any versions of this lib are already loaded. - for (auto version : versions_to_try) { - std::string lib_name_linux = get_lib_name_linux(lib_name, version); - std::string lib_name_windows = - get_lib_name_windows(lib_name, win_arch_name, version); - if (check_lib_loaded(lib_name_linux, lib_name_windows)) { - load_lib(lib_name_linux, lib_name_windows); - return true; - } - } - - // Try load any version of this lib if none of them are loaded. - bool loaded = false; - if (!loaded) { -#ifdef WIN32 - for (auto version : versions_to_try) { - std::string lib_name_windows = - get_lib_name_windows(lib_name, win_arch_name, version); - loader_ = std::make_unique(lib_name_windows); - loaded = loader_->loaded(); - if (loaded) { - break; - } - } -#else - for (auto version : versions_to_try) { - std::string lib_name_linux = get_lib_name_linux(lib_name, version); - loader_ = std::make_unique(lib_name_linux); - loaded = loader_->loaded(); - if (loaded) { - break; - } - } - if (!loaded) { - // Use the default version on linux. - std::string lib_name_linux = "lib" + lib_name + ".so"; - loader_ = std::make_unique(lib_name_linux); - loaded = loader_->loaded(); - } -#endif - } - return loaded; -} - bool CUBLASDriver::load_cublas() { /* To be compatible with torch environment, please libcublas.so.11 other than * libcublas.so. When using libcublas.so, the system cublas will be loaded and