diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml index 491fe0773..ff86a3e32 100644 --- a/.azure-pipelines/integration-test.yml +++ b/.azure-pipelines/integration-test.yml @@ -31,7 +31,7 @@ jobs: targetType: 'inline' script: | mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_PEERMEM_CHECK=ON -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .. + cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .. make -j workingDirectory: '$(System.DefaultWorkingDirectory)' diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index a7597dfa2..99017b91d 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -26,7 +26,7 @@ jobs: targetType: 'inline' script: | mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_PEERMEM_CHECK=ON -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .. + cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .. make -j make pylib-copy workingDirectory: '$(System.DefaultWorkingDirectory)' diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index 27cbaf5af..7295171e9 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -45,7 +45,7 @@ jobs: - name: Build run: | - cmake -DBYPASS_PEERMEM_CHECK=ON -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON . + cmake -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON . make -j - name: Perform CodeQL Analysis diff --git a/CMakeLists.txt b/CMakeLists.txt index 2a91157d5..66ed4b94b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,7 +21,6 @@ option(BUILD_PYTHON_BINDINGS "Build Python bindings" ON) option(USE_CUDA "Use NVIDIA/CUDA." OFF) option(USE_ROCM "Use AMD/ROCm." OFF) option(BYPASS_GPU_CHECK "Bypass GPU check." OFF) -option(BYPASS_PEERMEM_CHECK "Bypass checking nvidia_peermem" OFF) if(BYPASS_GPU_CHECK) if(USE_CUDA) @@ -81,16 +80,6 @@ if(USE_CUDA) set(GPU_LIBRARIES CUDA::cudart CUDA::cuda_driver) set(GPU_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS}) - - # Find if nvidia_peermem is installed and loaded - if(NOT BYPASS_PEERMEM_CHECK) - execute_process(COMMAND sh -c "lsmod | grep nvidia_peermem" - RESULT_VARIABLE lsmod_result - OUTPUT_VARIABLE lsmod_output) - if(NOT lsmod_result EQUAL 0) - message(FATAL_ERROR "nvidia_peermem is not installed or not loaded.") - endif() - endif() else() set(CMAKE_HIP_STANDARD 17) set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wall -Wextra") diff --git a/src/ib.cc b/src/ib.cc index 1d25b68b6..9955c5269 100644 --- a/src/ib.cc +++ b/src/ib.cc @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -16,6 +17,20 @@ #include "api.h" #include "debug.h" +#if !defined(__HIP_PLATFORM_AMD__) + +// Check if nvidia_peermem kernel module is loaded +static bool checkNvPeerMemLoaded() { + std::ifstream file("/proc/modules"); + std::string line; + while (std::getline(file, line)) { + if (line.find("nvidia_peermem") != std::string::npos) return true; + } + return false; +} + +#endif // !defined(__HIP_PLATFORM_AMD__) + namespace mscclpp { IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : buff(buff) { @@ -280,6 +295,11 @@ const ibv_wc* IbQp::getWc(int idx) const { return &this->wcs[idx]; } int IbQp::getNumCqItems() const { return this->numSignaledPostedItems; } IbCtx::IbCtx(const std::string& devName) : devName(devName) { +#if !defined(__HIP_PLATFORM_AMD__) + if (!checkNvPeerMemLoaded()) { + throw mscclpp::Error("nvidia_peermem kernel module is not loaded", ErrorCode::InternalError); + } +#endif // !defined(__HIP_PLATFORM_AMD__) int num; struct ibv_device** devices = ibv_get_device_list(&num); for (int i = 0; i < num; ++i) {