Skip to content

Commit

Permalink
Check nvidia_peermem during runtime (#234)
Browse files Browse the repository at this point in the history
  • Loading branch information
chhwang authored Dec 25, 2023
1 parent 6202b10 commit 5fa5bd2
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 14 deletions.
2 changes: 1 addition & 1 deletion .azure-pipelines/integration-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_PEERMEM_CHECK=ON -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON ..
cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'

Expand Down
2 changes: 1 addition & 1 deletion .azure-pipelines/multi-nodes-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_PEERMEM_CHECK=ON -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON ..
cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON ..
make -j
make pylib-copy
workingDirectory: '$(System.DefaultWorkingDirectory)'
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/codeql-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
- name: Build
run: |
cmake -DBYPASS_PEERMEM_CHECK=ON -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .
cmake -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .
make -j
- name: Perform CodeQL Analysis
Expand Down
11 changes: 0 additions & 11 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ option(BUILD_PYTHON_BINDINGS "Build Python bindings" ON)
option(USE_CUDA "Use NVIDIA/CUDA." OFF)
option(USE_ROCM "Use AMD/ROCm." OFF)
option(BYPASS_GPU_CHECK "Bypass GPU check." OFF)
option(BYPASS_PEERMEM_CHECK "Bypass checking nvidia_peermem" OFF)

if(BYPASS_GPU_CHECK)
if(USE_CUDA)
Expand Down Expand Up @@ -81,16 +80,6 @@ if(USE_CUDA)

set(GPU_LIBRARIES CUDA::cudart CUDA::cuda_driver)
set(GPU_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS})

# Find if nvidia_peermem is installed and loaded
if(NOT BYPASS_PEERMEM_CHECK)
execute_process(COMMAND sh -c "lsmod | grep nvidia_peermem"
RESULT_VARIABLE lsmod_result
OUTPUT_VARIABLE lsmod_output)
if(NOT lsmod_result EQUAL 0)
message(FATAL_ERROR "nvidia_peermem is not installed or not loaded.")
endif()
endif()
else()
set(CMAKE_HIP_STANDARD 17)
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wall -Wextra")
Expand Down
20 changes: 20 additions & 0 deletions src/ib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <unistd.h>

#include <cstring>
#include <fstream>
#include <mscclpp/core.hpp>
#include <mscclpp/fifo.hpp>
#include <sstream>
Expand All @@ -16,6 +17,20 @@
#include "api.h"
#include "debug.h"

#if !defined(__HIP_PLATFORM_AMD__)

// Check if nvidia_peermem kernel module is loaded
static bool checkNvPeerMemLoaded() {
std::ifstream file("/proc/modules");
std::string line;
while (std::getline(file, line)) {
if (line.find("nvidia_peermem") != std::string::npos) return true;
}
return false;
}

#endif // !defined(__HIP_PLATFORM_AMD__)

namespace mscclpp {

IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : buff(buff) {
Expand Down Expand Up @@ -280,6 +295,11 @@ const ibv_wc* IbQp::getWc(int idx) const { return &this->wcs[idx]; }
int IbQp::getNumCqItems() const { return this->numSignaledPostedItems; }

IbCtx::IbCtx(const std::string& devName) : devName(devName) {
#if !defined(__HIP_PLATFORM_AMD__)
if (!checkNvPeerMemLoaded()) {
throw mscclpp::Error("nvidia_peermem kernel module is not loaded", ErrorCode::InternalError);
}
#endif // !defined(__HIP_PLATFORM_AMD__)
int num;
struct ibv_device** devices = ibv_get_device_list(&num);
for (int i = 0; i < num; ++i) {
Expand Down

0 comments on commit 5fa5bd2

Please sign in to comment.