Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check nvidia_peermem during runtime #234

Merged
merged 5 commits into from
Dec 25, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .azure-pipelines/integration-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_PEERMEM_CHECK=ON -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON ..
cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'

Expand Down
2 changes: 1 addition & 1 deletion .azure-pipelines/multi-nodes-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_PEERMEM_CHECK=ON -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON ..
cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON ..
make -j
make pylib-copy
workingDirectory: '$(System.DefaultWorkingDirectory)'
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/codeql-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:

- name: Build
run: |
cmake -DBYPASS_PEERMEM_CHECK=ON -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .
cmake -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .
make -j

- name: Perform CodeQL Analysis
Expand Down
11 changes: 0 additions & 11 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ option(BUILD_PYTHON_BINDINGS "Build Python bindings" ON)
option(USE_CUDA "Use NVIDIA/CUDA." OFF)
option(USE_ROCM "Use AMD/ROCm." OFF)
option(BYPASS_GPU_CHECK "Bypass GPU check." OFF)
option(BYPASS_PEERMEM_CHECK "Bypass checking nvidia_peermem" OFF)

if(BYPASS_GPU_CHECK)
if(USE_CUDA)
Expand Down Expand Up @@ -81,16 +80,6 @@ if(USE_CUDA)

set(GPU_LIBRARIES CUDA::cudart CUDA::cuda_driver)
set(GPU_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS})

# Find if nvidia_peermem is installed and loaded
if(NOT BYPASS_PEERMEM_CHECK)
execute_process(COMMAND sh -c "lsmod | grep nvidia_peermem"
RESULT_VARIABLE lsmod_result
OUTPUT_VARIABLE lsmod_output)
if(NOT lsmod_result EQUAL 0)
message(FATAL_ERROR "nvidia_peermem is not installed or not loaded.")
endif()
endif()
else()
set(CMAKE_HIP_STANDARD 17)
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wall -Wextra")
Expand Down
20 changes: 20 additions & 0 deletions src/ib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <unistd.h>

#include <cstring>
#include <fstream>
#include <mscclpp/core.hpp>
#include <mscclpp/fifo.hpp>
#include <sstream>
Expand All @@ -16,6 +17,20 @@
#include "api.h"
#include "debug.h"

#if !defined(__HIP_PLATFORM_AMD__) || (__HIP_PLATFORM_AMD__ == 0)

// Check if nvidia_peermem kernel module is loaded
static bool checkNvPeerMemLoaded() {
std::ifstream file("/proc/modules");
std::string line;
while (std::getline(file, line)) {
if (line.find("nvidia_peermem") != std::string::npos) return true;
}
return false;
}

#endif // !defined(__HIP_PLATFORM_AMD__) || (__HIP_PLATFORM_AMD__ == 0)

namespace mscclpp {

IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : buff(buff) {
Expand Down Expand Up @@ -280,6 +295,11 @@ const ibv_wc* IbQp::getWc(int idx) const { return &this->wcs[idx]; }
int IbQp::getNumCqItems() const { return this->numSignaledPostedItems; }

IbCtx::IbCtx(const std::string& devName) : devName(devName) {
#if !defined(__HIP_PLATFORM_AMD__) || (__HIP_PLATFORM_AMD__ == 0)
if (!checkNvPeerMemLoaded()) {
throw mscclpp::Error("nvidia_peermem kernel module is not loaded", ErrorCode::InternalError);
}
#endif // !defined(__HIP_PLATFORM_AMD__) || (__HIP_PLATFORM_AMD__ == 0)
int num;
struct ibv_device** devices = ibv_get_device_list(&num);
for (int i = 0; i < num; ++i) {
Expand Down