Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

check whether nvidia-smi/rocm-smi command is available before trying to run it in get_gpu_info #4131

Merged
merged 1 commit into from
Dec 4, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 17 additions & 8 deletions easybuild/tools/systemtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@

from easybuild.base import fancylogger
from easybuild.tools.build_log import EasyBuildError, print_warning
from easybuild.tools.config import IGNORE
from easybuild.tools.filetools import is_readable, read_file, which
from easybuild.tools.py2vs3 import OrderedDict, string_type
from easybuild.tools.run import run_cmd
Expand Down Expand Up @@ -608,14 +609,19 @@ def get_gpu_info():
"""
Get the GPU info
"""
gpu_info = {}
os_type = get_os_type()
if get_os_type() != LINUX:
_log.info("Only know how to get GPU info on Linux, assuming no GPUs are present")
return {}

if os_type == LINUX:
gpu_info = {}
if not which('nvidia-smi', on_error=IGNORE):
_log.info("nvidia-smi not found. Cannot detect NVIDIA GPUs")
else:
try:
cmd = "nvidia-smi --query-gpu=gpu_name,driver_version --format=csv,noheader"
_log.debug("Trying to determine NVIDIA GPU info on Linux via cmd '%s'", cmd)
out, ec = run_cmd(cmd, force_in_dry_run=True, trace=False, stream_output=False)
out, ec = run_cmd(cmd, simple=False, log_ok=False, log_all=False,
force_in_dry_run=True, trace=False, stream_output=False)
if ec == 0:
for line in out.strip().split('\n'):
nvidia_gpu_info = gpu_info.setdefault('NVIDIA', {})
Expand All @@ -627,16 +633,21 @@ def get_gpu_info():
_log.debug("Exception was raised when running nvidia-smi: %s", err)
_log.info("No NVIDIA GPUs detected")

if not which('rocm-smi', on_error=IGNORE):
_log.info("rocm-smi not found. Cannot detect AMD GPUs")
else:
try:
cmd = "rocm-smi --showdriverversion --csv"
_log.debug("Trying to determine AMD GPU driver on Linux via cmd '%s'", cmd)
out, ec = run_cmd(cmd, force_in_dry_run=True, trace=False, stream_output=False)
out, ec = run_cmd(cmd, simple=False, log_ok=False, log_all=False,
force_in_dry_run=True, trace=False, stream_output=False)
if ec == 0:
amd_driver = out.strip().split('\n')[1].split(',')[1]

cmd = "rocm-smi --showproductname --csv"
_log.debug("Trying to determine AMD GPU info on Linux via cmd '%s'", cmd)
out, ec = run_cmd(cmd, force_in_dry_run=True, trace=False, stream_output=False)
out, ec = run_cmd(cmd, simple=False, log_ok=False, log_all=False,
force_in_dry_run=True, trace=False, stream_output=False)
if ec == 0:
for line in out.strip().split('\n')[1:]:
amd_card_series = line.split(',')[1]
Expand All @@ -650,8 +661,6 @@ def get_gpu_info():
except Exception as err:
_log.debug("Exception was raised when running rocm-smi: %s", err)
_log.info("No AMD GPUs detected")
else:
_log.info("Only know how to get GPU info on Linux, assuming no GPUs are present")

return gpu_info

Expand Down