Skip to content

Commit

Permalink
Avoid exception in get_gpu_info
Browse files Browse the repository at this point in the history
Currently the function calls `run_cmd` which throws on error AND checks
the exit code which is redundant and causes a log message
`"ERROR EasyBuild crashed with an error ..."` to be logged on error
which is confusing as e.g it is VERY unlikely both `nvidia-smi` and
`rocm-smi` are on the system.
So check for existance first and suppress output and error checking of `run_cmd`.
  • Loading branch information
Flamefire committed Nov 28, 2022
1 parent e25756a commit 2c2f07d
Showing 1 changed file with 17 additions and 8 deletions.
25 changes: 17 additions & 8 deletions easybuild/tools/systemtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@

from easybuild.base import fancylogger
from easybuild.tools.build_log import EasyBuildError, print_warning
from easybuild.tools.config import IGNORE
from easybuild.tools.filetools import is_readable, read_file, which
from easybuild.tools.py2vs3 import OrderedDict, string_type
from easybuild.tools.run import run_cmd
Expand Down Expand Up @@ -608,14 +609,19 @@ def get_gpu_info():
"""
Get the GPU info
"""
gpu_info = {}
os_type = get_os_type()
if get_os_type() != LINUX:
_log.info("Only know how to get GPU info on Linux, assuming no GPUs are present")
return {}

if os_type == LINUX:
gpu_info = {}
if not which('nvidia-smi', on_error=IGNORE):
_log.info("nvidia-smi not found. Cannot detect NVIDIA GPUs")
else:
try:
cmd = "nvidia-smi --query-gpu=gpu_name,driver_version --format=csv,noheader"
_log.debug("Trying to determine NVIDIA GPU info on Linux via cmd '%s'", cmd)
out, ec = run_cmd(cmd, force_in_dry_run=True, trace=False, stream_output=False)
out, ec = run_cmd(cmd, simple=False, log_ok=False, log_all=False,
force_in_dry_run=True, trace=False, stream_output=False)
if ec == 0:
for line in out.strip().split('\n'):
nvidia_gpu_info = gpu_info.setdefault('NVIDIA', {})
Expand All @@ -627,16 +633,21 @@ def get_gpu_info():
_log.debug("Exception was raised when running nvidia-smi: %s", err)
_log.info("No NVIDIA GPUs detected")

if not which('rocm-smi', on_error=IGNORE):
_log.info("rocm-smi not found. Cannot detect AMD GPUs")
else:
try:
cmd = "rocm-smi --showdriverversion --csv"
_log.debug("Trying to determine AMD GPU driver on Linux via cmd '%s'", cmd)
out, ec = run_cmd(cmd, force_in_dry_run=True, trace=False, stream_output=False)
out, ec = run_cmd(cmd, simple=False, log_ok=False, log_all=False,
force_in_dry_run=True, trace=False, stream_output=False)
if ec == 0:
amd_driver = out.strip().split('\n')[1].split(',')[1]

cmd = "rocm-smi --showproductname --csv"
_log.debug("Trying to determine AMD GPU info on Linux via cmd '%s'", cmd)
out, ec = run_cmd(cmd, force_in_dry_run=True, trace=False, stream_output=False)
out, ec = run_cmd(cmd, simple=False, log_ok=False, log_all=False,
force_in_dry_run=True, trace=False, stream_output=False)
if ec == 0:
for line in out.strip().split('\n')[1:]:
amd_card_series = line.split(',')[1]
Expand All @@ -650,8 +661,6 @@ def get_gpu_info():
except Exception as err:
_log.debug("Exception was raised when running rocm-smi: %s", err)
_log.info("No AMD GPUs detected")
else:
_log.info("Only know how to get GPU info on Linux, assuming no GPUs are present")

return gpu_info

Expand Down

0 comments on commit 2c2f07d

Please sign in to comment.