Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace GPUtil with pynvml for benchmark reports #1451

Merged
merged 3 commits into from
Jan 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions ci/conda/recipes/morpheus/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -110,7 +110,7 @@ outputs:
- {{ pin_compatible('cudatoolkit', min_pin='x.x', max_pin='x') }}
test:
requires:
- gputil
- pynvml
- pytest
- pytest-cov
- pytest-benchmark
Expand Down
1 change: 0 additions & 1 deletion conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ dependencies:
- flake8
- gcc_linux-64=11.2
- git-lfs
- gputil
- grpcio
- gxx_linux-64=11.2
- huggingface_hub=0.10.1
Expand Down
1 change: 0 additions & 1 deletion conda/environments/dev_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ dependencies:
- flake8
- gcc_linux-64=11.2
- git-lfs
- gputil
- grpcio
- gxx_linux-64=11.2
- include-what-you-use=0.20
Expand Down
3 changes: 1 addition & 2 deletions dependencies.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -235,7 +235,6 @@ dependencies:
- dill
- elasticsearch==8.9.0
- feedparser=6.0.10
- gputil
- grpcio
- mlflow>=2.2.1,<3
- nb_conda_kernels
Expand Down
3 changes: 1 addition & 2 deletions docker/conda/environments/cuda11.8_dev.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -57,7 +57,6 @@ dependencies:
- git>=2.35.3 # Needed for wildcards on safe.directory
- glog=0.6
- gmock>=1.13.0
- gputil
- grpcio
- gtest>=1.13.0
- gxx_linux-64=11.2
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -17,7 +17,8 @@
import json
from os import path

import GPUtil
from pynvml.smi import NVSMI_QUERY_GPU
from pynvml.smi import nvidia_smi

from benchmarks.test_bench_e2e_dfp_pipeline import PIPELINES_CONF

Expand All @@ -32,18 +33,40 @@ def pytest_benchmark_update_json(config, benchmarks, output_json): # pylint:dis

curr_dir = path.dirname(path.abspath(__file__))

gpus = GPUtil.getGPUs()

for i, gpu in enumerate(gpus):
# output_json["machine_info"]["gpu_" + str(i)] = gpu.name
output_json["machine_info"]["gpu_" + str(i)] = {}
output_json["machine_info"]["gpu_" + str(i)]["id"] = gpu.id
output_json["machine_info"]["gpu_" + str(i)]["name"] = gpu.name
output_json["machine_info"]["gpu_" + str(i)]["load"] = f"{gpu.load*100}%"
output_json["machine_info"]["gpu_" + str(i)]["free_memory"] = f"{gpu.memoryFree}MB"
output_json["machine_info"]["gpu_" + str(i)]["used_memory"] = f"{gpu.memoryUsed}MB"
output_json["machine_info"]["gpu_" + str(i)]["temperature"] = f"{gpu.temperature} C"
output_json["machine_info"]["gpu_" + str(i)]["uuid"] = gpu.uuid
query_opts = NVSMI_QUERY_GPU.copy()
nvsmi = nvidia_smi.getInstance()
device_query = nvsmi.DeviceQuery([
query_opts["driver_version"],
query_opts["count"],
query_opts["index"],
query_opts["gpu_name"],
query_opts["gpu_uuid"],
query_opts["memory.total"],
query_opts["memory.used"],
query_opts["memory.free"],
query_opts["utilization.gpu"],
query_opts["utilization.memory"],
query_opts["temperature.gpu"]
])

output_json["machine_info"]["gpu_driver_version"] = device_query["driver_version"]

for gpu in device_query["gpu"]:
gpu_num = gpu["minor_number"]
output_json["machine_info"]["gpu_" + gpu_num] = {}
output_json["machine_info"]["gpu_" + gpu_num]["id"] = gpu_num
output_json["machine_info"]["gpu_" + gpu_num]["name"] = gpu["product_name"]
output_json["machine_info"][
"gpu_" + gpu_num]["utilization"] = f"{gpu['utilization']['gpu_util']}{gpu['utilization']['unit']}"
output_json["machine_info"][
"gpu_" + gpu_num]["total_memory"] = f"{gpu['fb_memory_usage']['total']} {gpu['fb_memory_usage']['unit']}"
output_json["machine_info"][
"gpu_" + gpu_num]["used_memory"] = f"{gpu['fb_memory_usage']['used']} {gpu['fb_memory_usage']['unit']}"
output_json["machine_info"][
"gpu_" + gpu_num]["free_memory"] = f"{gpu['fb_memory_usage']['free']} {gpu['fb_memory_usage']['unit']}"
output_json["machine_info"][
"gpu_" + gpu_num]["temperature"] = f"{gpu['temperature']['gpu_temp']} {gpu['temperature']['unit']}"
output_json["machine_info"]["gpu_" + gpu_num]["uuid"] = gpu["uuid"]

for bench in output_json['benchmarks']:

Expand Down
52 changes: 38 additions & 14 deletions tests/benchmarks/conftest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -19,25 +19,49 @@
import typing
from unittest import mock

import GPUtil
import pytest
from pynvml.smi import NVSMI_QUERY_GPU
from pynvml.smi import nvidia_smi
from test_bench_e2e_pipelines import E2E_TEST_CONFIGS


# pylint: disable=unused-argument
def pytest_benchmark_update_json(config, benchmarks, output_json):
gpus = GPUtil.getGPUs()

for i, gpu in enumerate(gpus):
# output_json["machine_info"]["gpu_" + str(i)] = gpu.name
output_json["machine_info"]["gpu_" + str(i)] = {}
output_json["machine_info"]["gpu_" + str(i)]["id"] = gpu.id
output_json["machine_info"]["gpu_" + str(i)]["name"] = gpu.name
output_json["machine_info"]["gpu_" + str(i)]["load"] = f"{gpu.load*100}%"
output_json["machine_info"]["gpu_" + str(i)]["free_memory"] = f"{gpu.memoryFree}MB"
output_json["machine_info"]["gpu_" + str(i)]["used_memory"] = f"{gpu.memoryUsed}MB"
output_json["machine_info"]["gpu_" + str(i)]["temperature"] = f"{gpu.temperature} C"
output_json["machine_info"]["gpu_" + str(i)]["uuid"] = gpu.uuid

query_opts = NVSMI_QUERY_GPU.copy()
nvsmi = nvidia_smi.getInstance()
device_query = nvsmi.DeviceQuery([
query_opts["driver_version"],
query_opts["count"],
query_opts["index"],
query_opts["gpu_name"],
query_opts["gpu_uuid"],
query_opts["memory.total"],
query_opts["memory.used"],
query_opts["memory.free"],
query_opts["utilization.gpu"],
query_opts["utilization.memory"],
query_opts["temperature.gpu"]
])

output_json["machine_info"]["gpu_driver_version"] = device_query["driver_version"]

for gpu in device_query["gpu"]:
gpu_num = gpu["minor_number"]
output_json["machine_info"]["gpu_" + gpu_num] = {}
output_json["machine_info"]["gpu_" + gpu_num]["id"] = gpu_num
output_json["machine_info"]["gpu_" + gpu_num]["name"] = gpu["product_name"]
output_json["machine_info"][
"gpu_" + gpu_num]["utilization"] = f"{gpu['utilization']['gpu_util']}{gpu['utilization']['unit']}"
output_json["machine_info"][
"gpu_" + gpu_num]["total_memory"] = f"{gpu['fb_memory_usage']['total']} {gpu['fb_memory_usage']['unit']}"
output_json["machine_info"][
"gpu_" + gpu_num]["used_memory"] = f"{gpu['fb_memory_usage']['used']} {gpu['fb_memory_usage']['unit']}"
output_json["machine_info"][
"gpu_" + gpu_num]["free_memory"] = f"{gpu['fb_memory_usage']['free']} {gpu['fb_memory_usage']['unit']}"
output_json["machine_info"][
"gpu_" + gpu_num]["temperature"] = f"{gpu['temperature']['gpu_temp']} {gpu['temperature']['unit']}"
output_json["machine_info"]["gpu_" + gpu_num]["uuid"] = gpu["uuid"]

for bench in output_json['benchmarks']:
if bench["name"] not in E2E_TEST_CONFIGS:
Expand Down