Skip to content

Commit

Permalink
Replace GPUtil with pynvml for benchmark reports (#1451)
Browse files Browse the repository at this point in the history
+ Replace `GPUtil` with `pynvml` to collect GPU stats for benchmark reports.
+ Remove `GPUtil` from dependency yaml's. `pynvml` is already installed via `nvtabular`.

Closes #1446 

## By Submitting this PR I confirm:
- I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md).
- When the PR is ready for review, new or existing tests cover these changes.
- When the PR is ready for review, the documentation is up to date with these changes.

Authors:
  - Eli Fajardo (https://github.com/efajardo-nv)

Approvers:
  - Michael Demoret (https://github.com/mdemoret-nv)

URL: #1451
  • Loading branch information
efajardo-nv authored Jan 8, 2024
1 parent b68d769 commit bbc1cda
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 36 deletions.
4 changes: 2 additions & 2 deletions ci/conda/recipes/morpheus/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -110,7 +110,7 @@ outputs:
- {{ pin_compatible('cudatoolkit', min_pin='x.x', max_pin='x') }}
test:
requires:
- gputil
- pynvml
- pytest
- pytest-cov
- pytest-benchmark
Expand Down
1 change: 0 additions & 1 deletion conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ dependencies:
- flake8
- gcc_linux-64=11.2
- git-lfs
- gputil
- grpcio
- gxx_linux-64=11.2
- huggingface_hub=0.10.1
Expand Down
1 change: 0 additions & 1 deletion conda/environments/dev_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ dependencies:
- flake8
- gcc_linux-64=11.2
- git-lfs
- gputil
- grpcio
- gxx_linux-64=11.2
- include-what-you-use=0.20
Expand Down
3 changes: 1 addition & 2 deletions dependencies.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -235,7 +235,6 @@ dependencies:
- dill
- elasticsearch==8.9.0
- feedparser=6.0.10
- gputil
- grpcio
- mlflow>=2.2.1,<3
- nb_conda_kernels
Expand Down
3 changes: 1 addition & 2 deletions docker/conda/environments/cuda11.8_dev.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -57,7 +57,6 @@ dependencies:
- git>=2.35.3 # Needed for wildcards on safe.directory
- glog=0.6
- gmock>=1.13.0
- gputil
- grpcio
- gtest>=1.13.0
- gxx_linux-64=11.2
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -17,7 +17,8 @@
import json
from os import path

import GPUtil
from pynvml.smi import NVSMI_QUERY_GPU
from pynvml.smi import nvidia_smi

from benchmarks.test_bench_e2e_dfp_pipeline import PIPELINES_CONF

Expand All @@ -32,18 +33,40 @@ def pytest_benchmark_update_json(config, benchmarks, output_json): # pylint:dis

curr_dir = path.dirname(path.abspath(__file__))

gpus = GPUtil.getGPUs()

for i, gpu in enumerate(gpus):
# output_json["machine_info"]["gpu_" + str(i)] = gpu.name
output_json["machine_info"]["gpu_" + str(i)] = {}
output_json["machine_info"]["gpu_" + str(i)]["id"] = gpu.id
output_json["machine_info"]["gpu_" + str(i)]["name"] = gpu.name
output_json["machine_info"]["gpu_" + str(i)]["load"] = f"{gpu.load*100}%"
output_json["machine_info"]["gpu_" + str(i)]["free_memory"] = f"{gpu.memoryFree}MB"
output_json["machine_info"]["gpu_" + str(i)]["used_memory"] = f"{gpu.memoryUsed}MB"
output_json["machine_info"]["gpu_" + str(i)]["temperature"] = f"{gpu.temperature} C"
output_json["machine_info"]["gpu_" + str(i)]["uuid"] = gpu.uuid
query_opts = NVSMI_QUERY_GPU.copy()
nvsmi = nvidia_smi.getInstance()
device_query = nvsmi.DeviceQuery([
query_opts["driver_version"],
query_opts["count"],
query_opts["index"],
query_opts["gpu_name"],
query_opts["gpu_uuid"],
query_opts["memory.total"],
query_opts["memory.used"],
query_opts["memory.free"],
query_opts["utilization.gpu"],
query_opts["utilization.memory"],
query_opts["temperature.gpu"]
])

output_json["machine_info"]["gpu_driver_version"] = device_query["driver_version"]

for gpu in device_query["gpu"]:
gpu_num = gpu["minor_number"]
output_json["machine_info"]["gpu_" + gpu_num] = {}
output_json["machine_info"]["gpu_" + gpu_num]["id"] = gpu_num
output_json["machine_info"]["gpu_" + gpu_num]["name"] = gpu["product_name"]
output_json["machine_info"][
"gpu_" + gpu_num]["utilization"] = f"{gpu['utilization']['gpu_util']}{gpu['utilization']['unit']}"
output_json["machine_info"][
"gpu_" + gpu_num]["total_memory"] = f"{gpu['fb_memory_usage']['total']} {gpu['fb_memory_usage']['unit']}"
output_json["machine_info"][
"gpu_" + gpu_num]["used_memory"] = f"{gpu['fb_memory_usage']['used']} {gpu['fb_memory_usage']['unit']}"
output_json["machine_info"][
"gpu_" + gpu_num]["free_memory"] = f"{gpu['fb_memory_usage']['free']} {gpu['fb_memory_usage']['unit']}"
output_json["machine_info"][
"gpu_" + gpu_num]["temperature"] = f"{gpu['temperature']['gpu_temp']} {gpu['temperature']['unit']}"
output_json["machine_info"]["gpu_" + gpu_num]["uuid"] = gpu["uuid"]

for bench in output_json['benchmarks']:

Expand Down
52 changes: 38 additions & 14 deletions tests/benchmarks/conftest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -19,25 +19,49 @@
import typing
from unittest import mock

import GPUtil
import pytest
from pynvml.smi import NVSMI_QUERY_GPU
from pynvml.smi import nvidia_smi
from test_bench_e2e_pipelines import E2E_TEST_CONFIGS


# pylint: disable=unused-argument
def pytest_benchmark_update_json(config, benchmarks, output_json):
gpus = GPUtil.getGPUs()

for i, gpu in enumerate(gpus):
# output_json["machine_info"]["gpu_" + str(i)] = gpu.name
output_json["machine_info"]["gpu_" + str(i)] = {}
output_json["machine_info"]["gpu_" + str(i)]["id"] = gpu.id
output_json["machine_info"]["gpu_" + str(i)]["name"] = gpu.name
output_json["machine_info"]["gpu_" + str(i)]["load"] = f"{gpu.load*100}%"
output_json["machine_info"]["gpu_" + str(i)]["free_memory"] = f"{gpu.memoryFree}MB"
output_json["machine_info"]["gpu_" + str(i)]["used_memory"] = f"{gpu.memoryUsed}MB"
output_json["machine_info"]["gpu_" + str(i)]["temperature"] = f"{gpu.temperature} C"
output_json["machine_info"]["gpu_" + str(i)]["uuid"] = gpu.uuid

query_opts = NVSMI_QUERY_GPU.copy()
nvsmi = nvidia_smi.getInstance()
device_query = nvsmi.DeviceQuery([
query_opts["driver_version"],
query_opts["count"],
query_opts["index"],
query_opts["gpu_name"],
query_opts["gpu_uuid"],
query_opts["memory.total"],
query_opts["memory.used"],
query_opts["memory.free"],
query_opts["utilization.gpu"],
query_opts["utilization.memory"],
query_opts["temperature.gpu"]
])

output_json["machine_info"]["gpu_driver_version"] = device_query["driver_version"]

for gpu in device_query["gpu"]:
gpu_num = gpu["minor_number"]
output_json["machine_info"]["gpu_" + gpu_num] = {}
output_json["machine_info"]["gpu_" + gpu_num]["id"] = gpu_num
output_json["machine_info"]["gpu_" + gpu_num]["name"] = gpu["product_name"]
output_json["machine_info"][
"gpu_" + gpu_num]["utilization"] = f"{gpu['utilization']['gpu_util']}{gpu['utilization']['unit']}"
output_json["machine_info"][
"gpu_" + gpu_num]["total_memory"] = f"{gpu['fb_memory_usage']['total']} {gpu['fb_memory_usage']['unit']}"
output_json["machine_info"][
"gpu_" + gpu_num]["used_memory"] = f"{gpu['fb_memory_usage']['used']} {gpu['fb_memory_usage']['unit']}"
output_json["machine_info"][
"gpu_" + gpu_num]["free_memory"] = f"{gpu['fb_memory_usage']['free']} {gpu['fb_memory_usage']['unit']}"
output_json["machine_info"][
"gpu_" + gpu_num]["temperature"] = f"{gpu['temperature']['gpu_temp']} {gpu['temperature']['unit']}"
output_json["machine_info"]["gpu_" + gpu_num]["uuid"] = gpu["uuid"]

for bench in output_json['benchmarks']:
if bench["name"] not in E2E_TEST_CONFIGS:
Expand Down

0 comments on commit bbc1cda

Please sign in to comment.