Skip to content

Commit

Permalink
Merge pull request #49 from naved001/calculate-gpu-costs
Browse files Browse the repository at this point in the history
Implement how we gather GPU types.
  • Loading branch information
naved001 authored Mar 26, 2024
2 parents 79126e0 + 902162e commit 87259b3
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 93 deletions.
5 changes: 4 additions & 1 deletion openshift_metrics/openshift_prometheus_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@

CPU_REQUEST = 'kube_pod_resource_request{unit="cores"} unless on(pod, namespace) kube_pod_status_unschedulable'
MEMORY_REQUEST = 'kube_pod_resource_request{unit="bytes"} unless on(pod, namespace) kube_pod_status_unschedulable'
GPU_REQUEST = 'kube_pod_resource_request{resource=~".*gpu.*"} unless on(pod, namespace) kube_pod_status_unschedulable'

# For GPU requests, we don't need to exclude unscheduled pods because the join on node will eliminate those as unscheduled
# pods don't have a node value
GPU_REQUEST = 'kube_pod_resource_request{resource=~".*gpu.*"} * on(node) group_left(label_nvidia_com_gpu_product) kube_node_labels'


def main():
Expand Down
14 changes: 11 additions & 3 deletions openshift_metrics/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ def test_merge_metrics_not_empty(self):
"metric": {
"pod": "pod1",
"namespace": "namespace1",
"resource": "cpu",
},
"values": [
[0, 100],
Expand All @@ -183,7 +184,8 @@ def test_merge_metrics_not_empty(self):
{
"metric": {
"pod": "pod2",
"namespace": "namespace1"
"namespace": "namespace1",
"resource": "cpu",
},
"values": [
[60, 300],
Expand Down Expand Up @@ -641,7 +643,7 @@ def test_write_metrics_log(self, mock_gna):
},
"pod5": {
"namespace": "namespace2",
"gpu_type": utils.GPU_A2,
"gpu_type": utils.GPU_A100_SXM4,
"metrics": {
0: {
"cpu_request": 24,
Expand All @@ -657,7 +659,7 @@ def test_write_metrics_log(self, mock_gna):
"2023-01,namespace1,namespace1,PI1,,,,76,1128,OpenShift CPU,0.013,14.66\n"
"2023-01,namespace2,namespace2,PI2,,,,,96,OpenShift CPU,0.013,1.25\n"
"2023-01,namespace2,namespace2,PI2,,,,,48,OpenShift GPUA100,1.803,86.54\n"
"2023-01,namespace2,namespace2,PI2,,,,,144,OpenShift GPUA2,0.466,67.1\n")
"2023-01,namespace2,namespace2,PI2,,,,,48,OpenShift GPUA100SXM4,2.078,99.74\n")

with tempfile.NamedTemporaryFile(mode="w+") as tmp:
utils.write_metrics_by_namespace(test_metrics_dict, tmp.name, "2023-01")
Expand All @@ -678,6 +680,12 @@ def test_known_gpu(self):
self.assertEqual(su_count, 1)
self.assertEqual(determining_resource, "GPU")

def test_known_gpu_A100_SXM4(self):
su_type, su_count, determining_resource = utils.get_service_unit(32, 245, 1, utils.GPU_A100_SXM4)
self.assertEqual(su_type, utils.SU_A100_SXM4_GPU)
self.assertEqual(su_count, 1)
self.assertEqual(determining_resource, "GPU")

def test_known_gpu_high_cpu(self):
su_type, su_count, determining_resource = utils.get_service_unit(50, 96, 1, utils.GPU_A100)
self.assertEqual(su_type, utils.SU_A100_GPU)
Expand Down
138 changes: 49 additions & 89 deletions openshift_metrics/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,24 +25,24 @@
from requests.adapters import HTTPAdapter

# GPU types
GPU_A100 = "nvidia.com/gpu_A100"
GPU_A2 = "nvidia.com/gpu_A2"
GPU_V100 = "nvidia.com/gpu_V100"
GPU_A100 = "NVIDIA-A100-40GB"
GPU_A100_SXM4 = "NVIDIA-A100-SXM4-40GB"
GPU_V100 = "Tesla-V100-PCIE-32GB"
GPU_GENERIC = "nvidia.com/gpu"
NO_GPU = "No GPU"

# SU Types
SU_CPU = "OpenShift CPU"
SU_A100_GPU = "OpenShift GPUA100"
SU_A2_GPU = "OpenShift GPUA2"
SU_A100_SXM4_GPU = "OpenShift GPUA100SXM4"
SU_V100_GPU = "OpenShift GPUV100"
SU_UNKNOWN_GPU = "OpenShift Unknown GPU"
SU_UNKNOWN = "Openshift Unknown"

RATE = {
SU_CPU: 0.013,
SU_A100_GPU: 1.803,
SU_A2_GPU: 0.466,
SU_A100_SXM4_GPU: 2.078,
SU_V100_GPU: 1.214,
SU_UNKNOWN_GPU: 0,
}
Expand Down Expand Up @@ -180,7 +180,7 @@ def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type):

known_gpu_su = {
GPU_A100: SU_A100_GPU,
GPU_A2: SU_A2_GPU,
GPU_A100_SXM4: SU_A100_SXM4_GPU,
GPU_V100: SU_V100_GPU,
GPU_GENERIC: SU_UNKNOWN_GPU,
}
Expand All @@ -189,8 +189,8 @@ def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type):
su_config = {
SU_CPU: {"gpu": -1, "cpu": 1, "ram": 4},
SU_A100_GPU: {"gpu": 1, "cpu": 24, "ram": 74},
SU_A100_SXM4_GPU: {"gpu": 1, "cpu": 32, "ram": 245},
SU_V100_GPU: {"gpu": 1, "cpu": 24, "ram": 192},
SU_A2_GPU: {"gpu": 1, "cpu": 8, "ram": 64},
SU_UNKNOWN_GPU: {"gpu": 1, "cpu": 8, "ram": 64},
SU_UNKNOWN: {"gpu": -1, "cpu": 1, "ram": 1},
}
Expand Down Expand Up @@ -227,9 +227,10 @@ def merge_metrics(metric_name, metric_list, output_dict):
if pod not in output_dict:
output_dict[pod] = {"namespace": metric["metric"]["namespace"], "metrics": {}}

gpu_type = metric["metric"].get("resource", NO_GPU)
if gpu_type not in ["cpu", "memory"]:
output_dict[pod]["gpu_type"] = gpu_type
resource = metric["metric"].get("resource")

if resource not in ["cpu", "memory"]:
output_dict[pod]["gpu_type"] = metric["metric"].get("label_nvidia_com_gpu_product", GPU_GENERIC)
else:
output_dict[pod]["gpu_type"] = NO_GPU

Expand Down Expand Up @@ -293,6 +294,24 @@ def csv_writer(rows, file_name):
csvwriter.writerows(rows)


def add_row(rows, report_month, namespace, pi, institution_code, hours, su_type):

row = [
report_month,
namespace,
namespace,
pi,
"", #Invoice Email
"", #Invoice Address
"", #Institution
institution_code,
str(math.ceil(hours)),
su_type,
RATE.get(su_type),
str(round(RATE.get(su_type) * math.ceil(hours), 2))
]
rows.append(row)

def write_metrics_by_namespace(condensed_metrics_dict, file_name, report_month):
"""
Process metrics dictionary to aggregate usage by namespace and then write that to a file
Expand Down Expand Up @@ -335,7 +354,7 @@ def write_metrics_by_namespace(condensed_metrics_dict, file_name, report_month):
"_memory_hours": 0,
"SU_CPU_HOURS": 0,
"SU_A100_GPU_HOURS": 0,
"SU_A2_GPU_HOURS": 0,
"SU_A100_SXM4_GPU_HOURS": 0,
"SU_V100_GPU_HOURS": 0,
"SU_UNKNOWN_GPU_HOURS": 0,
"total_cost": 0,
Expand All @@ -351,99 +370,40 @@ def write_metrics_by_namespace(condensed_metrics_dict, file_name, report_month):

if gpu_type == GPU_A100:
metrics_by_namespace[namespace]["SU_A100_GPU_HOURS"] += su_count * duration_in_hours
elif gpu_type == GPU_A2:
metrics_by_namespace[namespace]["SU_A2_GPU_HOURS"] += su_count * duration_in_hours
elif gpu_type == GPU_A100_SXM4:
metrics_by_namespace[namespace]["SU_A100_SXM4_GPU_HOURS"] += su_count * duration_in_hours
elif gpu_type == GPU_V100:
metrics_by_namespace[namespace]["SU_V100_GPU_HOURS"] += su_count * duration_in_hours
elif gpu_type == GPU_GENERIC:
metrics_by_namespace[namespace]["SU_UNKNOWN_GPU_HOURS"] += su_count * duration_in_hours
else:
metrics_by_namespace[namespace]["SU_CPU_HOURS"] += su_count * duration_in_hours

for namespace, metrics in metrics_by_namespace.items():

common_args = {
"rows": rows,
"report_month": report_month,
"namespace": namespace,
"pi": metrics["pi"],
"institution_code": metrics["cf_institution_code"]
}

if metrics["SU_CPU_HOURS"] != 0:
row = [
report_month,
namespace,
namespace,
metrics["pi"],
"", #Invoice Email
"", #Invoice Address
"", #Institution
metrics["cf_institution_code"],
str(math.ceil(metrics["SU_CPU_HOURS"])),
SU_CPU,
str(RATE.get(SU_CPU)),
str(round(RATE.get(SU_CPU) * math.ceil(metrics["SU_CPU_HOURS"]), 2))
]
rows.append(row)
add_row(hours=metrics["SU_CPU_HOURS"], su_type=SU_CPU, **common_args)

if metrics["SU_A100_GPU_HOURS"] != 0:
row = [
report_month,
namespace,
namespace,
metrics["pi"],
"", #Invoice Email
"", #Invoice Address
"", #Institution
metrics["cf_institution_code"],
str(math.ceil(metrics["SU_A100_GPU_HOURS"])),
SU_A100_GPU,
str(RATE.get(SU_A100_GPU)),
str(round(RATE.get(SU_A100_GPU) * math.ceil(metrics["SU_A100_GPU_HOURS"]), 2))
]
rows.append(row)
add_row(hours=metrics["SU_A100_GPU_HOURS"], su_type=SU_A100_GPU, **common_args)

if metrics["SU_A2_GPU_HOURS"] != 0:
row = [
report_month,
namespace,
namespace,
metrics["pi"],
"", #Invoice Email
"", #Invoice Address
"", #Institution
metrics["cf_institution_code"],
str(math.ceil(metrics["SU_A2_GPU_HOURS"])),
SU_A2_GPU,
str(RATE.get(SU_A2_GPU)),
str(round(RATE.get(SU_A2_GPU) * math.ceil(metrics["SU_A2_GPU_HOURS"]), 2))
]
rows.append(row)
if metrics["SU_A100_SXM4_GPU_HOURS"] != 0:
add_row(hours=metrics["SU_A100_SXM4_GPU_HOURS"], su_type=SU_A100_SXM4_GPU, **common_args)

if metrics["SU_V100_GPU_HOURS"] != 0:
row = [
report_month,
namespace,
namespace,
metrics["pi"],
"", #Invoice Email
"", #Invoice Address
"", #Institution
metrics["cf_institution_code"],
str(match.ceil(metrics["SU_V100_GPU_HOURS"])),
SU_V100_GPU,
str(RATE.get(SU_V100_GPU)),
str(round(RATE.get(SU_V100_GPU) * match.ceil(metrics["SU_V100_GPU_HOURS"]), 2))
]
rows.append(row)
add_row(hours=metrics["SU_V100_GPU_HOURS"], su_type=SU_V100_GPU, **common_args)

if metrics["SU_UNKNOWN_GPU_HOURS"] != 0:
row = [
report_month,
namespace,
namespace,
metrics["pi"],
"", #Invoice Email
"", #Invoice Address
"", #Institution
metrics["cf_institution_code"],
str(math.ceil(metrics["SU_UNKNOWN_GPU_HOURS"])),
SU_UNKNOWN_GPU,
str(RATE.get(SU_UNKNOWN_GPU)),
str(RATE.get(SU_UNKNOWN_GPU) * math.ceil(metrics["SU_UNKNOWN_GPU_HOURS"])) #Cost
]
rows.append(row)
add_row(hours=metrics["SU_UNKNOWN_GPU_HOURS"], su_type=SU_UNKNOWN_GPU, **common_args)

csv_writer(rows, file_name)


Expand Down

0 comments on commit 87259b3

Please sign in to comment.