Merge pull request #49 from naved001/calculate-gpu-costs

Implement how we gather GPU types.
CCI-MOC · Mar 26, 2024 · 87259b3 · 87259b3
2 parents 79126e0 + 902162e
commit 87259b3
Show file tree

Hide file tree

Showing 3 changed files with 64 additions and 93 deletions.
diff --git a/openshift_metrics/openshift_prometheus_metrics.py b/openshift_metrics/openshift_prometheus_metrics.py
@@ -24,7 +24,10 @@
 
 CPU_REQUEST = 'kube_pod_resource_request{unit="cores"} unless on(pod, namespace) kube_pod_status_unschedulable'
 MEMORY_REQUEST = 'kube_pod_resource_request{unit="bytes"} unless on(pod, namespace) kube_pod_status_unschedulable'
-GPU_REQUEST = 'kube_pod_resource_request{resource=~".*gpu.*"} unless on(pod, namespace) kube_pod_status_unschedulable'
+
+# For GPU requests, we don't need to exclude unscheduled pods because the join on node will eliminate those as unscheduled
+# pods don't have a node value
+GPU_REQUEST = 'kube_pod_resource_request{resource=~".*gpu.*"} * on(node) group_left(label_nvidia_com_gpu_product) kube_node_labels'
 
 
 def main():

diff --git a/openshift_metrics/tests/test_utils.py b/openshift_metrics/tests/test_utils.py
@@ -173,6 +173,7 @@ def test_merge_metrics_not_empty(self):
                 "metric": {
                     "pod": "pod1",
                     "namespace": "namespace1",
+                    "resource": "cpu",
                 },
                 "values": [
                     [0, 100],
@@ -183,7 +184,8 @@ def test_merge_metrics_not_empty(self):
             {
                 "metric": {
                     "pod": "pod2",
-                    "namespace": "namespace1"
+                    "namespace": "namespace1",
+                    "resource": "cpu",
                 },
                 "values": [
                     [60, 300],
@@ -641,7 +643,7 @@ def test_write_metrics_log(self, mock_gna):
             },
             "pod5": {
                 "namespace": "namespace2",
-                "gpu_type": utils.GPU_A2,
+                "gpu_type": utils.GPU_A100_SXM4,
                 "metrics": {
                     0: {
                         "cpu_request": 24,
@@ -657,7 +659,7 @@ def test_write_metrics_log(self, mock_gna):
                             "2023-01,namespace1,namespace1,PI1,,,,76,1128,OpenShift CPU,0.013,14.66\n"
                             "2023-01,namespace2,namespace2,PI2,,,,,96,OpenShift CPU,0.013,1.25\n"
                             "2023-01,namespace2,namespace2,PI2,,,,,48,OpenShift GPUA100,1.803,86.54\n"
-                            "2023-01,namespace2,namespace2,PI2,,,,,144,OpenShift GPUA2,0.466,67.1\n")
+                            "2023-01,namespace2,namespace2,PI2,,,,,48,OpenShift GPUA100SXM4,2.078,99.74\n")
 
         with tempfile.NamedTemporaryFile(mode="w+") as tmp:
             utils.write_metrics_by_namespace(test_metrics_dict, tmp.name, "2023-01")
@@ -678,6 +680,12 @@ def test_known_gpu(self):
         self.assertEqual(su_count, 1)
         self.assertEqual(determining_resource, "GPU")
 
+    def test_known_gpu_A100_SXM4(self):
+        su_type, su_count, determining_resource = utils.get_service_unit(32, 245, 1, utils.GPU_A100_SXM4)
+        self.assertEqual(su_type, utils.SU_A100_SXM4_GPU)
+        self.assertEqual(su_count, 1)
+        self.assertEqual(determining_resource, "GPU")
+
     def test_known_gpu_high_cpu(self):
         su_type, su_count, determining_resource = utils.get_service_unit(50, 96, 1, utils.GPU_A100)
         self.assertEqual(su_type, utils.SU_A100_GPU)

diff --git a/openshift_metrics/utils.py b/openshift_metrics/utils.py
@@ -25,24 +25,24 @@
 from requests.adapters import HTTPAdapter
 
 # GPU types
-GPU_A100 = "nvidia.com/gpu_A100"
-GPU_A2 = "nvidia.com/gpu_A2"
-GPU_V100 = "nvidia.com/gpu_V100"
+GPU_A100 = "NVIDIA-A100-40GB"
+GPU_A100_SXM4 = "NVIDIA-A100-SXM4-40GB"
+GPU_V100 = "Tesla-V100-PCIE-32GB"
 GPU_GENERIC = "nvidia.com/gpu"
 NO_GPU = "No GPU"
 
 # SU Types
 SU_CPU = "OpenShift CPU"
 SU_A100_GPU = "OpenShift GPUA100"
-SU_A2_GPU = "OpenShift GPUA2"
+SU_A100_SXM4_GPU = "OpenShift GPUA100SXM4"
 SU_V100_GPU = "OpenShift GPUV100"
 SU_UNKNOWN_GPU = "OpenShift Unknown GPU"
 SU_UNKNOWN = "Openshift Unknown"
 
 RATE = {
     SU_CPU: 0.013,
     SU_A100_GPU: 1.803,
-    SU_A2_GPU: 0.466,
+    SU_A100_SXM4_GPU: 2.078,
     SU_V100_GPU: 1.214,
     SU_UNKNOWN_GPU: 0,
 }
@@ -180,7 +180,7 @@ def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type):
 
     known_gpu_su = {
         GPU_A100: SU_A100_GPU,
-        GPU_A2: SU_A2_GPU,
+        GPU_A100_SXM4: SU_A100_SXM4_GPU,
         GPU_V100: SU_V100_GPU,
         GPU_GENERIC: SU_UNKNOWN_GPU,
     }
@@ -189,8 +189,8 @@ def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type):
     su_config = {
         SU_CPU: {"gpu": -1, "cpu": 1, "ram": 4},
         SU_A100_GPU: {"gpu": 1, "cpu": 24, "ram": 74},
+        SU_A100_SXM4_GPU: {"gpu": 1, "cpu": 32, "ram": 245},
         SU_V100_GPU: {"gpu": 1, "cpu": 24, "ram": 192},
-        SU_A2_GPU: {"gpu": 1, "cpu": 8, "ram": 64},
         SU_UNKNOWN_GPU: {"gpu": 1, "cpu": 8, "ram": 64},
         SU_UNKNOWN: {"gpu": -1, "cpu": 1, "ram": 1},
     }
@@ -227,9 +227,10 @@ def merge_metrics(metric_name, metric_list, output_dict):
         if pod not in output_dict:
             output_dict[pod] = {"namespace": metric["metric"]["namespace"], "metrics": {}}
 
-        gpu_type = metric["metric"].get("resource", NO_GPU)
-        if gpu_type not in ["cpu", "memory"]:
-            output_dict[pod]["gpu_type"] = gpu_type
+        resource = metric["metric"].get("resource")
+
+        if resource not in ["cpu", "memory"]:
+            output_dict[pod]["gpu_type"] = metric["metric"].get("label_nvidia_com_gpu_product", GPU_GENERIC)
         else:
             output_dict[pod]["gpu_type"] = NO_GPU
 
@@ -293,6 +294,24 @@ def csv_writer(rows, file_name):
         csvwriter.writerows(rows)
 
 
+def add_row(rows, report_month, namespace, pi, institution_code, hours, su_type):
+
+    row = [
+        report_month,
+        namespace,
+        namespace,
+        pi,
+        "", #Invoice Email
+        "", #Invoice Address
+        "", #Institution
+        institution_code,
+        str(math.ceil(hours)),
+        su_type,
+        RATE.get(su_type),
+        str(round(RATE.get(su_type) * math.ceil(hours), 2))
+    ]
+    rows.append(row)
+
 def write_metrics_by_namespace(condensed_metrics_dict, file_name, report_month):
     """
     Process metrics dictionary to aggregate usage by namespace and then write that to a file
@@ -335,7 +354,7 @@ def write_metrics_by_namespace(condensed_metrics_dict, file_name, report_month):
                 "_memory_hours": 0,
                 "SU_CPU_HOURS": 0,
                 "SU_A100_GPU_HOURS": 0,
-                "SU_A2_GPU_HOURS": 0,
+                "SU_A100_SXM4_GPU_HOURS": 0,
                 "SU_V100_GPU_HOURS": 0,
                 "SU_UNKNOWN_GPU_HOURS": 0,
                 "total_cost": 0,
@@ -351,99 +370,40 @@ def write_metrics_by_namespace(condensed_metrics_dict, file_name, report_month):
 
             if gpu_type == GPU_A100:
                 metrics_by_namespace[namespace]["SU_A100_GPU_HOURS"] += su_count * duration_in_hours
-            elif gpu_type == GPU_A2:
-                metrics_by_namespace[namespace]["SU_A2_GPU_HOURS"] += su_count * duration_in_hours
+            elif gpu_type == GPU_A100_SXM4:
+                metrics_by_namespace[namespace]["SU_A100_SXM4_GPU_HOURS"] += su_count * duration_in_hours
+            elif gpu_type == GPU_V100:
+                metrics_by_namespace[namespace]["SU_V100_GPU_HOURS"] += su_count * duration_in_hours
             elif gpu_type == GPU_GENERIC:
                 metrics_by_namespace[namespace]["SU_UNKNOWN_GPU_HOURS"] += su_count * duration_in_hours
             else:
                 metrics_by_namespace[namespace]["SU_CPU_HOURS"] += su_count * duration_in_hours
 
     for namespace, metrics in metrics_by_namespace.items():
 
+        common_args = {
+            "rows": rows,
+            "report_month": report_month,
+            "namespace": namespace,
+            "pi": metrics["pi"],
+            "institution_code": metrics["cf_institution_code"]
+        }
+
         if metrics["SU_CPU_HOURS"] != 0:
-            row = [
-                report_month,
-                namespace,
-                namespace,
-                metrics["pi"],
-                "", #Invoice Email
-                "", #Invoice Address
-                "", #Institution
-                metrics["cf_institution_code"],
-                str(math.ceil(metrics["SU_CPU_HOURS"])),
-                SU_CPU,
-                str(RATE.get(SU_CPU)),
-                str(round(RATE.get(SU_CPU) * math.ceil(metrics["SU_CPU_HOURS"]), 2))
-            ]
-            rows.append(row)
+            add_row(hours=metrics["SU_CPU_HOURS"], su_type=SU_CPU, **common_args)
 
         if metrics["SU_A100_GPU_HOURS"] != 0:
-            row = [
-                report_month,
-                namespace,
-                namespace,
-                metrics["pi"],
-                "", #Invoice Email
-                "", #Invoice Address
-                "", #Institution
-                metrics["cf_institution_code"],
-                str(math.ceil(metrics["SU_A100_GPU_HOURS"])),
-                SU_A100_GPU,
-                str(RATE.get(SU_A100_GPU)),
-                str(round(RATE.get(SU_A100_GPU) * math.ceil(metrics["SU_A100_GPU_HOURS"]), 2))
-            ]
-            rows.append(row)
+            add_row(hours=metrics["SU_A100_GPU_HOURS"], su_type=SU_A100_GPU, **common_args)
 
-        if metrics["SU_A2_GPU_HOURS"] != 0:
-            row = [
-                report_month,
-                namespace,
-                namespace,
-                metrics["pi"],
-                "", #Invoice Email
-                "", #Invoice Address
-                "", #Institution
-                metrics["cf_institution_code"],
-                str(math.ceil(metrics["SU_A2_GPU_HOURS"])),
-                SU_A2_GPU,
-                str(RATE.get(SU_A2_GPU)),
-                str(round(RATE.get(SU_A2_GPU) * math.ceil(metrics["SU_A2_GPU_HOURS"]), 2))
-            ]
-            rows.append(row)
+        if metrics["SU_A100_SXM4_GPU_HOURS"] != 0:
+            add_row(hours=metrics["SU_A100_SXM4_GPU_HOURS"], su_type=SU_A100_SXM4_GPU, **common_args)
 
         if metrics["SU_V100_GPU_HOURS"] != 0:
-            row = [
-                report_month,
-                namespace,
-                namespace,
-                metrics["pi"],
-                "", #Invoice Email
-                "", #Invoice Address
-                "", #Institution
-                metrics["cf_institution_code"],
-                str(match.ceil(metrics["SU_V100_GPU_HOURS"])),
-                SU_V100_GPU,
-                str(RATE.get(SU_V100_GPU)),
-                str(round(RATE.get(SU_V100_GPU) * match.ceil(metrics["SU_V100_GPU_HOURS"]), 2))
-            ]
-            rows.append(row)
+            add_row(hours=metrics["SU_V100_GPU_HOURS"], su_type=SU_V100_GPU, **common_args)
 
         if metrics["SU_UNKNOWN_GPU_HOURS"] != 0:
-            row = [
-                report_month,
-                namespace,
-                namespace,
-                metrics["pi"],
-                "", #Invoice Email
-                "", #Invoice Address
-                "", #Institution
-                metrics["cf_institution_code"],
-                str(math.ceil(metrics["SU_UNKNOWN_GPU_HOURS"])),
-                SU_UNKNOWN_GPU,
-                str(RATE.get(SU_UNKNOWN_GPU)),
-                str(RATE.get(SU_UNKNOWN_GPU) * math.ceil(metrics["SU_UNKNOWN_GPU_HOURS"])) #Cost
-            ]
-            rows.append(row)
+            add_row(hours=metrics["SU_UNKNOWN_GPU_HOURS"], su_type=SU_UNKNOWN_GPU, **common_args)
+
     csv_writer(rows, file_name)