Merge pull request #58 from naved001/prepare-for-mig

Prepare for mig
CCI-MOC · Apr 12, 2024 · ba268c8 · ba268c8
2 parents a828c80 + 29d9eee
commit ba268c8
Show file tree

Hide file tree

Showing 3 changed files with 112 additions and 46 deletions.
diff --git a/openshift_metrics/openshift_prometheus_metrics.py b/openshift_metrics/openshift_prometheus_metrics.py
@@ -27,7 +27,7 @@
 
 # For GPU requests, we don't need to exclude unscheduled pods because the join on node will eliminate those as unscheduled
 # pods don't have a node value
-GPU_REQUEST = 'kube_pod_resource_request{resource=~".*gpu.*"} * on(node) group_left(label_nvidia_com_gpu_product) kube_node_labels'
+GPU_REQUEST = 'kube_pod_resource_request{resource=~"nvidia.com.*"} * on(node) group_left(label_nvidia_com_gpu_product, label_nvidia_com_gpu_machine) kube_node_labels'
 
 
 def main():

diff --git a/openshift_metrics/tests/test_utils.py b/openshift_metrics/tests/test_utils.py
@@ -416,17 +416,20 @@ def test_merge_metrics_not_empty_with_gpu(self):
                     0: {
                         "cpu": 10,
                         "gpu_request": 1,
-                        "gpu_type": "Tesla-V100-PCIE-32GB"
+                        "gpu_type": "Tesla-V100-PCIE-32GB",
+                        "gpu_resource": "nvidia.com/gpu",
                     },
                     60: {
                         "cpu": 15,
                         "gpu_request": 1,
-                        "gpu_type": "Tesla-V100-PCIE-32GB"
+                        "gpu_type": "Tesla-V100-PCIE-32GB",
+                        "gpu_resource": "nvidia.com/gpu",
                     },
                     120: {
                         "cpu": 20,
                         "gpu_request": 2,
-                        "gpu_type": "Tesla-V100-PCIE-32GB"
+                        "gpu_type": "Tesla-V100-PCIE-32GB",
+                        "gpu_resource": "nvidia.com/gpu",
                     },
                 }
             },
@@ -791,23 +794,25 @@ def test_write_metrics_log(self, mock_gna):
         test_metrics_dict = {
             "pod1": {
                 "namespace": "namespace1",
-                "gpu_type": utils.NO_GPU,
                 "metrics": {
                     0: {
                         "cpu_request": 10,
                         "memory_request": 1048576,
-                        "duration": 120
+                        "duration": 120,
+                        "node": "wrk-1",
+                        "node_model": "Dell",
                     },
                     120: {
                         "cpu_request": 20,
                         "memory_request": 1048576,
-                        "duration": 60
+                        "duration": 60,
+                        "node": "wrk-2",
+                        "node_model": "Lenovo"
                     }
                 }
             },
             "pod2": {
                 "namespace": "namespace1",
-                "gpu_type": utils.NO_GPU,
                 "metrics": {
                     0: {
                         "cpu_request": 20,
@@ -828,7 +833,6 @@ def test_write_metrics_log(self, mock_gna):
             },
             "pod3": {
                 "namespace": "namespace2",
-                "gpu_type": utils.NO_GPU,
                 "metrics": {
                     0: {
                         "cpu_request": 45,
@@ -839,7 +843,6 @@ def test_write_metrics_log(self, mock_gna):
             },
             "pod4": { # this results in 0.5 SU
                 "namespace": "namespace2",
-                "gpu_type": utils.NO_GPU,
                 "metrics": {
                     0: {
                         "cpu_request": 0.5,
@@ -850,14 +853,14 @@ def test_write_metrics_log(self, mock_gna):
             },
         }
 
-        expected_output = ("Namespace,Coldfront_PI Name,Coldfront Project ID ,Pod Start Time,Pod End Time,Duration (Hours),Pod Name,CPU Request,GPU Request,GPU Type,Memory Request (GiB),Determining Resource,SU Type,SU Count\n"
-                           "namespace1,PI1,123,1970-01-01T00:00:00,1970-01-01T00:02:00,0.0333,pod1,10,0,No GPU,0.001,CPU,OpenShift CPU,10.0\n"
-                           "namespace1,PI1,123,1970-01-01T00:02:00,1970-01-01T00:03:00,0.0167,pod1,20,0,No GPU,0.001,CPU,OpenShift CPU,20.0\n"
-                           "namespace1,PI1,123,1970-01-01T00:00:00,1970-01-01T00:01:00,0.0167,pod2,20,0,No GPU,0.0098,CPU,OpenShift CPU,20.0\n"
-                           "namespace1,PI1,123,1970-01-01T00:01:00,1970-01-01T00:02:00,0.0167,pod2,25,0,No GPU,0.0098,CPU,OpenShift CPU,25.0\n"
-                           "namespace1,PI1,123,1970-01-01T00:02:00,1970-01-01T00:03:00,0.0167,pod2,20,0,No GPU,0.0098,CPU,OpenShift CPU,20.0\n"
-                           "namespace2,PI2,456,1970-01-01T00:00:00,1970-01-01T00:03:00,0.05,pod3,45,0,No GPU,0.0977,CPU,OpenShift CPU,45.0\n"
-                           "namespace2,PI2,456,1970-01-01T00:00:00,1970-01-01T01:00:00,1.0,pod4,0.5,0,No GPU,2.0,CPU,OpenShift CPU,0.5\n")
+        expected_output = ("Namespace,Coldfront_PI Name,Coldfront Project ID ,Pod Start Time,Pod End Time,Duration (Hours),Pod Name,CPU Request,GPU Request,GPU Type,GPU Resource,Node,Node Model,Memory Request (GiB),Determining Resource,SU Type,SU Count\n"
+                           "namespace1,PI1,123,1970-01-01T00:00:00,1970-01-01T00:02:00,0.0333,pod1,10,0,,,wrk-1,Dell,0.001,CPU,OpenShift CPU,10.0\n"
+                           "namespace1,PI1,123,1970-01-01T00:02:00,1970-01-01T00:03:00,0.0167,pod1,20,0,,,wrk-2,Lenovo,0.001,CPU,OpenShift CPU,20.0\n"
+                           "namespace1,PI1,123,1970-01-01T00:00:00,1970-01-01T00:01:00,0.0167,pod2,20,0,,,Unknown Node,Unknown Model,0.0098,CPU,OpenShift CPU,20.0\n"
+                           "namespace1,PI1,123,1970-01-01T00:01:00,1970-01-01T00:02:00,0.0167,pod2,25,0,,,Unknown Node,Unknown Model,0.0098,CPU,OpenShift CPU,25.0\n"
+                           "namespace1,PI1,123,1970-01-01T00:02:00,1970-01-01T00:03:00,0.0167,pod2,20,0,,,Unknown Node,Unknown Model,0.0098,CPU,OpenShift CPU,20.0\n"
+                           "namespace2,PI2,456,1970-01-01T00:00:00,1970-01-01T00:03:00,0.05,pod3,45,0,,,Unknown Node,Unknown Model,0.0977,CPU,OpenShift CPU,45.0\n"
+                           "namespace2,PI2,456,1970-01-01T00:00:00,1970-01-01T01:00:00,1.0,pod4,0.5,0,,,Unknown Node,Unknown Model,2.0,CPU,OpenShift CPU,0.5\n")
 
         with tempfile.NamedTemporaryFile(mode="w+") as tmp:
             utils.write_metrics_by_pod(test_metrics_dict, tmp.name)
@@ -928,6 +931,7 @@ def test_write_metrics_log(self, mock_gna):
                         "memory_request": 8 * 2**30,
                         "gpu_request": 1,
                         "gpu_type": utils.GPU_A100,
+                        "gpu_resource": utils.WHOLE_GPU,
                         "duration": 172700 # little under 48 hours, expect to be rounded up in the output
                     },
                 }
@@ -941,6 +945,7 @@ def test_write_metrics_log(self, mock_gna):
                         "memory_request": 8 * 2**30,
                         "gpu_request": 1,
                         "gpu_type": utils.GPU_A100_SXM4,
+                        "gpu_resource": utils.WHOLE_GPU,
                         "duration": 172800
                     },
                 }
@@ -961,79 +966,103 @@ def test_write_metrics_log(self, mock_gna):
 class TestGetServiceUnit(TestCase):
 
     def test_cpu_only(self):
-        su_type, su_count, determining_resource = utils.get_service_unit(4, 16, 0, None)
+        su_type, su_count, determining_resource = utils.get_service_unit(4, 16, 0, None, None)
         self.assertEqual(su_type, utils.SU_CPU)
         self.assertEqual(su_count, 4)
         self.assertEqual(determining_resource, "CPU")
 
     def test_known_gpu(self):
-        su_type, su_count, determining_resource = utils.get_service_unit(24, 74, 1, utils.GPU_A100)
+        su_type, su_count, determining_resource = utils.get_service_unit(24, 74, 1, utils.GPU_A100, utils.WHOLE_GPU)
         self.assertEqual(su_type, utils.SU_A100_GPU)
         self.assertEqual(su_count, 1)
         self.assertEqual(determining_resource, "GPU")
 
     def test_known_gpu_A100_SXM4(self):
-        su_type, su_count, determining_resource = utils.get_service_unit(32, 245, 1, utils.GPU_A100_SXM4)
+        su_type, su_count, determining_resource = utils.get_service_unit(32, 245, 1, utils.GPU_A100_SXM4, utils.WHOLE_GPU)
         self.assertEqual(su_type, utils.SU_A100_SXM4_GPU)
         self.assertEqual(su_count, 1)
         self.assertEqual(determining_resource, "GPU")
 
     def test_known_gpu_high_cpu(self):
-        su_type, su_count, determining_resource = utils.get_service_unit(50, 96, 1, utils.GPU_A100)
+        su_type, su_count, determining_resource = utils.get_service_unit(50, 96, 1, utils.GPU_A100, utils.WHOLE_GPU)
         self.assertEqual(su_type, utils.SU_A100_GPU)
         self.assertEqual(su_count, 3)
         self.assertEqual(determining_resource, "CPU")
 
     def test_known_gpu_high_memory(self):
-        su_type, su_count, determining_resource = utils.get_service_unit(24, 100, 1, utils.GPU_A100)
+        su_type, su_count, determining_resource = utils.get_service_unit(24, 100, 1, utils.GPU_A100, utils.WHOLE_GPU)
         self.assertEqual(su_type, utils.SU_A100_GPU)
         self.assertEqual(su_count, 2)
         self.assertEqual(determining_resource, "RAM")
 
     def test_known_gpu_low_cpu_memory(self):
-        su_type, su_count, determining_resource = utils.get_service_unit(2, 4, 1, utils.GPU_A100)
+        su_type, su_count, determining_resource = utils.get_service_unit(2, 4, 1, utils.GPU_A100, utils.WHOLE_GPU)
         self.assertEqual(su_type, utils.SU_A100_GPU)
         self.assertEqual(su_count, 1)
         self.assertEqual(determining_resource, "GPU")
 
     def test_unknown_gpu(self):
-        su_type, su_count, determining_resource = utils.get_service_unit(8, 64, 1, "Unknown_GPU_Type")
+        su_type, su_count, determining_resource = utils.get_service_unit(8, 64, 1, "Unknown_GPU_Type", utils.WHOLE_GPU)
+        self.assertEqual(su_type, utils.SU_UNKNOWN_GPU)
+        self.assertEqual(su_count, 1)
+        self.assertEqual(determining_resource, "GPU")
+
+    def test_known_gpu_zero_count(self):
+        su_type, su_count, determining_resource = utils.get_service_unit(8, 64, 0, utils.GPU_A100, utils.WHOLE_GPU)
         self.assertEqual(su_type, utils.SU_UNKNOWN_GPU)
+        self.assertEqual(su_count, 0)
+        self.assertEqual(determining_resource, "GPU")
+
+    def test_known_mig_gpu(self):
+        su_type, su_count, determining_resource = utils.get_service_unit(1, 4, 1, utils.GPU_A100_SXM4, utils.MIG_1G_5GB)
+        self.assertEqual(su_type, utils.SU_UNKNOWN_MIG_GPU)
         self.assertEqual(su_count, 1)
         self.assertEqual(determining_resource, "GPU")
 
+    def test_known_gpu_unknown_resource(self):
+        su_type, su_count, determining_resource = utils.get_service_unit(1, 4, 1, utils.GPU_A100, "nvidia.com/mig_20G_500GB")
+        self.assertEqual(su_type, utils.SU_UNKNOWN_GPU)
+        self.assertEqual(su_count, 0)
+        self.assertEqual(determining_resource, "GPU")
+
+    def test_unknown_gpu_known_resource(self):
+        su_type, su_count, determining_resource = utils.get_service_unit(1, 4, 1, "Unknown GPU", utils.MIG_2G_10GB)
+        self.assertEqual(su_type, utils.SU_UNKNOWN_GPU)
+        self.assertEqual(su_count, 0)
+        self.assertEqual(determining_resource, "GPU")
+
     def test_zero_memory(self):
-        su_type, su_count, determining_resource = utils.get_service_unit(1, 0, 0, None)
+        su_type, su_count, determining_resource = utils.get_service_unit(1, 0, 0, None, None)
         self.assertEqual(su_type, utils.SU_UNKNOWN)
         self.assertEqual(su_count, 0)
         self.assertEqual(determining_resource, "CPU")
 
     def test_zero_cpu(self):
-        su_type, su_count, determining_resource = utils.get_service_unit(0, 1, 0, None)
+        su_type, su_count, determining_resource = utils.get_service_unit(0, 1, 0, None, None)
         self.assertEqual(su_type, utils.SU_UNKNOWN)
         self.assertEqual(su_count, 0)
         self.assertEqual(determining_resource, "CPU")
 
     def test_memory_dominant(self):
-        su_type, su_count, determining_resource = utils.get_service_unit(8, 64, 0, None)
+        su_type, su_count, determining_resource = utils.get_service_unit(8, 64, 0, None, None)
         self.assertEqual(su_type, utils.SU_CPU)
         self.assertEqual(su_count, 16)
         self.assertEqual(determining_resource, "RAM")
 
     def test_fractional_su_cpu_dominant(self):
-        su_type, su_count, determining_resource = utils.get_service_unit(0.5, 0.5, 0, None)
+        su_type, su_count, determining_resource = utils.get_service_unit(0.5, 0.5, 0, None, None)
         self.assertEqual(su_type, utils.SU_CPU)
         self.assertEqual(su_count, 0.5)
         self.assertEqual(determining_resource, "CPU")
 
     def test_fractional_su_memory_dominant(self):
-        su_type, su_count, determining_resource = utils.get_service_unit(0.1, 1, 0, None)
+        su_type, su_count, determining_resource = utils.get_service_unit(0.1, 1, 0, None, None)
         self.assertEqual(su_type, utils.SU_CPU)
         self.assertEqual(su_count, 0.25)
         self.assertEqual(determining_resource, "RAM")
 
     def test_known_gpu_fractional_cpu_memory(self):
-        su_type, su_count, determining_resource = utils.get_service_unit(0.8, 0.8, 1, utils.GPU_A100)
+        su_type, su_count, determining_resource = utils.get_service_unit(0.8, 0.8, 1, utils.GPU_A100, utils.WHOLE_GPU)
         self.assertEqual(su_type, utils.SU_A100_GPU)
         self.assertEqual(su_count, 1)
         self.assertEqual(determining_resource, "GPU")
diff --git a/openshift_metrics/utils.py b/openshift_metrics/utils.py
@@ -29,14 +29,22 @@
 GPU_A100_SXM4 = "NVIDIA-A100-SXM4-40GB"
 GPU_V100 = "Tesla-V100-PCIE-32GB"
 GPU_UNKNOWN_TYPE = "GPU_UNKNOWN_TYPE"
-NO_GPU = "No GPU"
+
+# GPU Resource - MIG Geometries
+# A100 Strategies
+MIG_1G_5GB = "nvidia.com/mig-1g.5gb"
+MIG_2G_10GB = "nvidia.com/mig-2g.10gb"
+MIG_3G_20GB = "nvidia.com/mig-3g.20gb"
+WHOLE_GPU = "nvidia.com/gpu"
+
 
 # SU Types
 SU_CPU = "OpenShift CPU"
 SU_A100_GPU = "OpenShift GPUA100"
 SU_A100_SXM4_GPU = "OpenShift GPUA100SXM4"
 SU_V100_GPU = "OpenShift GPUV100"
 SU_UNKNOWN_GPU = "OpenShift Unknown GPU"
+SU_UNKNOWN_MIG_GPU = "OpenShift Unknown MIG GPU"
 SU_UNKNOWN = "Openshift Unknown"
 
 RATE = {
@@ -160,18 +168,15 @@ def get_namespace_attributes():
     return namespaces_dict
 
 
-def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type):
+def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type, gpu_resource):
     """
     Returns the type of service unit, the count, and the determining resource
     """
     su_type = SU_UNKNOWN
     su_count = 0
 
-    if gpu_type == NO_GPU:
-        gpu_type = None
-
     # pods that requested a specific GPU but weren't scheduled may report 0 GPU
-    if gpu_type is not None and gpu_count == 0:
+    if gpu_resource is not None and gpu_count == 0:
         return SU_UNKNOWN_GPU, 0, "GPU"
 
     # pods in weird states
@@ -182,7 +187,12 @@ def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type):
         GPU_A100: SU_A100_GPU,
         GPU_A100_SXM4: SU_A100_SXM4_GPU,
         GPU_V100: SU_V100_GPU,
-        GPU_UNKNOWN_TYPE: SU_UNKNOWN_GPU,
+    }
+
+    A100_SXM4_MIG = {
+        MIG_1G_5GB: SU_UNKNOWN_MIG_GPU,
+        MIG_2G_10GB: SU_UNKNOWN_MIG_GPU,
+        MIG_3G_20GB: SU_UNKNOWN_MIG_GPU,
     }
 
     # GPU count for some configs is -1 for math reasons, in reality it is 0
@@ -192,13 +202,18 @@ def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type):
         SU_A100_SXM4_GPU: {"gpu": 1, "cpu": 32, "ram": 245},
         SU_V100_GPU: {"gpu": 1, "cpu": 24, "ram": 192},
         SU_UNKNOWN_GPU: {"gpu": 1, "cpu": 8, "ram": 64},
+        SU_UNKNOWN_MIG_GPU: {"gpu": 1, "cpu": 8, "ram": 64},
         SU_UNKNOWN: {"gpu": -1, "cpu": 1, "ram": 1},
     }
 
-    if gpu_type is None and gpu_count == 0:
+    if gpu_resource is None and gpu_count == 0:
         su_type = SU_CPU
-    else:
+    elif gpu_type is not None and gpu_resource == WHOLE_GPU:
         su_type = known_gpu_su.get(gpu_type, SU_UNKNOWN_GPU)
+    elif gpu_type == GPU_A100_SXM4: # for MIG GPU of type A100_SXM4
+        su_type = A100_SXM4_MIG.get(gpu_resource, SU_UNKNOWN_MIG_GPU)
+    else:
+        return SU_UNKNOWN_GPU, 0, "GPU"
 
     cpu_multiplier = cpu_count / su_config[su_type]["cpu"]
     gpu_multiplier = gpu_count / su_config[su_type]["gpu"]
@@ -230,14 +245,20 @@ def merge_metrics(metric_name, metric_list, output_dict):
     for metric in metric_list:
         pod = metric["metric"]["pod"]
         namespace = metric["metric"]["namespace"]
+        node = metric["metric"].get("node")
+
+        gpu_type = None
+        gpu_resource = None
+        node_model = None
+
         unique_name = namespace + "+" + pod
         if unique_name not in output_dict:
-            output_dict[unique_name] = {"namespace": metric["metric"]["namespace"], "metrics": {}}
+            output_dict[unique_name] = {"namespace": namespace, "metrics": {}}
 
         if metric_name == "gpu_request":
             gpu_type = metric["metric"].get("label_nvidia_com_gpu_product", GPU_UNKNOWN_TYPE)
-        else:
-            gpu_type = None
+            gpu_resource = metric["metric"].get("resource")
+            node_model = metric["metric"].get("label_nvidia_com_gpu_machine")
 
         for value in metric["values"]:
             epoch_time = value[0]
@@ -246,6 +267,12 @@ def merge_metrics(metric_name, metric_list, output_dict):
             output_dict[unique_name]["metrics"][epoch_time][metric_name] = value[1]
             if gpu_type:
                 output_dict[unique_name]["metrics"][epoch_time]['gpu_type'] = gpu_type
+            if gpu_resource:
+                output_dict[unique_name]["metrics"][epoch_time]['gpu_resource'] = gpu_resource
+            if node_model:
+                output_dict[unique_name]["metrics"][epoch_time]['node_model'] = node_model
+            if node:
+                output_dict[unique_name]["metrics"][epoch_time]['node'] = node
 
     return output_dict
 
@@ -402,9 +429,10 @@ def write_metrics_by_namespace(condensed_metrics_dict, file_name, report_month):
             cpu_request = float(pod_metric_dict.get("cpu_request", 0))
             gpu_request = float(pod_metric_dict.get("gpu_request", 0))
             gpu_type = pod_metric_dict.get("gpu_type")
+            gpu_resource = pod_metric_dict.get("gpu_resource")
             memory_request = float(pod_metric_dict.get("memory_request", 0)) / 2**30
 
-            _, su_count, _ = get_service_unit(cpu_request, memory_request, gpu_request, gpu_type)
+            _, su_count, _ = get_service_unit(cpu_request, memory_request, gpu_request, gpu_type, gpu_resource)
 
             if gpu_type == GPU_A100:
                 metrics_by_namespace[namespace]["SU_A100_GPU_HOURS"] += su_count * duration_in_hours
@@ -465,6 +493,9 @@ def write_metrics_by_pod(metrics_dict, file_name):
         "CPU Request",
         "GPU Request",
         "GPU Type",
+        "GPU Resource",
+        "Node",
+        "Node Model",
         "Memory Request (GiB)",
         "Determining Resource",
         "SU Type",
@@ -489,10 +520,13 @@ def write_metrics_by_pod(metrics_dict, file_name):
             duration = round(float(pod_metric_dict["duration"]) / 3600, 4)
             cpu_request = pod_metric_dict.get("cpu_request", 0)
             gpu_request = pod_metric_dict.get("gpu_request", 0)
-            gpu_type = pod_metric_dict.get("gpu_type", NO_GPU)
+            gpu_type = pod_metric_dict.get("gpu_type")
+            gpu_resource = pod_metric_dict.get("gpu_resource")
+            node = pod_metric_dict.get("node", "Unknown Node")
+            node_model = pod_metric_dict.get("node_model", "Unknown Model")
             memory_request = round(float(pod_metric_dict.get("memory_request", 0)) / 2**30, 4)
             su_type, su_count, determining_resource = get_service_unit(
-                float(cpu_request), memory_request, float(gpu_request), gpu_type
+                float(cpu_request), memory_request, float(gpu_request), gpu_type, gpu_resource
             )
 
             info_list = [
@@ -506,6 +540,9 @@ def write_metrics_by_pod(metrics_dict, file_name):
                 cpu_request,
                 gpu_request,
                 gpu_type,
+                gpu_resource,
+                node,
+                node_model,
                 memory_request,
                 determining_resource,
                 su_type,