Skip to content

Commit

Permalink
Merge pull request #58 from naved001/prepare-for-mig
Browse files Browse the repository at this point in the history
Prepare for mig
  • Loading branch information
naved001 authored Apr 12, 2024
2 parents a828c80 + 29d9eee commit ba268c8
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 46 deletions.
2 changes: 1 addition & 1 deletion openshift_metrics/openshift_prometheus_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

# For GPU requests, we don't need to exclude unscheduled pods because the join on node will eliminate those as unscheduled
# pods don't have a node value
GPU_REQUEST = 'kube_pod_resource_request{resource=~".*gpu.*"} * on(node) group_left(label_nvidia_com_gpu_product) kube_node_labels'
GPU_REQUEST = 'kube_pod_resource_request{resource=~"nvidia.com.*"} * on(node) group_left(label_nvidia_com_gpu_product, label_nvidia_com_gpu_machine) kube_node_labels'


def main():
Expand Down
89 changes: 59 additions & 30 deletions openshift_metrics/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,17 +416,20 @@ def test_merge_metrics_not_empty_with_gpu(self):
0: {
"cpu": 10,
"gpu_request": 1,
"gpu_type": "Tesla-V100-PCIE-32GB"
"gpu_type": "Tesla-V100-PCIE-32GB",
"gpu_resource": "nvidia.com/gpu",
},
60: {
"cpu": 15,
"gpu_request": 1,
"gpu_type": "Tesla-V100-PCIE-32GB"
"gpu_type": "Tesla-V100-PCIE-32GB",
"gpu_resource": "nvidia.com/gpu",
},
120: {
"cpu": 20,
"gpu_request": 2,
"gpu_type": "Tesla-V100-PCIE-32GB"
"gpu_type": "Tesla-V100-PCIE-32GB",
"gpu_resource": "nvidia.com/gpu",
},
}
},
Expand Down Expand Up @@ -791,23 +794,25 @@ def test_write_metrics_log(self, mock_gna):
test_metrics_dict = {
"pod1": {
"namespace": "namespace1",
"gpu_type": utils.NO_GPU,
"metrics": {
0: {
"cpu_request": 10,
"memory_request": 1048576,
"duration": 120
"duration": 120,
"node": "wrk-1",
"node_model": "Dell",
},
120: {
"cpu_request": 20,
"memory_request": 1048576,
"duration": 60
"duration": 60,
"node": "wrk-2",
"node_model": "Lenovo"
}
}
},
"pod2": {
"namespace": "namespace1",
"gpu_type": utils.NO_GPU,
"metrics": {
0: {
"cpu_request": 20,
Expand All @@ -828,7 +833,6 @@ def test_write_metrics_log(self, mock_gna):
},
"pod3": {
"namespace": "namespace2",
"gpu_type": utils.NO_GPU,
"metrics": {
0: {
"cpu_request": 45,
Expand All @@ -839,7 +843,6 @@ def test_write_metrics_log(self, mock_gna):
},
"pod4": { # this results in 0.5 SU
"namespace": "namespace2",
"gpu_type": utils.NO_GPU,
"metrics": {
0: {
"cpu_request": 0.5,
Expand All @@ -850,14 +853,14 @@ def test_write_metrics_log(self, mock_gna):
},
}

expected_output = ("Namespace,Coldfront_PI Name,Coldfront Project ID ,Pod Start Time,Pod End Time,Duration (Hours),Pod Name,CPU Request,GPU Request,GPU Type,Memory Request (GiB),Determining Resource,SU Type,SU Count\n"
"namespace1,PI1,123,1970-01-01T00:00:00,1970-01-01T00:02:00,0.0333,pod1,10,0,No GPU,0.001,CPU,OpenShift CPU,10.0\n"
"namespace1,PI1,123,1970-01-01T00:02:00,1970-01-01T00:03:00,0.0167,pod1,20,0,No GPU,0.001,CPU,OpenShift CPU,20.0\n"
"namespace1,PI1,123,1970-01-01T00:00:00,1970-01-01T00:01:00,0.0167,pod2,20,0,No GPU,0.0098,CPU,OpenShift CPU,20.0\n"
"namespace1,PI1,123,1970-01-01T00:01:00,1970-01-01T00:02:00,0.0167,pod2,25,0,No GPU,0.0098,CPU,OpenShift CPU,25.0\n"
"namespace1,PI1,123,1970-01-01T00:02:00,1970-01-01T00:03:00,0.0167,pod2,20,0,No GPU,0.0098,CPU,OpenShift CPU,20.0\n"
"namespace2,PI2,456,1970-01-01T00:00:00,1970-01-01T00:03:00,0.05,pod3,45,0,No GPU,0.0977,CPU,OpenShift CPU,45.0\n"
"namespace2,PI2,456,1970-01-01T00:00:00,1970-01-01T01:00:00,1.0,pod4,0.5,0,No GPU,2.0,CPU,OpenShift CPU,0.5\n")
expected_output = ("Namespace,Coldfront_PI Name,Coldfront Project ID ,Pod Start Time,Pod End Time,Duration (Hours),Pod Name,CPU Request,GPU Request,GPU Type,GPU Resource,Node,Node Model,Memory Request (GiB),Determining Resource,SU Type,SU Count\n"
"namespace1,PI1,123,1970-01-01T00:00:00,1970-01-01T00:02:00,0.0333,pod1,10,0,,,wrk-1,Dell,0.001,CPU,OpenShift CPU,10.0\n"
"namespace1,PI1,123,1970-01-01T00:02:00,1970-01-01T00:03:00,0.0167,pod1,20,0,,,wrk-2,Lenovo,0.001,CPU,OpenShift CPU,20.0\n"
"namespace1,PI1,123,1970-01-01T00:00:00,1970-01-01T00:01:00,0.0167,pod2,20,0,,,Unknown Node,Unknown Model,0.0098,CPU,OpenShift CPU,20.0\n"
"namespace1,PI1,123,1970-01-01T00:01:00,1970-01-01T00:02:00,0.0167,pod2,25,0,,,Unknown Node,Unknown Model,0.0098,CPU,OpenShift CPU,25.0\n"
"namespace1,PI1,123,1970-01-01T00:02:00,1970-01-01T00:03:00,0.0167,pod2,20,0,,,Unknown Node,Unknown Model,0.0098,CPU,OpenShift CPU,20.0\n"
"namespace2,PI2,456,1970-01-01T00:00:00,1970-01-01T00:03:00,0.05,pod3,45,0,,,Unknown Node,Unknown Model,0.0977,CPU,OpenShift CPU,45.0\n"
"namespace2,PI2,456,1970-01-01T00:00:00,1970-01-01T01:00:00,1.0,pod4,0.5,0,,,Unknown Node,Unknown Model,2.0,CPU,OpenShift CPU,0.5\n")

with tempfile.NamedTemporaryFile(mode="w+") as tmp:
utils.write_metrics_by_pod(test_metrics_dict, tmp.name)
Expand Down Expand Up @@ -928,6 +931,7 @@ def test_write_metrics_log(self, mock_gna):
"memory_request": 8 * 2**30,
"gpu_request": 1,
"gpu_type": utils.GPU_A100,
"gpu_resource": utils.WHOLE_GPU,
"duration": 172700 # little under 48 hours, expect to be rounded up in the output
},
}
Expand All @@ -941,6 +945,7 @@ def test_write_metrics_log(self, mock_gna):
"memory_request": 8 * 2**30,
"gpu_request": 1,
"gpu_type": utils.GPU_A100_SXM4,
"gpu_resource": utils.WHOLE_GPU,
"duration": 172800
},
}
Expand All @@ -961,79 +966,103 @@ def test_write_metrics_log(self, mock_gna):
class TestGetServiceUnit(TestCase):

def test_cpu_only(self):
su_type, su_count, determining_resource = utils.get_service_unit(4, 16, 0, None)
su_type, su_count, determining_resource = utils.get_service_unit(4, 16, 0, None, None)
self.assertEqual(su_type, utils.SU_CPU)
self.assertEqual(su_count, 4)
self.assertEqual(determining_resource, "CPU")

def test_known_gpu(self):
su_type, su_count, determining_resource = utils.get_service_unit(24, 74, 1, utils.GPU_A100)
su_type, su_count, determining_resource = utils.get_service_unit(24, 74, 1, utils.GPU_A100, utils.WHOLE_GPU)
self.assertEqual(su_type, utils.SU_A100_GPU)
self.assertEqual(su_count, 1)
self.assertEqual(determining_resource, "GPU")

def test_known_gpu_A100_SXM4(self):
su_type, su_count, determining_resource = utils.get_service_unit(32, 245, 1, utils.GPU_A100_SXM4)
su_type, su_count, determining_resource = utils.get_service_unit(32, 245, 1, utils.GPU_A100_SXM4, utils.WHOLE_GPU)
self.assertEqual(su_type, utils.SU_A100_SXM4_GPU)
self.assertEqual(su_count, 1)
self.assertEqual(determining_resource, "GPU")

def test_known_gpu_high_cpu(self):
su_type, su_count, determining_resource = utils.get_service_unit(50, 96, 1, utils.GPU_A100)
su_type, su_count, determining_resource = utils.get_service_unit(50, 96, 1, utils.GPU_A100, utils.WHOLE_GPU)
self.assertEqual(su_type, utils.SU_A100_GPU)
self.assertEqual(su_count, 3)
self.assertEqual(determining_resource, "CPU")

def test_known_gpu_high_memory(self):
su_type, su_count, determining_resource = utils.get_service_unit(24, 100, 1, utils.GPU_A100)
su_type, su_count, determining_resource = utils.get_service_unit(24, 100, 1, utils.GPU_A100, utils.WHOLE_GPU)
self.assertEqual(su_type, utils.SU_A100_GPU)
self.assertEqual(su_count, 2)
self.assertEqual(determining_resource, "RAM")

def test_known_gpu_low_cpu_memory(self):
su_type, su_count, determining_resource = utils.get_service_unit(2, 4, 1, utils.GPU_A100)
su_type, su_count, determining_resource = utils.get_service_unit(2, 4, 1, utils.GPU_A100, utils.WHOLE_GPU)
self.assertEqual(su_type, utils.SU_A100_GPU)
self.assertEqual(su_count, 1)
self.assertEqual(determining_resource, "GPU")

def test_unknown_gpu(self):
su_type, su_count, determining_resource = utils.get_service_unit(8, 64, 1, "Unknown_GPU_Type")
su_type, su_count, determining_resource = utils.get_service_unit(8, 64, 1, "Unknown_GPU_Type", utils.WHOLE_GPU)
self.assertEqual(su_type, utils.SU_UNKNOWN_GPU)
self.assertEqual(su_count, 1)
self.assertEqual(determining_resource, "GPU")

def test_known_gpu_zero_count(self):
su_type, su_count, determining_resource = utils.get_service_unit(8, 64, 0, utils.GPU_A100, utils.WHOLE_GPU)
self.assertEqual(su_type, utils.SU_UNKNOWN_GPU)
self.assertEqual(su_count, 0)
self.assertEqual(determining_resource, "GPU")

def test_known_mig_gpu(self):
su_type, su_count, determining_resource = utils.get_service_unit(1, 4, 1, utils.GPU_A100_SXM4, utils.MIG_1G_5GB)
self.assertEqual(su_type, utils.SU_UNKNOWN_MIG_GPU)
self.assertEqual(su_count, 1)
self.assertEqual(determining_resource, "GPU")

def test_known_gpu_unknown_resource(self):
su_type, su_count, determining_resource = utils.get_service_unit(1, 4, 1, utils.GPU_A100, "nvidia.com/mig_20G_500GB")
self.assertEqual(su_type, utils.SU_UNKNOWN_GPU)
self.assertEqual(su_count, 0)
self.assertEqual(determining_resource, "GPU")

def test_unknown_gpu_known_resource(self):
su_type, su_count, determining_resource = utils.get_service_unit(1, 4, 1, "Unknown GPU", utils.MIG_2G_10GB)
self.assertEqual(su_type, utils.SU_UNKNOWN_GPU)
self.assertEqual(su_count, 0)
self.assertEqual(determining_resource, "GPU")

def test_zero_memory(self):
su_type, su_count, determining_resource = utils.get_service_unit(1, 0, 0, None)
su_type, su_count, determining_resource = utils.get_service_unit(1, 0, 0, None, None)
self.assertEqual(su_type, utils.SU_UNKNOWN)
self.assertEqual(su_count, 0)
self.assertEqual(determining_resource, "CPU")

def test_zero_cpu(self):
su_type, su_count, determining_resource = utils.get_service_unit(0, 1, 0, None)
su_type, su_count, determining_resource = utils.get_service_unit(0, 1, 0, None, None)
self.assertEqual(su_type, utils.SU_UNKNOWN)
self.assertEqual(su_count, 0)
self.assertEqual(determining_resource, "CPU")

def test_memory_dominant(self):
su_type, su_count, determining_resource = utils.get_service_unit(8, 64, 0, None)
su_type, su_count, determining_resource = utils.get_service_unit(8, 64, 0, None, None)
self.assertEqual(su_type, utils.SU_CPU)
self.assertEqual(su_count, 16)
self.assertEqual(determining_resource, "RAM")

def test_fractional_su_cpu_dominant(self):
su_type, su_count, determining_resource = utils.get_service_unit(0.5, 0.5, 0, None)
su_type, su_count, determining_resource = utils.get_service_unit(0.5, 0.5, 0, None, None)
self.assertEqual(su_type, utils.SU_CPU)
self.assertEqual(su_count, 0.5)
self.assertEqual(determining_resource, "CPU")

def test_fractional_su_memory_dominant(self):
su_type, su_count, determining_resource = utils.get_service_unit(0.1, 1, 0, None)
su_type, su_count, determining_resource = utils.get_service_unit(0.1, 1, 0, None, None)
self.assertEqual(su_type, utils.SU_CPU)
self.assertEqual(su_count, 0.25)
self.assertEqual(determining_resource, "RAM")

def test_known_gpu_fractional_cpu_memory(self):
su_type, su_count, determining_resource = utils.get_service_unit(0.8, 0.8, 1, utils.GPU_A100)
su_type, su_count, determining_resource = utils.get_service_unit(0.8, 0.8, 1, utils.GPU_A100, utils.WHOLE_GPU)
self.assertEqual(su_type, utils.SU_A100_GPU)
self.assertEqual(su_count, 1)
self.assertEqual(determining_resource, "GPU")
67 changes: 52 additions & 15 deletions openshift_metrics/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,22 @@
GPU_A100_SXM4 = "NVIDIA-A100-SXM4-40GB"
GPU_V100 = "Tesla-V100-PCIE-32GB"
GPU_UNKNOWN_TYPE = "GPU_UNKNOWN_TYPE"
NO_GPU = "No GPU"

# GPU Resource - MIG Geometries
# A100 Strategies
MIG_1G_5GB = "nvidia.com/mig-1g.5gb"
MIG_2G_10GB = "nvidia.com/mig-2g.10gb"
MIG_3G_20GB = "nvidia.com/mig-3g.20gb"
WHOLE_GPU = "nvidia.com/gpu"


# SU Types
SU_CPU = "OpenShift CPU"
SU_A100_GPU = "OpenShift GPUA100"
SU_A100_SXM4_GPU = "OpenShift GPUA100SXM4"
SU_V100_GPU = "OpenShift GPUV100"
SU_UNKNOWN_GPU = "OpenShift Unknown GPU"
SU_UNKNOWN_MIG_GPU = "OpenShift Unknown MIG GPU"
SU_UNKNOWN = "Openshift Unknown"

RATE = {
Expand Down Expand Up @@ -160,18 +168,15 @@ def get_namespace_attributes():
return namespaces_dict


def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type):
def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type, gpu_resource):
"""
Returns the type of service unit, the count, and the determining resource
"""
su_type = SU_UNKNOWN
su_count = 0

if gpu_type == NO_GPU:
gpu_type = None

# pods that requested a specific GPU but weren't scheduled may report 0 GPU
if gpu_type is not None and gpu_count == 0:
if gpu_resource is not None and gpu_count == 0:
return SU_UNKNOWN_GPU, 0, "GPU"

# pods in weird states
Expand All @@ -182,7 +187,12 @@ def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type):
GPU_A100: SU_A100_GPU,
GPU_A100_SXM4: SU_A100_SXM4_GPU,
GPU_V100: SU_V100_GPU,
GPU_UNKNOWN_TYPE: SU_UNKNOWN_GPU,
}

A100_SXM4_MIG = {
MIG_1G_5GB: SU_UNKNOWN_MIG_GPU,
MIG_2G_10GB: SU_UNKNOWN_MIG_GPU,
MIG_3G_20GB: SU_UNKNOWN_MIG_GPU,
}

# GPU count for some configs is -1 for math reasons, in reality it is 0
Expand All @@ -192,13 +202,18 @@ def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type):
SU_A100_SXM4_GPU: {"gpu": 1, "cpu": 32, "ram": 245},
SU_V100_GPU: {"gpu": 1, "cpu": 24, "ram": 192},
SU_UNKNOWN_GPU: {"gpu": 1, "cpu": 8, "ram": 64},
SU_UNKNOWN_MIG_GPU: {"gpu": 1, "cpu": 8, "ram": 64},
SU_UNKNOWN: {"gpu": -1, "cpu": 1, "ram": 1},
}

if gpu_type is None and gpu_count == 0:
if gpu_resource is None and gpu_count == 0:
su_type = SU_CPU
else:
elif gpu_type is not None and gpu_resource == WHOLE_GPU:
su_type = known_gpu_su.get(gpu_type, SU_UNKNOWN_GPU)
elif gpu_type == GPU_A100_SXM4: # for MIG GPU of type A100_SXM4
su_type = A100_SXM4_MIG.get(gpu_resource, SU_UNKNOWN_MIG_GPU)
else:
return SU_UNKNOWN_GPU, 0, "GPU"

cpu_multiplier = cpu_count / su_config[su_type]["cpu"]
gpu_multiplier = gpu_count / su_config[su_type]["gpu"]
Expand Down Expand Up @@ -230,14 +245,20 @@ def merge_metrics(metric_name, metric_list, output_dict):
for metric in metric_list:
pod = metric["metric"]["pod"]
namespace = metric["metric"]["namespace"]
node = metric["metric"].get("node")

gpu_type = None
gpu_resource = None
node_model = None

unique_name = namespace + "+" + pod
if unique_name not in output_dict:
output_dict[unique_name] = {"namespace": metric["metric"]["namespace"], "metrics": {}}
output_dict[unique_name] = {"namespace": namespace, "metrics": {}}

if metric_name == "gpu_request":
gpu_type = metric["metric"].get("label_nvidia_com_gpu_product", GPU_UNKNOWN_TYPE)
else:
gpu_type = None
gpu_resource = metric["metric"].get("resource")
node_model = metric["metric"].get("label_nvidia_com_gpu_machine")

for value in metric["values"]:
epoch_time = value[0]
Expand All @@ -246,6 +267,12 @@ def merge_metrics(metric_name, metric_list, output_dict):
output_dict[unique_name]["metrics"][epoch_time][metric_name] = value[1]
if gpu_type:
output_dict[unique_name]["metrics"][epoch_time]['gpu_type'] = gpu_type
if gpu_resource:
output_dict[unique_name]["metrics"][epoch_time]['gpu_resource'] = gpu_resource
if node_model:
output_dict[unique_name]["metrics"][epoch_time]['node_model'] = node_model
if node:
output_dict[unique_name]["metrics"][epoch_time]['node'] = node

return output_dict

Expand Down Expand Up @@ -402,9 +429,10 @@ def write_metrics_by_namespace(condensed_metrics_dict, file_name, report_month):
cpu_request = float(pod_metric_dict.get("cpu_request", 0))
gpu_request = float(pod_metric_dict.get("gpu_request", 0))
gpu_type = pod_metric_dict.get("gpu_type")
gpu_resource = pod_metric_dict.get("gpu_resource")
memory_request = float(pod_metric_dict.get("memory_request", 0)) / 2**30

_, su_count, _ = get_service_unit(cpu_request, memory_request, gpu_request, gpu_type)
_, su_count, _ = get_service_unit(cpu_request, memory_request, gpu_request, gpu_type, gpu_resource)

if gpu_type == GPU_A100:
metrics_by_namespace[namespace]["SU_A100_GPU_HOURS"] += su_count * duration_in_hours
Expand Down Expand Up @@ -465,6 +493,9 @@ def write_metrics_by_pod(metrics_dict, file_name):
"CPU Request",
"GPU Request",
"GPU Type",
"GPU Resource",
"Node",
"Node Model",
"Memory Request (GiB)",
"Determining Resource",
"SU Type",
Expand All @@ -489,10 +520,13 @@ def write_metrics_by_pod(metrics_dict, file_name):
duration = round(float(pod_metric_dict["duration"]) / 3600, 4)
cpu_request = pod_metric_dict.get("cpu_request", 0)
gpu_request = pod_metric_dict.get("gpu_request", 0)
gpu_type = pod_metric_dict.get("gpu_type", NO_GPU)
gpu_type = pod_metric_dict.get("gpu_type")
gpu_resource = pod_metric_dict.get("gpu_resource")
node = pod_metric_dict.get("node", "Unknown Node")
node_model = pod_metric_dict.get("node_model", "Unknown Model")
memory_request = round(float(pod_metric_dict.get("memory_request", 0)) / 2**30, 4)
su_type, su_count, determining_resource = get_service_unit(
float(cpu_request), memory_request, float(gpu_request), gpu_type
float(cpu_request), memory_request, float(gpu_request), gpu_type, gpu_resource
)

info_list = [
Expand All @@ -506,6 +540,9 @@ def write_metrics_by_pod(metrics_dict, file_name):
cpu_request,
gpu_request,
gpu_type,
gpu_resource,
node,
node_model,
memory_request,
determining_resource,
su_type,
Expand Down

0 comments on commit ba268c8

Please sign in to comment.