Skip to content

Commit

Permalink
Merge pull request #21 from naved001/new-maths
Browse files Browse the repository at this point in the history
Update the way we calculate SU*Hours for a project.
  • Loading branch information
naved001 authored Dec 15, 2023
2 parents ad5fa7e + a69450e commit aca59a0
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 56 deletions.
1 change: 0 additions & 1 deletion .github/workflows/unit-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ jobs:
- name: Install dependencies
run: |
pip install -r requirements.txt
pip install -r test-requirements.txt
- name: Run unit tests
run: |
Expand Down
72 changes: 49 additions & 23 deletions openshift_metrics/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,8 @@
# under the License.
#

import mock
import requests
import tempfile
import time
from unittest import TestCase
from unittest import TestCase, mock

from openshift_metrics import utils
import openshift as oc
Expand All @@ -24,7 +21,8 @@
class TestQueryMetric(TestCase):

@mock.patch('requests.get')
def test_query_metric(self, mock_get):
@mock.patch('time.sleep')
def test_query_metric(self, mock_sleep, mock_get):
mock_response = mock.Mock(status_code=200)
mock_response.json.return_value = {"data": {
"result": "this is data"
Expand All @@ -36,7 +34,8 @@ def test_query_metric(self, mock_get):
self.assertEqual(mock_get.call_count, 1)

@mock.patch('requests.get')
def test_query_metric_exception(self, mock_get):
@mock.patch('time.sleep')
def test_query_metric_exception(self, mock_sleep, mock_get):
mock_get.return_value = mock.Mock(status_code=404)

self.assertRaises(Exception, utils.query_metric, 'fake-url', 'fake-token',
Expand Down Expand Up @@ -464,21 +463,32 @@ def test_write_metrics_log(self, mock_gna):
},
}
},
"pod4": { # this results in 0.5 SU
"namespace": "namespace2",
"gpu_type": utils.NO_GPU,
"metrics": {
0: {
"cpu_request": 0.5,
"memory_request": 2147483648,
"duration": 3600
},
}
},
}

expected_output = ("Namespace,Coldfront_PI Name,Coldfront Project ID ,Pod Start Time,Pod End Time,Duration (Hours),Pod Name,CPU Request,GPU Request,GPU Type,Memory Request (GiB),Determining Resource,SU Type,SU Count\n"
"namespace1,PI1,123,1970-01-01T00:00:00,1970-01-01T00:02:00,0.0333,pod1,10,0,No GPU,0.001,CPU,OpenShift CPU,10\n"
"namespace1,PI1,123,1970-01-01T00:02:00,1970-01-01T00:03:00,0.0167,pod1,20,0,No GPU,0.001,CPU,OpenShift CPU,20\n"
"namespace1,PI1,123,1970-01-01T00:00:00,1970-01-01T00:01:00,0.0167,pod2,20,0,No GPU,0.0098,CPU,OpenShift CPU,20\n"
"namespace1,PI1,123,1970-01-01T00:01:00,1970-01-01T00:02:00,0.0167,pod2,25,0,No GPU,0.0098,CPU,OpenShift CPU,25\n"
"namespace1,PI1,123,1970-01-01T00:02:00,1970-01-01T00:03:00,0.0167,pod2,20,0,No GPU,0.0098,CPU,OpenShift CPU,20\n"
"namespace2,PI2,456,1970-01-01T00:00:00,1970-01-01T00:03:00,0.05,pod3,45,0,No GPU,0.0977,CPU,OpenShift CPU,45\n")

tmp_file_name = "%s/test-metrics-%s.log" % (tempfile.gettempdir(), time.time())
utils.write_metrics_by_pod(test_metrics_dict, tmp_file_name)
f = open(tmp_file_name, "r")
self.assertEqual(f.read(), expected_output)
f.close()
"namespace1,PI1,123,1970-01-01T00:00:00,1970-01-01T00:02:00,0.0333,pod1,10,0,No GPU,0.001,CPU,OpenShift CPU,10.0\n"
"namespace1,PI1,123,1970-01-01T00:02:00,1970-01-01T00:03:00,0.0167,pod1,20,0,No GPU,0.001,CPU,OpenShift CPU,20.0\n"
"namespace1,PI1,123,1970-01-01T00:00:00,1970-01-01T00:01:00,0.0167,pod2,20,0,No GPU,0.0098,CPU,OpenShift CPU,20.0\n"
"namespace1,PI1,123,1970-01-01T00:01:00,1970-01-01T00:02:00,0.0167,pod2,25,0,No GPU,0.0098,CPU,OpenShift CPU,25.0\n"
"namespace1,PI1,123,1970-01-01T00:02:00,1970-01-01T00:03:00,0.0167,pod2,20,0,No GPU,0.0098,CPU,OpenShift CPU,20.0\n"
"namespace2,PI2,456,1970-01-01T00:00:00,1970-01-01T00:03:00,0.05,pod3,45,0,No GPU,0.0977,CPU,OpenShift CPU,45.0\n"
"namespace2,PI2,456,1970-01-01T00:00:00,1970-01-01T01:00:00,1.0,pod4,0.5,0,No GPU,2.0,CPU,OpenShift CPU,0.5\n")

with tempfile.NamedTemporaryFile(mode="w+") as tmp:
utils.write_metrics_by_pod(test_metrics_dict, tmp.name)
self.assertEqual(tmp.read(), expected_output)


class TestWriteMetricsByNamespace(TestCase):

Expand Down Expand Up @@ -570,11 +580,9 @@ def test_write_metrics_log(self, mock_gna):
"2023-01,namespace2,namespace2,PI2,,,,,48,OpenShift GPUA100,1.803,86.544\n"
"2023-01,namespace2,namespace2,PI2,,,,,144,OpenShift GPUA2,0.466,67.104\n")

tmp_file_name = "%s/test-metrics-%s.log" % (tempfile.gettempdir(), time.time())
utils.write_metrics_by_namespace(test_metrics_dict, tmp_file_name, "2023-01")
f = open(tmp_file_name, "r")
self.assertEqual(f.read(), expected_output)
f.close()
with tempfile.NamedTemporaryFile(mode="w+") as tmp:
utils.write_metrics_by_namespace(test_metrics_dict, tmp.name, "2023-01")
self.assertEqual(tmp.read(), expected_output)


class TestGetServiceUnit(TestCase):
Expand Down Expand Up @@ -632,3 +640,21 @@ def test_memory_dominant(self):
self.assertEqual(su_type, utils.SU_CPU)
self.assertEqual(su_count, 16)
self.assertEqual(determining_resource, "RAM")

def test_fractional_su_cpu_dominant(self):
su_type, su_count, determining_resource = utils.get_service_unit(0.5, 0.5, 0, None)
self.assertEqual(su_type, utils.SU_CPU)
self.assertEqual(su_count, 0.5)
self.assertEqual(determining_resource, "CPU")

def test_fractional_su_memory_dominant(self):
su_type, su_count, determining_resource = utils.get_service_unit(0.1, 1, 0, None)
self.assertEqual(su_type, utils.SU_CPU)
self.assertEqual(su_count, 0.25)
self.assertEqual(determining_resource, "RAM")

def test_known_gpu_fractional_cpu_memory(self):
su_type, su_count, determining_resource = utils.get_service_unit(0.8, 0.8, 1, utils.GPU_A100)
self.assertEqual(su_type, utils.SU_A100_GPU)
self.assertEqual(su_count, 1)
self.assertEqual(determining_resource, "GPU")
42 changes: 11 additions & 31 deletions openshift_metrics/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type):
"""
su_type = SU_UNKNOWN
su_count = 0

if gpu_type == NO_GPU:
gpu_type = None

Expand Down Expand Up @@ -132,14 +133,15 @@ def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type):
else:
su_type = known_gpu_su.get(gpu_type, SU_UNKNOWN_GPU)

# because openshift offers fractional CPUs, so we round it up.
cpu_count = math.ceil(cpu_count)

cpu_multiplier = cpu_count / su_config[su_type]["cpu"]
gpu_multiplier = gpu_count / su_config[su_type]["gpu"]
memory_multiplier = math.ceil(memory_count / su_config[su_type]["ram"])
memory_multiplier = memory_count / su_config[su_type]["ram"]

su_count = max(cpu_multiplier, gpu_multiplier, memory_multiplier)

su_count = math.ceil(max(cpu_multiplier, gpu_multiplier, memory_multiplier))
# no fractional SUs for GPU SUs
if su_type != SU_CPU:
su_count = math.ceil(su_count)

if gpu_multiplier >= cpu_multiplier and gpu_multiplier >= memory_multiplier:
determining_resource = "GPU"
Expand Down Expand Up @@ -227,11 +229,6 @@ def csv_writer(rows, file_name):
def write_metrics_by_namespace(condensed_metrics_dict, file_name, report_month):
"""
Process metrics dictionary to aggregate usage by namespace and then write that to a file
It sums up the cpu and memory resources for all non-gpu pods per project and then calculates
service units on the total.
For GPU resources, it relies on the `get_service_unit` method to get the SU count.
"""
metrics_by_namespace = {}
rows = []
Expand Down Expand Up @@ -279,34 +276,18 @@ def write_metrics_by_namespace(condensed_metrics_dict, file_name, report_month):
gpu_request = float(pod_metric_dict.get("gpu_request", 0))
memory_request = float(pod_metric_dict.get("memory_request", 0)) / 2**30

_, su_count, _ = get_service_unit(cpu_request, memory_request, gpu_request, gpu_type)

if gpu_type == GPU_A100:
_, su_count, _ = get_service_unit(
float(cpu_request), memory_request, float(gpu_request), gpu_type
)
metrics_by_namespace[namespace]["SU_A100_GPU_HOURS"] += su_count * duration_in_hours
elif gpu_type == GPU_A2:
_, su_count, _ = get_service_unit(
float(cpu_request), memory_request, float(gpu_request), gpu_type
)
metrics_by_namespace[namespace]["SU_A2_GPU_HOURS"] += su_count * duration_in_hours
elif gpu_type == GPU_GENERIC:
_, su_count, _ = get_service_unit(
float(cpu_request), memory_request, float(gpu_request), gpu_type
)
metrics_by_namespace[namespace]["SU_UNKNOWN_GPU_HOURS"] += su_count * duration_in_hours
else:
metrics_by_namespace[namespace]["_cpu_hours"] += cpu_request * duration_in_hours
metrics_by_namespace[namespace]["_memory_hours"] += (
memory_request * duration_in_hours
)
metrics_by_namespace[namespace]["SU_CPU_HOURS"] += su_count * duration_in_hours

for namespace, metrics in metrics_by_namespace.items():
cpu_multiplier = metrics["_cpu_hours"] / 1
memory_multiplier = metrics["_memory_hours"] / 4

su_count_hours = math.ceil(max(cpu_multiplier, memory_multiplier))

metrics["SU_CPU_HOURS"] += su_count_hours

if metrics["SU_CPU_HOURS"] != 0:
row = [
Expand All @@ -318,7 +299,7 @@ def write_metrics_by_namespace(condensed_metrics_dict, file_name, report_month):
"", #Invoice Address
"", #Institution
"", #Institution - Specific Code
str(metrics["SU_CPU_HOURS"]),
str(math.ceil(metrics["SU_CPU_HOURS"])),
SU_CPU,
str(RATE.get(SU_CPU)),
str(RATE.get(SU_CPU) * metrics["SU_CPU_HOURS"])
Expand Down Expand Up @@ -463,5 +444,4 @@ def write_metrics_by_pod(metrics_dict, file_name):
]

rows.append(info_list)

csv_writer(rows, file_name)
1 change: 0 additions & 1 deletion test-requirements.txt

This file was deleted.

0 comments on commit aca59a0

Please sign in to comment.