Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update the way we calculate SU*Hours for a project. #21

Merged
merged 2 commits into from
Dec 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/unit-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ jobs:
- name: Install dependencies
run: |
pip install -r requirements.txt
pip install -r test-requirements.txt

- name: Run unit tests
run: |
Expand Down
72 changes: 49 additions & 23 deletions openshift_metrics/tests/test_utils.py
naved001 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,8 @@
# under the License.
#

import mock
import requests
import tempfile
import time
from unittest import TestCase
from unittest import TestCase, mock

from openshift_metrics import utils
import openshift as oc
Expand All @@ -24,7 +21,8 @@
class TestQueryMetric(TestCase):

@mock.patch('requests.get')
def test_query_metric(self, mock_get):
@mock.patch('time.sleep')
def test_query_metric(self, mock_sleep, mock_get):
mock_response = mock.Mock(status_code=200)
mock_response.json.return_value = {"data": {
"result": "this is data"
Expand All @@ -36,7 +34,8 @@ def test_query_metric(self, mock_get):
self.assertEqual(mock_get.call_count, 1)

@mock.patch('requests.get')
def test_query_metric_exception(self, mock_get):
@mock.patch('time.sleep')
def test_query_metric_exception(self, mock_sleep, mock_get):
mock_get.return_value = mock.Mock(status_code=404)

self.assertRaises(Exception, utils.query_metric, 'fake-url', 'fake-token',
Expand Down Expand Up @@ -464,21 +463,32 @@ def test_write_metrics_log(self, mock_gna):
},
}
},
"pod4": { # this results in 0.5 SU
"namespace": "namespace2",
"gpu_type": utils.NO_GPU,
"metrics": {
0: {
"cpu_request": 0.5,
"memory_request": 2147483648,
"duration": 3600
},
}
},
}

expected_output = ("Namespace,Coldfront_PI Name,Coldfront Project ID ,Pod Start Time,Pod End Time,Duration (Hours),Pod Name,CPU Request,GPU Request,GPU Type,Memory Request (GiB),Determining Resource,SU Type,SU Count\n"
"namespace1,PI1,123,1970-01-01T00:00:00,1970-01-01T00:02:00,0.0333,pod1,10,0,No GPU,0.001,CPU,OpenShift CPU,10\n"
"namespace1,PI1,123,1970-01-01T00:02:00,1970-01-01T00:03:00,0.0167,pod1,20,0,No GPU,0.001,CPU,OpenShift CPU,20\n"
"namespace1,PI1,123,1970-01-01T00:00:00,1970-01-01T00:01:00,0.0167,pod2,20,0,No GPU,0.0098,CPU,OpenShift CPU,20\n"
"namespace1,PI1,123,1970-01-01T00:01:00,1970-01-01T00:02:00,0.0167,pod2,25,0,No GPU,0.0098,CPU,OpenShift CPU,25\n"
"namespace1,PI1,123,1970-01-01T00:02:00,1970-01-01T00:03:00,0.0167,pod2,20,0,No GPU,0.0098,CPU,OpenShift CPU,20\n"
"namespace2,PI2,456,1970-01-01T00:00:00,1970-01-01T00:03:00,0.05,pod3,45,0,No GPU,0.0977,CPU,OpenShift CPU,45\n")

tmp_file_name = "%s/test-metrics-%s.log" % (tempfile.gettempdir(), time.time())
utils.write_metrics_by_pod(test_metrics_dict, tmp_file_name)
f = open(tmp_file_name, "r")
self.assertEqual(f.read(), expected_output)
f.close()
"namespace1,PI1,123,1970-01-01T00:00:00,1970-01-01T00:02:00,0.0333,pod1,10,0,No GPU,0.001,CPU,OpenShift CPU,10.0\n"
"namespace1,PI1,123,1970-01-01T00:02:00,1970-01-01T00:03:00,0.0167,pod1,20,0,No GPU,0.001,CPU,OpenShift CPU,20.0\n"
"namespace1,PI1,123,1970-01-01T00:00:00,1970-01-01T00:01:00,0.0167,pod2,20,0,No GPU,0.0098,CPU,OpenShift CPU,20.0\n"
"namespace1,PI1,123,1970-01-01T00:01:00,1970-01-01T00:02:00,0.0167,pod2,25,0,No GPU,0.0098,CPU,OpenShift CPU,25.0\n"
"namespace1,PI1,123,1970-01-01T00:02:00,1970-01-01T00:03:00,0.0167,pod2,20,0,No GPU,0.0098,CPU,OpenShift CPU,20.0\n"
"namespace2,PI2,456,1970-01-01T00:00:00,1970-01-01T00:03:00,0.05,pod3,45,0,No GPU,0.0977,CPU,OpenShift CPU,45.0\n"
"namespace2,PI2,456,1970-01-01T00:00:00,1970-01-01T01:00:00,1.0,pod4,0.5,0,No GPU,2.0,CPU,OpenShift CPU,0.5\n")

with tempfile.NamedTemporaryFile(mode="w+") as tmp:
utils.write_metrics_by_pod(test_metrics_dict, tmp.name)
self.assertEqual(tmp.read(), expected_output)


class TestWriteMetricsByNamespace(TestCase):

Expand Down Expand Up @@ -570,11 +580,9 @@ def test_write_metrics_log(self, mock_gna):
"2023-01,namespace2,namespace2,PI2,,,,,48,OpenShift GPUA100,1.803,86.544\n"
"2023-01,namespace2,namespace2,PI2,,,,,144,OpenShift GPUA2,0.466,67.104\n")

tmp_file_name = "%s/test-metrics-%s.log" % (tempfile.gettempdir(), time.time())
utils.write_metrics_by_namespace(test_metrics_dict, tmp_file_name, "2023-01")
f = open(tmp_file_name, "r")
self.assertEqual(f.read(), expected_output)
f.close()
with tempfile.NamedTemporaryFile(mode="w+") as tmp:
utils.write_metrics_by_namespace(test_metrics_dict, tmp.name, "2023-01")
self.assertEqual(tmp.read(), expected_output)

larsks marked this conversation as resolved.
Show resolved Hide resolved

class TestGetServiceUnit(TestCase):
Expand Down Expand Up @@ -632,3 +640,21 @@ def test_memory_dominant(self):
self.assertEqual(su_type, utils.SU_CPU)
self.assertEqual(su_count, 16)
self.assertEqual(determining_resource, "RAM")

def test_fractional_su_cpu_dominant(self):
su_type, su_count, determining_resource = utils.get_service_unit(0.5, 0.5, 0, None)
self.assertEqual(su_type, utils.SU_CPU)
self.assertEqual(su_count, 0.5)
self.assertEqual(determining_resource, "CPU")

def test_fractional_su_memory_dominant(self):
su_type, su_count, determining_resource = utils.get_service_unit(0.1, 1, 0, None)
self.assertEqual(su_type, utils.SU_CPU)
self.assertEqual(su_count, 0.25)
self.assertEqual(determining_resource, "RAM")

def test_known_gpu_fractional_cpu_memory(self):
su_type, su_count, determining_resource = utils.get_service_unit(0.8, 0.8, 1, utils.GPU_A100)
self.assertEqual(su_type, utils.SU_A100_GPU)
self.assertEqual(su_count, 1)
self.assertEqual(determining_resource, "GPU")
42 changes: 11 additions & 31 deletions openshift_metrics/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type):
"""
su_type = SU_UNKNOWN
su_count = 0

if gpu_type == NO_GPU:
gpu_type = None

Expand Down Expand Up @@ -132,14 +133,15 @@ def get_service_unit(cpu_count, memory_count, gpu_count, gpu_type):
else:
su_type = known_gpu_su.get(gpu_type, SU_UNKNOWN_GPU)

# because openshift offers fractional CPUs, so we round it up.
cpu_count = math.ceil(cpu_count)

cpu_multiplier = cpu_count / su_config[su_type]["cpu"]
gpu_multiplier = gpu_count / su_config[su_type]["gpu"]
memory_multiplier = math.ceil(memory_count / su_config[su_type]["ram"])
memory_multiplier = memory_count / su_config[su_type]["ram"]

su_count = max(cpu_multiplier, gpu_multiplier, memory_multiplier)

su_count = math.ceil(max(cpu_multiplier, gpu_multiplier, memory_multiplier))
# no fractional SUs for GPU SUs
if su_type != SU_CPU:
su_count = math.ceil(su_count)

if gpu_multiplier >= cpu_multiplier and gpu_multiplier >= memory_multiplier:
determining_resource = "GPU"
Expand Down Expand Up @@ -227,11 +229,6 @@ def csv_writer(rows, file_name):
def write_metrics_by_namespace(condensed_metrics_dict, file_name, report_month):
"""
Process metrics dictionary to aggregate usage by namespace and then write that to a file

It sums up the cpu and memory resources for all non-gpu pods per project and then calculates
service units on the total.

For GPU resources, it relies on the `get_service_unit` method to get the SU count.
"""
metrics_by_namespace = {}
rows = []
Expand Down Expand Up @@ -279,34 +276,18 @@ def write_metrics_by_namespace(condensed_metrics_dict, file_name, report_month):
gpu_request = float(pod_metric_dict.get("gpu_request", 0))
memory_request = float(pod_metric_dict.get("memory_request", 0)) / 2**30

_, su_count, _ = get_service_unit(cpu_request, memory_request, gpu_request, gpu_type)

if gpu_type == GPU_A100:
_, su_count, _ = get_service_unit(
float(cpu_request), memory_request, float(gpu_request), gpu_type
)
metrics_by_namespace[namespace]["SU_A100_GPU_HOURS"] += su_count * duration_in_hours
elif gpu_type == GPU_A2:
_, su_count, _ = get_service_unit(
float(cpu_request), memory_request, float(gpu_request), gpu_type
)
metrics_by_namespace[namespace]["SU_A2_GPU_HOURS"] += su_count * duration_in_hours
elif gpu_type == GPU_GENERIC:
_, su_count, _ = get_service_unit(
float(cpu_request), memory_request, float(gpu_request), gpu_type
)
metrics_by_namespace[namespace]["SU_UNKNOWN_GPU_HOURS"] += su_count * duration_in_hours
else:
metrics_by_namespace[namespace]["_cpu_hours"] += cpu_request * duration_in_hours
metrics_by_namespace[namespace]["_memory_hours"] += (
memory_request * duration_in_hours
)
metrics_by_namespace[namespace]["SU_CPU_HOURS"] += su_count * duration_in_hours

for namespace, metrics in metrics_by_namespace.items():
cpu_multiplier = metrics["_cpu_hours"] / 1
memory_multiplier = metrics["_memory_hours"] / 4

su_count_hours = math.ceil(max(cpu_multiplier, memory_multiplier))

metrics["SU_CPU_HOURS"] += su_count_hours

if metrics["SU_CPU_HOURS"] != 0:
row = [
Expand All @@ -318,7 +299,7 @@ def write_metrics_by_namespace(condensed_metrics_dict, file_name, report_month):
"", #Invoice Address
"", #Institution
"", #Institution - Specific Code
str(metrics["SU_CPU_HOURS"]),
str(math.ceil(metrics["SU_CPU_HOURS"])),
SU_CPU,
str(RATE.get(SU_CPU)),
str(RATE.get(SU_CPU) * metrics["SU_CPU_HOURS"])
Expand Down Expand Up @@ -463,5 +444,4 @@ def write_metrics_by_pod(metrics_dict, file_name):
]

rows.append(info_list)

csv_writer(rows, file_name)
1 change: 0 additions & 1 deletion test-requirements.txt

This file was deleted.