Skip to content

Commit 77a3fd7

Browse files
authored
Merge pull request #106 from naved001/rhods-notebook
2 parents 9ca3f64 + bf927ed commit 77a3fd7

File tree

5 files changed

+217
-2
lines changed

5 files changed

+217
-2
lines changed

openshift_metrics/merge.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,14 @@ def main():
148148
rates=rates,
149149
ignore_hours=ignore_hours,
150150
)
151+
utils.write_metrics_by_classes(
152+
condensed_metrics_dict=condensed_metrics_dict,
153+
file_name=f"by-classes-{invoice_file}",
154+
report_month=report_month,
155+
rates=rates,
156+
namespaces_with_classes=["rhods-notebooks"],
157+
ignore_hours=ignore_hours,
158+
)
151159
utils.write_metrics_by_pod(condensed_metrics_dict, pod_report_file, ignore_hours)
152160

153161
if args.upload_to_s3:

openshift_metrics/metrics_processor.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ def __init__(
2525

2626
def merge_metrics(self, metric_name, metric_list):
2727
"""Merge metrics (cpu, memory, gpu) by pod"""
28-
2928
for metric in metric_list:
3029
pod = metric["metric"]["pod"]
3130
namespace = metric["metric"]["namespace"]
@@ -34,6 +33,11 @@ def merge_metrics(self, metric_name, metric_list):
3433
self.merged_data.setdefault(namespace, {})
3534
self.merged_data[namespace].setdefault(pod, {"metrics": {}})
3635

36+
if metric_name == "cpu_request":
37+
class_name = metric["metric"].get("label_nerc_mghpcc_org_class")
38+
if class_name is not None:
39+
self.merged_data[namespace][pod]["label_nerc_mghpcc_org_class"] = class_name
40+
3741
gpu_type, gpu_resource, node_model = self._extract_gpu_info(
3842
metric_name, metric
3943
)
@@ -193,3 +197,21 @@ def insert_node_labels(node_labels: list, resource_request_metrics: list) -> lis
193197
"machine"
194198
)
195199
return resource_request_metrics
200+
201+
@staticmethod
202+
def insert_pod_labels(pod_labels: list, resource_request_metrics: list) -> list:
203+
"""Inserts `label_nerc_mghpcc_org_class` label into resource_request_metrics"""
204+
pod_label_dict = {}
205+
for pod_label in pod_labels:
206+
pod_name = pod_label["metric"]["pod"]
207+
class_name = pod_label["metric"].get("label_nerc_mghpcc_org_class")
208+
pod_label_dict[pod_name] = {"class": class_name}
209+
210+
for pod in resource_request_metrics:
211+
pod_name = pod["metric"]["pod"]
212+
if pod_name not in pod_label_dict:
213+
continue
214+
pod["metric"]["label_nerc_mghpcc_org_class"] = pod_label_dict[pod_name].get(
215+
"class"
216+
)
217+
return resource_request_metrics

openshift_metrics/openshift_prometheus_metrics.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
MEMORY_REQUEST = 'kube_pod_resource_request{unit="bytes", node!=""} unless on(pod, namespace) kube_pod_status_unschedulable'
3232
GPU_REQUEST = 'kube_pod_resource_request{resource=~"nvidia.com.*", node!=""} unless on(pod, namespace) kube_pod_status_unschedulable'
3333
KUBE_NODE_LABELS = 'kube_node_labels{label_nvidia_com_gpu_product!=""}'
34+
KUBE_POD_LABELS = 'kube_pod_labels{label_nerc_mghpcc_org_class!=""}'
3435

3536
def main():
3637
"""This method kick starts the process of collecting and saving the metrics"""
@@ -87,10 +88,17 @@ def main():
8788
cpu_request_metrics = prom_client.query_metric(
8889
CPU_REQUEST, report_start_date, report_end_date
8990
)
91+
92+
try:
93+
pod_labels = prom_client.query_metric(KUBE_POD_LABELS, report_start_date, report_end_date)
94+
metrics_dict["cpu_metrics"] = MetricsProcessor.insert_pod_labels(pod_labels, cpu_request_metrics)
95+
except utils.EmptyResultError:
96+
logger.info(f"No pod labels found for the period {report_start_date} to {report_end_date}")
97+
metrics_dict["cpu_metrics"] = cpu_request_metrics
98+
9099
memory_request_metrics = prom_client.query_metric(
91100
MEMORY_REQUEST, report_start_date, report_end_date
92101
)
93-
metrics_dict["cpu_metrics"] = cpu_request_metrics
94102
metrics_dict["memory_metrics"] = memory_request_metrics
95103

96104
# because if nobody requests a GPU then we will get an empty set

openshift_metrics/tests/test_utils.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,109 @@ def test_write_metrics_log(self, mock_gna):
258258
self.assertEqual(tmp.read(), expected_output)
259259

260260

261+
class TestWriteMetricsByClasses(TestCase):
262+
263+
@mock.patch('openshift_metrics.utils.get_namespace_attributes')
264+
def test_write_metrics_log(self, mock_gna):
265+
mock_gna.return_value = {
266+
'namespace1': {
267+
'cf_pi': 'PI1',
268+
'cf_project_id': '123',
269+
'institution_code': '76'
270+
},
271+
'namespace2': {
272+
'cf_pi': 'PI2',
273+
'cf_project_id': '456',
274+
}
275+
}
276+
test_metrics_dict = {
277+
"namespace1": { # namespace is ignored entirely from the report
278+
"pod1": {
279+
"metrics": {
280+
0: {
281+
"cpu_request": 2,
282+
"memory_request": 4 * 2**30,
283+
"duration": 43200
284+
},
285+
}
286+
},
287+
},
288+
"namespace2": {
289+
"pod2": { # pod which doesn't belong to a class
290+
"metrics": {
291+
0: {
292+
"cpu_request": 1,
293+
"memory_request": 8 * 2**30,
294+
"duration": 172800
295+
},
296+
}
297+
},
298+
"pod3": {
299+
"label_nerc_mghpcc_org_class": "math-201",
300+
"metrics": {
301+
0: {
302+
"cpu_request": 1,
303+
"memory_request": 8 * 2**30,
304+
"duration": 86400
305+
},
306+
}
307+
},
308+
"pod4": {
309+
"label_nerc_mghpcc_org_class": "math-201",
310+
"metrics": {
311+
0: {
312+
"cpu_request": 2,
313+
"memory_request": 8 * 2**30,
314+
"duration": 86400
315+
},
316+
}
317+
},
318+
"pod5": {
319+
"label_nerc_mghpcc_org_class": "math-201",
320+
"metrics": {
321+
0: {
322+
"cpu_request": 1,
323+
"memory_request": 8 * 2**30,
324+
"gpu_request": 1,
325+
"gpu_type": invoice.GPU_A100,
326+
"gpu_resource": invoice.WHOLE_GPU,
327+
"duration": 86400
328+
},
329+
}
330+
},
331+
"pod6": {
332+
"label_nerc_mghpcc_org_class": "cs-101",
333+
"gpu_type": invoice.GPU_A100_SXM4,
334+
"metrics": {
335+
0: {
336+
"cpu_request": 24,
337+
"memory_request": 8 * 2**30,
338+
"gpu_request": 1,
339+
"gpu_type": invoice.GPU_A100_SXM4,
340+
"gpu_resource": invoice.WHOLE_GPU,
341+
"duration": 172800
342+
},
343+
}
344+
},
345+
}
346+
}
347+
348+
expected_output = ("Invoice Month,Project - Allocation,Project - Allocation ID,Manager (PI),Invoice Email,Invoice Address,Institution,Institution - Specific Code,SU Hours (GBhr or SUhr),SU Type,Rate,Cost\n"
349+
"2023-01,namespace2:noclass,namespace2:noclass,,,,,,96,OpenShift CPU,0.013,1.25\n"
350+
"2023-01,namespace2:math-201,namespace2:math-201,,,,,,96,OpenShift CPU,0.013,1.25\n"
351+
"2023-01,namespace2:math-201,namespace2:math-201,,,,,,24,OpenShift GPUA100,1.803,43.27\n"
352+
"2023-01,namespace2:cs-101,namespace2:cs-101,,,,,,48,OpenShift GPUA100SXM4,2.078,99.74\n")
353+
354+
with tempfile.NamedTemporaryFile(mode="w+") as tmp:
355+
utils.write_metrics_by_classes(
356+
condensed_metrics_dict=test_metrics_dict,
357+
file_name=tmp.name,
358+
report_month="2023-01",
359+
rates=RATES,
360+
namespaces_with_classes=["namespace2"]
361+
)
362+
self.assertEqual(tmp.read(), expected_output)
363+
261364
@mock.patch('openshift_metrics.utils.get_namespace_attributes')
262365
def test_write_metrics_by_namespace_decimal(self, mock_gna):
263366
"""This tests the inaccurate result we get when using floating

openshift_metrics/utils.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,3 +227,77 @@ def write_metrics_by_pod(condensed_metrics_dict, file_name, ignore_hours=None):
227227
rows.append(pod_obj.generate_pod_row(ignore_hours))
228228

229229
csv_writer(rows, file_name)
230+
231+
def write_metrics_by_classes(condensed_metrics_dict, file_name, report_month, rates, namespaces_with_classes, ignore_hours=None):
232+
"""
233+
Process metrics dictionary to aggregate usage by the class label.
234+
235+
If a pod has a class label, then the project name is composed of namespace:class_name
236+
otherwise it's namespace:noclass.
237+
"""
238+
invoices = {}
239+
rows = []
240+
headers = [
241+
"Invoice Month",
242+
"Project - Allocation",
243+
"Project - Allocation ID",
244+
"Manager (PI)",
245+
"Invoice Email",
246+
"Invoice Address",
247+
"Institution",
248+
"Institution - Specific Code",
249+
"SU Hours (GBhr or SUhr)",
250+
"SU Type",
251+
"Rate",
252+
"Cost",
253+
]
254+
255+
rows.append(headers)
256+
257+
for namespace, pods in condensed_metrics_dict.items():
258+
if namespace not in namespaces_with_classes:
259+
continue
260+
261+
for pod, pod_dict in pods.items():
262+
class_name = pod_dict.get("label_nerc_mghpcc_org_class")
263+
if class_name:
264+
project_name = f"{namespace}:{class_name}"
265+
else:
266+
project_name = f"{namespace}:noclass"
267+
268+
if project_name not in invoices:
269+
project_invoice = invoice.ProjectInvoce(
270+
invoice_month=report_month,
271+
project=project_name,
272+
project_id=project_name,
273+
pi="",
274+
invoice_email="",
275+
invoice_address="",
276+
intitution="",
277+
institution_specific_code="",
278+
rates=rates,
279+
ignore_hours=ignore_hours,
280+
)
281+
invoices[project_name] = project_invoice
282+
project_invoice = invoices[project_name]
283+
284+
for epoch_time, pod_metric_dict in pod_dict["metrics"].items():
285+
pod_obj = invoice.Pod(
286+
pod_name=pod,
287+
namespace=project_name,
288+
start_time=epoch_time,
289+
duration=pod_metric_dict["duration"],
290+
cpu_request=Decimal(pod_metric_dict.get("cpu_request", 0)),
291+
gpu_request=Decimal(pod_metric_dict.get("gpu_request", 0)),
292+
memory_request=Decimal(pod_metric_dict.get("memory_request", 0)) / 2**30,
293+
gpu_type=pod_metric_dict.get("gpu_type"),
294+
gpu_resource=pod_metric_dict.get("gpu_resource"),
295+
node_hostname=pod_metric_dict.get("node"),
296+
node_model=pod_metric_dict.get("node_model"),
297+
)
298+
project_invoice.add_pod(pod_obj)
299+
300+
for project_invoice in invoices.values():
301+
rows.extend(project_invoice.generate_invoice_rows(report_month))
302+
303+
csv_writer(rows, file_name)

0 commit comments

Comments
 (0)