Merge pull request #106 from naved001/rhods-notebook

naved001 · web-flow · commit 77a3fd765678 · 2025-02-26T08:24:44.000-05:00
diff --git a/openshift_metrics/merge.py b/openshift_metrics/merge.py
@@ -148,6 +148,14 @@ def main():
         rates=rates,
         ignore_hours=ignore_hours,
     )
+    utils.write_metrics_by_classes(
+        condensed_metrics_dict=condensed_metrics_dict,
+        file_name=f"by-classes-{invoice_file}",
+        report_month=report_month,
+        rates=rates,
+        namespaces_with_classes=["rhods-notebooks"],
+        ignore_hours=ignore_hours,
+    )
     utils.write_metrics_by_pod(condensed_metrics_dict, pod_report_file, ignore_hours)
 
     if args.upload_to_s3:
diff --git a/openshift_metrics/metrics_processor.py b/openshift_metrics/metrics_processor.py
@@ -25,7 +25,6 @@ def __init__(
 
     def merge_metrics(self, metric_name, metric_list):
         """Merge metrics (cpu, memory, gpu) by pod"""
-
         for metric in metric_list:
             pod = metric["metric"]["pod"]
             namespace = metric["metric"]["namespace"]
@@ -34,6 +33,11 @@ def merge_metrics(self, metric_name, metric_list):
             self.merged_data.setdefault(namespace, {})
             self.merged_data[namespace].setdefault(pod, {"metrics": {}})
 
+            if metric_name == "cpu_request":
+                class_name = metric["metric"].get("label_nerc_mghpcc_org_class")
+                if class_name is not None:
+                    self.merged_data[namespace][pod]["label_nerc_mghpcc_org_class"] = class_name
+
             gpu_type, gpu_resource, node_model = self._extract_gpu_info(
                 metric_name, metric
             )
@@ -193,3 +197,21 @@ def insert_node_labels(node_labels: list, resource_request_metrics: list) -> lis
                 "machine"
             )
         return resource_request_metrics
+
+    @staticmethod
+    def insert_pod_labels(pod_labels: list, resource_request_metrics: list) -> list:
+        """Inserts `label_nerc_mghpcc_org_class` label into resource_request_metrics"""
+        pod_label_dict = {}
+        for pod_label in pod_labels:
+            pod_name = pod_label["metric"]["pod"]
+            class_name = pod_label["metric"].get("label_nerc_mghpcc_org_class")
+            pod_label_dict[pod_name] = {"class": class_name}
+
+        for pod in resource_request_metrics:
+            pod_name = pod["metric"]["pod"]
+            if pod_name not in pod_label_dict:
+                continue
+            pod["metric"]["label_nerc_mghpcc_org_class"] = pod_label_dict[pod_name].get(
+                "class"
+            )
+        return resource_request_metrics
diff --git a/openshift_metrics/openshift_prometheus_metrics.py b/openshift_metrics/openshift_prometheus_metrics.py
@@ -31,6 +31,7 @@
 MEMORY_REQUEST = 'kube_pod_resource_request{unit="bytes", node!=""} unless on(pod, namespace) kube_pod_status_unschedulable'
 GPU_REQUEST = 'kube_pod_resource_request{resource=~"nvidia.com.*", node!=""} unless on(pod, namespace) kube_pod_status_unschedulable'
 KUBE_NODE_LABELS = 'kube_node_labels{label_nvidia_com_gpu_product!=""}'
+KUBE_POD_LABELS = 'kube_pod_labels{label_nerc_mghpcc_org_class!=""}'
 
 def main():
     """This method kick starts the process of collecting and saving the metrics"""
@@ -87,10 +88,17 @@ def main():
     cpu_request_metrics = prom_client.query_metric(
         CPU_REQUEST, report_start_date, report_end_date
     )
+
+    try:
+        pod_labels = prom_client.query_metric(KUBE_POD_LABELS, report_start_date, report_end_date)
+        metrics_dict["cpu_metrics"] = MetricsProcessor.insert_pod_labels(pod_labels, cpu_request_metrics)
+    except utils.EmptyResultError:
+        logger.info(f"No pod labels found for the period {report_start_date} to {report_end_date}")
+        metrics_dict["cpu_metrics"] = cpu_request_metrics
+
     memory_request_metrics = prom_client.query_metric(
         MEMORY_REQUEST, report_start_date, report_end_date
     )
-    metrics_dict["cpu_metrics"] = cpu_request_metrics
     metrics_dict["memory_metrics"] = memory_request_metrics
 
     # because if nobody requests a GPU then we will get an empty set
diff --git a/openshift_metrics/tests/test_utils.py b/openshift_metrics/tests/test_utils.py
@@ -258,6 +258,109 @@ def test_write_metrics_log(self, mock_gna):
             self.assertEqual(tmp.read(), expected_output)
 
 
+class TestWriteMetricsByClasses(TestCase):
+
+    @mock.patch('openshift_metrics.utils.get_namespace_attributes')
+    def test_write_metrics_log(self, mock_gna):
+        mock_gna.return_value = {
+            'namespace1': {
+                'cf_pi': 'PI1',
+                'cf_project_id': '123',
+                'institution_code': '76'
+            },
+            'namespace2': {
+                'cf_pi': 'PI2',
+                'cf_project_id': '456',
+            }
+        }
+        test_metrics_dict = {
+            "namespace1": { # namespace is ignored entirely from the report
+                "pod1": {
+                    "metrics": {
+                        0: {
+                            "cpu_request": 2,
+                            "memory_request": 4 * 2**30,
+                            "duration": 43200
+                        },
+                    }
+                },
+            },
+            "namespace2": {
+                "pod2": { # pod which doesn't belong to a class
+                    "metrics": {
+                        0: {
+                            "cpu_request": 1,
+                            "memory_request": 8 * 2**30,
+                            "duration": 172800
+                        },
+                    }
+                },
+                "pod3": {
+                    "label_nerc_mghpcc_org_class": "math-201",
+                    "metrics": {
+                        0: {
+                            "cpu_request": 1,
+                            "memory_request": 8 * 2**30,
+                            "duration": 86400
+                        },
+                    }
+                },
+                "pod4": {
+                    "label_nerc_mghpcc_org_class": "math-201",
+                    "metrics": {
+                        0: {
+                            "cpu_request": 2,
+                            "memory_request": 8 * 2**30,
+                            "duration": 86400
+                        },
+                    }
+                },
+                "pod5": {
+                    "label_nerc_mghpcc_org_class": "math-201",
+                    "metrics": {
+                        0: {
+                            "cpu_request": 1,
+                            "memory_request": 8 * 2**30,
+                            "gpu_request": 1,
+                            "gpu_type": invoice.GPU_A100,
+                            "gpu_resource": invoice.WHOLE_GPU,
+                            "duration": 86400
+                        },
+                    }
+                },
+                "pod6": {
+                    "label_nerc_mghpcc_org_class": "cs-101",
+                    "gpu_type": invoice.GPU_A100_SXM4,
+                    "metrics": {
+                        0: {
+                            "cpu_request": 24,
+                            "memory_request": 8 * 2**30,
+                            "gpu_request": 1,
+                            "gpu_type": invoice.GPU_A100_SXM4,
+                            "gpu_resource": invoice.WHOLE_GPU,
+                            "duration": 172800
+                        },
+                    }
+            },
+            }
+        }
+
+        expected_output = ("Invoice Month,Project - Allocation,Project - Allocation ID,Manager (PI),Invoice Email,Invoice Address,Institution,Institution - Specific Code,SU Hours (GBhr or SUhr),SU Type,Rate,Cost\n"
+                            "2023-01,namespace2:noclass,namespace2:noclass,,,,,,96,OpenShift CPU,0.013,1.25\n"
+                            "2023-01,namespace2:math-201,namespace2:math-201,,,,,,96,OpenShift CPU,0.013,1.25\n"
+                            "2023-01,namespace2:math-201,namespace2:math-201,,,,,,24,OpenShift GPUA100,1.803,43.27\n"
+                            "2023-01,namespace2:cs-101,namespace2:cs-101,,,,,,48,OpenShift GPUA100SXM4,2.078,99.74\n")
+
+        with tempfile.NamedTemporaryFile(mode="w+") as tmp:
+            utils.write_metrics_by_classes(
+                condensed_metrics_dict=test_metrics_dict,
+                file_name=tmp.name,
+                report_month="2023-01",
+                rates=RATES,
+                namespaces_with_classes=["namespace2"]
+                )
+            self.assertEqual(tmp.read(), expected_output)
+
     @mock.patch('openshift_metrics.utils.get_namespace_attributes')
     def test_write_metrics_by_namespace_decimal(self, mock_gna):
         """This tests the inaccurate result we get when using floating
diff --git a/openshift_metrics/utils.py b/openshift_metrics/utils.py
@@ -227,3 +227,77 @@ def write_metrics_by_pod(condensed_metrics_dict, file_name, ignore_hours=None):
                 rows.append(pod_obj.generate_pod_row(ignore_hours))
 
     csv_writer(rows, file_name)
+
+def write_metrics_by_classes(condensed_metrics_dict, file_name, report_month, rates, namespaces_with_classes, ignore_hours=None):
+    """
+    Process metrics dictionary to aggregate usage by the class label.
+
+    If a pod has a class label, then the project name is composed of namespace:class_name
+    otherwise it's namespace:noclass.
+    """
+    invoices = {}
+    rows = []
+    headers = [
+        "Invoice Month",
+        "Project - Allocation",
+        "Project - Allocation ID",
+        "Manager (PI)",
+        "Invoice Email",
+        "Invoice Address",
+        "Institution",
+        "Institution - Specific Code",
+        "SU Hours (GBhr or SUhr)",
+        "SU Type",
+        "Rate",
+        "Cost",
+    ]
+
+    rows.append(headers)
+
+    for namespace, pods in condensed_metrics_dict.items():
+        if namespace not in namespaces_with_classes:
+            continue
+
+        for pod, pod_dict in pods.items():
+            class_name = pod_dict.get("label_nerc_mghpcc_org_class")
+            if class_name:
+                project_name = f"{namespace}:{class_name}"
+            else:
+                project_name = f"{namespace}:noclass"
+
+            if project_name not in invoices:
+                project_invoice = invoice.ProjectInvoce(
+                    invoice_month=report_month,
+                    project=project_name,
+                    project_id=project_name,
+                    pi="",
+                    invoice_email="",
+                    invoice_address="",
+                    intitution="",
+                    institution_specific_code="",
+                    rates=rates,
+                    ignore_hours=ignore_hours,
+                )
+                invoices[project_name] = project_invoice
+            project_invoice = invoices[project_name]
+
+            for epoch_time, pod_metric_dict in pod_dict["metrics"].items():
+                pod_obj = invoice.Pod(
+                    pod_name=pod,
+                    namespace=project_name,
+                    start_time=epoch_time,
+                    duration=pod_metric_dict["duration"],
+                    cpu_request=Decimal(pod_metric_dict.get("cpu_request", 0)),
+                    gpu_request=Decimal(pod_metric_dict.get("gpu_request", 0)),
+                    memory_request=Decimal(pod_metric_dict.get("memory_request", 0)) / 2**30,
+                    gpu_type=pod_metric_dict.get("gpu_type"),
+                    gpu_resource=pod_metric_dict.get("gpu_resource"),
+                    node_hostname=pod_metric_dict.get("node"),
+                    node_model=pod_metric_dict.get("node_model"),
+                )
+                project_invoice.add_pod(pod_obj)
+
+    for project_invoice in invoices.values():
+        rows.extend(project_invoice.generate_invoice_rows(report_month))
+
+    csv_writer(rows, file_name)