From 9c9456216aefea79f5f106cb39beb71bbfdc707b Mon Sep 17 00:00:00 2001 From: Naved Ansari Date: Wed, 30 Oct 2024 12:33:21 -0400 Subject: [PATCH] Some updates to the kubernetes manifests so they work with the new code changes I have added a couple of shell scripts to produce report and collect metrics. The cronjobs now call them. The dockerfile has been updated to include the shell files. A configmap has been added that mounts the gpu node map file at the approprate directory. --- Dockerfile | 4 +++- bin/collect_metrics.sh | 5 +++++ bin/produce_report.sh | 6 +++++ ...y-openshift-metrics-collector-cronjob.yaml | 8 +------ k8s/base/gpu-node-map-configmap.yaml | 22 +++++++++++++++++++ k8s/base/kustomization.yaml | 1 + k8s/base/produce-report-cronjob.yaml | 8 ++++++- 7 files changed, 45 insertions(+), 9 deletions(-) create mode 100755 bin/collect_metrics.sh create mode 100755 bin/produce_report.sh create mode 100644 k8s/base/gpu-node-map-configmap.yaml diff --git a/Dockerfile b/Dockerfile index c2ff164..2be712c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,5 +5,7 @@ COPY requirements.txt ./ RUN pip install -r requirements.txt COPY openshift_metrics/ /app/openshift_metrics +COPY bin/collect_metrics.sh /app/collect_metrics.sh +COPY bin/produce_report.sh /app/produce_report.sh -CMD ["python", "openshift_metrics/openshift_prometheus_metrics.py", "--upload-to-s3"] +CMD ["./collect_metrics.sh"] diff --git a/bin/collect_metrics.sh b/bin/collect_metrics.sh new file mode 100755 index 0000000..cf6cb34 --- /dev/null +++ b/bin/collect_metrics.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env sh + +python -m openshift_metrics.openshift_prometheus_metrics \ + --output-file /tmp/metrics.json \ + --upload-to-s3 diff --git a/bin/produce_report.sh b/bin/produce_report.sh new file mode 100755 index 0000000..dfc5d91 --- /dev/null +++ b/bin/produce_report.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env sh + +python -m openshift_metrics.merge /data/*.json \ + --invoice-file /tmp/invoice.csv \ + --pod-report-file /tmp/pod-report.csv \ + --upload-to-s3 diff --git a/k8s/base/daily-openshift-metrics-collector-cronjob.yaml b/k8s/base/daily-openshift-metrics-collector-cronjob.yaml index 2951e5d..e7da03c 100644 --- a/k8s/base/daily-openshift-metrics-collector-cronjob.yaml +++ b/k8s/base/daily-openshift-metrics-collector-cronjob.yaml @@ -29,11 +29,5 @@ spec: secretKeyRef: name: openshift-metrics-b2-bucket key: secret-access-key - volumeMounts: - - name: data-volume - mountPath: /data - command: ["/bin/sh", "-c", "cd /data && python /app/openshift_metrics/openshift_prometheus_metrics.py --upload-to-s3"] - volumes: - - name: data-volume - emptyDir: {} + command: ["./collect_metrics.sh"] restartPolicy: OnFailure diff --git a/k8s/base/gpu-node-map-configmap.yaml b/k8s/base/gpu-node-map-configmap.yaml new file mode 100644 index 0000000..b841af3 --- /dev/null +++ b/k8s/base/gpu-node-map-configmap.yaml @@ -0,0 +1,22 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: gpu-node-map +data: + gpu_node_map.json: | + { + "wrk-88": "Tesla-V100-PCIE-32GB", + "wrk-89": "Tesla-V100-PCIE-32GB", + "wrk-94": "NVIDIA-A100-SXM4-40GB", + "wrk-95": "NVIDIA-A100-SXM4-40GB", + "wrk-97": "NVIDIA-A100-SXM4-40GB", + "wrk-98": "NVIDIA-A100-SXM4-40GB", + "wrk-99": "NVIDIA-A100-SXM4-40GB", + "wrk-102": "Tesla-V100-PCIE-32GB", + "wrk-103": "Tesla-V100-PCIE-32GB", + "wrk-104": "Tesla-V100-PCIE-32GB", + "wrk-105": "Tesla-V100-PCIE-32GB", + "wrk-106": "Tesla-V100-PCIE-32GB", + "wrk-107": "Tesla-V100-PCIE-32GB", + "wrk-108": "Tesla-V100-PCIE-32GB" + } diff --git a/k8s/base/kustomization.yaml b/k8s/base/kustomization.yaml index 5769c19..1fbae9b 100644 --- a/k8s/base/kustomization.yaml +++ b/k8s/base/kustomization.yaml @@ -2,3 +2,4 @@ resources: - daily-openshift-metrics-collector-cronjob.yaml - produce-report-cronjob.yaml - metrics-downloader-configmap.yaml + - gpu-node-map-configmap.yaml diff --git a/k8s/base/produce-report-cronjob.yaml b/k8s/base/produce-report-cronjob.yaml index b5c3319..d67e1f5 100644 --- a/k8s/base/produce-report-cronjob.yaml +++ b/k8s/base/produce-report-cronjob.yaml @@ -35,7 +35,10 @@ spec: volumeMounts: - name: data-volume mountPath: /data - command: ["/bin/sh", "-c", "cd /data && python /app/openshift_metrics/merge.py /data/*.json --upload-to-s3"] + - name: gpu-node-map + mountPath: /app/gpu_node_map.json + subPath: gpu_node_map.json + command: ["./produce_report.sh"] initContainers: - name: download-metrics image: amazon/aws-cli @@ -63,4 +66,7 @@ spec: configMap: name: metrics-downloader defaultMode: 0555 + - name: gpu-node-map + configMap: + name: gpu-node-map restartPolicy: OnFailure