Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for GitRepository #4

Merged
merged 2 commits into from
Oct 7, 2024
Merged

Add support for GitRepository #4

merged 2 commits into from
Oct 7, 2024

Conversation

abstrask
Copy link
Owner

@abstrask abstrask commented Sep 27, 2024

Means metaflow test file should now render diff.

Copy link

github-actions bot commented Oct 7, 2024

Flux Helm diffs

infrastructure/base/argo-workflows/helm.yaml

--- /dev/fd/63	2024-10-07 07:09:37.485950240 +0000
+++ /dev/fd/62	2024-10-07 07:09:37.486950255 +0000
@@ -1,29 +1,31 @@
 apiVersion: v1
 kind: ServiceAccount
 metadata:
   labels:
+    app: workflow-controller
     app.kubernetes.io/component: workflow-controller
     app.kubernetes.io/instance: argo-workflows
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: argo-workflows-workflow-controller
     app.kubernetes.io/part-of: argo-workflows
-    helm.sh/chart: argo-workflows-0.40.14
+    helm.sh/chart: argo-workflows-0.42.5
   name: argo-workflows-workflow-controller
   namespace: argo
 ---
 # Source: argo-workflows/templates/server/server-sa.yaml
 apiVersion: v1
 kind: ServiceAccount
 metadata:
   labels:
+    app: server
     app.kubernetes.io/component: server
     app.kubernetes.io/instance: argo-workflows
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: argo-workflows-server
     app.kubernetes.io/part-of: argo-workflows
-    helm.sh/chart: argo-workflows-0.40.14
+    helm.sh/chart: argo-workflows-0.42.5
   name: argo-workflows
   namespace: argo
 ---
 # Source: argo-workflows/templates/controller/workflow-controller-config-map.yaml
 apiVersion: v1
@@ -47,30 +49,32 @@
     nodeEvents:
       enabled: true
 kind: ConfigMap
 metadata:
   labels:
+    app: workflow-controller
     app.kubernetes.io/component: workflow-controller
     app.kubernetes.io/instance: argo-workflows
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: argo-workflows-cm
     app.kubernetes.io/part-of: argo-workflows
-    helm.sh/chart: argo-workflows-0.40.14
+    helm.sh/chart: argo-workflows-0.42.5
   name: argo-workflows-workflow-controller-configmap
   namespace: argo
 ---
 # Source: argo-workflows/templates/controller/workflow-aggregate-roles.yaml
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
   labels:
+    app: workflow-controller
     app.kubernetes.io/component: workflow-controller
     app.kubernetes.io/instance: argo-workflows
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: argo-workflows-workflow-controller
     app.kubernetes.io/part-of: argo-workflows
-    helm.sh/chart: argo-workflows-0.40.14
+    helm.sh/chart: argo-workflows-0.42.5
     rbac.authorization.k8s.io/aggregate-to-view: "true"
   name: argo-workflows-view
 rules:
   - apiGroups:
       - argoproj.io
@@ -99,16 +103,17 @@
 # Source: argo-workflows/templates/controller/workflow-aggregate-roles.yaml
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
   labels:
+    app: server
     app.kubernetes.io/component: server
     app.kubernetes.io/instance: argo-workflows
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: argo-workflows-server
     app.kubernetes.io/part-of: argo-workflows
-    helm.sh/chart: argo-workflows-0.40.14
+    helm.sh/chart: argo-workflows-0.42.5
     rbac.authorization.k8s.io/aggregate-to-edit: "true"
   name: argo-workflows-edit
 rules:
   - apiGroups:
       - argoproj.io
@@ -142,16 +147,17 @@
 # Source: argo-workflows/templates/controller/workflow-aggregate-roles.yaml
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
   labels:
+    app: server
     app.kubernetes.io/component: server
     app.kubernetes.io/instance: argo-workflows
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: argo-workflows-server
     app.kubernetes.io/part-of: argo-workflows
-    helm.sh/chart: argo-workflows-0.40.14
+    helm.sh/chart: argo-workflows-0.42.5
     rbac.authorization.k8s.io/aggregate-to-admin: "true"
   name: argo-workflows-admin
 rules:
   - apiGroups:
       - argoproj.io
@@ -185,16 +191,17 @@
 # Source: argo-workflows/templates/controller/workflow-controller-cluster-roles.yaml
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
   labels:
+    app: workflow-controller
     app.kubernetes.io/component: workflow-controller
     app.kubernetes.io/instance: argo-workflows
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: argo-workflows-workflow-controller
     app.kubernetes.io/part-of: argo-workflows
-    helm.sh/chart: argo-workflows-0.40.14
+    helm.sh/chart: argo-workflows-0.42.5
   name: argo-workflows-workflow-controller
 rules:
   - apiGroups:
       - ""
     resources:
@@ -330,16 +337,17 @@
 # Source: argo-workflows/templates/controller/workflow-controller-cluster-roles.yaml
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
   labels:
+    app: workflow-controller
     app.kubernetes.io/component: workflow-controller
     app.kubernetes.io/instance: argo-workflows
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: argo-workflows-workflow-controller
     app.kubernetes.io/part-of: argo-workflows
-    helm.sh/chart: argo-workflows-0.40.14
+    helm.sh/chart: argo-workflows-0.42.5
   name: argo-workflows-workflow-controller-cluster-template
 rules:
   - apiGroups:
       - argoproj.io
     resources:
@@ -353,16 +361,17 @@
 # Source: argo-workflows/templates/server/server-cluster-roles.yaml
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
   labels:
+    app: server
     app.kubernetes.io/component: server
     app.kubernetes.io/instance: argo-workflows
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: argo-workflows-server
     app.kubernetes.io/part-of: argo-workflows
-    helm.sh/chart: argo-workflows-0.40.14
+    helm.sh/chart: argo-workflows-0.42.5
   name: argo-workflows-server
 rules:
   - apiGroups:
       - ""
     resources:
@@ -423,16 +432,17 @@
 # Source: argo-workflows/templates/server/server-cluster-roles.yaml
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
   labels:
+    app: server
     app.kubernetes.io/component: server
     app.kubernetes.io/instance: argo-workflows
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: argo-workflows-server
     app.kubernetes.io/part-of: argo-workflows
-    helm.sh/chart: argo-workflows-0.40.14
+    helm.sh/chart: argo-workflows-0.42.5
   name: argo-workflows-server-cluster-template
 rules:
   - apiGroups:
       - argoproj.io
     resources:
@@ -449,16 +459,17 @@
 # Source: argo-workflows/templates/controller/workflow-controller-crb.yaml
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
   labels:
+    app: workflow-controller
     app.kubernetes.io/component: workflow-controller
     app.kubernetes.io/instance: argo-workflows
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: argo-workflows-workflow-controller
     app.kubernetes.io/part-of: argo-workflows
-    helm.sh/chart: argo-workflows-0.40.14
+    helm.sh/chart: argo-workflows-0.42.5
   name: argo-workflows-workflow-controller
 roleRef:
   apiGroup: rbac.authorization.k8s.io
   kind: ClusterRole
   name: argo-workflows-workflow-controller
@@ -470,16 +481,17 @@
 # Source: argo-workflows/templates/controller/workflow-controller-crb.yaml
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
   labels:
+    app: workflow-controller
     app.kubernetes.io/component: workflow-controller
     app.kubernetes.io/instance: argo-workflows
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: argo-workflows-workflow-controller
     app.kubernetes.io/part-of: argo-workflows
-    helm.sh/chart: argo-workflows-0.40.14
+    helm.sh/chart: argo-workflows-0.42.5
   name: argo-workflows-workflow-controller-cluster-template
 roleRef:
   apiGroup: rbac.authorization.k8s.io
   kind: ClusterRole
   name: argo-workflows-workflow-controller-cluster-template
@@ -491,16 +503,17 @@
 # Source: argo-workflows/templates/server/server-crb.yaml
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
   labels:
+    app: server
     app.kubernetes.io/component: server
     app.kubernetes.io/instance: argo-workflows
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: argo-workflows-server
     app.kubernetes.io/part-of: argo-workflows
-    helm.sh/chart: argo-workflows-0.40.14
+    helm.sh/chart: argo-workflows-0.42.5
   name: argo-workflows-server
 roleRef:
   apiGroup: rbac.authorization.k8s.io
   kind: ClusterRole
   name: argo-workflows-server
@@ -512,16 +525,17 @@
 # Source: argo-workflows/templates/server/server-crb.yaml
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
   labels:
+    app: server
     app.kubernetes.io/component: server
     app.kubernetes.io/instance: argo-workflows
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: argo-workflows-server
     app.kubernetes.io/part-of: argo-workflows
-    helm.sh/chart: argo-workflows-0.40.14
+    helm.sh/chart: argo-workflows-0.42.5
   name: argo-workflows-server-cluster-template
 roleRef:
   apiGroup: rbac.authorization.k8s.io
   kind: ClusterRole
   name: argo-workflows-server-cluster-template
@@ -533,16 +547,17 @@
 # Source: argo-workflows/templates/controller/workflow-role.yaml
 apiVersion: rbac.authorization.k8s.io/v1
 kind: Role
 metadata:
   labels:
+    app: workflow-controller
     app.kubernetes.io/component: workflow-controller
     app.kubernetes.io/instance: argo-workflows
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: argo-workflows-workflow-controller
     app.kubernetes.io/part-of: argo-workflows
-    helm.sh/chart: argo-workflows-0.40.14
+    helm.sh/chart: argo-workflows-0.42.5
   name: argo-workflows-workflow
   namespace: default
 rules:
   - apiGroups:
       - ""
@@ -591,16 +606,17 @@
 # Source: argo-workflows/templates/controller/workflow-role.yaml
 apiVersion: rbac.authorization.k8s.io/v1
 kind: Role
 metadata:
   labels:
+    app: workflow-controller
     app.kubernetes.io/component: workflow-controller
     app.kubernetes.io/instance: argo-workflows
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: argo-workflows-workflow-controller
     app.kubernetes.io/part-of: argo-workflows
-    helm.sh/chart: argo-workflows-0.40.14
+    helm.sh/chart: argo-workflows-0.42.5
   name: argo-workflows-workflow
   namespace: argo
 rules:
   - apiGroups:
       - ""
@@ -649,16 +665,17 @@
 # Source: argo-workflows/templates/controller/workflow-rb.yaml
 apiVersion: rbac.authorization.k8s.io/v1
 kind: RoleBinding
 metadata:
   labels:
+    app: workflow-controller
     app.kubernetes.io/component: workflow-controller
     app.kubernetes.io/instance: argo-workflows
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: argo-workflows-workflow-controller
     app.kubernetes.io/part-of: argo-workflows
-    helm.sh/chart: argo-workflows-0.40.14
+    helm.sh/chart: argo-workflows-0.42.5
   name: argo-workflows-workflow
   namespace: default
 roleRef:
   apiGroup: rbac.authorization.k8s.io
   kind: Role
@@ -671,16 +688,17 @@
 # Source: argo-workflows/templates/controller/workflow-rb.yaml
 apiVersion: rbac.authorization.k8s.io/v1
 kind: RoleBinding
 metadata:
   labels:
+    app: workflow-controller
     app.kubernetes.io/component: workflow-controller
     app.kubernetes.io/instance: argo-workflows
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: argo-workflows-workflow-controller
     app.kubernetes.io/part-of: argo-workflows
-    helm.sh/chart: argo-workflows-0.40.14
+    helm.sh/chart: argo-workflows-0.42.5
   name: argo-workflows-workflow
   namespace: argo
 roleRef:
   apiGroup: rbac.authorization.k8s.io
   kind: Role
@@ -693,17 +711,18 @@
 # Source: argo-workflows/templates/controller/workflow-controller-service.yaml
 apiVersion: v1
 kind: Service
 metadata:
   labels:
+    app: workflow-controller
     app.kubernetes.io/component: workflow-controller
     app.kubernetes.io/instance: argo-workflows
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: argo-workflows-workflow-controller
     app.kubernetes.io/part-of: argo-workflows
-    app.kubernetes.io/version: v3.5.5
-    helm.sh/chart: argo-workflows-0.40.14
+    app.kubernetes.io/version: v3.5.11
+    helm.sh/chart: argo-workflows-0.42.5
   name: argo-workflows-workflow-controller
   namespace: argo
 spec:
   ports:
     - name: metrics
@@ -719,17 +738,18 @@
 # Source: argo-workflows/templates/server/server-service.yaml
 apiVersion: v1
 kind: Service
 metadata:
   labels:
+    app: server
     app.kubernetes.io/component: server
     app.kubernetes.io/instance: argo-workflows
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: argo-workflows-server
     app.kubernetes.io/part-of: argo-workflows
-    app.kubernetes.io/version: v3.5.5
-    helm.sh/chart: argo-workflows-0.40.14
+    app.kubernetes.io/version: v3.5.11
+    helm.sh/chart: argo-workflows-0.42.5
   name: argo-workflows-server
   namespace: argo
 spec:
   ports:
     - port: 2746
@@ -743,17 +763,18 @@
 # Source: argo-workflows/templates/controller/workflow-controller-deployment.yaml
 apiVersion: apps/v1
 kind: Deployment
 metadata:
   labels:
+    app: workflow-controller
     app.kubernetes.io/component: workflow-controller
     app.kubernetes.io/instance: argo-workflows
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: argo-workflows-workflow-controller
     app.kubernetes.io/part-of: argo-workflows
-    app.kubernetes.io/version: v3.5.5
-    helm.sh/chart: argo-workflows-0.40.14
+    app.kubernetes.io/version: v3.5.11
+    helm.sh/chart: argo-workflows-0.42.5
   name: argo-workflows-workflow-controller
   namespace: argo
 spec:
   replicas: 1
   revisionHistoryLimit: 10
@@ -762,24 +783,25 @@
       app.kubernetes.io/instance: argo-workflows
       app.kubernetes.io/name: argo-workflows-workflow-controller
   template:
     metadata:
       labels:
+        app: workflow-controller
         app.kubernetes.io/component: workflow-controller
         app.kubernetes.io/instance: argo-workflows
         app.kubernetes.io/managed-by: Helm
         app.kubernetes.io/name: argo-workflows-workflow-controller
         app.kubernetes.io/part-of: argo-workflows
-        app.kubernetes.io/version: v3.5.5
-        helm.sh/chart: argo-workflows-0.40.14
+        app.kubernetes.io/version: v3.5.11
+        helm.sh/chart: argo-workflows-0.42.5
     spec:
       containers:
         - args:
             - --configmap
             - argo-workflows-workflow-controller-configmap
             - --executor-image
-            - quay.io/argoproj/argoexec:v3.5.5
+            - quay.io/argoproj/argoexec:v3.5.11
             - --loglevel
             - info
             - --gloglevel
             - "0"
             - --log-format
@@ -805,11 +827,11 @@
                 fieldRef:
                   apiVersion: v1
                   fieldPath: metadata.name
             - name: LEADER_ELECTION_DISABLE
               value: "true"
-          image: quay.io/argoproj/workflow-controller:v3.5.5
+          image: quay.io/argoproj/workflow-controller:v3.5.11
           imagePullPolicy: Always
           livenessProbe:
             failureThreshold: 3
             httpGet:
               path: /healthz
@@ -843,17 +865,18 @@
 # Source: argo-workflows/templates/server/server-deployment.yaml
 apiVersion: apps/v1
 kind: Deployment
 metadata:
   labels:
+    app: server
     app.kubernetes.io/component: server
     app.kubernetes.io/instance: argo-workflows
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: argo-workflows-server
     app.kubernetes.io/part-of: argo-workflows
-    app.kubernetes.io/version: v3.5.5
-    helm.sh/chart: argo-workflows-0.40.14
+    app.kubernetes.io/version: v3.5.11
+    helm.sh/chart: argo-workflows-0.42.5
   name: argo-workflows-server
   namespace: argo
 spec:
   replicas: 1
   revisionHistoryLimit: 10
@@ -862,17 +885,18 @@
       app.kubernetes.io/instance: argo-workflows
       app.kubernetes.io/name: argo-workflows-server
   template:
     metadata:
       labels:
+        app: server
         app.kubernetes.io/component: server
         app.kubernetes.io/instance: argo-workflows
         app.kubernetes.io/managed-by: Helm
         app.kubernetes.io/name: argo-workflows-server
         app.kubernetes.io/part-of: argo-workflows
-        app.kubernetes.io/version: v3.5.5
-        helm.sh/chart: argo-workflows-0.40.14
+        app.kubernetes.io/version: v3.5.11
+        helm.sh/chart: argo-workflows-0.42.5
     spec:
       containers:
         - args:
             - server
             - --configmap=argo-workflows-workflow-controller-configmap
@@ -894,11 +918,11 @@
                 fieldRef:
                   apiVersion: v1
                   fieldPath: metadata.namespace
             - name: BASE_HREF
               value: /
-          image: quay.io/argoproj/argocli:v3.5.5
+          image: quay.io/argoproj/argocli:v3.5.11
           imagePullPolicy: Always
           name: argo-server
           ports:
             - containerPort: 2746
               name: web
@@ -938,16 +962,17 @@
 kind: Ingress
 metadata:
   annotations:
     cert-manager.io/cluster-issuer: letsencrypt
   labels:
+    app: server
     app.kubernetes.io/component: server
     app.kubernetes.io/instance: argo-workflows
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: argo-workflows-server
     app.kubernetes.io/part-of: argo-workflows
-    helm.sh/chart: argo-workflows-0.40.14
+    helm.sh/chart: argo-workflows-0.42.5
   name: argo-workflows-server
   namespace: argo
 spec:
   ingressClassName: nginx
   rules:
@@ -969,22 +994,24 @@
 # Source: argo-workflows/templates/controller/workflow-controller-servicemonitor.yaml
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
   labels:
+    app: workflow-controller
     app.kubernetes.io/component: workflow-controller
     app.kubernetes.io/instance: argo-workflows
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: argo-workflows-workflow-controller
     app.kubernetes.io/part-of: argo-workflows
-    helm.sh/chart: argo-workflows-0.40.14
+    helm.sh/chart: argo-workflows-0.42.5
     instance: primary
   name: argo-workflows-workflow-controller
   namespace: argo
 spec:
   endpoints:
-    - interval: 30s
+    - honorLabels: true
+      interval: 30s
       path: /metrics
       port: metrics
   namespaceSelector:
     matchNames:
       - argo

infrastructure/base/dcgm-exporter/helm.yaml

--- /dev/fd/63	2024-10-07 07:09:37.619952273 +0000
+++ /dev/fd/62	2024-10-07 07:09:37.619952273 +0000
@@ -1 +1,287 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  labels:
+    app.kubernetes.io/component: dcgm-exporter
+    app.kubernetes.io/instance: dcgm-exporter
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/name: dcgm-exporter
+    app.kubernetes.io/version: 3.5.0
+    helm.sh/chart: dcgm-exporter-3.5.0
+  name: dcgm-exporter
+  namespace: dcgm-exporter
+---
+# Source: dcgm-exporter/templates/metrics-configmap.yaml
+apiVersion: v1
+data:
+  metrics: |
+    # Format
+    # If line starts with a '#' it is considered a comment
+    # DCGM FIELD, Prometheus metric type, help message
 
+    # Clocks
+    DCGM_FI_DEV_SM_CLOCK,  gauge, SM clock frequency (in MHz).
+    DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
+
+    # Temperature
+    DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
+    DCGM_FI_DEV_GPU_TEMP,    gauge, GPU temperature (in C).
+
+    # Power
+    DCGM_FI_DEV_POWER_USAGE,              gauge, Power draw (in W).
+    DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ).
+
+    # PCIE
+    # DCGM_FI_PROF_PCIE_TX_BYTES,  counter, Total number of bytes transmitted through PCIe TX via NVML.
+    # DCGM_FI_PROF_PCIE_RX_BYTES,  counter, Total number of bytes received through PCIe RX via NVML.
+    DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries.
+
+    # Utilization (the sample period varies depending on the product)
+    DCGM_FI_DEV_GPU_UTIL,      gauge, GPU utilization (in %).
+    DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %).
+    DCGM_FI_DEV_ENC_UTIL,      gauge, Encoder utilization (in %).
+    DCGM_FI_DEV_DEC_UTIL ,     gauge, Decoder utilization (in %).
+
+    # Errors and violations
+    DCGM_FI_DEV_XID_ERRORS,            gauge,   Value of the last XID error encountered.
+    # DCGM_FI_DEV_POWER_VIOLATION,       counter, Throttling duration due to power constraints (in us).
+    # DCGM_FI_DEV_THERMAL_VIOLATION,     counter, Throttling duration due to thermal constraints (in us).
+    # DCGM_FI_DEV_SYNC_BOOST_VIOLATION,  counter, Throttling duration due to sync-boost constraints (in us).
+    # DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us).
+    # DCGM_FI_DEV_LOW_UTIL_VIOLATION,    counter, Throttling duration due to low utilization (in us).
+    # DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us).
+
+    # Memory usage
+    DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB).
+    DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB).
+
+    # ECC
+    # DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors.
+    # DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors.
+    # DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors.
+    # DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors.
+
+    # Retired pages
+    # DCGM_FI_DEV_RETIRED_SBE,     counter, Total number of retired pages due to single-bit errors.
+    # DCGM_FI_DEV_RETIRED_DBE,     counter, Total number of retired pages due to double-bit errors.
+    # DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement.
+
+    # NVLink
+    # DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors.
+    # DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors.
+    # DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL,   counter, Total number of NVLink retries.
+    # DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors.
+    DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL,            counter, Total number of NVLink bandwidth counters for all lanes.
+    # DCGM_FI_DEV_NVLINK_BANDWIDTH_L0,               counter, The number of bytes of active NVLink rx or tx data including both header and payload.
+
+    # VGPU License status
+    DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status
+
+    # Remapped rows
+    DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors
+    DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS,   counter, Number of remapped rows for correctable errors
+    DCGM_FI_DEV_ROW_REMAP_FAILURE,           gauge,   Whether remapping of rows has failed
+
+    # DCP metrics
+    DCGM_FI_PROF_GR_ENGINE_ACTIVE,   gauge, Ratio of time the graphics engine is active.
+    # DCGM_FI_PROF_SM_ACTIVE,          gauge, The ratio of cycles an SM has at least 1 warp assigned.
+    # DCGM_FI_PROF_SM_OCCUPANCY,       gauge, The ratio of number of warps resident on an SM.
+    DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active.
+    DCGM_FI_PROF_DRAM_ACTIVE,        gauge, Ratio of cycles the device memory interface is active sending or receiving data.
+    # DCGM_FI_PROF_PIPE_FP64_ACTIVE,   gauge, Ratio of cycles the fp64 pipes are active.
+    # DCGM_FI_PROF_PIPE_FP32_ACTIVE,   gauge, Ratio of cycles the fp32 pipes are active.
+    # DCGM_FI_PROF_PIPE_FP16_ACTIVE,   gauge, Ratio of cycles the fp16 pipes are active.
+    DCGM_FI_PROF_PCIE_TX_BYTES,      counter, The number of bytes of active pcie tx data including both header and payload.
+    DCGM_FI_PROF_PCIE_RX_BYTES,      counter, The number of bytes of active pcie rx data including both header and payload.
+kind: ConfigMap
+metadata:
+  name: exporter-metrics-config-map
+  namespace: dcgm-exporter
+---
+# Source: dcgm-exporter/templates/role.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  labels:
+    app.kubernetes.io/component: dcgm-exporter
+    app.kubernetes.io/instance: dcgm-exporter
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/name: dcgm-exporter
+    app.kubernetes.io/version: 3.5.0
+    helm.sh/chart: dcgm-exporter-3.5.0
+  name: dcgm-exporter-read-cm
+  namespace: dcgm-exporter
+rules:
+  - apiGroups:
+      - ""
+    resourceNames:
+      - exporter-metrics-config-map
+    resources:
+      - configmaps
+    verbs:
+      - get
+---
+# Source: dcgm-exporter/templates/rolebinding.yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  labels:
+    app.kubernetes.io/component: dcgm-exporter
+    app.kubernetes.io/instance: dcgm-exporter
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/name: dcgm-exporter
+    app.kubernetes.io/version: 3.5.0
+    helm.sh/chart: dcgm-exporter-3.5.0
+  name: dcgm-exporter
+  namespace: dcgm-exporter
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: dcgm-exporter-read-cm
+subjects:
+  - kind: ServiceAccount
+    name: dcgm-exporter
+    namespace: dcgm-exporter
+---
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    app.kubernetes.io/component: dcgm-exporter
+    app.kubernetes.io/instance: dcgm-exporter
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/name: dcgm-exporter
+    app.kubernetes.io/version: 3.5.0
+    helm.sh/chart: dcgm-exporter-3.5.0
+  name: dcgm-exporter
+  namespace: dcgm-exporter
+spec:
+  ports:
+    - name: metrics
+      port: 9400
+      protocol: TCP
+      targetPort: 9400
+  selector:
+    app.kubernetes.io/instance: dcgm-exporter
+    app.kubernetes.io/name: dcgm-exporter
+  type: ClusterIP
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  labels:
+    app.kubernetes.io/component: dcgm-exporter
+    app.kubernetes.io/instance: dcgm-exporter
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/name: dcgm-exporter
+    app.kubernetes.io/version: 3.5.0
+    helm.sh/chart: dcgm-exporter-3.5.0
+  name: dcgm-exporter
+  namespace: dcgm-exporter
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/component: dcgm-exporter
+      app.kubernetes.io/instance: dcgm-exporter
+      app.kubernetes.io/name: dcgm-exporter
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/component: dcgm-exporter
+        app.kubernetes.io/instance: dcgm-exporter
+        app.kubernetes.io/name: dcgm-exporter
+    spec:
+      containers:
+        - args:
+            - -f
+            - /etc/dcgm-exporter/dcp-metrics-included.csv
+          env:
+            - name: DCGM_EXPORTER_KUBERNETES
+              value: "true"
+            - name: DCGM_EXPORTER_LISTEN
+              value: :9400
+            - name: NODE_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: spec.nodeName
+          image: nvcr.io/nvidia/k8s/dcgm-exporter:3.3.7-3.5.0-ubuntu22.04
+          imagePullPolicy: IfNotPresent
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: 9400
+            initialDelaySeconds: 45
+            periodSeconds: 5
+          name: exporter
+          ports:
+            - containerPort: 9400
+              name: metrics
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: 9400
+            initialDelaySeconds: 45
+          resources:
+            limits:
+              memory: 256Mi
+            requests:
+              cpu: 100m
+              memory: 256Mi
+          securityContext:
+            capabilities:
+              add:
+                - SYS_ADMIN
+            runAsNonRoot: false
+            runAsUser: 0
+          volumeMounts:
+            - mountPath: /var/lib/kubelet/pod-resources
+              name: pod-gpu-resources
+              readOnly: true
+      nodeSelector:
+        nvidia.com/gpu: "true"
+      priorityClassName: infrastructure
+      serviceAccountName: dcgm-exporter
+      tolerations:
+        - key: CriticalAddonsOnly
+          operator: Exists
+        - effect: NoSchedule
+          key: nvidia.com/gpu
+          operator: Exists
+      volumes:
+        - hostPath:
+            path: /var/lib/kubelet/pod-resources
+          name: pod-gpu-resources
+  updateStrategy:
+    rollingUpdate:
+      maxSurge: 0
+      maxUnavailable: 25%
+    type: RollingUpdate
+---
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  labels:
+    app.kubernetes.io/component: dcgm-exporter
+    app.kubernetes.io/instance: dcgm-exporter
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/name: dcgm-exporter
+    app.kubernetes.io/version: 3.5.0
+    helm.sh/chart: dcgm-exporter-3.5.0
+    instance: primary
+  name: dcgm-exporter
+  namespace: dcgm-exporter
+spec:
+  endpoints:
+    - honorLabels: false
+      interval: 120s
+      path: /metrics
+      port: metrics
+      relabelings: []
+  namespaceSelector:
+    matchNames:
+      - dcgm-exporter
+  selector:
+    matchLabels:
+      app.kubernetes.io/component: dcgm-exporter
+      app.kubernetes.io/instance: dcgm-exporter
+      app.kubernetes.io/name: dcgm-exporter

infrastructure/base/metaflow/helm.yaml

--- /dev/fd/63	2024-10-07 07:09:38.884941632 +0000
+++ /dev/fd/62	2024-10-07 07:09:38.885941617 +0000
@@ -3,11 +3,11 @@
 metadata:
   labels:
     app.kubernetes.io/instance: metaflow
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: metaflow-service
-    app.kubernetes.io/version: 2.2.4
+    app.kubernetes.io/version: v2.3.6
     helm.sh/chart: metaflow-service-0.2.0
   name: metaflow-service
 spec:
   ports:
     - name: metadata
@@ -29,12 +29,12 @@
 metadata:
   labels:
     app.kubernetes.io/instance: metaflow
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: metaflow-ui
-    app.kubernetes.io/version: 2.1.0
-    helm.sh/chart: metaflow-ui-0.1.0
+    app.kubernetes.io/version: v2.3.6
+    helm.sh/chart: metaflow-ui-0.1.1
   name: metaflow-ui
 spec:
   ports:
     - name: http
       port: 8083
@@ -51,12 +51,12 @@
 metadata:
   labels:
     app.kubernetes.io/instance: metaflow
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: metaflow-ui-static
-    app.kubernetes.io/version: 2.1.0
-    helm.sh/chart: metaflow-ui-0.1.0
+    app.kubernetes.io/version: v2.3.6
+    helm.sh/chart: metaflow-ui-0.1.1
   name: metaflow-ui-static
 spec:
   ports:
     - name: http
       port: 3000
@@ -73,11 +73,11 @@
 metadata:
   labels:
     app.kubernetes.io/instance: metaflow
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: metaflow-service
-    app.kubernetes.io/version: 2.2.4
+    app.kubernetes.io/version: v2.3.6
     helm.sh/chart: metaflow-service-0.2.0
   name: metaflow-service
 spec:
   replicas: 2
   selector:
@@ -104,11 +104,11 @@
               value: metaflow
             - name: MF_METADATA_DB_USER
               value: metaflow
             - name: MF_METADATA_DB_HOST
               value: metaflow-postgresql
-          image: public.ecr.aws/outerbounds/metaflow_metadata_service:2.2.4
+          image: public.ecr.aws/outerbounds/metaflow_metadata_service:v2.3.6
           imagePullPolicy: IfNotPresent
           livenessProbe:
             httpGet:
               path: /ping
               port: http
@@ -137,11 +137,11 @@
               value: metaflow
             - name: MF_METADATA_DB_USER
               value: metaflow
             - name: MF_METADATA_DB_HOST
               value: metaflow-postgresql
-          image: public.ecr.aws/outerbounds/metaflow_metadata_service:2.2.4
+          image: public.ecr.aws/outerbounds/metaflow_metadata_service:v2.3.6
           name: db-migrations
       securityContext: {}
       serviceAccountName: metaflow
 ---
 # Source: metaflow/charts/metaflow-ui/templates/backend_deployment.yaml
@@ -150,12 +150,12 @@
 metadata:
   labels:
     app.kubernetes.io/instance: metaflow
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: metaflow-ui
-    app.kubernetes.io/version: 2.1.0
-    helm.sh/chart: metaflow-ui-0.1.0
+    app.kubernetes.io/version: v2.3.6
+    helm.sh/chart: metaflow-ui-0.1.1
   name: metaflow-ui
 spec:
   replicas: 2
   selector:
     matchLabels:
@@ -197,11 +197,11 @@
               value: metaflow
             - name: MF_METADATA_DB_USER
               value: metaflow
             - name: MF_METADATA_DB_HOST
               value: metaflow-postgresql
-          image: netflixoss/metaflow_metadata_service:2.1.0
+          image: public.ecr.aws/outerbounds/metaflow_metadata_service:v2.3.6
           imagePullPolicy: IfNotPresent
           livenessProbe:
             httpGet:
               path: /api/ping
               port: http
@@ -225,12 +225,12 @@
 metadata:
   labels:
     app.kubernetes.io/instance: metaflow
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: metaflow-ui-static
-    app.kubernetes.io/version: 2.1.0
-    helm.sh/chart: metaflow-ui-0.1.0
+    app.kubernetes.io/version: v2.3.6
+    helm.sh/chart: metaflow-ui-0.1.1
   name: metaflow-ui-static
 spec:
   replicas: 2
   selector:
     matchLabels:
@@ -241,11 +241,11 @@
       labels:
         app.kubernetes.io/instance: metaflow
         app.kubernetes.io/name: metaflow-ui-static
     spec:
       containers:
-        - image: public.ecr.aws/outerbounds/metaflow_ui:v1.0.1
+        - image: public.ecr.aws/outerbounds/metaflow_ui:v1.1.4
           imagePullPolicy: IfNotPresent
           livenessProbe:
             httpGet:
               path: /
               port: http
@@ -271,11 +271,11 @@
     helm.sh/hook: test
   labels:
     app.kubernetes.io/instance: metaflow
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: metaflow-service
-    app.kubernetes.io/version: 2.2.4
+    app.kubernetes.io/version: v2.3.6
     helm.sh/chart: metaflow-service-0.2.0
   name: metaflow-service-test-connection
 spec:
   containers:
     - args:
@@ -294,12 +294,12 @@
     helm.sh/hook: test
   labels:
     app.kubernetes.io/instance: metaflow
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: metaflow-ui
-    app.kubernetes.io/version: 2.1.0
-    helm.sh/chart: metaflow-ui-0.1.0
+    app.kubernetes.io/version: v2.3.6
+    helm.sh/chart: metaflow-ui-0.1.1
   name: metaflow-ui-test-connection
 spec:
   containers:
     - args:
         - metaflow-ui:8083

infrastructure/base/nvidia-device-plugin/helm.yaml

--- /dev/fd/63	2024-10-07 07:09:39.315935458 +0000
+++ /dev/fd/62	2024-10-07 07:09:39.316935444 +0000
@@ -1,14 +1,70 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  labels:
+    app.kubernetes.io/instance: nvidia-device-plugin
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/name: nvidia-device-plugin
+    app.kubernetes.io/version: 0.15.0
+    helm.sh/chart: nvidia-device-plugin-0.15.0
+  name: nvidia-device-plugin-service-account
+  namespace: nvidia-device-plugin
+---
+# Source: nvidia-device-plugin/templates/role.yml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/instance: nvidia-device-plugin
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/name: nvidia-device-plugin
+    app.kubernetes.io/version: 0.15.0
+    helm.sh/chart: nvidia-device-plugin-0.15.0
+  name: nvidia-device-plugin-role
+  namespace: nvidia-device-plugin
+rules:
+  - apiGroups:
+      - ""
+    resources:
+      - nodes
+    verbs:
+      - get
+      - list
+      - watch
+---
+# Source: nvidia-device-plugin/templates/role-binding.yml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  labels:
+    app.kubernetes.io/instance: nvidia-device-plugin
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/name: nvidia-device-plugin
+    app.kubernetes.io/version: 0.15.0
+    helm.sh/chart: nvidia-device-plugin-0.15.0
+  name: nvidia-device-plugin-role-binding
+  namespace: nvidia-device-plugin
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: nvidia-device-plugin-role
+subjects:
+  - kind: ServiceAccount
+    name: nvidia-device-plugin-service-account
+    namespace: nvidia-device-plugin
+---
+# Source: nvidia-device-plugin/templates/daemonset-device-plugin.yml
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
   labels:
     app.kubernetes.io/instance: nvidia-device-plugin
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: nvidia-device-plugin
-    app.kubernetes.io/version: 0.14.5
-    helm.sh/chart: nvidia-device-plugin-0.14.5
+    app.kubernetes.io/version: 0.15.0
+    helm.sh/chart: nvidia-device-plugin-0.15.0
   name: nvidia-device-plugin
   namespace: nvidia-device-plugin
 spec:
   selector:
     matchLabels:
@@ -19,15 +75,42 @@
       annotations: {}
       labels:
         app.kubernetes.io/instance: nvidia-device-plugin
         app.kubernetes.io/name: nvidia-device-plugin
     spec:
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: feature.node.kubernetes.io/pci-10de.present
+                    operator: In
+                    values:
+                      - "true"
+              - matchExpressions:
+                  - key: feature.node.kubernetes.io/cpu-model.vendor_id
+                    operator: In
+                    values:
+                      - NVIDIA
+              - matchExpressions:
+                  - key: nvidia.com/gpu.present
+                    operator: In
+                    values:
+                      - "true"
       containers:
-        - env:
+        - command:
+            - nvidia-device-plugin
+          env:
+            - name: MPS_ROOT
+              value: /run/nvidia/mps
             - name: NVIDIA_MIG_MONITOR_DEVICES
               value: all
-          image: nvcr.io/nvidia/k8s-device-plugin:v0.14.5
+            - name: NVIDIA_VISIBLE_DEVICES
+              value: all
+            - name: NVIDIA_DRIVER_CAPABILITIES
+              value: compute,utility
+          image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
           imagePullPolicy: IfNotPresent
           name: nvidia-device-plugin-ctr
           resources:
             limits:
               memory: 64Mi
@@ -39,10 +122,17 @@
               add:
                 - SYS_ADMIN
           volumeMounts:
             - mountPath: /var/lib/kubelet/device-plugins
               name: device-plugin
+            # The MPS /dev/shm is needed to allow for MPS daemon health-checking.
+            - mountPath: /dev/shm
+              name: mps-shm
+            - mountPath: /mps
+              name: mps-root
+            - mountPath: /var/run/cdi
+              name: cdi-root
       nodeSelector:
         nvidia.com/gpu: "true"
       priorityClassName: system-node-critical
       securityContext: {}
       tolerations:
@@ -62,10 +152,151 @@
           operator: Exists
       volumes:
         - hostPath:
             path: /var/lib/kubelet/device-plugins
           name: device-plugin
+        - hostPath:
+            path: /run/nvidia/mps
+            type: DirectoryOrCreate
+          name: mps-root
+        - hostPath:
+            path: /run/nvidia/mps/shm
+          name: mps-shm
+        - hostPath:
+            path: /var/run/cdi
+            type: DirectoryOrCreate
+          name: cdi-root
   updateStrategy:
     rollingUpdate:
       maxUnavailable: 25%
     type: RollingUpdate
 ---
+# Source: nvidia-device-plugin/templates/daemonset-mps-control-daemon.yml
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  labels:
+    app.kubernetes.io/instance: nvidia-device-plugin
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/name: nvidia-device-plugin
+    app.kubernetes.io/version: 0.15.0
+    helm.sh/chart: nvidia-device-plugin-0.15.0
+  name: nvidia-device-plugin-mps-control-daemon
+  namespace: nvidia-device-plugin
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/instance: nvidia-device-plugin
+      app.kubernetes.io/name: nvidia-device-plugin
+  template:
+    metadata:
+      annotations: {}
+      labels:
+        app.kubernetes.io/instance: nvidia-device-plugin
+        app.kubernetes.io/name: nvidia-device-plugin
+    spec:
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: feature.node.kubernetes.io/pci-10de.present
+                    operator: In
+                    values:
+                      - "true"
+              - matchExpressions:
+                  - key: feature.node.kubernetes.io/cpu-model.vendor_id
+                    operator: In
+                    values:
+                      - NVIDIA
+              - matchExpressions:
+                  - key: nvidia.com/gpu.present
+                    operator: In
+                    values:
+                      - "true"
+      containers:
+        - command:
+            - mps-control-daemon
+          env:
+            - name: NODE_NAME
+              valueFrom:
+                fieldRef:
+                  apiVersion: v1
+                  fieldPath: spec.nodeName
+            - name: NVIDIA_MIG_MONITOR_DEVICES
+              value: all
+            - name: NVIDIA_VISIBLE_DEVICES
+              value: all
+            - name: NVIDIA_DRIVER_CAPABILITIES
+              value: compute,utility
+          image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
+          imagePullPolicy: IfNotPresent
+          name: mps-control-daemon-ctr
+          resources:
+            limits:
+              memory: 64Mi
+            requests:
+              cpu: 10m
+              memory: 64Mi
+          securityContext:
+            privileged: true
+          volumeMounts:
+            - mountPath: /dev/shm
+              name: mps-shm
+            - mountPath: /mps
+              name: mps-root
+      initContainers:
+        - command:
+            - mps-control-daemon
+            - mount-shm
+          image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
+          name: mps-control-daemon-mounts
+          resources:
+            limits:
+              memory: 64Mi
+            requests:
+              cpu: 10m
+              memory: 64Mi
+          securityContext:
+            privileged: true
+          volumeMounts:
+            - mountPath: /mps
+              mountPropagation: Bidirectional
+              name: mps-root
+      nodeSelector:
+        nvidia.com/gpu: "true"
+        # We only deploy this pod if the following sharing label is applied.
+        nvidia.com/mps.capable: "true"
+      priorityClassName: system-node-critical
+      securityContext: {}
+      tolerations:
+        - key: CriticalAddonsOnly
+          operator: Exists
+        - effect: NoSchedule
+          key: nvidia.com/gpu
+          operator: Exists
+        - effect: NoSchedule
+          key: veo.co/nodegroup-purpose
+          operator: Exists
+        - effect: NoSchedule
+          key: veo.co/processing-type
+          operator: Exists
+        - effect: NoSchedule
+          key: ${flux_toleration_key}
+          operator: Exists
+      volumes:
+        - hostPath:
+            path: /run/nvidia/mps
+            type: DirectoryOrCreate
+          name: mps-root
+        - hostPath:
+            path: /run/nvidia/mps/shm
+          name: mps-shm
+  updateStrategy:
+    rollingUpdate:
+      maxUnavailable: 25%
+    type: RollingUpdate
+---
+
+---
+
+---

infrastructure/base/weave-gitops-helm2oci/helm.yaml

No changes

infrastructure/base/weave-gitops-helmrepo/helm.yaml

--- /dev/fd/63	2024-10-07 07:09:42.280964967 +0000
+++ /dev/fd/62	2024-10-07 07:09:42.280964967 +0000
@@ -1,7 +1,7 @@
-Digest: sha256:ecf9b11fb56392b0b3dd1dc16e90529ece808e33711f83a14f3b1aa65dcad5c1
-Pulled: ghcr.io/weaveworks/charts/weave-gitops:4.0.31
+Digest: sha256:2e2460c3a971dcc034aea485586a69d7c14e1d4f064619d77492375f7d3ba0c4
+Pulled: ghcr.io/weaveworks/charts/weave-gitops:4.0.33
 ---
 # Source: weave-gitops/templates/network-policy.yaml
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
@@ -39,12 +39,12 @@
 metadata:
   labels:
     app.kubernetes.io/instance: weave-gitops
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: weave-gitops
-    app.kubernetes.io/version: v0.33.0
-    helm.sh/chart: weave-gitops-4.0.31
+    app.kubernetes.io/version: v0.35.0
+    helm.sh/chart: weave-gitops-4.0.33
   name: weave-gitops
 ---
 # Source: weave-gitops/templates/admin-user-creds.yaml
 apiVersion: v1
 data:
@@ -212,12 +212,12 @@
 metadata:
   labels:
     app.kubernetes.io/instance: weave-gitops
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: weave-gitops
-    app.kubernetes.io/version: v0.33.0
-    helm.sh/chart: weave-gitops-4.0.31
+    app.kubernetes.io/version: v0.35.0
+    helm.sh/chart: weave-gitops-4.0.33
   name: weave-gitops
 roleRef:
   apiGroup: rbac.authorization.k8s.io
   kind: ClusterRole
   name: weave-gitops
@@ -314,12 +314,12 @@
   annotations:
   labels:
     app.kubernetes.io/instance: weave-gitops
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: weave-gitops
-    app.kubernetes.io/version: v0.33.0
-    helm.sh/chart: weave-gitops-4.0.31
+    app.kubernetes.io/version: v0.35.0
+    helm.sh/chart: weave-gitops-4.0.33
   name: weave-gitops
 spec:
   ports:
     - name: http
       port: 9001
@@ -337,12 +337,12 @@
   labels:
     app.kubernetes.io/instance: weave-gitops
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: weave-gitops
     app.kubernetes.io/part-of: weave-gitops
-    app.kubernetes.io/version: v0.33.0
-    helm.sh/chart: weave-gitops-4.0.31
+    app.kubernetes.io/version: v0.35.0
+    helm.sh/chart: weave-gitops-4.0.33
     weave.works/app: weave-gitops-oss
   name: weave-gitops
 spec:
   replicas: 1
   selector:
@@ -365,11 +365,11 @@
           env:
             - name: WEAVE_GITOPS_FEATURE_TENANCY
               value: "true"
             - name: WEAVE_GITOPS_FEATURE_CLUSTER
               value: "false"
-          image: ghcr.io/weaveworks/wego-app:v0.33.0
+          image: ghcr.io/weaveworks/wego-app:v0.35.0
           imagePullPolicy: IfNotPresent
           livenessProbe:
             httpGet:
               path: /
               port: http
@@ -403,12 +403,12 @@
     cert-manager.io/cluster-issuer: letsencrypt
   labels:
     app.kubernetes.io/instance: weave-gitops
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: weave-gitops
-    app.kubernetes.io/version: v0.33.0
-    helm.sh/chart: weave-gitops-4.0.31
+    app.kubernetes.io/version: v0.35.0
+    helm.sh/chart: weave-gitops-4.0.33
   name: weave-gitops
 spec:
   ingressClassName: nginx
   rules:
     - host: gitops.${flux_base_domain_name}
@@ -423,28 +423,5 @@
             pathType: Prefix
   tls:
     - hosts:
         - gitops.${flux_base_domain_name}
       secretName: gitops-tls
----
-# Source: weave-gitops/templates/tests/test-connection.yaml
-apiVersion: v1
-kind: Pod
-metadata:
-  annotations:
-    helm.sh/hook: test
-  labels:
-    app.kubernetes.io/instance: weave-gitops
-    app.kubernetes.io/managed-by: Helm
-    app.kubernetes.io/name: weave-gitops
-    app.kubernetes.io/version: v0.33.0
-    helm.sh/chart: weave-gitops-4.0.31
-  name: weave-gitops-test-connection
-spec:
-  containers:
-    - args:
-        - weave-gitops:9001
-      command:
-        - wget
-      image: busybox
-      name: wget
-  restartPolicy: Never

infrastructure/base/weave-gitops-ocirepo/helm.yaml

--- /dev/fd/63	2024-10-07 07:09:43.828984181 +0000
+++ /dev/fd/62	2024-10-07 07:09:43.828984181 +0000
@@ -1,7 +1,7 @@
-Digest: sha256:ecf9b11fb56392b0b3dd1dc16e90529ece808e33711f83a14f3b1aa65dcad5c1
-Pulled: ghcr.io/weaveworks/charts/weave-gitops:4.0.31
+Digest: sha256:68e0ca052b23f0fd11c4923192cec0d65f8129553eb44605f117f9824dd0dae8
+Pulled: ghcr.io/weaveworks/charts/weave-gitops:4.0.32
 ---
 # Source: weave-gitops/templates/network-policy.yaml
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
@@ -39,12 +39,12 @@
 metadata:
   labels:
     app.kubernetes.io/instance: weave-gitops
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: weave-gitops
-    app.kubernetes.io/version: v0.33.0
-    helm.sh/chart: weave-gitops-4.0.31
+    app.kubernetes.io/version: v0.34.0
+    helm.sh/chart: weave-gitops-4.0.32
   name: weave-gitops
 ---
 # Source: weave-gitops/templates/admin-user-creds.yaml
 apiVersion: v1
 data:
@@ -212,12 +212,12 @@
 metadata:
   labels:
     app.kubernetes.io/instance: weave-gitops
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: weave-gitops
-    app.kubernetes.io/version: v0.33.0
-    helm.sh/chart: weave-gitops-4.0.31
+    app.kubernetes.io/version: v0.34.0
+    helm.sh/chart: weave-gitops-4.0.32
   name: weave-gitops
 roleRef:
   apiGroup: rbac.authorization.k8s.io
   kind: ClusterRole
   name: weave-gitops
@@ -314,12 +314,12 @@
   annotations:
   labels:
     app.kubernetes.io/instance: weave-gitops
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: weave-gitops
-    app.kubernetes.io/version: v0.33.0
-    helm.sh/chart: weave-gitops-4.0.31
+    app.kubernetes.io/version: v0.34.0
+    helm.sh/chart: weave-gitops-4.0.32
   name: weave-gitops
 spec:
   ports:
     - name: http
       port: 9001
@@ -337,12 +337,12 @@
   labels:
     app.kubernetes.io/instance: weave-gitops
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: weave-gitops
     app.kubernetes.io/part-of: weave-gitops
-    app.kubernetes.io/version: v0.33.0
-    helm.sh/chart: weave-gitops-4.0.31
+    app.kubernetes.io/version: v0.34.0
+    helm.sh/chart: weave-gitops-4.0.32
     weave.works/app: weave-gitops-oss
   name: weave-gitops
 spec:
   replicas: 1
   selector:
@@ -365,11 +365,11 @@
           env:
             - name: WEAVE_GITOPS_FEATURE_TENANCY
               value: "true"
             - name: WEAVE_GITOPS_FEATURE_CLUSTER
               value: "false"
-          image: ghcr.io/weaveworks/wego-app:v0.33.0
+          image: ghcr.io/weaveworks/wego-app:v0.34.0
           imagePullPolicy: IfNotPresent
           livenessProbe:
             httpGet:
               path: /
               port: http
@@ -403,12 +403,12 @@
     cert-manager.io/cluster-issuer: letsencrypt
   labels:
     app.kubernetes.io/instance: weave-gitops
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: weave-gitops
-    app.kubernetes.io/version: v0.33.0
-    helm.sh/chart: weave-gitops-4.0.31
+    app.kubernetes.io/version: v0.34.0
+    helm.sh/chart: weave-gitops-4.0.32
   name: weave-gitops
 spec:
   ingressClassName: nginx
   rules:
     - host: gitops.${flux_base_domain_name}
@@ -434,12 +434,12 @@
     helm.sh/hook: test
   labels:
     app.kubernetes.io/instance: weave-gitops
     app.kubernetes.io/managed-by: Helm
     app.kubernetes.io/name: weave-gitops
-    app.kubernetes.io/version: v0.33.0
-    helm.sh/chart: weave-gitops-4.0.31
+    app.kubernetes.io/version: v0.34.0
+    helm.sh/chart: weave-gitops-4.0.32
   name: weave-gitops-test-connection
 spec:
   containers:
     - args:
         - weave-gitops:9001

@abstrask abstrask merged commit 28adb85 into dev Oct 7, 2024
1 check passed
@abstrask abstrask deleted the add-gitrepo-support branch October 7, 2024 07:11
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

1 participant