From 5dbdd66b27d72a7cad58ed7f017eaa1575e8cc06 Mon Sep 17 00:00:00 2001 From: Alejandro Escobar Date: Thu, 16 Jan 2020 11:59:26 -0800 Subject: [PATCH] [nvidia] chore: chart improvements (#357) * [nvidia] chore: chart improvements * fixed requirements file * fixed missing field in chart file * values file fixed --- staging/nvidia/Chart.yaml | 13 ++++-- staging/nvidia/charts/grafana/Chart.yaml | 2 +- .../templates/configmap-dashboards.yaml | 40 +++++++++--------- staging/nvidia/charts/grafana/values.yaml | 5 --- .../charts/nvidia-dcgm-exporter/Chart.yaml | 4 +- .../templates/daemonset.yaml | 42 ++++++++++--------- .../templates/tests/test-connection.yaml | 2 +- .../charts/nvidia-dcgm-exporter/values.yaml | 28 ------------- .../charts/nvidia-device-plugin/Chart.yaml | 2 +- .../nvidia-device-plugin/templates/NOTES.txt | 21 ---------- .../templates/daemonset.yaml | 37 ++++++++-------- .../templates/tests/test-connection.yaml | 2 +- .../charts/nvidia-device-plugin/values.yaml | 33 +-------------- .../nvidia/charts/nvidia-driver/Chart.yaml | 2 +- .../charts/nvidia-driver/templates/NOTES.txt | 21 ---------- .../nvidia-driver/templates/daemonset.yaml | 28 +++++++------ .../templates/tests/test-connection.yaml | 2 +- .../nvidia/charts/nvidia-driver/values.yaml | 30 ++----------- staging/nvidia/requirements.yaml | 13 ++++++ staging/nvidia/values.yaml | 27 ++++++++++-- 20 files changed, 132 insertions(+), 222 deletions(-) delete mode 100644 staging/nvidia/charts/nvidia-device-plugin/templates/NOTES.txt delete mode 100644 staging/nvidia/charts/nvidia-driver/templates/NOTES.txt create mode 100644 staging/nvidia/requirements.yaml diff --git a/staging/nvidia/Chart.yaml b/staging/nvidia/Chart.yaml index d28db505c..e35aaa2f9 100644 --- a/staging/nvidia/Chart.yaml +++ b/staging/nvidia/Chart.yaml @@ -1,8 +1,13 @@ apiVersion: v1 name: nvidia description: Nvidia GPU driver and device plugin for running Nvidia GPU -home: https://docs.nvidia.com/datacenter/kubernetes/kubernetes-upstream/ -version: 0.2.0 -maintainers: -- name: gilbert88 +keywords: + - gpu + - nvidia +version: 0.3.0 appVersion: "0.2.0" +home: http://github.com/mesosphere/charts/staging +sources: + - http://github.com/mesosphere/charts/staging +maintainers: + - name: gilbert88 diff --git a/staging/nvidia/charts/grafana/Chart.yaml b/staging/nvidia/charts/grafana/Chart.yaml index f924ed0f4..2ef6e1663 100644 --- a/staging/nvidia/charts/grafana/Chart.yaml +++ b/staging/nvidia/charts/grafana/Chart.yaml @@ -1,5 +1,5 @@ apiVersion: v1 description: Nvidia GPU Grafana Dashboard name: grafana -version: 0.1.0 +version: 0.1.1 appVersion: "0.1.0" diff --git a/staging/nvidia/charts/grafana/templates/configmap-dashboards.yaml b/staging/nvidia/charts/grafana/templates/configmap-dashboards.yaml index 5a76ca63e..ad58d5f1a 100644 --- a/staging/nvidia/charts/grafana/templates/configmap-dashboards.yaml +++ b/staging/nvidia/charts/grafana/templates/configmap-dashboards.yaml @@ -1,12 +1,8 @@ -# Generated from 'nodes' from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml -# Do not change in-place! In order to change this file first read following link: -# https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack -{{- if .Values.enabled }} apiVersion: v1 kind: ConfigMap metadata: name: {{ template "grafana.fullname" . }} - namespace: {{ $.Values.namespace }} + namespace: {{ .Release.Namespace }} labels: {{- if $.Values.dashboards.label }} {{ $.Values.dashboards.label }}: "1" @@ -93,7 +89,7 @@ data: "step": 50 } ], - "title": "GPU SM Clocks", + "title": "SM Clock Frequency", "tooltip": { "msResolution": false, "shared": true, @@ -169,7 +165,7 @@ data: "step": 50 } ], - "title": "GPU Memory Clocks", + "title": "Memory Clock Frequency", "tooltip": { "msResolution": false, "shared": true, @@ -256,7 +252,7 @@ data: "step": 50 } ], - "title": "GPU Utilization", + "title": "Utilization", "tooltip": { "msResolution": false, "shared": true, @@ -334,7 +330,7 @@ data: "step": 50 } ], - "title": "GPU Mem Cpy Utilization", + "title": "Memory Copy Utilization", "tooltip": { "msResolution": false, "shared": true, @@ -422,7 +418,7 @@ data: "refId": "A" } ], - "title": "GPU Power Usage", + "title": "Power Usage", "tooltip": { "msResolution": false, "shared": true, @@ -509,7 +505,7 @@ data: } ], "thresholds": "1800, 2200", - "title": "GPU Power Total", + "title": "Power Total", "transparent": false, "type": "singlestat", "valueFontSize": "80%", @@ -581,7 +577,7 @@ data: "refId": "A" } ], - "title": "GPU Temperature", + "title": "Temperature", "tooltip": { "msResolution": false, "shared": true, @@ -669,7 +665,7 @@ data: } ], "thresholds": "83, 87", - "title": "GPU Avg. Temp", + "title": "Average Temperature", "transparent": false, "type": "singlestat", "valueFontSize": "80%", @@ -741,7 +737,7 @@ data: "refId": "A" } ], - "title": "GPU Memory Temperature", + "title": "Memory Temperature", "tooltip": { "msResolution": false, "shared": true, @@ -829,7 +825,7 @@ data: } ], "thresholds": "83, 87", - "title": "GPU Avg. Memory Temp", + "title": "Average Memory Temperature", "transparent": false, "type": "singlestat", "valueFontSize": "80%", @@ -902,7 +898,7 @@ data: "step": 50 } ], - "title": "GPU Framebuffer Mem Used", + "title": "Framebuffer Memory Used", "tooltip": { "msResolution": false, "shared": true, @@ -980,7 +976,7 @@ data: "step": 50 } ], - "title": "GPU Framebuffer Mem Free", + "title": "Framebuffer Memory Free", "tooltip": { "msResolution": false, "shared": true, @@ -1016,7 +1012,10 @@ data: "schemaVersion": 14, "sharedCrosshair": false, "style": "dark", - "tags": [], + "tags": [ + "Nvidia", + "GPU" + ], "templating": { "list": [ { @@ -1119,7 +1118,6 @@ data: ] }, "timezone": "browser", - "title": "GPU", + "title": "GPUs / Nvidia", "version": 2 - } -{{- end }} \ No newline at end of file + } \ No newline at end of file diff --git a/staging/nvidia/charts/grafana/values.yaml b/staging/nvidia/charts/grafana/values.yaml index 9fa59f037..8389ad27a 100644 --- a/staging/nvidia/charts/grafana/values.yaml +++ b/staging/nvidia/charts/grafana/values.yaml @@ -1,7 +1,2 @@ -# -# grafana dashboards -# -enabled: true -namespace: kubeaddons dashboards: label: grafana_dashboard diff --git a/staging/nvidia/charts/nvidia-dcgm-exporter/Chart.yaml b/staging/nvidia/charts/nvidia-dcgm-exporter/Chart.yaml index 35fafd497..3190c62e3 100644 --- a/staging/nvidia/charts/nvidia-dcgm-exporter/Chart.yaml +++ b/staging/nvidia/charts/nvidia-dcgm-exporter/Chart.yaml @@ -2,4 +2,6 @@ apiVersion: v1 appVersion: "0.1.0" description: Nvidia GPU Metrics Exporter name: nvidia-dcgm-exporter -version: 0.1.0 +version: 0.1.1 +maintainers: + - name: gilbert88 \ No newline at end of file diff --git a/staging/nvidia/charts/nvidia-dcgm-exporter/templates/daemonset.yaml b/staging/nvidia/charts/nvidia-dcgm-exporter/templates/daemonset.yaml index 98715f9fb..55eae887c 100644 --- a/staging/nvidia/charts/nvidia-dcgm-exporter/templates/daemonset.yaml +++ b/staging/nvidia/charts/nvidia-dcgm-exporter/templates/daemonset.yaml @@ -1,9 +1,8 @@ -{{- if .Values.enabled }} apiVersion: apps/v1 kind: DaemonSet metadata: name: {{ template "nvidia-dcgm-exporter.fullname" . }} - namespace: {{ $.Values.namespace }} + namespace: {{ .Release.Namespace }} spec: selector: matchLabels: @@ -19,12 +18,23 @@ spec: {{- end }} {{- if .Values.tolerations }} tolerations: -{{ toYaml .Values.tolerations | indent 8 }} +{{- toYaml .Values.tolerations | indent 8 }} {{- end }} +{{- if .Values.nodeSelector }} + nodeSelector: +{{- toYaml .Values.nodeSelector | indent 8 }} +{{- end }} +{{- if .Values.priorityClassName }} priorityClassName: {{ .Values.priorityClassName }} +{{- end }} containers: - - image: "{{ .Values.podNvidiaGpuMetricsExporter.image.repository }}:{{ .Values.podNvidiaGpuMetricsExporter.image.tag }}" - name: pod-nvidia-gpu-metrics-exporter + - name: pod-nvidia-gpu-metrics-exporter + image: "{{ .Values.podNvidiaGpuMetricsExporter.image.repository }}:{{ .Values.podNvidiaGpuMetricsExporter.image.tag }}" + imagePullPolicy: {{ .Values.podNvidiaGpuMetricsExporter.image.pullPolicy }} +{{- if .Values.podNvidiaGpuMetricsExporter.resources }} + resources: +{{ toYaml .Values.podNvidiaGpuMetricsExporter.resources | indent 12 }} +{{- end }} ports: - name: gpu-metrics containerPort: 9400 @@ -32,9 +42,6 @@ spec: securityContext: runAsNonRoot: false runAsUser: 0 - imagePullPolicy: {{ .Values.podNvidiaGpuMetricsExporter.image.pullPolicy }} - resources: -{{ toYaml .Values.podNvidiaGpuMetricsExporter.resources | indent 12 }} volumeMounts: - name: pod-gpu-resources readOnly: true @@ -42,26 +49,23 @@ spec: - name: device-metrics readOnly: true mountPath: /run/prometheus - - image: "{{ .Values.nvidiaDcgmExporter.image.repository }}:{{ .Values.nvidiaDcgmExporter.image.tag }}" - name: nvidia-dcgm-exporter + - name: nvidia-dcgm-exporter + image: "{{ .Values.nvidiaDcgmExporter.image.repository }}:{{ .Values.nvidiaDcgmExporter.image.tag }}" + imagePullPolicy: {{ .Values.nvidiaDcgmExporter.image.pullPolicy }} +{{- if .Values.nvidiaDcgmExporter.resources }} + resources: +{{- toYaml .Values.nvidiaDcgmExporter.resources | indent 12 }} +{{- end }} securityContext: runAsNonRoot: false runAsUser: 0 - imagePullPolicy: {{ .Values.nvidiaDcgmExporter.image.pullPolicy }} - resources: -{{ toYaml .Values.nvidiaDcgmExporter.resources | indent 12 }} volumeMounts: - name: device-metrics mountPath: /run/prometheus -{{- if .Values.nodeSelector }} - nodeSelector: -{{ toYaml .Values.nodeSelector | indent 8 }} -{{- end }} volumes: - name: pod-gpu-resources hostPath: path: /var/lib/kubelet/pod-resources - name: device-metrics emptyDir: - medium: Memory -{{- end }} \ No newline at end of file + medium: Memory \ No newline at end of file diff --git a/staging/nvidia/charts/nvidia-dcgm-exporter/templates/tests/test-connection.yaml b/staging/nvidia/charts/nvidia-dcgm-exporter/templates/tests/test-connection.yaml index 4b090ab2f..e5633cdd3 100644 --- a/staging/nvidia/charts/nvidia-dcgm-exporter/templates/tests/test-connection.yaml +++ b/staging/nvidia/charts/nvidia-dcgm-exporter/templates/tests/test-connection.yaml @@ -11,5 +11,5 @@ spec: - name: wget image: busybox command: ['wget'] - args: ['{{ include "nvidia-dcgm-exporter.fullname" . }}:{{ .Values.service.port }}'] + args: ['{{ include "nvidia-dcgm-exporter.fullname" . }}:80'] restartPolicy: Never diff --git a/staging/nvidia/charts/nvidia-dcgm-exporter/values.yaml b/staging/nvidia/charts/nvidia-dcgm-exporter/values.yaml index 65f12f4b9..a7368a132 100644 --- a/staging/nvidia/charts/nvidia-dcgm-exporter/values.yaml +++ b/staging/nvidia/charts/nvidia-dcgm-exporter/values.yaml @@ -1,13 +1,3 @@ -# Default values for nvidia-dcgm-exporter. -# This is a YAML-formatted file. -# Declare variables to be passed into your templates. - -enabled: true - -namespace: kubeaddons - -replicaCount: 1 - podNvidiaGpuMetricsExporter: image: repository: nvidia/pod-gpu-metrics-exporter @@ -22,24 +12,6 @@ nvidiaDcgmExporter: pullPolicy: IfNotPresent resources: {} -service: - type: ClusterIP - port: 80 - -ingress: - enabled: false - annotations: {} - # kubernetes.io/ingress.class: nginx - # kubernetes.io/tls-acme: "true" - hosts: - - host: chart-example.local - paths: [] - - tls: [] - # - secretName: chart-example-tls - # hosts: - # - chart-example.local - nodeSelector: {} priorityClassName: "" diff --git a/staging/nvidia/charts/nvidia-device-plugin/Chart.yaml b/staging/nvidia/charts/nvidia-device-plugin/Chart.yaml index 0d1490d00..6d47d876e 100644 --- a/staging/nvidia/charts/nvidia-device-plugin/Chart.yaml +++ b/staging/nvidia/charts/nvidia-device-plugin/Chart.yaml @@ -3,6 +3,6 @@ name: nvidia-device-plugin home: https://github.com/mesosphere/charts appVersion: "1.0" description: Nvidia Device Plugin for konvoy -version: 0.1.1 +version: 0.1.2 maintainers: - name: gilbert88 diff --git a/staging/nvidia/charts/nvidia-device-plugin/templates/NOTES.txt b/staging/nvidia/charts/nvidia-device-plugin/templates/NOTES.txt deleted file mode 100644 index ac3675b6b..000000000 --- a/staging/nvidia/charts/nvidia-device-plugin/templates/NOTES.txt +++ /dev/null @@ -1,21 +0,0 @@ -1. Get the application URL by running these commands: -{{- if .Values.ingress.enabled }} -{{- range $host := .Values.ingress.hosts }} - {{- range .paths }} - http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ . }} - {{- end }} -{{- end }} -{{- else if contains "NodePort" .Values.service.type }} - export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "nvidia-device-plugin.fullname" . }}) - export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") - echo http://$NODE_IP:$NODE_PORT -{{- else if contains "LoadBalancer" .Values.service.type }} - NOTE: It may take a few minutes for the LoadBalancer IP to be available. - You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "nvidia-device-plugin.fullname" . }}' - export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "nvidia-device-plugin.fullname" . }} -o jsonpath='{.status.loadBalancer.ingress[0].ip}') - echo http://$SERVICE_IP:{{ .Values.service.port }} -{{- else if contains "ClusterIP" .Values.service.type }} - export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "nvidia-device-plugin.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") - echo "Visit http://127.0.0.1:8080 to use your application" - kubectl port-forward $POD_NAME 8080:80 -{{- end }} diff --git a/staging/nvidia/charts/nvidia-device-plugin/templates/daemonset.yaml b/staging/nvidia/charts/nvidia-device-plugin/templates/daemonset.yaml index 922b44fc0..d2191fdec 100644 --- a/staging/nvidia/charts/nvidia-device-plugin/templates/daemonset.yaml +++ b/staging/nvidia/charts/nvidia-device-plugin/templates/daemonset.yaml @@ -1,51 +1,48 @@ apiVersion: apps/v1 kind: DaemonSet metadata: - name: nvidia-device-plugin-daemonset + name: {{ template "nvidia-device-plugin.fullname" . }} namespace: kube-system spec: selector: matchLabels: - name: nvidia-device-plugin-ds + name: {{ template "nvidia-device-plugin.fullname" . }} updateStrategy: type: RollingUpdate template: metadata: - # Mark this pod as a critical add-on; when enabled, the critical add-on scheduler - # reserves resources for critical add-on pods so that they can be rescheduled after - # a failure. This annotation works in tandem with the toleration below. - annotations: - scheduler.alpha.kubernetes.io/critical-pod: "" labels: - name: nvidia-device-plugin-ds + name: {{ template "nvidia-device-plugin.fullname" . }} spec: {{- if .Values.initContainers }} initContainers: - {{- toYaml .Values.initContainers | nindent 8 }} +{{- toYaml .Values.initContainers | nindent 8 }} {{- end }} {{- if .Values.tolerations }} tolerations: -{{ toYaml .Values.tolerations | indent 8 }} +{{- toYaml .Values.tolerations | indent 8 }} {{- end }} priorityClassName: {{ .Values.priorityClassName }} containers: - - image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" - name: nvidia-device-plugin-ctr + - name: nvidia-device-plugin-ctr + image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} securityContext: allowPrivilegeEscalation: false capabilities: drop: ["ALL"] - imagePullPolicy: {{ .Values.image.pullPolicy }} +{{- if .Values.resources }} resources: -{{ toYaml .Values.resources | indent 12 }} +{{- toYaml .Values.resources | indent 12 }} +{{- end }} volumeMounts: - - name: device-plugin - mountPath: /var/lib/kubelet/device-plugins + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins {{- if .Values.nodeSelector }} nodeSelector: -{{ toYaml .Values.nodeSelector | indent 8 }} +{{- toYaml .Values.nodeSelector | indent 8 }} {{- end }} volumes: - - name: device-plugin - hostPath: - path: /var/lib/kubelet/device-plugins + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins diff --git a/staging/nvidia/charts/nvidia-device-plugin/templates/tests/test-connection.yaml b/staging/nvidia/charts/nvidia-device-plugin/templates/tests/test-connection.yaml index 4b05868f6..cbd92625d 100644 --- a/staging/nvidia/charts/nvidia-device-plugin/templates/tests/test-connection.yaml +++ b/staging/nvidia/charts/nvidia-device-plugin/templates/tests/test-connection.yaml @@ -11,5 +11,5 @@ spec: - name: wget image: busybox command: ['wget'] - args: ['{{ include "nvidia-device-plugin.fullname" . }}:{{ .Values.service.port }}'] + args: ['{{ include "nvidia-device-plugin.fullname" . }}:80'] restartPolicy: Never diff --git a/staging/nvidia/charts/nvidia-device-plugin/values.yaml b/staging/nvidia/charts/nvidia-device-plugin/values.yaml index c60b0529f..5a98b703d 100644 --- a/staging/nvidia/charts/nvidia-device-plugin/values.yaml +++ b/staging/nvidia/charts/nvidia-device-plugin/values.yaml @@ -1,9 +1,3 @@ -# Default values for nvidia-device-plugin. -# This is a YAML-formatted file. -# Declare variables to be passed into your templates. - -replicaCount: 1 - image: repository: nvidia/k8s-device-plugin tag: 1.0.0-beta4 @@ -13,24 +7,6 @@ imagePullSecrets: [] nameOverride: "" fullnameOverride: "" -service: - type: ClusterIP - port: 80 - -ingress: - enabled: false - annotations: {} - # kubernetes.io/ingress.class: nginx - # kubernetes.io/tls-acme: "true" - hosts: - - host: chart-example.local - paths: [] - - tls: [] - # - secretName: chart-example-tls - # hosts: - # - chart-example.local - resources: {} # We usually recommend not to specify default resources and to leave this as a conscious # choice for the user. This also increases chances charts run on environments with little @@ -47,14 +23,7 @@ nodeSelector: {} priorityClassName: system-node-critical -tolerations: - # Allow this pod to be rescheduled while the node is in "critical add-ons only" mode. - # This, along with the annotation above marks this pod as a critical add-on. - - key: CriticalAddonsOnly - operator: Exists - - key: nvidia.com/gpu - operator: Exists - effect: NoSchedule +tolerations: [] affinity: {} diff --git a/staging/nvidia/charts/nvidia-driver/Chart.yaml b/staging/nvidia/charts/nvidia-driver/Chart.yaml index 6b9ee0084..7e7cfcbf2 100644 --- a/staging/nvidia/charts/nvidia-driver/Chart.yaml +++ b/staging/nvidia/charts/nvidia-driver/Chart.yaml @@ -3,6 +3,6 @@ name: nvidia-driver home: https://github.com/mesosphere/charts appVersion: "1.0" description: Nvidia Driver for konvoy -version: 0.1.1 +version: 0.1.2 maintainers: - name: gilbert88 diff --git a/staging/nvidia/charts/nvidia-driver/templates/NOTES.txt b/staging/nvidia/charts/nvidia-driver/templates/NOTES.txt deleted file mode 100644 index 7c2f42eef..000000000 --- a/staging/nvidia/charts/nvidia-driver/templates/NOTES.txt +++ /dev/null @@ -1,21 +0,0 @@ -1. Get the application URL by running these commands: -{{- if .Values.ingress.enabled }} -{{- range $host := .Values.ingress.hosts }} - {{- range .paths }} - http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ . }} - {{- end }} -{{- end }} -{{- else if contains "NodePort" .Values.service.type }} - export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "nvidia-driver.fullname" . }}) - export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") - echo http://$NODE_IP:$NODE_PORT -{{- else if contains "LoadBalancer" .Values.service.type }} - NOTE: It may take a few minutes for the LoadBalancer IP to be available. - You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "nvidia-driver.fullname" . }}' - export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "nvidia-driver.fullname" . }} -o jsonpath='{.status.loadBalancer.ingress[0].ip}') - echo http://$SERVICE_IP:{{ .Values.service.port }} -{{- else if contains "ClusterIP" .Values.service.type }} - export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "nvidia-driver.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") - echo "Visit http://127.0.0.1:8080 to use your application" - kubectl port-forward $POD_NAME 8080:80 -{{- end }} diff --git a/staging/nvidia/charts/nvidia-driver/templates/daemonset.yaml b/staging/nvidia/charts/nvidia-driver/templates/daemonset.yaml index 5710975b5..b2f38996f 100644 --- a/staging/nvidia/charts/nvidia-driver/templates/daemonset.yaml +++ b/staging/nvidia/charts/nvidia-driver/templates/daemonset.yaml @@ -1,46 +1,48 @@ apiVersion: apps/v1 kind: DaemonSet metadata: - name: nvidia-driver-daemonset + name: {{ template "nvidia-driver.fullname" . }} namespace: kube-system spec: selector: matchLabels: - name: nvidia-driver-ds + name: {{ template "nvidia-driver.fullname" . }} template: metadata: labels: - name: nvidia-driver-ds + name: {{ template "nvidia-driver.fullname" . }} spec: +{{- if .Values.nodeSelector }} + nodeSelector: +{{- toYaml .Values.nodeSelector | indent 8 }} +{{- end }} {{- if .Values.initContainers }} initContainers: - {{- toYaml .Values.initContainers | nindent 8 }} +{{- toYaml .Values.initContainers | nindent 8 }} {{- end }} {{- if .Values.tolerations }} tolerations: -{{ toYaml .Values.tolerations | indent 8 }} +{{- toYaml .Values.tolerations | indent 8 }} {{- end }} priorityClassName: {{ .Values.priorityClassName }} hostPID: true containers: - - image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" - name: nvidia-driver + - name: nvidia-driver + image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} args: ["--accept-license"] securityContext: privileged: true - imagePullPolicy: {{ .Values.image.pullPolicy }} +{{- if .Values.resources }} resources: -{{ toYaml .Values.resources | indent 12 }} +{{- toYaml .Values.resources | indent 12 }} +{{- end }} volumeMounts: - name: update-driver mountPath: /run/kernel/postinst.d - name: driver-path mountPropagation: Bidirectional mountPath: /run/nvidia -{{- if .Values.nodeSelector }} - nodeSelector: -{{ toYaml .Values.nodeSelector | indent 8 }} -{{- end }} volumes: - name: update-driver hostPath: diff --git a/staging/nvidia/charts/nvidia-driver/templates/tests/test-connection.yaml b/staging/nvidia/charts/nvidia-driver/templates/tests/test-connection.yaml index c13d31e26..ebf56ba97 100644 --- a/staging/nvidia/charts/nvidia-driver/templates/tests/test-connection.yaml +++ b/staging/nvidia/charts/nvidia-driver/templates/tests/test-connection.yaml @@ -11,5 +11,5 @@ spec: - name: wget image: busybox command: ['wget'] - args: ['{{ include "nvidia-driver.fullname" . }}:{{ .Values.service.port }}'] + args: ['{{ include "nvidia-driver.fullname" . }}:80'] restartPolicy: Never diff --git a/staging/nvidia/charts/nvidia-driver/values.yaml b/staging/nvidia/charts/nvidia-driver/values.yaml index 2fec0188f..ce218c3e7 100644 --- a/staging/nvidia/charts/nvidia-driver/values.yaml +++ b/staging/nvidia/charts/nvidia-driver/values.yaml @@ -1,36 +1,12 @@ -# Default values for nvidia-driver. -# This is a YAML-formatted file. -# Declare variables to be passed into your templates. - -replicaCount: 1 - image: - repository: "nvidia/driver" - tag: "418.87.01-ubuntu16.04" - pullPolicy: "IfNotPresent" + repository: nvidia/driver + tag: 418.87.01-ubuntu16.04 + pullPolicy: IfNotPresent imagePullSecrets: [] nameOverride: "" fullnameOverride: "" -service: - type: ClusterIP - port: 80 - -ingress: - enabled: false - annotations: {} - # kubernetes.io/ingress.class: nginx - # kubernetes.io/tls-acme: "true" - hosts: - - host: chart-example.local - paths: [] - - tls: [] - # - secretName: chart-example-tls - # hosts: - # - chart-example.local - resources: {} # We usually recommend not to specify default resources and to leave this as a conscious # choice for the user. This also increases chances charts run on environments with little diff --git a/staging/nvidia/requirements.yaml b/staging/nvidia/requirements.yaml new file mode 100644 index 000000000..93147552c --- /dev/null +++ b/staging/nvidia/requirements.yaml @@ -0,0 +1,13 @@ +dependencies: + - name: grafana + version: 0.1.1 + condition: grafana.enabled + - name: nvidia-dcgm-exporter + version: 0.1.1 + condition: nvidia-dcgm-exporter.enabled + - name: nvidia-device-plugin + version: 0.1.2 + condition: nvidia-device-plugin.enabled + - name: nvidia-driver + version: 0.1.2 + condition: nvidia-driver.enabled \ No newline at end of file diff --git a/staging/nvidia/values.yaml b/staging/nvidia/values.yaml index 18818f806..c6f866818 100644 --- a/staging/nvidia/values.yaml +++ b/staging/nvidia/values.yaml @@ -1,4 +1,23 @@ -global: - serviceLabels: {} -serving: - domain: example.com +grafana: + enabled: true + +nvidia-dcgm-exporter: + enabled: true + resources: {} + nodeSelector: {} + tolerations: [] + initContainers: [] + +nvidia-device-plugin: + enabled: true + resources: {} + nodeSelector: {} + tolerations: [] + initContainers: [] + +nvidia-driver: + enabled: true + resources: {} + nodeSelector: {} + tolerations: [] + initContainers: []