Skip to content

Commit

Permalink
[nvidia] chore: chart improvements (#357)
Browse files Browse the repository at this point in the history
* [nvidia] chore: chart improvements

* fixed requirements file

* fixed missing field in chart file

* values file fixed
  • Loading branch information
alejandroEsc authored Jan 16, 2020
1 parent 1efeb5c commit 5dbdd66
Show file tree
Hide file tree
Showing 20 changed files with 132 additions and 222 deletions.
13 changes: 9 additions & 4 deletions staging/nvidia/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
apiVersion: v1
name: nvidia
description: Nvidia GPU driver and device plugin for running Nvidia GPU
home: https://docs.nvidia.com/datacenter/kubernetes/kubernetes-upstream/
version: 0.2.0
maintainers:
- name: gilbert88
keywords:
- gpu
- nvidia
version: 0.3.0
appVersion: "0.2.0"
home: http://github.com/mesosphere/charts/staging
sources:
- http://github.com/mesosphere/charts/staging
maintainers:
- name: gilbert88
2 changes: 1 addition & 1 deletion staging/nvidia/charts/grafana/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
apiVersion: v1
description: Nvidia GPU Grafana Dashboard
name: grafana
version: 0.1.0
version: 0.1.1
appVersion: "0.1.0"
40 changes: 19 additions & 21 deletions staging/nvidia/charts/grafana/templates/configmap-dashboards.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
# Generated from 'nodes' from https://raw.githubusercontent.com/coreos/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml
# Do not change in-place! In order to change this file first read following link:
# https://github.com/helm/charts/tree/master/stable/prometheus-operator/hack
{{- if .Values.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ template "grafana.fullname" . }}
namespace: {{ $.Values.namespace }}
namespace: {{ .Release.Namespace }}
labels:
{{- if $.Values.dashboards.label }}
{{ $.Values.dashboards.label }}: "1"
Expand Down Expand Up @@ -93,7 +89,7 @@ data:
"step": 50
}
],
"title": "GPU SM Clocks",
"title": "SM Clock Frequency",
"tooltip": {
"msResolution": false,
"shared": true,
Expand Down Expand Up @@ -169,7 +165,7 @@ data:
"step": 50
}
],
"title": "GPU Memory Clocks",
"title": "Memory Clock Frequency",
"tooltip": {
"msResolution": false,
"shared": true,
Expand Down Expand Up @@ -256,7 +252,7 @@ data:
"step": 50
}
],
"title": "GPU Utilization",
"title": "Utilization",
"tooltip": {
"msResolution": false,
"shared": true,
Expand Down Expand Up @@ -334,7 +330,7 @@ data:
"step": 50
}
],
"title": "GPU Mem Cpy Utilization",
"title": "Memory Copy Utilization",
"tooltip": {
"msResolution": false,
"shared": true,
Expand Down Expand Up @@ -422,7 +418,7 @@ data:
"refId": "A"
}
],
"title": "GPU Power Usage",
"title": "Power Usage",
"tooltip": {
"msResolution": false,
"shared": true,
Expand Down Expand Up @@ -509,7 +505,7 @@ data:
}
],
"thresholds": "1800, 2200",
"title": "GPU Power Total",
"title": "Power Total",
"transparent": false,
"type": "singlestat",
"valueFontSize": "80%",
Expand Down Expand Up @@ -581,7 +577,7 @@ data:
"refId": "A"
}
],
"title": "GPU Temperature",
"title": "Temperature",
"tooltip": {
"msResolution": false,
"shared": true,
Expand Down Expand Up @@ -669,7 +665,7 @@ data:
}
],
"thresholds": "83, 87",
"title": "GPU Avg. Temp",
"title": "Average Temperature",
"transparent": false,
"type": "singlestat",
"valueFontSize": "80%",
Expand Down Expand Up @@ -741,7 +737,7 @@ data:
"refId": "A"
}
],
"title": "GPU Memory Temperature",
"title": "Memory Temperature",
"tooltip": {
"msResolution": false,
"shared": true,
Expand Down Expand Up @@ -829,7 +825,7 @@ data:
}
],
"thresholds": "83, 87",
"title": "GPU Avg. Memory Temp",
"title": "Average Memory Temperature",
"transparent": false,
"type": "singlestat",
"valueFontSize": "80%",
Expand Down Expand Up @@ -902,7 +898,7 @@ data:
"step": 50
}
],
"title": "GPU Framebuffer Mem Used",
"title": "Framebuffer Memory Used",
"tooltip": {
"msResolution": false,
"shared": true,
Expand Down Expand Up @@ -980,7 +976,7 @@ data:
"step": 50
}
],
"title": "GPU Framebuffer Mem Free",
"title": "Framebuffer Memory Free",
"tooltip": {
"msResolution": false,
"shared": true,
Expand Down Expand Up @@ -1016,7 +1012,10 @@ data:
"schemaVersion": 14,
"sharedCrosshair": false,
"style": "dark",
"tags": [],
"tags": [
"Nvidia",
"GPU"
],
"templating": {
"list": [
{
Expand Down Expand Up @@ -1119,7 +1118,6 @@ data:
]
},
"timezone": "browser",
"title": "GPU",
"title": "GPUs / Nvidia",
"version": 2
}
{{- end }}
}
5 changes: 0 additions & 5 deletions staging/nvidia/charts/grafana/values.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,2 @@
#
# grafana dashboards
#
enabled: true
namespace: kubeaddons
dashboards:
label: grafana_dashboard
4 changes: 3 additions & 1 deletion staging/nvidia/charts/nvidia-dcgm-exporter/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@ apiVersion: v1
appVersion: "0.1.0"
description: Nvidia GPU Metrics Exporter
name: nvidia-dcgm-exporter
version: 0.1.0
version: 0.1.1
maintainers:
- name: gilbert88
42 changes: 23 additions & 19 deletions staging/nvidia/charts/nvidia-dcgm-exporter/templates/daemonset.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
{{- if .Values.enabled }}
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: {{ template "nvidia-dcgm-exporter.fullname" . }}
namespace: {{ $.Values.namespace }}
namespace: {{ .Release.Namespace }}
spec:
selector:
matchLabels:
Expand All @@ -19,49 +18,54 @@ spec:
{{- end }}
{{- if .Values.tolerations }}
tolerations:
{{ toYaml .Values.tolerations | indent 8 }}
{{- toYaml .Values.tolerations | indent 8 }}
{{- end }}
{{- if .Values.nodeSelector }}
nodeSelector:
{{- toYaml .Values.nodeSelector | indent 8 }}
{{- end }}
{{- if .Values.priorityClassName }}
priorityClassName: {{ .Values.priorityClassName }}
{{- end }}
containers:
- image: "{{ .Values.podNvidiaGpuMetricsExporter.image.repository }}:{{ .Values.podNvidiaGpuMetricsExporter.image.tag }}"
name: pod-nvidia-gpu-metrics-exporter
- name: pod-nvidia-gpu-metrics-exporter
image: "{{ .Values.podNvidiaGpuMetricsExporter.image.repository }}:{{ .Values.podNvidiaGpuMetricsExporter.image.tag }}"
imagePullPolicy: {{ .Values.podNvidiaGpuMetricsExporter.image.pullPolicy }}
{{- if .Values.podNvidiaGpuMetricsExporter.resources }}
resources:
{{ toYaml .Values.podNvidiaGpuMetricsExporter.resources | indent 12 }}
{{- end }}
ports:
- name: gpu-metrics
containerPort: 9400
hostPort: 9400
securityContext:
runAsNonRoot: false
runAsUser: 0
imagePullPolicy: {{ .Values.podNvidiaGpuMetricsExporter.image.pullPolicy }}
resources:
{{ toYaml .Values.podNvidiaGpuMetricsExporter.resources | indent 12 }}
volumeMounts:
- name: pod-gpu-resources
readOnly: true
mountPath: /var/lib/kubelet/pod-resources
- name: device-metrics
readOnly: true
mountPath: /run/prometheus
- image: "{{ .Values.nvidiaDcgmExporter.image.repository }}:{{ .Values.nvidiaDcgmExporter.image.tag }}"
name: nvidia-dcgm-exporter
- name: nvidia-dcgm-exporter
image: "{{ .Values.nvidiaDcgmExporter.image.repository }}:{{ .Values.nvidiaDcgmExporter.image.tag }}"
imagePullPolicy: {{ .Values.nvidiaDcgmExporter.image.pullPolicy }}
{{- if .Values.nvidiaDcgmExporter.resources }}
resources:
{{- toYaml .Values.nvidiaDcgmExporter.resources | indent 12 }}
{{- end }}
securityContext:
runAsNonRoot: false
runAsUser: 0
imagePullPolicy: {{ .Values.nvidiaDcgmExporter.image.pullPolicy }}
resources:
{{ toYaml .Values.nvidiaDcgmExporter.resources | indent 12 }}
volumeMounts:
- name: device-metrics
mountPath: /run/prometheus
{{- if .Values.nodeSelector }}
nodeSelector:
{{ toYaml .Values.nodeSelector | indent 8 }}
{{- end }}
volumes:
- name: pod-gpu-resources
hostPath:
path: /var/lib/kubelet/pod-resources
- name: device-metrics
emptyDir:
medium: Memory
{{- end }}
medium: Memory
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@ spec:
- name: wget
image: busybox
command: ['wget']
args: ['{{ include "nvidia-dcgm-exporter.fullname" . }}:{{ .Values.service.port }}']
args: ['{{ include "nvidia-dcgm-exporter.fullname" . }}:80']
restartPolicy: Never
28 changes: 0 additions & 28 deletions staging/nvidia/charts/nvidia-dcgm-exporter/values.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,3 @@
# Default values for nvidia-dcgm-exporter.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.

enabled: true

namespace: kubeaddons

replicaCount: 1

podNvidiaGpuMetricsExporter:
image:
repository: nvidia/pod-gpu-metrics-exporter
Expand All @@ -22,24 +12,6 @@ nvidiaDcgmExporter:
pullPolicy: IfNotPresent
resources: {}

service:
type: ClusterIP
port: 80

ingress:
enabled: false
annotations: {}
# kubernetes.io/ingress.class: nginx
# kubernetes.io/tls-acme: "true"
hosts:
- host: chart-example.local
paths: []

tls: []
# - secretName: chart-example-tls
# hosts:
# - chart-example.local

nodeSelector: {}

priorityClassName: ""
Expand Down
2 changes: 1 addition & 1 deletion staging/nvidia/charts/nvidia-device-plugin/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ name: nvidia-device-plugin
home: https://github.com/mesosphere/charts
appVersion: "1.0"
description: Nvidia Device Plugin for konvoy
version: 0.1.1
version: 0.1.2
maintainers:
- name: gilbert88
21 changes: 0 additions & 21 deletions staging/nvidia/charts/nvidia-device-plugin/templates/NOTES.txt

This file was deleted.

Loading

0 comments on commit 5dbdd66

Please sign in to comment.