Skip to content

Commit

Permalink
feat(t8s-cluster/management-cluster): add support for gpu nodes (#499)
Browse files Browse the repository at this point in the history
  • Loading branch information
cwrau authored Aug 25, 2023
1 parent 10759bf commit 7552b66
Show file tree
Hide file tree
Showing 18 changed files with 175 additions and 66 deletions.
5 changes: 5 additions & 0 deletions charts/t8s-cluster/ci/gpu-flavor-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
workers:
test:
replicas: 1
availabilityZone: Zone1
flavor: standard-gpu.2.1905
12 changes: 11 additions & 1 deletion charts/t8s-cluster/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,14 @@ sourceRef:
kind: HelmRepository
name: {{ printf "%s-%s" .context.Release.Name .repo | quote }}
namespace: {{ .context.Release.Namespace }}
{{- end -}}
{{- end -}}

{{- define "t8s-cluster.hasGPUNodes" -}}
{{- $hasGPUFlavor := false -}}
{{- range $name, $machineDeploymentClass := .Values.workers -}}
{{- if contains "gpu" (lower $machineDeploymentClass.flavor) -}}
{{- $hasGPUFlavor = true -}}
{{- end -}}
{{- end -}}
{{- $hasGPUFlavor -}}
{{- end -}}
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ spec:
ref:
apiVersion: bootstrap.cluster.x-k8s.io/v1beta1
kind: KubeadmConfigTemplate
name: {{ printf "%s-worker" $.Release.Name }}
name: {{ printf "%s-%s-worker" $.Release.Name ($machineDeploymentClass.flavor | contains "gpu" | ternary "gpu" "standard") }}
infrastructure:
ref:
apiVersion: {{ include "t8s-cluster.clusterClass.infrastructureApiVersion" $ }}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,31 @@
{{- define "t8s-cluster.clusterClass.containerdConfig.plugins" -}}
[plugins]
[plugins."io.containerd.grpc.v1.cri"]
{{- $_ := set . "Values" .context.Values -}}
{{- if .Values.containerRegistryProxy.proxyRegistryEndpoint }}
[plugins."io.containerd.grpc.v1.cri".registry]
config_path = "/etc/containerd/registries.conf.d"
{{- end }}
{{- if .gpu }}
[plugins."io.containerd.grpc.v1.cri".containerd]
default_runtime_name = "runc"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
# TODO: this is only needed because of https://github.com/containerd/containerd/issues/5837
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
SystemdCgroup = true
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
privileged_without_host_devices = false
runtime_engine = ""
runtime_root = ""
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
BinaryName = "/usr/local/nvidia/toolkit/nvidia-container-runtime"
{{- end -}}
{{- end -}}

{{- define "t8s-cluster.clusterClass.containerdConfig.containerRegistryProxyConfigs" -}}
- content: |-
[plugins]
[plugins."io.containerd.grpc.v1.cri".registry]
config_path = "/etc/containerd/registries.conf.d"
path: /etc/containerd/conf.d/teuto-mirror.toml
{{- $_ := set . "Values" .context.Values -}}
{{- $defaultProxiedRegistries := list
"docker.io"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{{- define "t8s-cluster.clusterClass.kubeadmConfigTemplate.spec" -}}
{{- $_ := set . "Values" .context.Values -}}
joinConfiguration:
nodeRegistration:
kubeletExtraArgs: {{- include "t8s-cluster.clusterClass.kubeletExtraArgs" .context | nindent 6 }}
name: '{{ `{{ local_hostname }}` }}'
patches:
directory: /etc/kubernetes/patches
files: {{- include "t8s-cluster.patches.kubelet.patches" .context | nindent 2 }}
{{- if .Values.containerRegistryProxy.proxyRegistryEndpoint }}
{{- include "t8s-cluster.clusterClass.containerdConfig.containerRegistryProxyConfigs" (dict "context" .context) | nindent 2 }}
{{- end }}
- content: |- {{- include "t8s-cluster.clusterClass.containerdConfig.plugins" (dict "context" .context "gpu" .gpu) | nindent 6 }}
path: /etc/containerd/conf.d/plugins.toml
{{- if .Values.global.injectedCertificateAuthorities }}
- content: |- {{- .Values.global.injectedCertificateAuthorities | nindent 6 }}
path: /usr/local/share/ca-certificates/injected-ca-certs.crt
{{- end }}
{{ if .Values.global.injectedCertificateAuthorities }}
preKubeadmCommands:
- update-ca-certificates
{{- end }}
postKubeadmCommands: {{- include "t8s-cluster.clusterClass.postKubeadmCommands" (dict) | nindent 2 }}
{{- end -}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{{- define "t8s-cluster.clusterClass.kubeadmConfigTemplate" -}}
apiVersion: bootstrap.cluster.x-k8s.io/v1beta1
kind: KubeadmConfigTemplate
metadata:
name: {{ printf "%s-%s-worker" .context.Release.Name (.gpu | ternary "gpu" "standard") }}
namespace: {{ .context.Release.Namespace }}
labels: {{- include "common.labels.standard" .context | nindent 4 }}
spec:
template:
spec: {{- include "t8s-cluster.clusterClass.kubeadmConfigTemplate.spec" (dict "gpu" .gpu "context" .context) | nindent 6 }}
{{- end -}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{{- if eq (include "t8s-cluster.hasGPUNodes" .) "true" }}
{{- include "t8s-cluster.clusterClass.kubeadmConfigTemplate" (dict "gpu" true "context" $) -}}
{{- end }}

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{{- $hasStandardFlavor := false -}}
{{- range $name, $machineDeploymentClass := .Values.workers }}
{{- if not (contains "gpu" (lower $machineDeploymentClass.flavor)) -}}
{{- $hasStandardFlavor = true -}}
{{- end -}}
{{- end }}
{{- if $hasStandardFlavor }}
{{- include "t8s-cluster.clusterClass.kubeadmConfigTemplate" (dict "gpu" false "context" $) -}}
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ clusterConfiguration:
authorization-always-allow-paths: /healthz,/readyz,/livez,/metrics
bind-address: 0.0.0.0
profiling: 'false'
files: {{- include "t8s-cluster.patches.kubelet.patches" . | nindent 2 }}
files: {{- include "t8s-cluster.patches.kubelet.patches" $ | nindent 2 }}
- content: |- {{- .Files.Get "files/admission-control-config.yaml" | nindent 6 }}
path: *admissionControlConfigFilePath
- content: |- {{- .Files.Get "files/event-rate-limit-config.yaml" | nindent 6 }}
Expand All @@ -49,6 +49,8 @@ files: {{- include "t8s-cluster.patches.kubelet.patches" . | nindent 2 }}
{{- if .Values.containerRegistryProxy.proxyRegistryEndpoint }}
{{- include "t8s-cluster.clusterClass.containerdConfig.containerRegistryProxyConfigs" (dict "context" $) | nindent 2 }}
{{- end }}
- content: |- {{- include "t8s-cluster.clusterClass.containerdConfig.plugins" (dict "context" $ "gpu" false) | nindent 6 }}
path: /etc/containerd/conf.d/plugins.toml
{{- if .Values.global.injectedCertificateAuthorities }}
- content: |- {{- .Values.global.injectedCertificateAuthorities | nindent 6 }}
path: /usr/local/share/ca-certificates/injected-ca-certs.crt
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,6 @@
{{- $patches := list (include "t8s-cluster.patches.kubelet.default" . | fromYaml) -}}
{{- if and (eq (int .Values.version.major) 1) (ge (int .Values.version.minor) 27) (gt (int .Values.global.kubeletExtraConfig.maxParallelImagePulls) 1) -}}
{{- $patches = append $patches (include "t8s-cluster.patches.kubelet.imagePulls" . | fromYaml) -}}
{{- end }}
{{- end -}}
{{- $patches | toYaml -}}
{{- end -}}

This file was deleted.

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{{- range $name, $config := .Values.global.helmRepositories }}
{{- $create := true -}}
{{- if $config.condition -}}
{{- $create = eq (include "common.tplvalues.render" (dict "value" $config.condition "context" (deepCopy $))) "true" -}}
{{- end -}}
{{- if $create -}}
apiVersion: source.toolkit.fluxcd.io/v1beta2
kind: HelmRepository
metadata:
name: {{ printf "%s-%s" $.Release.Name $name | quote }}
namespace: {{ $.Release.Namespace }}
labels: {{- include "common.labels.standard" $ | nindent 4 }}
spec:
interval: 5m
url: {{ $config.url }}
{{- if $config.url | hasPrefix "oci://" }}
type: oci
{{- end }}
---
{{ end -}}
{{- end }}
38 changes: 38 additions & 0 deletions charts/t8s-cluster/templates/workload-cluster/gpu-operator.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{{- if eq (include "t8s-cluster.hasGPUNodes" .) "true" }}
apiVersion: helm.toolkit.fluxcd.io/v2beta1
kind: HelmRelease
metadata:
name: {{ printf "%s-gpu-operator" .Release.Name }}
namespace: {{ .Release.Namespace}}
labels: {{- include "common.labels.standard" . | nindent 4 }}
spec:
chart:
spec: {{- include "t8s-cluster.helm.chartSpec" (dict "repo" "nvidia" "chart" "gpu-operator" "context" $) | nindent 6 }}
interval: 1h
kubeConfig:
secretRef:
name: {{ .Release.Name }}-kubeconfig
install:
remediation:
retries: -1
storageNamespace: kube-system
targetNamespace: kube-system
releaseName: gpu-operator
values:
node-feature-discovery:
topologyUpdater:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node.kubernetes.io/instance-type
operator: In
{{- $gpuFlavours := list }}
{{- range $_, $machineDeploymentClass := .Values.workers -}}
{{- if contains "gpu" (lower $machineDeploymentClass.flavor) -}}
{{- $gpuFlavours = append $gpuFlavours $machineDeploymentClass.flavor -}}
{{- end -}}
{{- end }}
values: {{- $gpuFlavours | toYaml | nindent 24 }}
{{- end }}
11 changes: 11 additions & 0 deletions charts/t8s-cluster/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
"additionalProperties": {
"type": "string"
}
},
"condition": {
"$ref": "#/$defs/condition"
}
},
"additionalProperties": false
Expand Down Expand Up @@ -308,6 +311,14 @@
}
},
"additionalProperties": false
},
"condition": {
"type": "string",
"description": "A condition with which to decide to include the resource. This will be templated. Must return a truthy value",
"examples": [
"{{ true }}",
"{{ eq .Values.global.baseDomain \"teuto.net\" }}"
]
}
}
}
10 changes: 10 additions & 0 deletions charts/t8s-cluster/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@ global:
url: https://helm.cilium.io
charts:
cilium: 1.x.x
condition: '{{ eq .Values.cni "cilium" }}'
nvidia:
url: https://helm.ngc.nvidia.com/nvidia
charts:
gpu-operator: 23.x.x
condition: '{{ eq (include "t8s-cluster.hasGPUNodes" .) "true" }}'
cloud-provider-openstack:
url: https://kubernetes.github.io/cloud-provider-openstack
cetic:
url: https://cetic.github.io/helm-charts
etcd:
image:
registry: registry.k8s.io
Expand Down

0 comments on commit 7552b66

Please sign in to comment.