From 1ac31fb30af91715db5cdf5cfb38ce7eb82ec8c9 Mon Sep 17 00:00:00 2001 From: Chris Werner Rau Date: Tue, 22 Aug 2023 14:30:51 +0200 Subject: [PATCH 1/4] feat(t8s-cluster/management-cluster): add support for gpu nodes create new kubeadmConfigTemplate for gpu nodes add gpu-operator HelmRelease --- charts/t8s-cluster/ci/gpu-flavor-values.yaml | 5 +++ charts/t8s-cluster/templates/_helpers.tpl | 12 ++++++- .../clusterClass/clusterClass.yaml | 2 +- .../kubeadmConfigTemplate/_helpers.tpl | 32 ++++++++++++++++--- .../_kubeadmConfigTemplateSpec.yaml | 24 ++++++++++++++ .../gpuWorkerKubeadmConfigTemplate.yaml | 11 +++++++ .../kubeadmConfigTemplate.yaml | 28 ---------------- .../standardWorkerKubeadmConfigTemplate.yaml | 17 ++++++++++ .../_kubeadmControlPlaneTemplateSpec.yaml | 4 ++- .../clusterClass/patches/_kubelet.tpl | 2 +- .../repositories/cetic.yaml | 9 ------ .../cloud-provider-openstack.yaml | 9 ------ .../repositories/cni-cilium.yaml | 11 ------- .../repositories/helmRepositories.yaml | 21 ++++++++++++ .../workload-cluster/gpu-operator.yaml | 21 ++++++++++++ charts/t8s-cluster/values.schema.json | 11 +++++++ charts/t8s-cluster/values.yaml | 10 ++++++ 17 files changed, 163 insertions(+), 66 deletions(-) create mode 100644 charts/t8s-cluster/ci/gpu-flavor-values.yaml create mode 100644 charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/_kubeadmConfigTemplateSpec.yaml create mode 100644 charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/gpuWorkerKubeadmConfigTemplate.yaml delete mode 100644 charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/kubeadmConfigTemplate.yaml create mode 100644 charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/standardWorkerKubeadmConfigTemplate.yaml delete mode 100644 charts/t8s-cluster/templates/management-cluster/repositories/cetic.yaml delete mode 100644 charts/t8s-cluster/templates/management-cluster/repositories/cloud-provider-openstack.yaml delete mode 100644 charts/t8s-cluster/templates/management-cluster/repositories/cni-cilium.yaml create mode 100644 charts/t8s-cluster/templates/management-cluster/repositories/helmRepositories.yaml create mode 100644 charts/t8s-cluster/templates/workload-cluster/gpu-operator.yaml diff --git a/charts/t8s-cluster/ci/gpu-flavor-values.yaml b/charts/t8s-cluster/ci/gpu-flavor-values.yaml new file mode 100644 index 000000000..0551177a4 --- /dev/null +++ b/charts/t8s-cluster/ci/gpu-flavor-values.yaml @@ -0,0 +1,5 @@ +workers: + test: + replicas: 1 + availabilityZone: Zone1 + flavor: standard-gpu.2.1905 diff --git a/charts/t8s-cluster/templates/_helpers.tpl b/charts/t8s-cluster/templates/_helpers.tpl index 64f5ce1dd..f96ce5b3c 100644 --- a/charts/t8s-cluster/templates/_helpers.tpl +++ b/charts/t8s-cluster/templates/_helpers.tpl @@ -19,4 +19,14 @@ sourceRef: kind: HelmRepository name: {{ printf "%s-%s" .context.Release.Name .repo | quote }} namespace: {{ .context.Release.Namespace }} -{{- end -}} \ No newline at end of file +{{- end -}} + +{{- define "t8s-cluster.hasGPUNodes" -}} +{{- $hasGPUFlavor := false -}} +{{- range $name, $machineDeploymentClass := .Values.workers -}} + {{- if contains "gpu" (lower $machineDeploymentClass.flavor) -}} + {{- $hasGPUFlavor = true -}} + {{- end -}} +{{- end -}} +{{- $hasGPUFlavor -}} +{{- end -}} diff --git a/charts/t8s-cluster/templates/management-cluster/clusterClass/clusterClass.yaml b/charts/t8s-cluster/templates/management-cluster/clusterClass/clusterClass.yaml index ca6072361..7ab586c88 100644 --- a/charts/t8s-cluster/templates/management-cluster/clusterClass/clusterClass.yaml +++ b/charts/t8s-cluster/templates/management-cluster/clusterClass/clusterClass.yaml @@ -178,7 +178,7 @@ spec: ref: apiVersion: bootstrap.cluster.x-k8s.io/v1beta1 kind: KubeadmConfigTemplate - name: {{ printf "%s-worker" $.Release.Name }} + name: {{ printf "%s-%s-worker" $.Release.Name ($machineDeploymentClass.flavor | contains "gpu" | ternary "gpu" "standard") }} infrastructure: ref: apiVersion: {{ include "t8s-cluster.clusterClass.infrastructureApiVersion" $ }} diff --git a/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/_helpers.tpl b/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/_helpers.tpl index e90e414ca..a8b2007a2 100644 --- a/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/_helpers.tpl +++ b/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/_helpers.tpl @@ -1,9 +1,31 @@ +{{- define "t8s-cluster.clusterClass.containerdConfig.plugins" -}} +[plugins] + [plugins."io.containerd.grpc.v1.cri"] + {{- $_ := set . "Values" .context.Values -}} + {{- if .Values.containerRegistryProxy.proxyRegistryEndpoint }} + [plugins."io.containerd.grpc.v1.cri".registry] + config_path = "/etc/containerd/registries.conf.d" + {{- end }} + {{- if .gpu }} + [plugins."io.containerd.grpc.v1.cri".containerd] + default_runtime_name = "runc" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] + # TODO: this is only needed because of https://github.com/containerd/containerd/issues/5837 + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] + runtime_type = "io.containerd.runc.v2" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] + SystemdCgroup = true + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia] + privileged_without_host_devices = false + runtime_engine = "" + runtime_root = "" + runtime_type = "io.containerd.runc.v2" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options] + BinaryName = "/usr/local/nvidia/toolkit/nvidia-container-runtime" + {{- end -}} +{{- end -}} + {{- define "t8s-cluster.clusterClass.containerdConfig.containerRegistryProxyConfigs" -}} -- content: |- - [plugins] - [plugins."io.containerd.grpc.v1.cri".registry] - config_path = "/etc/containerd/registries.conf.d" - path: /etc/containerd/conf.d/teuto-mirror.toml {{- $_ := set . "Values" .context.Values -}} {{- $defaultProxiedRegistries := list "docker.io" diff --git a/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/_kubeadmConfigTemplateSpec.yaml b/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/_kubeadmConfigTemplateSpec.yaml new file mode 100644 index 000000000..413406d95 --- /dev/null +++ b/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/_kubeadmConfigTemplateSpec.yaml @@ -0,0 +1,24 @@ +{{- define "t8s-cluster.clusterClass.kubeadmConfigTemplate.spec" -}} + {{- $_ := set . "Values" .context.Values -}} +joinConfiguration: + nodeRegistration: + kubeletExtraArgs: {{- include "t8s-cluster.clusterClass.kubeletExtraArgs" .context | nindent 6 }} + name: '{{ `{{ local_hostname }}` }}' + patches: + directory: /etc/kubernetes/patches +files: {{- include "t8s-cluster.patches.kubelet.patches" .context | nindent 2 }} + {{- if .Values.containerRegistryProxy.proxyRegistryEndpoint }} + {{- include "t8s-cluster.clusterClass.containerdConfig.containerRegistryProxyConfigs" (dict "context" .context) | nindent 2 }} + {{- end }} + - content: |- {{- include "t8s-cluster.clusterClass.containerdConfig.plugins" (dict "context" .context "gpu" .gpu) | nindent 6 }} + path: /etc/containerd/conf.d/plugins.toml + {{- if .Values.global.injectedCertificateAuthorities }} + - content: |- {{- .Values.global.injectedCertificateAuthorities | nindent 6 }} + path: /usr/local/share/ca-certificates/injected-ca-certs.crt + {{- end }} + {{ if .Values.global.injectedCertificateAuthorities }} +preKubeadmCommands: + - update-ca-certificates + {{- end }} +postKubeadmCommands: {{- include "t8s-cluster.clusterClass.postKubeadmCommands" (dict) | nindent 2 }} +{{- end -}} diff --git a/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/gpuWorkerKubeadmConfigTemplate.yaml b/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/gpuWorkerKubeadmConfigTemplate.yaml new file mode 100644 index 000000000..396210832 --- /dev/null +++ b/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/gpuWorkerKubeadmConfigTemplate.yaml @@ -0,0 +1,11 @@ +{{- if (include "t8s-cluster.hasGPUNodes" .) }} +apiVersion: bootstrap.cluster.x-k8s.io/v1beta1 +kind: KubeadmConfigTemplate +metadata: + name: {{ printf "%s-gpu-worker" $.Release.Name }} + namespace: {{ $.Release.Namespace }} + labels: {{- include "common.labels.standard" $ | nindent 4 }} +spec: + template: + spec: {{- include "t8s-cluster.clusterClass.kubeadmConfigTemplate.spec" (dict "gpu" true "context" $) | nindent 6 }} +{{- end }} diff --git a/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/kubeadmConfigTemplate.yaml b/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/kubeadmConfigTemplate.yaml deleted file mode 100644 index c978cc685..000000000 --- a/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/kubeadmConfigTemplate.yaml +++ /dev/null @@ -1,28 +0,0 @@ -apiVersion: bootstrap.cluster.x-k8s.io/v1beta1 -kind: KubeadmConfigTemplate -metadata: - name: {{ printf "%s-worker" $.Release.Name }} - namespace: {{ $.Release.Namespace }} - labels: {{- include "common.labels.standard" $ | nindent 4 }} -spec: - template: - spec: - joinConfiguration: - nodeRegistration: - kubeletExtraArgs: {{- include "t8s-cluster.clusterClass.kubeletExtraArgs" $ | nindent 12 }} - name: '{{ `{{ local_hostname }}` }}' - patches: - directory: /etc/kubernetes/patches - files: {{- include "t8s-cluster.patches.kubelet.patches" . | nindent 8 }} - {{- if .Values.containerRegistryProxy.proxyRegistryEndpoint }} - {{- include "t8s-cluster.clusterClass.containerdConfig.containerRegistryProxyConfigs" (dict "context" $) | nindent 8 }} - {{- end }} - {{- if .Values.global.injectedCertificateAuthorities }} - - content: |- {{- .Values.global.injectedCertificateAuthorities | nindent 12 }} - path: /usr/local/share/ca-certificates/injected-ca-certs.crt - {{- end }} - {{ if .Values.global.injectedCertificateAuthorities -}} - preKubeadmCommands: - - update-ca-certificates - {{- end }} - postKubeadmCommands: {{- include "t8s-cluster.clusterClass.postKubeadmCommands" (dict) | nindent 8 }} diff --git a/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/standardWorkerKubeadmConfigTemplate.yaml b/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/standardWorkerKubeadmConfigTemplate.yaml new file mode 100644 index 000000000..c467cb49e --- /dev/null +++ b/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/standardWorkerKubeadmConfigTemplate.yaml @@ -0,0 +1,17 @@ +{{- $hasStandardFlavor := false -}} +{{- range $name, $machineDeploymentClass := .Values.workers }} + {{- if not (contains "gpu" (lower $machineDeploymentClass.flavor)) -}} + {{- $hasStandardFlavor = true -}} + {{- end -}} +{{- end }} +{{- if $hasStandardFlavor }} +apiVersion: bootstrap.cluster.x-k8s.io/v1beta1 +kind: KubeadmConfigTemplate +metadata: + name: {{ printf "%s-standard-worker" $.Release.Name }} + namespace: {{ $.Release.Namespace }} + labels: {{- include "common.labels.standard" $ | nindent 4 }} +spec: + template: + spec: {{- include "t8s-cluster.clusterClass.kubeadmConfigTemplate.spec" (dict "gpu" false "context" $) | nindent 6 }} +{{- end }} diff --git a/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmnControlPlaneTemplate/_kubeadmControlPlaneTemplateSpec.yaml b/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmnControlPlaneTemplate/_kubeadmControlPlaneTemplateSpec.yaml index afb4b7c44..37db3986f 100644 --- a/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmnControlPlaneTemplate/_kubeadmControlPlaneTemplateSpec.yaml +++ b/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmnControlPlaneTemplate/_kubeadmControlPlaneTemplateSpec.yaml @@ -36,7 +36,7 @@ clusterConfiguration: authorization-always-allow-paths: /healthz,/readyz,/livez,/metrics bind-address: 0.0.0.0 profiling: 'false' -files: {{- include "t8s-cluster.patches.kubelet.patches" . | nindent 2 }} +files: {{- include "t8s-cluster.patches.kubelet.patches" $ | nindent 2 }} - content: |- {{- .Files.Get "files/admission-control-config.yaml" | nindent 6 }} path: *admissionControlConfigFilePath - content: |- {{- .Files.Get "files/event-rate-limit-config.yaml" | nindent 6 }} @@ -49,6 +49,8 @@ files: {{- include "t8s-cluster.patches.kubelet.patches" . | nindent 2 }} {{- if .Values.containerRegistryProxy.proxyRegistryEndpoint }} {{- include "t8s-cluster.clusterClass.containerdConfig.containerRegistryProxyConfigs" (dict "context" $) | nindent 2 }} {{- end }} + - content: |- {{- include "t8s-cluster.clusterClass.containerdConfig.plugins" (dict "context" $ "gpu" false) | nindent 6 }} + path: /etc/containerd/conf.d/plugins.toml {{- if .Values.global.injectedCertificateAuthorities }} - content: |- {{- .Values.global.injectedCertificateAuthorities | nindent 6 }} path: /usr/local/share/ca-certificates/injected-ca-certs.crt diff --git a/charts/t8s-cluster/templates/management-cluster/clusterClass/patches/_kubelet.tpl b/charts/t8s-cluster/templates/management-cluster/clusterClass/patches/_kubelet.tpl index d3d793de8..2e53994fc 100644 --- a/charts/t8s-cluster/templates/management-cluster/clusterClass/patches/_kubelet.tpl +++ b/charts/t8s-cluster/templates/management-cluster/clusterClass/patches/_kubelet.tpl @@ -16,6 +16,6 @@ {{- $patches := list (include "t8s-cluster.patches.kubelet.default" . | fromYaml) -}} {{- if and (eq (int .Values.version.major) 1) (ge (int .Values.version.minor) 27) (gt (int .Values.global.kubeletExtraConfig.maxParallelImagePulls) 1) -}} {{- $patches = append $patches (include "t8s-cluster.patches.kubelet.imagePulls" . | fromYaml) -}} - {{- end }} + {{- end -}} {{- $patches | toYaml -}} {{- end -}} diff --git a/charts/t8s-cluster/templates/management-cluster/repositories/cetic.yaml b/charts/t8s-cluster/templates/management-cluster/repositories/cetic.yaml deleted file mode 100644 index c544bc788..000000000 --- a/charts/t8s-cluster/templates/management-cluster/repositories/cetic.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: source.toolkit.fluxcd.io/v1beta2 -kind: HelmRepository -metadata: - name: {{ printf "%s-cetic" .Release.Name }} - namespace: {{ .Release.Namespace}} - labels: {{- include "common.labels.standard" . | nindent 4 }} -spec: - interval: 5m - url: https://cetic.github.io/helm-charts \ No newline at end of file diff --git a/charts/t8s-cluster/templates/management-cluster/repositories/cloud-provider-openstack.yaml b/charts/t8s-cluster/templates/management-cluster/repositories/cloud-provider-openstack.yaml deleted file mode 100644 index 5e5a3c0af..000000000 --- a/charts/t8s-cluster/templates/management-cluster/repositories/cloud-provider-openstack.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: source.toolkit.fluxcd.io/v1beta1 -kind: HelmRepository -metadata: - name: {{ printf "%s-cloud-provider-openstack" .Release.Name }} - namespace: {{ .Release.Namespace}} - labels: {{- include "common.labels.standard" . | nindent 4 }} -spec: - interval: 5m - url: https://kubernetes.github.io/cloud-provider-openstack diff --git a/charts/t8s-cluster/templates/management-cluster/repositories/cni-cilium.yaml b/charts/t8s-cluster/templates/management-cluster/repositories/cni-cilium.yaml deleted file mode 100644 index ec830b0b3..000000000 --- a/charts/t8s-cluster/templates/management-cluster/repositories/cni-cilium.yaml +++ /dev/null @@ -1,11 +0,0 @@ -{{- if eq .Values.cni "cilium" }} -apiVersion: source.toolkit.fluxcd.io/v1beta2 -kind: HelmRepository -metadata: - name: {{ printf "%s-cilium" .Release.Name }} - namespace: {{ .Release.Namespace}} - labels: {{- include "common.labels.standard" . | nindent 4 }} -spec: - interval: 5m - url: https://helm.cilium.io -{{- end }} \ No newline at end of file diff --git a/charts/t8s-cluster/templates/management-cluster/repositories/helmRepositories.yaml b/charts/t8s-cluster/templates/management-cluster/repositories/helmRepositories.yaml new file mode 100644 index 000000000..f504aa1bc --- /dev/null +++ b/charts/t8s-cluster/templates/management-cluster/repositories/helmRepositories.yaml @@ -0,0 +1,21 @@ +{{- range $name, $config := .Values.global.helmRepositories }} +{{- $create := true -}} +{{- if $config.condition -}} + {{- $create = eq (include "common.tplvalues.render" (dict "value" $config.condition "context" (deepCopy $))) "true" -}} +{{- end -}} +{{- if $create -}} +apiVersion: source.toolkit.fluxcd.io/v1beta2 +kind: HelmRepository +metadata: + name: {{ printf "%s-%s" $.Release.Name $name | quote }} + namespace: {{ $.Release.Namespace }} + labels: {{- include "common.labels.standard" $ | nindent 4 }} +spec: + interval: 5m + url: {{ $config.url }} + {{- if $config.url | hasPrefix "oci://" }} + type: oci + {{- end }} +--- +{{ end -}} +{{- end }} diff --git a/charts/t8s-cluster/templates/workload-cluster/gpu-operator.yaml b/charts/t8s-cluster/templates/workload-cluster/gpu-operator.yaml new file mode 100644 index 000000000..d3731998f --- /dev/null +++ b/charts/t8s-cluster/templates/workload-cluster/gpu-operator.yaml @@ -0,0 +1,21 @@ +{{- if (include "t8s-cluster.hasGPUNodes" .) }} +apiVersion: helm.toolkit.fluxcd.io/v2beta1 +kind: HelmRelease +metadata: + name: {{ printf "%s-gpu-operator" .Release.Name }} + namespace: {{ .Release.Namespace}} + labels: {{- include "common.labels.standard" . | nindent 4 }} +spec: + chart: + spec: {{- include "t8s-cluster.helm.chartSpec" (dict "repo" "nvidia" "chart" "gpu-operator" "context" $) | nindent 6 }} + interval: 1h + kubeConfig: + secretRef: + name: {{ .Release.Name }}-kubeconfig + install: + remediation: + retries: -1 + storageNamespace: kube-system + targetNamespace: kube-system + releaseName: gpu-operator +{{- end }} diff --git a/charts/t8s-cluster/values.schema.json b/charts/t8s-cluster/values.schema.json index 4aa21b293..38591cfd3 100644 --- a/charts/t8s-cluster/values.schema.json +++ b/charts/t8s-cluster/values.schema.json @@ -21,6 +21,9 @@ "additionalProperties": { "type": "string" } + }, + "condition": { + "$ref": "#/$defs/condition" } }, "additionalProperties": false @@ -308,6 +311,14 @@ } }, "additionalProperties": false + }, + "condition": { + "type": "string", + "description": "A condition with which to decide to include the resource. This will be templated. Must return a truthy value", + "examples": [ + "{{ true }}", + "{{ eq .Values.global.baseDomain \"teuto.net\" }}" + ] } } } diff --git a/charts/t8s-cluster/values.yaml b/charts/t8s-cluster/values.yaml index b594f9406..273717592 100644 --- a/charts/t8s-cluster/values.yaml +++ b/charts/t8s-cluster/values.yaml @@ -5,6 +5,16 @@ global: url: https://helm.cilium.io charts: cilium: 1.x.x + condition: '{{ eq .Values.cni "cilium" }}' + nvidia: + url: https://helm.ngc.nvidia.com/nvidia + charts: + gpu-operator: 23.x.x + condition: '{{ include "t8s-cluster.hasGPUNodes" . }}' + cloud-provider-openstack: + url: https://kubernetes.github.io/cloud-provider-openstack + cetic: + url: https://cetic.github.io/helm-charts etcd: image: registry: registry.k8s.io From ffb6085bd08b64a7b723de3cd1961479e2e81e63 Mon Sep 17 00:00:00 2001 From: Chris Werner Rau Date: Wed, 23 Aug 2023 09:49:55 +0200 Subject: [PATCH 2/4] fix(t8s-cluster/management-cluster): wrong check for gpu instances --- .../kubeadmConfigTemplate/gpuWorkerKubeadmConfigTemplate.yaml | 2 +- charts/t8s-cluster/templates/workload-cluster/gpu-operator.yaml | 2 +- charts/t8s-cluster/values.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/gpuWorkerKubeadmConfigTemplate.yaml b/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/gpuWorkerKubeadmConfigTemplate.yaml index 396210832..595043bff 100644 --- a/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/gpuWorkerKubeadmConfigTemplate.yaml +++ b/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/gpuWorkerKubeadmConfigTemplate.yaml @@ -1,4 +1,4 @@ -{{- if (include "t8s-cluster.hasGPUNodes" .) }} +{{- if eq (include "t8s-cluster.hasGPUNodes" .) "true" }} apiVersion: bootstrap.cluster.x-k8s.io/v1beta1 kind: KubeadmConfigTemplate metadata: diff --git a/charts/t8s-cluster/templates/workload-cluster/gpu-operator.yaml b/charts/t8s-cluster/templates/workload-cluster/gpu-operator.yaml index d3731998f..be581ad07 100644 --- a/charts/t8s-cluster/templates/workload-cluster/gpu-operator.yaml +++ b/charts/t8s-cluster/templates/workload-cluster/gpu-operator.yaml @@ -1,4 +1,4 @@ -{{- if (include "t8s-cluster.hasGPUNodes" .) }} +{{- if eq (include "t8s-cluster.hasGPUNodes" .) "true" }} apiVersion: helm.toolkit.fluxcd.io/v2beta1 kind: HelmRelease metadata: diff --git a/charts/t8s-cluster/values.yaml b/charts/t8s-cluster/values.yaml index 273717592..14f1d7c62 100644 --- a/charts/t8s-cluster/values.yaml +++ b/charts/t8s-cluster/values.yaml @@ -10,7 +10,7 @@ global: url: https://helm.ngc.nvidia.com/nvidia charts: gpu-operator: 23.x.x - condition: '{{ include "t8s-cluster.hasGPUNodes" . }}' + condition: '{{ eq (include "t8s-cluster.hasGPUNodes" .) "true" }}' cloud-provider-openstack: url: https://kubernetes.github.io/cloud-provider-openstack cetic: From bc91c76b3e7b76f8784f8187125fb4a192d6d974 Mon Sep 17 00:00:00 2001 From: Chris Werner Rau Date: Wed, 23 Aug 2023 13:39:51 +0200 Subject: [PATCH 3/4] chore(t8s-cluster/management-cluster): deduplicate kubeadmConfigTemplate --- .../_workerKubeadmConfigTemplate.yaml | 11 +++++++++++ .../gpuWorkerKubeadmConfigTemplate.yaml | 10 +--------- .../standardWorkerKubeadmConfigTemplate.yaml | 10 +--------- 3 files changed, 13 insertions(+), 18 deletions(-) create mode 100644 charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/_workerKubeadmConfigTemplate.yaml diff --git a/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/_workerKubeadmConfigTemplate.yaml b/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/_workerKubeadmConfigTemplate.yaml new file mode 100644 index 000000000..0a6471f31 --- /dev/null +++ b/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/_workerKubeadmConfigTemplate.yaml @@ -0,0 +1,11 @@ +{{- define "t8s-cluster.clusterClass.kubeadmConfigTemplate" -}} +apiVersion: bootstrap.cluster.x-k8s.io/v1beta1 +kind: KubeadmConfigTemplate +metadata: + name: {{ printf "%s-%s-worker" .context.Release.Name (.gpu | ternary "gpu" "standard") }} + namespace: {{ .context.Release.Namespace }} + labels: {{- include "common.labels.standard" .context | nindent 4 }} +spec: + template: + spec: {{- include "t8s-cluster.clusterClass.kubeadmConfigTemplate.spec" (dict "gpu" .gpu "context" .context) | nindent 6 }} +{{- end -}} diff --git a/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/gpuWorkerKubeadmConfigTemplate.yaml b/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/gpuWorkerKubeadmConfigTemplate.yaml index 595043bff..a5e88977b 100644 --- a/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/gpuWorkerKubeadmConfigTemplate.yaml +++ b/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/gpuWorkerKubeadmConfigTemplate.yaml @@ -1,11 +1,3 @@ {{- if eq (include "t8s-cluster.hasGPUNodes" .) "true" }} -apiVersion: bootstrap.cluster.x-k8s.io/v1beta1 -kind: KubeadmConfigTemplate -metadata: - name: {{ printf "%s-gpu-worker" $.Release.Name }} - namespace: {{ $.Release.Namespace }} - labels: {{- include "common.labels.standard" $ | nindent 4 }} -spec: - template: - spec: {{- include "t8s-cluster.clusterClass.kubeadmConfigTemplate.spec" (dict "gpu" true "context" $) | nindent 6 }} + {{- include "t8s-cluster.clusterClass.kubeadmConfigTemplate" (dict "gpu" true "context" $) -}} {{- end }} diff --git a/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/standardWorkerKubeadmConfigTemplate.yaml b/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/standardWorkerKubeadmConfigTemplate.yaml index c467cb49e..d5d1bc3c6 100644 --- a/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/standardWorkerKubeadmConfigTemplate.yaml +++ b/charts/t8s-cluster/templates/management-cluster/clusterClass/kubeadmConfigTemplate/standardWorkerKubeadmConfigTemplate.yaml @@ -5,13 +5,5 @@ {{- end -}} {{- end }} {{- if $hasStandardFlavor }} -apiVersion: bootstrap.cluster.x-k8s.io/v1beta1 -kind: KubeadmConfigTemplate -metadata: - name: {{ printf "%s-standard-worker" $.Release.Name }} - namespace: {{ $.Release.Namespace }} - labels: {{- include "common.labels.standard" $ | nindent 4 }} -spec: - template: - spec: {{- include "t8s-cluster.clusterClass.kubeadmConfigTemplate.spec" (dict "gpu" false "context" $) | nindent 6 }} + {{- include "t8s-cluster.clusterClass.kubeadmConfigTemplate" (dict "gpu" false "context" $) -}} {{- end }} From b044afa24e01999bbe1e68b55d028a9feb3f9737 Mon Sep 17 00:00:00 2001 From: Chris Werner Rau Date: Wed, 23 Aug 2023 14:33:25 +0200 Subject: [PATCH 4/4] feat(t8s-cluster/workload-cluster): only run node-feature-discovery on gpu nodes there will never be gpus on other nodes, so we can save some resources this way --- .../workload-cluster/gpu-operator.yaml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/charts/t8s-cluster/templates/workload-cluster/gpu-operator.yaml b/charts/t8s-cluster/templates/workload-cluster/gpu-operator.yaml index be581ad07..080f30f51 100644 --- a/charts/t8s-cluster/templates/workload-cluster/gpu-operator.yaml +++ b/charts/t8s-cluster/templates/workload-cluster/gpu-operator.yaml @@ -18,4 +18,21 @@ spec: storageNamespace: kube-system targetNamespace: kube-system releaseName: gpu-operator + values: + node-feature-discovery: + topologyUpdater: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + {{- $gpuFlavours := list }} + {{- range $_, $machineDeploymentClass := .Values.workers -}} + {{- if contains "gpu" (lower $machineDeploymentClass.flavor) -}} + {{- $gpuFlavours = append $gpuFlavours $machineDeploymentClass.flavor -}} + {{- end -}} + {{- end }} + values: {{- $gpuFlavours | toYaml | nindent 24 }} {{- end }}