Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(t8s-cluster/management-cluster): add support for gpu nodes #499

Merged
merged 4 commits into from
Aug 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions charts/t8s-cluster/ci/gpu-flavor-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
workers:
test:
replicas: 1
availabilityZone: Zone1
flavor: standard-gpu.2.1905
12 changes: 11 additions & 1 deletion charts/t8s-cluster/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,14 @@ sourceRef:
kind: HelmRepository
name: {{ printf "%s-%s" .context.Release.Name .repo | quote }}
namespace: {{ .context.Release.Namespace }}
{{- end -}}
{{- end -}}

{{- define "t8s-cluster.hasGPUNodes" -}}
{{- $hasGPUFlavor := false -}}
{{- range $name, $machineDeploymentClass := .Values.workers -}}
{{- if contains "gpu" (lower $machineDeploymentClass.flavor) -}}
{{- $hasGPUFlavor = true -}}
{{- end -}}
{{- end -}}
{{- $hasGPUFlavor -}}
{{- end -}}
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ spec:
ref:
apiVersion: bootstrap.cluster.x-k8s.io/v1beta1
kind: KubeadmConfigTemplate
name: {{ printf "%s-worker" $.Release.Name }}
name: {{ printf "%s-%s-worker" $.Release.Name ($machineDeploymentClass.flavor | contains "gpu" | ternary "gpu" "standard") }}
This conversation was marked as resolved.
Show resolved Hide resolved
infrastructure:
ref:
apiVersion: {{ include "t8s-cluster.clusterClass.infrastructureApiVersion" $ }}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,31 @@
{{- define "t8s-cluster.clusterClass.containerdConfig.plugins" -}}
[plugins]
[plugins."io.containerd.grpc.v1.cri"]
{{- $_ := set . "Values" .context.Values -}}
{{- if .Values.containerRegistryProxy.proxyRegistryEndpoint }}
[plugins."io.containerd.grpc.v1.cri".registry]
config_path = "/etc/containerd/registries.conf.d"
{{- end }}
{{- if .gpu }}
[plugins."io.containerd.grpc.v1.cri".containerd]
default_runtime_name = "runc"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
# TODO: this is only needed because of https://github.com/containerd/containerd/issues/5837
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
SystemdCgroup = true
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
privileged_without_host_devices = false
runtime_engine = ""
runtime_root = ""
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
BinaryName = "/usr/local/nvidia/toolkit/nvidia-container-runtime"
{{- end -}}
{{- end -}}

{{- define "t8s-cluster.clusterClass.containerdConfig.containerRegistryProxyConfigs" -}}
- content: |-
[plugins]
[plugins."io.containerd.grpc.v1.cri".registry]
config_path = "/etc/containerd/registries.conf.d"
path: /etc/containerd/conf.d/teuto-mirror.toml
{{- $_ := set . "Values" .context.Values -}}
{{- $defaultProxiedRegistries := list
"docker.io"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{{- define "t8s-cluster.clusterClass.kubeadmConfigTemplate.spec" -}}
{{- $_ := set . "Values" .context.Values -}}
joinConfiguration:
nodeRegistration:
kubeletExtraArgs: {{- include "t8s-cluster.clusterClass.kubeletExtraArgs" .context | nindent 6 }}
name: '{{ `{{ local_hostname }}` }}'
patches:
directory: /etc/kubernetes/patches
files: {{- include "t8s-cluster.patches.kubelet.patches" .context | nindent 2 }}
{{- if .Values.containerRegistryProxy.proxyRegistryEndpoint }}
{{- include "t8s-cluster.clusterClass.containerdConfig.containerRegistryProxyConfigs" (dict "context" .context) | nindent 2 }}
{{- end }}
- content: |- {{- include "t8s-cluster.clusterClass.containerdConfig.plugins" (dict "context" .context "gpu" .gpu) | nindent 6 }}
path: /etc/containerd/conf.d/plugins.toml
{{- if .Values.global.injectedCertificateAuthorities }}
- content: |- {{- .Values.global.injectedCertificateAuthorities | nindent 6 }}
path: /usr/local/share/ca-certificates/injected-ca-certs.crt
{{- end }}
{{ if .Values.global.injectedCertificateAuthorities }}
preKubeadmCommands:
- update-ca-certificates
{{- end }}
postKubeadmCommands: {{- include "t8s-cluster.clusterClass.postKubeadmCommands" (dict) | nindent 2 }}
{{- end -}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{{- define "t8s-cluster.clusterClass.kubeadmConfigTemplate" -}}
apiVersion: bootstrap.cluster.x-k8s.io/v1beta1
kind: KubeadmConfigTemplate
metadata:
name: {{ printf "%s-%s-worker" .context.Release.Name (.gpu | ternary "gpu" "standard") }}
namespace: {{ .context.Release.Namespace }}
labels: {{- include "common.labels.standard" .context | nindent 4 }}
spec:
template:
spec: {{- include "t8s-cluster.clusterClass.kubeadmConfigTemplate.spec" (dict "gpu" .gpu "context" .context) | nindent 6 }}
{{- end -}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{{- if eq (include "t8s-cluster.hasGPUNodes" .) "true" }}
{{- include "t8s-cluster.clusterClass.kubeadmConfigTemplate" (dict "gpu" true "context" $) -}}
{{- end }}

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{{- $hasStandardFlavor := false -}}
{{- range $name, $machineDeploymentClass := .Values.workers }}
{{- if not (contains "gpu" (lower $machineDeploymentClass.flavor)) -}}
{{- $hasStandardFlavor = true -}}
{{- end -}}
{{- end }}
{{- if $hasStandardFlavor }}
{{- include "t8s-cluster.clusterClass.kubeadmConfigTemplate" (dict "gpu" false "context" $) -}}
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ clusterConfiguration:
authorization-always-allow-paths: /healthz,/readyz,/livez,/metrics
bind-address: 0.0.0.0
profiling: 'false'
files: {{- include "t8s-cluster.patches.kubelet.patches" . | nindent 2 }}
files: {{- include "t8s-cluster.patches.kubelet.patches" $ | nindent 2 }}
- content: |- {{- .Files.Get "files/admission-control-config.yaml" | nindent 6 }}
path: *admissionControlConfigFilePath
- content: |- {{- .Files.Get "files/event-rate-limit-config.yaml" | nindent 6 }}
Expand All @@ -49,6 +49,8 @@ files: {{- include "t8s-cluster.patches.kubelet.patches" . | nindent 2 }}
{{- if .Values.containerRegistryProxy.proxyRegistryEndpoint }}
{{- include "t8s-cluster.clusterClass.containerdConfig.containerRegistryProxyConfigs" (dict "context" $) | nindent 2 }}
{{- end }}
- content: |- {{- include "t8s-cluster.clusterClass.containerdConfig.plugins" (dict "context" $ "gpu" false) | nindent 6 }}
path: /etc/containerd/conf.d/plugins.toml
{{- if .Values.global.injectedCertificateAuthorities }}
- content: |- {{- .Values.global.injectedCertificateAuthorities | nindent 6 }}
path: /usr/local/share/ca-certificates/injected-ca-certs.crt
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,6 @@
{{- $patches := list (include "t8s-cluster.patches.kubelet.default" . | fromYaml) -}}
{{- if and (eq (int .Values.version.major) 1) (ge (int .Values.version.minor) 27) (gt (int .Values.global.kubeletExtraConfig.maxParallelImagePulls) 1) -}}
{{- $patches = append $patches (include "t8s-cluster.patches.kubelet.imagePulls" . | fromYaml) -}}
{{- end }}
{{- end -}}
{{- $patches | toYaml -}}
{{- end -}}

This file was deleted.

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{{- range $name, $config := .Values.global.helmRepositories }}
{{- $create := true -}}
{{- if $config.condition -}}
{{- $create = eq (include "common.tplvalues.render" (dict "value" $config.condition "context" (deepCopy $))) "true" -}}
{{- end -}}
{{- if $create -}}
apiVersion: source.toolkit.fluxcd.io/v1beta2
kind: HelmRepository
metadata:
name: {{ printf "%s-%s" $.Release.Name $name | quote }}
namespace: {{ $.Release.Namespace }}
labels: {{- include "common.labels.standard" $ | nindent 4 }}
spec:
interval: 5m
url: {{ $config.url }}
{{- if $config.url | hasPrefix "oci://" }}
type: oci
{{- end }}
---
{{ end -}}
{{- end }}
38 changes: 38 additions & 0 deletions charts/t8s-cluster/templates/workload-cluster/gpu-operator.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{{- if eq (include "t8s-cluster.hasGPUNodes" .) "true" }}
apiVersion: helm.toolkit.fluxcd.io/v2beta1
kind: HelmRelease
metadata:
name: {{ printf "%s-gpu-operator" .Release.Name }}
namespace: {{ .Release.Namespace}}
labels: {{- include "common.labels.standard" . | nindent 4 }}
spec:
chart:
spec: {{- include "t8s-cluster.helm.chartSpec" (dict "repo" "nvidia" "chart" "gpu-operator" "context" $) | nindent 6 }}
interval: 1h
kubeConfig:
secretRef:
name: {{ .Release.Name }}-kubeconfig
install:
remediation:
retries: -1
storageNamespace: kube-system
targetNamespace: kube-system
releaseName: gpu-operator
values:
node-feature-discovery:
topologyUpdater:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node.kubernetes.io/instance-type
operator: In
{{- $gpuFlavours := list }}
{{- range $_, $machineDeploymentClass := .Values.workers -}}
{{- if contains "gpu" (lower $machineDeploymentClass.flavor) -}}
{{- $gpuFlavours = append $gpuFlavours $machineDeploymentClass.flavor -}}
{{- end -}}
{{- end }}
values: {{- $gpuFlavours | toYaml | nindent 24 }}
{{- end }}
11 changes: 11 additions & 0 deletions charts/t8s-cluster/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
"additionalProperties": {
"type": "string"
}
},
"condition": {
"$ref": "#/$defs/condition"
}
},
"additionalProperties": false
Expand Down Expand Up @@ -308,6 +311,14 @@
}
},
"additionalProperties": false
},
"condition": {
"type": "string",
"description": "A condition with which to decide to include the resource. This will be templated. Must return a truthy value",
"examples": [
"{{ true }}",
"{{ eq .Values.global.baseDomain \"teuto.net\" }}"
]
}
}
}
10 changes: 10 additions & 0 deletions charts/t8s-cluster/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@ global:
url: https://helm.cilium.io
charts:
cilium: 1.x.x
condition: '{{ eq .Values.cni "cilium" }}'
nvidia:
url: https://helm.ngc.nvidia.com/nvidia
charts:
gpu-operator: 23.x.x
condition: '{{ eq (include "t8s-cluster.hasGPUNodes" .) "true" }}'
cloud-provider-openstack:
url: https://kubernetes.github.io/cloud-provider-openstack
cetic:
url: https://cetic.github.io/helm-charts
etcd:
image:
registry: registry.k8s.io
Expand Down