docs and yamls for dependencies

This adds more documentation and yamls for getting dependencies up and running. Signed-off-by: Ukri Niemimuukko <ukri.niemimuukko@intel.com>
intel · Jun 18, 2021 · 3c514af · 3c514af
1 parent ab25796
commit 3c514af
Show file tree

Hide file tree

Showing 22 changed files with 586 additions and 17 deletions.
diff --git a/gpu-aware-scheduling/README.md b/gpu-aware-scheduling/README.md
@@ -36,7 +36,7 @@ Note: a shell script that shows these steps can be found [here](deploy/extender-
 
 The extender configuration files can be found under deploy/extender-configuration.
 GAS Scheduler Extender needs to be registered with the Kubernetes Scheduler. In order to do this a configmap should be created like the below:
-````
+```
 apiVersion: v1alpha1
 kind: ConfigMap
 metadata:
@@ -72,14 +72,14 @@ data:
          ]
     }
 
-````
+```
 
 A similar file can be found [in the deploy folder](./deploy/extender-configuration/scheduler-extender-configmap.yaml). This configmap can be created with ``kubectl apply -f ./deploy/scheduler-extender-configmap.yaml``
 The scheduler requires flags passed to it in order to know the location of this config map. The flags are:
-````
+```
     - --policy-configmap=scheduler-extender-policy
     - --policy-configmap-namespace=kube-system
-````
+```
 
 If scheduler is running as a service these can be added as flags to the binary. If scheduler is running as a container - as in kubeadm - these args can be passed in the deployment file.
 Note: For Kubeadm set ups some additional steps may be needed.

diff --git a/gpu-aware-scheduling/docs/example/README.md b/gpu-aware-scheduling/docs/example/README.md
@@ -0,0 +1,13 @@
+This folder has a simple example POD which uses kubernetes extended resources
+
+To deploy, you can run in this folder:
+
+```
+kubectl apply -f .
+```
+
+Then you can check the GPU devices of the first pod in the deployment with:
+
+```
+kubectl exec -it deploy/bb-example -- ls /dev/dri
+```
diff --git a/gpu-aware-scheduling/docs/example/bb_example.yaml b/gpu-aware-scheduling/docs/example/bb_example.yaml
@@ -0,0 +1,23 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: bb-example
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: bb-example
+  template:
+    metadata:
+      labels:
+        app: bb-example
+    spec:
+      containers:
+      - name: gpu-resource-request
+        image: busybox:1.33.1
+        command: ['sh', '-c', 'echo The gpu resource request app is running! && sleep 6000']
+        resources:
+          limits:
+            gpu.intel.com/i915: 1
+            gpu.intel.com/millicores: 100
+            gpu.intel.com/memory.max: 1G
diff --git a/gpu-aware-scheduling/docs/gpu_plugin/README.md b/gpu-aware-scheduling/docs/gpu_plugin/README.md
@@ -0,0 +1,8 @@
+This folder has a simple example of how to deploy the Intel GPU plugin so that it has the fractional
+resource support enabled.
+
+To deploy, you can run in this folder:
+
+```
+kubectl apply -k overlays/fractional_resources
+```
diff --git a/gpu-aware-scheduling/docs/gpu_plugin/base/intel-gpu-plugin.yaml b/gpu-aware-scheduling/docs/gpu_plugin/base/intel-gpu-plugin.yaml
@@ -0,0 +1,60 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: intel-gpu-plugin
+  labels:
+    app: intel-gpu-plugin
+spec:
+  selector:
+    matchLabels:
+      app: intel-gpu-plugin
+  template:
+    metadata:
+      labels:
+        app: intel-gpu-plugin
+    spec:
+      initContainers:
+      - name: intel-gpu-initcontainer
+        image: intel/intel-gpu-initcontainer:devel
+        imagePullPolicy: IfNotPresent
+        securityContext:
+          readOnlyRootFilesystem: true
+        volumeMounts:
+        - mountPath: /etc/kubernetes/node-feature-discovery/source.d/
+          name: nfd-source-hooks
+      containers:
+      - name: intel-gpu-plugin
+        env:
+          - name: NODE_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: spec.nodeName
+        image: intel/intel-gpu-plugin:devel
+        imagePullPolicy: IfNotPresent
+        securityContext:
+          readOnlyRootFilesystem: true
+        volumeMounts:
+        - name: devfs
+          mountPath: /dev/dri
+          readOnly: true
+        - name: sysfs
+          mountPath: /sys/class/drm
+          readOnly: true
+        - name: kubeletsockets
+          mountPath: /var/lib/kubelet/device-plugins
+      volumes:
+      - name: devfs
+        hostPath:
+          path: /dev/dri
+      - name: sysfs
+        hostPath:
+          path: /sys/class/drm
+      - name: kubeletsockets
+        hostPath:
+          path: /var/lib/kubelet/device-plugins
+      - name: nfd-source-hooks
+        hostPath:
+          path: /etc/kubernetes/node-feature-discovery/source.d/
+          type: DirectoryOrCreate
+      nodeSelector:
+        kubernetes.io/arch: amd64
diff --git a/gpu-aware-scheduling/docs/gpu_plugin/base/kustomization.yaml b/gpu-aware-scheduling/docs/gpu_plugin/base/kustomization.yaml
@@ -0,0 +1,2 @@
+resources:
+  - intel-gpu-plugin.yaml
diff --git a/gpu-aware-scheduling/docs/gpu_plugin/kustomization.yaml b/gpu-aware-scheduling/docs/gpu_plugin/kustomization.yaml
@@ -0,0 +1,2 @@
+bases:
+  - base
diff --git a/gpu-aware-scheduling/docs/gpu_plugin/overlays/fractional_resources/add-args.yaml b/gpu-aware-scheduling/docs/gpu_plugin/overlays/fractional_resources/add-args.yaml
@@ -0,0 +1,12 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: intel-gpu-plugin
+spec:
+  template:
+    spec:
+      containers:
+      - name: intel-gpu-plugin
+        args:
+        - "-shared-dev-num=300"
+        - "-resource-manager"
diff --git a/...aware-scheduling/docs/gpu_plugin/overlays/fractional_resources/add-podresource-mount.yaml b/...aware-scheduling/docs/gpu_plugin/overlays/fractional_resources/add-podresource-mount.yaml
@@ -0,0 +1,17 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: intel-gpu-plugin
+spec:
+  template:
+    spec:
+      containers:
+      - name: intel-gpu-plugin
+        volumeMounts:
+        - name: podresources
+          mountPath: /var/lib/kubelet/pod-resources
+      volumes:
+      - name: podresources
+        hostPath:
+          path: /var/lib/kubelet/pod-resources
+
diff --git a/gpu-aware-scheduling/docs/gpu_plugin/overlays/fractional_resources/add-serviceaccount.yaml b/gpu-aware-scheduling/docs/gpu_plugin/overlays/fractional_resources/add-serviceaccount.yaml
@@ -0,0 +1,8 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: intel-gpu-plugin
+spec:
+  template:
+    spec:
+      serviceAccountName: resource-reader-sa
diff --git a/gpu-aware-scheduling/docs/gpu_plugin/overlays/fractional_resources/kustomization.yaml b/gpu-aware-scheduling/docs/gpu_plugin/overlays/fractional_resources/kustomization.yaml
@@ -0,0 +1,10 @@
+bases:
+  - ../../base
+resources:
+  - resource-cluster-role-binding.yaml
+  - resource-cluster-role.yaml
+  - resource-reader-sa.yaml
+patches:
+  - add-serviceaccount.yaml
+  - add-podresource-mount.yaml
+  - add-args.yaml
diff --git a/...heduling/docs/gpu_plugin/overlays/fractional_resources/resource-cluster-role-binding.yaml b/...heduling/docs/gpu_plugin/overlays/fractional_resources/resource-cluster-role-binding.yaml
@@ -0,0 +1,12 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: resource-reader-rb
+subjects:
+- kind: ServiceAccount
+  name: resource-reader-sa
+  namespace: default
+roleRef:
+  kind: ClusterRole
+  name: resource-reader
+  apiGroup: rbac.authorization.k8s.io
diff --git a/...aware-scheduling/docs/gpu_plugin/overlays/fractional_resources/resource-cluster-role.yaml b/...aware-scheduling/docs/gpu_plugin/overlays/fractional_resources/resource-cluster-role.yaml
@@ -0,0 +1,8 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: resource-reader
+rules:
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["list"]
diff --git a/gpu-aware-scheduling/docs/gpu_plugin/overlays/fractional_resources/resource-reader-sa.yaml b/gpu-aware-scheduling/docs/gpu_plugin/overlays/fractional_resources/resource-reader-sa.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: resource-reader-sa
diff --git a/gpu-aware-scheduling/docs/nfd/README.md b/gpu-aware-scheduling/docs/nfd/README.md
@@ -0,0 +1,8 @@
+This folder has a simple example of how to deploy NFD so that it can create extended resources for
+GPU Aware Scheduling
+
+To deploy, you can run in this folder:
+
+```
+kubectl apply -k .
+```
diff --git a/gpu-aware-scheduling/docs/nfd/kustom/env_vars.yaml b/gpu-aware-scheduling/docs/nfd/kustom/env_vars.yaml
@@ -0,0 +1,19 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nfd-worker
+spec:
+  template:
+    spec:
+      containers:
+        - env:
+          # GPU_MEMORY_OVERRIDE is the value for gpus that don't tell memory amount via the driver
+          - name: GPU_MEMORY_OVERRIDE
+            value: "4000000000"
+          # GPU_MEMORY_RESERVED is the value of memory scoped out from k8s for those gpus which
+          # do tell the memory amount via the driver
+#          - name: GPU_MEMORY_RESERVED
+#            value: "294967295"
+          name: nfd-worker
+
+# the env var values propagate to the nfd extension hook (gpu nfd hook, installed by gpu plugin initcontainer)
diff --git a/gpu-aware-scheduling/docs/nfd/kustom/external_resources.yaml b/gpu-aware-scheduling/docs/nfd/kustom/external_resources.yaml
@@ -0,0 +1,13 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: nfd-master
+spec:
+  template:
+    spec:
+      containers:
+        - name: nfd-master
+          command:
+            - "nfd-master"
+            - "--resource-labels=gpu.intel.com/memory.max,gpu.intel.com/millicores"
+            - "--extra-label-ns=gpu.intel.com"
diff --git a/gpu-aware-scheduling/docs/nfd/kustom/rbac.yaml b/gpu-aware-scheduling/docs/nfd/kustom/rbac.yaml
@@ -0,0 +1,18 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: nfd-master
+rules:
+- apiGroups:
+  - ""
+  resources:
+  - nodes
+# since we are using command line flag --resource-labels to create extended resources
+# this kustomize patch uncomments "- nodes/status"
+  - nodes/status
+  verbs:
+  - get
+  - patch
+  - update
+  # List only needed for --prune
+  - list
diff --git a/gpu-aware-scheduling/docs/nfd/kustomization.yaml b/gpu-aware-scheduling/docs/nfd/kustomization.yaml
@@ -0,0 +1,7 @@
+resources:
+- v0.7.0/nfd-master.yaml.template
+- v0.7.0/nfd-worker-daemonset.yaml.template
+patchesStrategicMerge:
+- kustom/external_resources.yaml
+- kustom/env_vars.yaml
+- kustom/rbac.yaml