Merge pull request #3181 from chengcongdu/storage

add training example for gke parallelstore blueprint
GoogleCloudPlatform · Nov 1, 2024 · 38f5344 · 38f5344
2 parents cbe7ffc + 431f05a
commit 38f5344
Show file tree

Hide file tree

Showing 6 changed files with 52 additions and 21 deletions.
diff --git a/examples/gke-storage-parallelstore.yaml b/examples/gke-storage-parallelstore.yaml
@@ -67,7 +67,7 @@ deployment_groups:
       sc_volume_binding_mode: Immediate
       sc_reclaim_policy: Delete # Use Retain if you want to volume and parallelstore resource will remain after
       sc_topology_zones: [$(vars.zone)]
-      pvc_count: 2
+      pvc_count: 1
       capacity_gb: 12000 # from 12,000 GiB to 100,000 GiB, in multiples of 4,000 GiB
 
   - id: sample-pool
@@ -76,32 +76,46 @@ deployment_groups:
     settings:
       name: sample-pool
       zones: [$(vars.zone)]
-      machine_type: n2-standard-4
+      machine_type: n2-standard-16
 
-  ### Parallelstore enabled Job ###
+  # Train a TensorFlow model with Keras and Parallelstore on GKE
+  # Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample
 
   - id: parallelstore-job
     source: modules/compute/gke-job-template
     use:
     - gke_cluster
     - parallelstore-setup
     settings:
-      image: busybox
+      name: tensorflow
+      image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d
+      security_context:  # to make sure the job have enough access to execute the jobs and r/w from parallelstore
+      - key: runAsUser
+        value: 1000
+      - key: runAsGroup
+        value: 100
+      - key: fsGroup
+        value: 100
       command:
-      - bin/sh
+      - bash
       - -c
       - |
-        echo "Set up job folders"
-        shopt -s extglob; JOB=${HOSTNAME%%-+([[:digit:]])}
-        mkdir /data/parallelstore-pvc-0/${JOB}/ -p;
-        mkdir /data/parallelstore-pvc-1/${JOB}/ -p;
-
-        echo "Writing seed data to Parallelstore volumes"
-        dd if=/dev/urandom of=/data/parallelstore-pvc-0/${JOB}/${JOB_COMPLETION_INDEX}.dat bs=1K count=1000
-        dd if=/dev/urandom of=/data/parallelstore-pvc-1/${JOB}/${JOB_COMPLETION_INDEX}.dat bs=1K count=1000
-
-        # echo "Hash file and write between the 2 hyerpdisk balanced volumes"
-        # md5sum /data/parallelstore-pvc-0/${JOB}/${JOB_COMPLETION_INDEX}.dat > /data/parallelstore-pvc-1/${JOB}/${JOB_COMPLETION_INDEX}.md5
-        # md5sum /data/parallelstore-pvc-1/${JOB}/${JOB_COMPLETION_INDEX}.dat > /data/parallelstore-pvc-0/${JOB}/${JOB_COMPLETION_INDEX}.md5
-      node_count: 5
+        pip install transformers datasets
+        python - <<EOF
+        from datasets import load_dataset
+        dataset = load_dataset("glue", "cola", cache_dir='/data/parallelstore-pvc-0')
+        dataset = dataset["train"]
+        from transformers import AutoTokenizer
+        import numpy as np
+        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+        tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True)
+        tokenized_data = dict(tokenized_data)
+        labels = np.array(dataset["label"])
+        from transformers import TFAutoModelForSequenceClassification
+        from tensorflow.keras.optimizers import Adam
+        model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
+        model.compile(optimizer=Adam(3e-5))
+        model.fit(tokenized_data, labels)
+        EOF
+      node_count: 1
     outputs: [instructions]
diff --git a/modules/compute/gke-job-template/README.md b/modules/compute/gke-job-template/README.md
@@ -117,6 +117,7 @@ No modules.
 | <a name="input_random_name_sufix"></a> [random\_name\_sufix](#input\_random\_name\_sufix) | Appends a random suffix to the job name to avoid clashes. | `bool` | `true` | no |
 | <a name="input_requested_cpu_per_pod"></a> [requested\_cpu\_per\_pod](#input\_requested\_cpu\_per\_pod) | The requested cpu per pod. If null, allocatable\_cpu\_per\_node will be used to claim whole nodes. If provided will override allocatable\_cpu\_per\_node. | `number` | `-1` | no |
 | <a name="input_restart_policy"></a> [restart\_policy](#input\_restart\_policy) | Job restart policy. Only a RestartPolicy equal to `Never` or `OnFailure` is allowed. | `string` | `"Never"` | no |
+| <a name="input_security_context"></a> [security\_context](#input\_security\_context) | The security options the container should be run with. More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ | <pre>list(object({<br/>    key   = string<br/>    value = string<br/>  }))</pre> | `[]` | no |
 | <a name="input_tolerations"></a> [tolerations](#input\_tolerations) | Tolerations allow the scheduler to schedule pods with matching taints. Generally populated from gke-node-pool via `use` field. | <pre>list(object({<br/>    key      = string<br/>    operator = string<br/>    value    = string<br/>    effect   = string<br/>  }))</pre> | <pre>[<br/>  {<br/>    "effect": "NoSchedule",<br/>    "key": "user-workload",<br/>    "operator": "Equal",<br/>    "value": "true"<br/>  }<br/>]</pre> | no |
 
 ## Outputs

diff --git a/modules/compute/gke-job-template/main.tf b/modules/compute/gke-job-template/main.tf
@@ -129,6 +129,7 @@ locals {
       restart_policy           = var.restart_policy
       backoff_limit            = var.backoff_limit
       tolerations              = distinct(var.tolerations)
+      security_context         = var.security_context
       labels                   = local.labels
 
       empty_dir_volumes    = local.empty_dir_volumes

diff --git a/modules/compute/gke-job-template/templates/gke-job-base.yaml.tftpl b/modules/compute/gke-job-template/templates/gke-job-base.yaml.tftpl
@@ -18,6 +18,12 @@ spec:
         gke-gcsfuse/volumes: "true"
     %{~ endif ~}
     spec:
+      %{~ if length(security_context) > 0 ~}
+      securityContext:
+      %{~ for context in security_context ~}
+        ${context.key}: ${context.value}
+      %{~ endfor ~}
+      %{~ endif ~}
       %{~ if k8s_service_account_name != null ~}
       serviceAccountName: ${k8s_service_account_name}
       %{~ endif ~}

diff --git a/modules/compute/gke-job-template/variables.tf b/modules/compute/gke-job-template/variables.tf
@@ -92,6 +92,15 @@ variable "tolerations" {
   ]
 }
 
+variable "security_context" {
+  description = "The security options the container should be run with. More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/"
+  type = list(object({
+    key   = string
+    value = string
+  }))
+  default = []
+}
+
 variable "machine_family" {
   description = "The machine family to use in the node selector (example: `n2`). If null then machine family will not be used as selector criteria."
   type        = string

diff --git a/...ud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml b/...ud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml
@@ -19,7 +19,7 @@
 - name: Execute the job
   delegate_to: localhost
   ansible.builtin.shell: |
-    jobs=({{ workspace }}/{{ deployment_name }}/primary/my-job*)
+    jobs=({{ workspace }}/{{ deployment_name }}/primary/tensorflow*)
     for job in "${jobs[@]}"; do
       kubectl create -f "$job"
     done
@@ -30,10 +30,10 @@
 - name: Wait for job to complete
   delegate_to: localhost
   ansible.builtin.command: |
-    kubectl get job --field-selector  status.successful=5
+    kubectl get job --field-selector  status.successful=1
   register: job_completion
   until: job_completion.stdout_lines | length > 1
-  retries: 40
+  retries: 80
   delay: 15
 
 - name: Print job_completion debug output