Skip to content

Commit

Permalink
Merge pull request #3181 from chengcongdu/storage
Browse files Browse the repository at this point in the history
add training example for gke parallelstore blueprint
  • Loading branch information
chengcongdu authored Nov 1, 2024
2 parents cbe7ffc + 431f05a commit 38f5344
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 21 deletions.
50 changes: 32 additions & 18 deletions examples/gke-storage-parallelstore.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ deployment_groups:
sc_volume_binding_mode: Immediate
sc_reclaim_policy: Delete # Use Retain if you want to volume and parallelstore resource will remain after
sc_topology_zones: [$(vars.zone)]
pvc_count: 2
pvc_count: 1
capacity_gb: 12000 # from 12,000 GiB to 100,000 GiB, in multiples of 4,000 GiB

- id: sample-pool
Expand All @@ -76,32 +76,46 @@ deployment_groups:
settings:
name: sample-pool
zones: [$(vars.zone)]
machine_type: n2-standard-4
machine_type: n2-standard-16

### Parallelstore enabled Job ###
# Train a TensorFlow model with Keras and Parallelstore on GKE
# Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample

- id: parallelstore-job
source: modules/compute/gke-job-template
use:
- gke_cluster
- parallelstore-setup
settings:
image: busybox
name: tensorflow
image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d
security_context: # to make sure the job have enough access to execute the jobs and r/w from parallelstore
- key: runAsUser
value: 1000
- key: runAsGroup
value: 100
- key: fsGroup
value: 100
command:
- bin/sh
- bash
- -c
- |
echo "Set up job folders"
shopt -s extglob; JOB=${HOSTNAME%%-+([[:digit:]])}
mkdir /data/parallelstore-pvc-0/${JOB}/ -p;
mkdir /data/parallelstore-pvc-1/${JOB}/ -p;
echo "Writing seed data to Parallelstore volumes"
dd if=/dev/urandom of=/data/parallelstore-pvc-0/${JOB}/${JOB_COMPLETION_INDEX}.dat bs=1K count=1000
dd if=/dev/urandom of=/data/parallelstore-pvc-1/${JOB}/${JOB_COMPLETION_INDEX}.dat bs=1K count=1000
# echo "Hash file and write between the 2 hyerpdisk balanced volumes"
# md5sum /data/parallelstore-pvc-0/${JOB}/${JOB_COMPLETION_INDEX}.dat > /data/parallelstore-pvc-1/${JOB}/${JOB_COMPLETION_INDEX}.md5
# md5sum /data/parallelstore-pvc-1/${JOB}/${JOB_COMPLETION_INDEX}.dat > /data/parallelstore-pvc-0/${JOB}/${JOB_COMPLETION_INDEX}.md5
node_count: 5
pip install transformers datasets
python - <<EOF
from datasets import load_dataset
dataset = load_dataset("glue", "cola", cache_dir='/data/parallelstore-pvc-0')
dataset = dataset["train"]
from transformers import AutoTokenizer
import numpy as np
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True)
tokenized_data = dict(tokenized_data)
labels = np.array(dataset["label"])
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
model.compile(optimizer=Adam(3e-5))
model.fit(tokenized_data, labels)
EOF
node_count: 1
outputs: [instructions]
1 change: 1 addition & 0 deletions modules/compute/gke-job-template/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ No modules.
| <a name="input_random_name_sufix"></a> [random\_name\_sufix](#input\_random\_name\_sufix) | Appends a random suffix to the job name to avoid clashes. | `bool` | `true` | no |
| <a name="input_requested_cpu_per_pod"></a> [requested\_cpu\_per\_pod](#input\_requested\_cpu\_per\_pod) | The requested cpu per pod. If null, allocatable\_cpu\_per\_node will be used to claim whole nodes. If provided will override allocatable\_cpu\_per\_node. | `number` | `-1` | no |
| <a name="input_restart_policy"></a> [restart\_policy](#input\_restart\_policy) | Job restart policy. Only a RestartPolicy equal to `Never` or `OnFailure` is allowed. | `string` | `"Never"` | no |
| <a name="input_security_context"></a> [security\_context](#input\_security\_context) | The security options the container should be run with. More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ | <pre>list(object({<br/> key = string<br/> value = string<br/> }))</pre> | `[]` | no |
| <a name="input_tolerations"></a> [tolerations](#input\_tolerations) | Tolerations allow the scheduler to schedule pods with matching taints. Generally populated from gke-node-pool via `use` field. | <pre>list(object({<br/> key = string<br/> operator = string<br/> value = string<br/> effect = string<br/> }))</pre> | <pre>[<br/> {<br/> "effect": "NoSchedule",<br/> "key": "user-workload",<br/> "operator": "Equal",<br/> "value": "true"<br/> }<br/>]</pre> | no |

## Outputs
Expand Down
1 change: 1 addition & 0 deletions modules/compute/gke-job-template/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ locals {
restart_policy = var.restart_policy
backoff_limit = var.backoff_limit
tolerations = distinct(var.tolerations)
security_context = var.security_context
labels = local.labels

empty_dir_volumes = local.empty_dir_volumes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@ spec:
gke-gcsfuse/volumes: "true"
%{~ endif ~}
spec:
%{~ if length(security_context) > 0 ~}
securityContext:
%{~ for context in security_context ~}
${context.key}: ${context.value}
%{~ endfor ~}
%{~ endif ~}
%{~ if k8s_service_account_name != null ~}
serviceAccountName: ${k8s_service_account_name}
%{~ endif ~}
Expand Down
9 changes: 9 additions & 0 deletions modules/compute/gke-job-template/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,15 @@ variable "tolerations" {
]
}

variable "security_context" {
description = "The security options the container should be run with. More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/"
type = list(object({
key = string
value = string
}))
default = []
}

variable "machine_family" {
description = "The machine family to use in the node selector (example: `n2`). If null then machine family will not be used as selector criteria."
type = string
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
- name: Execute the job
delegate_to: localhost
ansible.builtin.shell: |
jobs=({{ workspace }}/{{ deployment_name }}/primary/my-job*)
jobs=({{ workspace }}/{{ deployment_name }}/primary/tensorflow*)
for job in "${jobs[@]}"; do
kubectl create -f "$job"
done
Expand All @@ -30,10 +30,10 @@
- name: Wait for job to complete
delegate_to: localhost
ansible.builtin.command: |
kubectl get job --field-selector status.successful=5
kubectl get job --field-selector status.successful=1
register: job_completion
until: job_completion.stdout_lines | length > 1
retries: 40
retries: 80
delay: 15

- name: Print job_completion debug output
Expand Down

0 comments on commit 38f5344

Please sign in to comment.