From 7885a22d758847205416902096dfa1c578f3552b Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Mon, 28 Oct 2024 23:54:42 +0000 Subject: [PATCH 1/3] add training example for gke parallelstore blueprint --- examples/gke-storage-parallelstore.yaml | 54 ++++++++++++------- modules/compute/gke-job-template/README.md | 1 + modules/compute/gke-job-template/main.tf | 1 + .../templates/gke-job-base.yaml.tftpl | 6 +++ modules/compute/gke-job-template/variables.tf | 9 ++++ .../test-gke-storage-parallelstore.yml | 6 +-- 6 files changed, 54 insertions(+), 23 deletions(-) diff --git a/examples/gke-storage-parallelstore.yaml b/examples/gke-storage-parallelstore.yaml index 413e523da7..45b51d6c74 100644 --- a/examples/gke-storage-parallelstore.yaml +++ b/examples/gke-storage-parallelstore.yaml @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -blueprint_name: gke-storage-parallelstore +blueprint_name: gke-storage-parallelstore-dev vars: project_id: ## Set GCP Project ID Here ## - deployment_name: gke-storage-parallelstore + deployment_name: gke-storage-parallelstore-dev region: us-central1 zone: us-central1-c @@ -67,7 +67,7 @@ deployment_groups: sc_volume_binding_mode: Immediate sc_reclaim_policy: Delete # Use Retain if you want to volume and parallelstore resource will remain after sc_topology_zones: [$(vars.zone)] - pvc_count: 2 + pvc_count: 1 capacity_gb: 12000 # from 12,000 GiB to 100,000 GiB, in multiples of 4,000 GiB - id: sample-pool @@ -76,9 +76,10 @@ deployment_groups: settings: name: sample-pool zones: [$(vars.zone)] - machine_type: n2-standard-4 + machine_type: n2-standard-16 - ### Parallelstore enabled Job ### + # Train a TensorFlow model with Keras and Parallelstore on GKE + # Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample - id: parallelstore-job source: modules/compute/gke-job-template @@ -86,22 +87,35 @@ deployment_groups: - gke_cluster - parallelstore-setup settings: - image: busybox + name: tensorflow + image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d + security_context: + - key: runAsUser + value: 1000 + - key: runAsGroup + value: 100 + - key: fsGroup + value: 100 command: - - bin/sh + - bash - -c - | - echo "Set up job folders" - shopt -s extglob; JOB=${HOSTNAME%%-+([[:digit:]])} - mkdir /data/parallelstore-pvc-0/${JOB}/ -p; - mkdir /data/parallelstore-pvc-1/${JOB}/ -p; - - echo "Writing seed data to Parallelstore volumes" - dd if=/dev/urandom of=/data/parallelstore-pvc-0/${JOB}/${JOB_COMPLETION_INDEX}.dat bs=1K count=1000 - dd if=/dev/urandom of=/data/parallelstore-pvc-1/${JOB}/${JOB_COMPLETION_INDEX}.dat bs=1K count=1000 - - # echo "Hash file and write between the 2 hyerpdisk balanced volumes" - # md5sum /data/parallelstore-pvc-0/${JOB}/${JOB_COMPLETION_INDEX}.dat > /data/parallelstore-pvc-1/${JOB}/${JOB_COMPLETION_INDEX}.md5 - # md5sum /data/parallelstore-pvc-1/${JOB}/${JOB_COMPLETION_INDEX}.dat > /data/parallelstore-pvc-0/${JOB}/${JOB_COMPLETION_INDEX}.md5 - node_count: 5 + pip install transformers datasets + python - < [random\_name\_sufix](#input\_random\_name\_sufix) | Appends a random suffix to the job name to avoid clashes. | `bool` | `true` | no | | [requested\_cpu\_per\_pod](#input\_requested\_cpu\_per\_pod) | The requested cpu per pod. If null, allocatable\_cpu\_per\_node will be used to claim whole nodes. If provided will override allocatable\_cpu\_per\_node. | `number` | `-1` | no | | [restart\_policy](#input\_restart\_policy) | Job restart policy. Only a RestartPolicy equal to `Never` or `OnFailure` is allowed. | `string` | `"Never"` | no | +| [security\_context](#input\_security\_context) | The security options the container should be run with. More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ |
list(object({
key = string
value = string
}))
| `[]` | no | | [tolerations](#input\_tolerations) | Tolerations allow the scheduler to schedule pods with matching taints. Generally populated from gke-node-pool via `use` field. |
list(object({
key = string
operator = string
value = string
effect = string
}))
|
[
{
"effect": "NoSchedule",
"key": "user-workload",
"operator": "Equal",
"value": "true"
}
]
| no | ## Outputs diff --git a/modules/compute/gke-job-template/main.tf b/modules/compute/gke-job-template/main.tf index cded3fbb1d..2e21c7c394 100644 --- a/modules/compute/gke-job-template/main.tf +++ b/modules/compute/gke-job-template/main.tf @@ -129,6 +129,7 @@ locals { restart_policy = var.restart_policy backoff_limit = var.backoff_limit tolerations = distinct(var.tolerations) + security_context = var.security_context labels = local.labels empty_dir_volumes = local.empty_dir_volumes diff --git a/modules/compute/gke-job-template/templates/gke-job-base.yaml.tftpl b/modules/compute/gke-job-template/templates/gke-job-base.yaml.tftpl index 61c34f8b25..431a519b9c 100644 --- a/modules/compute/gke-job-template/templates/gke-job-base.yaml.tftpl +++ b/modules/compute/gke-job-template/templates/gke-job-base.yaml.tftpl @@ -18,6 +18,12 @@ spec: gke-gcsfuse/volumes: "true" %{~ endif ~} spec: + %{~ if length(security_context) > 0 ~} + securityContext: + %{~ for context in security_context ~} + ${context.key}: ${context.value} + %{~ endfor ~} + %{~ endif ~} %{~ if k8s_service_account_name != null ~} serviceAccountName: ${k8s_service_account_name} %{~ endif ~} diff --git a/modules/compute/gke-job-template/variables.tf b/modules/compute/gke-job-template/variables.tf index 279293cf26..6a37c344c1 100644 --- a/modules/compute/gke-job-template/variables.tf +++ b/modules/compute/gke-job-template/variables.tf @@ -92,6 +92,15 @@ variable "tolerations" { ] } +variable "security_context" { + description = "The security options the container should be run with. More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/" + type = list(object({ + key = string + value = string + })) + default = [] +} + variable "machine_family" { description = "The machine family to use in the node selector (example: `n2`). If null then machine family will not be used as selector criteria." type = string diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml index 424908f436..adceaa1087 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml @@ -19,7 +19,7 @@ - name: Execute the job delegate_to: localhost ansible.builtin.shell: | - jobs=({{ workspace }}/{{ deployment_name }}/primary/my-job*) + jobs=({{ workspace }}/{{ deployment_name }}/primary/tensorflow*) for job in "${jobs[@]}"; do kubectl create -f "$job" done @@ -30,10 +30,10 @@ - name: Wait for job to complete delegate_to: localhost ansible.builtin.command: | - kubectl get job --field-selector status.successful=5 + kubectl get job --field-selector status.successful=1 register: job_completion until: job_completion.stdout_lines | length > 1 - retries: 40 + retries: 80 delay: 15 - name: Print job_completion debug output From 0b18f4b02164320d8de3555a6c0b9c9e239acaf9 Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Mon, 28 Oct 2024 23:57:03 +0000 Subject: [PATCH 2/3] fix blueprint name --- examples/gke-storage-parallelstore.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/gke-storage-parallelstore.yaml b/examples/gke-storage-parallelstore.yaml index 45b51d6c74..6b88fd7913 100644 --- a/examples/gke-storage-parallelstore.yaml +++ b/examples/gke-storage-parallelstore.yaml @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -blueprint_name: gke-storage-parallelstore-dev +blueprint_name: gke-storage-parallelstore vars: project_id: ## Set GCP Project ID Here ## - deployment_name: gke-storage-parallelstore-dev + deployment_name: gke-storage-parallelstore region: us-central1 zone: us-central1-c From 431f05ab9e67fa8e0e7127345d98c36634fce494 Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Fri, 1 Nov 2024 17:33:57 +0000 Subject: [PATCH 3/3] fix comment --- examples/gke-storage-parallelstore.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/gke-storage-parallelstore.yaml b/examples/gke-storage-parallelstore.yaml index 6b88fd7913..9ffe737e83 100644 --- a/examples/gke-storage-parallelstore.yaml +++ b/examples/gke-storage-parallelstore.yaml @@ -89,7 +89,7 @@ deployment_groups: settings: name: tensorflow image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d - security_context: + security_context: # to make sure the job have enough access to execute the jobs and r/w from parallelstore - key: runAsUser value: 1000 - key: runAsGroup