From 1447c1481ec6e566bceebb9b54a8c262a4bb6a2c Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Thu, 29 Feb 2024 19:25:42 +0000 Subject: [PATCH] Add Slurm v6 version for image builder blueprint --- examples/README.md | 126 ++++++++++++++++++ examples/image-builder-v6.yaml | 119 +++++++++++++++++ .../daily-tests/builds/packer-v6.yaml | 56 ++++++++ .../daily-tests/tests/packer-v6.yml | 27 ++++ 4 files changed, 328 insertions(+) create mode 100644 examples/image-builder-v6.yaml create mode 100644 tools/cloud-build/daily-tests/builds/packer-v6.yaml create mode 100644 tools/cloud-build/daily-tests/tests/packer-v6.yml diff --git a/examples/README.md b/examples/README.md index 3675aa94c5..ba8b784004 100644 --- a/examples/README.md +++ b/examples/README.md @@ -39,6 +39,7 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [hpc-slurm6-tpu.yaml](#hpc-slurm6-tpuyaml--) ![community-badge] ![experimental-badge] * [ml-slurm.yaml](#ml-slurmyaml-) ![core-badge] * [image-builder.yaml](#image-builderyaml-) ![core-badge] + * [image-builder-v6.yaml](#image-builderyaml--) ![core-badge] ![experimental-badge] * [serverless-batch.yaml](#serverless-batchyaml-) ![core-badge] * [serverless-batch-mpi.yaml](#serverless-batch-mpiyaml-) ![core-badge] * [pfs-lustre.yaml](#pfs-lustreyaml-) ![core-badge] @@ -517,6 +518,131 @@ partition is using the custom image. Each compute node should contain the For this example the following is needed in the selected region: +* Compute Engine API: Images (global, not regional quota): 1 image per invocation of `packer build` +* Compute Engine API: Persistent Disk SSD (GB): **~50 GB** +* Compute Engine API: Persistent Disk Standard (GB): **~64 GB static + 32 + GB/node** up to 704 GB +* Compute Engine API: N2 CPUs: **4** (for short-lived Packer VM and Slurm login node) +* Compute Engine API: C2 CPUs: **4** for controller node and **60/node** active + in `compute` partition up to 1,204 +* Compute Engine API: Affinity Groups: **one for each job in parallel** - _only + needed for `compute` partition_ +* Compute Engine API: Resource policies: **one for each job in parallel** - + _only needed for `compute` partition_ + +### [image-builder-v6.yaml] ![core-badge] ![experimental-badge] + +This blueprint uses the [Packer template module][pkr] to create a custom VM +image and uses it to provision an HPC cluster using the Slurm scheduler. By +using a custom image, the cluster is able to begin running jobs sooner and more +reliably because there is no need to install applications as VMs boot. This +example takes the following steps: + +1. Creates a network with outbound internet access in which to build the image (see +[Custom Network](#custom-network-deployment-group-1)). +2. Creates a script that will be used to customize the image (see +[Toolkit Runners](#toolkit-runners-deployment-group-1)). +3. Builds a custom Slurm image by executing the script on a standard Slurm image +(see [Packer Template](#packer-template-deployment-group-2)). +4. Deploys a Slurm cluster using the custom image (see +[Slurm Cluster Based on Custom Image](#slurm-cluster-based-on-custom-image-deployment-group-3)). + +#### Building and using the custom image + +Create the deployment folder from the blueprint: + +```text +./ghpc create examples/image-builder-v6.yaml --vars "project_id=${GOOGLE_CLOUD_PROJECT}" +./ghpc deploy image-builder-v6-001" +``` + +Follow the on-screen prompts to approve the creation of each deployment group. +For example, the network is created in the first deployment group, the VM image +is created in the second group, and the third group uses the image to create an +HPC cluster using the Slurm scheduler. + +When you are done, clean up the resources in reverse order of creation: + +```text +terraform -chdir=image-builder-v6-001/cluster destroy --auto-approve +terraform -chdir=image-builder-v6-001/primary destroy --auto-approve +``` + +Finally, browse to the [Cloud Console][console-images] to delete your custom +image. It will be named beginning with `my-slurm-image` followed by a date and +timestamp for uniqueness. + +[console-images]: https://console.cloud.google.com/compute/images + +#### Why use a custom image? + +Using a custom VM image can be more scalable and reliable than installing +software using boot-time startup scripts because: + +* it avoids reliance on continued availability of package repositories +* VMs will join an HPC cluster and execute workloads more rapidly due to reduced + boot-time configuration +* machines are guaranteed to boot with a static software configuration chosen + when the custom image was created. No potential for some machines to have + different software versions installed due to `apt`/`yum`/`pip` installations + executed after remote repositories have been updated. + +[hpcimage]: https://cloud.google.com/compute/docs/instances/create-hpc-vm +[pkr]: ../modules/packer/custom-image/README.md +[image-builder-v6.yaml]: ./image-builder-v6.yaml + +#### Custom Network (deployment group 1) + +A tool called [Packer](https://packer.io) builds custom VM images by creating +short-lived VMs, executing scripts on them, and saving the boot disk as an +image that can be used by future VMs. The short-lived VM typically operates in a +network that has outbound access to the internet for downloading software. + +This deployment group creates a network using [Cloud Nat][cloudnat] and +[Identity-Aware Proxy (IAP)][iap] to allow outbound traffic and inbound SSH +connections without exposing the machine to the internet on a public IP address. + +[cloudnat]: https://cloud.google.com/nat/docs/overview +[iap]: https://cloud.google.com/iap/docs/using-tcp-forwarding + +#### Toolkit Runners (deployment group 1) + +The Toolkit [startup-script](../modules/scripts/startup-script/README.md) +module supports boot-time configuration of VMs using "runners". Runners are +configured as a series of scripts uploaded to Cloud Storage. A simple, standard +[VM startup script][vmstartup] runs at boot-time, downloads the scripts from +Cloud Storage and executes them in sequence. + +The script in this example performs the trivial task of creating a file as a +simple demonstration of functionality. You can use the startup-script module +to address more complex scenarios. + +[vmstartup]: https://cloud.google.com/compute/docs/instances/startup-scripts/linux + +#### Packer Template (deployment group 2) + +The Packer module uses the startup-script module from the first deployment group +and executes the script to produce a custom image. + +#### Slurm Cluster Based on Custom Image (deployment group 3) + +Once the Slurm cluster has been deployed we can test that our Slurm compute +partition is using the custom image. Each compute node should contain the +`hello.txt` file added by the startup-script. + +1. SSH into the login node `slurm-image-builder-v6-001-login0`. +2. Run a job that prints the contents of the added file: + + ```bash + $ srun -N 2 cat /home/hello.txt + Hello World + Hello World + ``` + +#### Quota Requirements for image-builder-v6.yaml + +For this example the following is needed in the selected region: + * Compute Engine API: Images (global, not regional quota): 1 image per invocation of `packer build` * Compute Engine API: Persistent Disk SSD (GB): **~50 GB** * Compute Engine API: Persistent Disk Standard (GB): **~64 GB static + 32 diff --git a/examples/image-builder-v6.yaml b/examples/image-builder-v6.yaml new file mode 100644 index 0000000000..1da2052155 --- /dev/null +++ b/examples/image-builder-v6.yaml @@ -0,0 +1,119 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +# See instructions at +# https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/examples#image-builderyaml- + +blueprint_name: image-builder-v6 + +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: image-builder-v6-001 + region: us-central1 + zone: us-central1-c + custom_image: + family: my-slurm-image + project: $(vars.project_id) + disk_size: 32 + +# Documentation for each of the modules used below can be found at +# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md + +deployment_groups: +- group: primary + modules: + - id: network + source: modules/network/vpc + + - id: scripts_for_image + source: modules/scripts/startup-script + settings: + runners: + - type: shell + destination: generate_hello.sh + content: | + #!/bin/sh + echo "Hello World" > /home/hello.txt + + - id: builder_sa + source: community/modules/project/service-account + settings: + name: pkr + project_roles: + - compute.instanceAdmin.v1 + - logging.logWriter + - monitoring.metricWriter + - storage.objectViewer + +- group: packer + modules: + - id: custom-image + source: modules/packer/custom-image + kind: packer + use: + - network + - scripts_for_image + - builder_sa + settings: + source_image_project_id: [schedmd-slurm-public] + # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family + source_image_family: slurm-gcp-6-4-hpc-rocky-linux-8 + # You can find size of source image by using following command + # gcloud compute images describe-from-family --project schedmd-slurm-public + disk_size: $(vars.disk_size) + image_family: $(vars.custom_image.family) + state_timeout: 15m + +- group: cluster + modules: + - id: compute_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] + settings: + node_count_dynamic_max: 20 + disk_size_gb: $(vars.disk_size) + instance_image: $(vars.custom_image) + instance_image_custom: true + bandwidth_tier: gvnic_enabled + + - id: compute_partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: [compute_nodeset] + settings: + partition_name: compute + is_default: true + + - id: slurm_login + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login + use: [network] + settings: + name_prefix: login + disable_login_public_ips: false + disk_size_gb: $(vars.disk_size) + instance_image: $(vars.custom_image) + instance_image_custom: true + + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller + use: + - network + - compute_partition + - slurm_login + settings: + disable_controller_public_ips: false + disk_size_gb: $(vars.disk_size) + instance_image: $(vars.custom_image) + instance_image_custom: true diff --git a/tools/cloud-build/daily-tests/builds/packer-v6.yaml b/tools/cloud-build/daily-tests/builds/packer-v6.yaml new file mode 100644 index 0000000000..6c0e0c8f00 --- /dev/null +++ b/tools/cloud-build/daily-tests/builds/packer-v6.yaml @@ -0,0 +1,56 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +tags: [] +timeout: 14400s # 4hr +steps: +## Test simple golang build +- id: build_ghpc + waitFor: ["-"] + name: "golang:bullseye" + entrypoint: /bin/bash + args: + - -c + - | + cd /workspace + make +- id: fetch_builder + waitFor: ["-"] + name: >- + us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder + entrypoint: /bin/bash + args: + - -c + - echo "done fetching builder" + +# test image creation by provisioning a new VPC and using Packer to build an +# image in it +- id: packer-v6 + waitFor: ["fetch_builder", "build_ghpc"] + name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder + entrypoint: /bin/bash + env: + - "ANSIBLE_HOST_KEY_CHECKING=false" + - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" + args: + - -c + - | + set -x -e + BUILD_ID_FULL=$BUILD_ID + BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} + + ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="@tools/cloud-build/daily-tests/tests/packer-v6.yml" diff --git a/tools/cloud-build/daily-tests/tests/packer-v6.yml b/tools/cloud-build/daily-tests/tests/packer-v6.yml new file mode 100644 index 0000000000..b21e84bb8d --- /dev/null +++ b/tools/cloud-build/daily-tests/tests/packer-v6.yml @@ -0,0 +1,27 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +test_name: image-builder-v6 +deployment_name: pkrv6{{ build }} +zone: us-central1-c +workspace: /workspace +blueprint_yaml: "{{ workspace }}/examples/image-builder-v6.yaml" +network: "{{ deployment_name }}-net" +packer_group_name: packer +packer_module_id: custom-image +cli_deployment_vars: + network_name: "{{ network }}" + subnetwork_name: "{{ network }}-sub"