diff --git a/community/examples/hpc-build-slurm-image.yaml b/community/examples/hpc-build-slurm-image.yaml new file mode 100644 index 0000000000..6a82fe8d9f --- /dev/null +++ b/community/examples/hpc-build-slurm-image.yaml @@ -0,0 +1,119 @@ +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +blueprint_name: hpc-build-slurm-image + +vars: + project_id: ns-playground-2023-01-19 ## Set GCP Project ID Here ## + deployment_name: build-slurm-1 + region: us-central1 + zone: us-central1-a + + image_build_machine_type: n2d-standard-32 + build_from_image_family: hpc-rocky-linux-8 + build_from_image_project: cloud-hpc-image-public + built_image_family: my-custom-slurm + built_instance_image: + family: $(vars.built_image_family) + project: $(vars.project_id) + instance_image_custom: true + +deployment_groups: +- group: setup + modules: + - id: network + source: modules/network/vpc + + - id: slurm-build-script + source: modules/scripts/startup-script + settings: + # Do not create Ansible virtual env; Install system wide Ansible below. + install_ansible: false + runners: + - type: shell + destination: prep-for-slurm-build.sh + content: | + #!/bin/bash + set -e -o pipefail + # Slurm build on Rocky8 will upgrade to python38 as part of build + # This is not compatible with ansible-local runner + dnf install -y python38 + alternatives --set python3 /usr/bin/python3.8 + python3 -m pip install pip --upgrade + python3 -m pip install ansible==6.7.0 + python3 -m pip install selinux + export PATH=/usr/local/bin:$PATH + ansible --version + ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents + - type: data + destination: /var/tmp/slurm_vars.json + content: | + { + "reboot": false, + "slurm_version": "23.02.5", + "install_cuda": false, + "nvidia_version": "latest", + "install_ompi": true, + "install_lustre": false, + "install_gcsfuse": true + } + - type: shell + destination: install_slurm.sh + content: | + #!/bin/bash + set -e -o pipefail + ansible-pull \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C master \ + -i localhost, --limit localhost --connection=local \ + -e @/var/tmp/slurm_vars.json \ + ansible/playbook.yml + +- group: build-slurm + modules: + - id: slurm-custom-image + source: modules/packer/custom-image + kind: packer + settings: + machine_type: $(vars.image_build_machine_type) + source_image_family: $(vars.build_from_image_family) + source_image_project_id: [$(vars.build_from_image_project)] + image_family: $(vars.built_image_family) + use: + - network + - slurm-build-script + +- group: demo-cluster + modules: + + - id: debug_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] + settings: + machine_type: n2d-standard-2 + instance_image: $(vars.built_instance_image) + + - id: debug_partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: [debug_nodeset] + settings: + partition_name: debug + + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller + use: + - network + - debug_partition + settings: + machine_type: n2d-standard-4 + instance_image: $(vars.built_instance_image) diff --git a/examples/README.md b/examples/README.md index 50d928393f..76368a0f05 100644 --- a/examples/README.md +++ b/examples/README.md @@ -20,6 +20,7 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [serverless-batch-mpi.yaml](#serverless-batch-mpiyaml-) ![core-badge] * [pfs-lustre.yaml](#pfs-lustreyaml-) ![core-badge] * [cae-slurm.yaml](#cae-slurmyaml-) ![core-badge] + * [hpc-build-slurm-image.yaml](#hpc-build-slurm-imageyaml-) ![community-badge] * [hpc-slurm-ubuntu2004.yaml](#hpc-slurm-ubuntu2004yaml-) ![community-badge] * [pfs-daos.yaml](#pfs-daosyaml-) ![community-badge] * [hpc-slurm-daos.yaml](#hpc-slurm-daosyaml-) ![community-badge] @@ -578,6 +579,25 @@ For this example the following is needed in the selected region: [cae-slurm.yaml]: ../examples/cae/cae-slurm.yaml +### [hpc-build-slurm-image.yaml] ![community-badge] + +This blueprint demonstrates how to use HPC Toolkit to build a Slurm image on top +of an existing image, `hpc-rocky-linux-8` in the case of this example. + +The blueprint contains 3 groups: + +1. The first group creates a network and generates the scripts that will install + Slurm. This uses the Ansible Playbook contained in the + [Slurm on GCP](https://github.com/GoogleCloudPlatform/slurm-gcp) repo. +2. The second group executes the build using Packer to run the scripts from the + first group. This can take ~30 min and will generate a custom Slurm image in + your project. +3. The third group deploys a demo cluster that uses the newly built image. For a + real world use case the demo cluster can be swapped out for a more powerful + slurm cluster from other examples. + +[hpc-build-slurm-image.yaml]: ../community/examples/hpc-build-slurm-image.yaml + ### [hpc-slurm-ubuntu2004.yaml] ![community-badge] > **Warning**: The variables `enable_reconfigure`, diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml index af3383d517..3245b2a0f5 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml @@ -49,7 +49,7 @@ set -e -o pipefail gcloud compute images delete --project={{ project }} --quiet $(jq -r '.builds[-1].artifact_id' packer-manifest.json | cut -d ":" -f2) args: - chdir: "{{ workspace }}/{{ deployment_name }}/packer/custom-image" + chdir: "{{ workspace }}/{{ deployment_name }}/{{ packer_group_name }}/{{ packer_module_id }}" executable: /bin/bash - name: Trigger Cloud Build failure when: ghpc_destroy.failed or image_deletion.failed diff --git a/tools/cloud-build/daily-tests/builds/hpc-build-slurm-image.yaml b/tools/cloud-build/daily-tests/builds/hpc-build-slurm-image.yaml new file mode 100644 index 0000000000..d71788e2a6 --- /dev/null +++ b/tools/cloud-build/daily-tests/builds/hpc-build-slurm-image.yaml @@ -0,0 +1,53 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +timeout: 5400s # 1.5h +steps: +## Test simple golang build +- id: build_ghpc + waitFor: ["-"] + name: "golang:bullseye" + entrypoint: /bin/bash + args: + - -c + - | + cd /workspace + make +- id: fetch_builder + waitFor: ["-"] + name: >- + us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder + entrypoint: /bin/bash + args: + - -c + - echo "done fetching builder" + +- id: hpc-build-slurm-image + waitFor: ["fetch_builder", "build_ghpc"] + name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder + entrypoint: /bin/bash + env: + - "ANSIBLE_HOST_KEY_CHECKING=false" + - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" + args: + - -c + - | + set -x -e + BUILD_ID_FULL=$BUILD_ID + BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} + + ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="@tools/cloud-build/daily-tests/tests/hpc-build-slurm-image.yml" diff --git a/tools/cloud-build/daily-tests/tests/hpc-build-slurm-image.yml b/tools/cloud-build/daily-tests/tests/hpc-build-slurm-image.yml new file mode 100644 index 0000000000..deb3d3eedc --- /dev/null +++ b/tools/cloud-build/daily-tests/tests/hpc-build-slurm-image.yml @@ -0,0 +1,27 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +test_name: hpc-build-slurm-image +deployment_name: build-slurm-{{ build }} +zone: us-central1-c +workspace: /workspace +blueprint_yaml: "{{ workspace }}/community/examples/hpc-build-slurm-image.yaml" +network: "{{ deployment_name }}-net" +packer_group_name: build-slurm +packer_module_id: slurm-custom-image +cli_deployment_vars: + network_name: "{{ network }}" + subnetwork_name: "{{ network }}-sub" diff --git a/tools/cloud-build/daily-tests/tests/packer.yml b/tools/cloud-build/daily-tests/tests/packer.yml index 54fb9ddd28..70dd6c9597 100644 --- a/tools/cloud-build/daily-tests/tests/packer.yml +++ b/tools/cloud-build/daily-tests/tests/packer.yml @@ -20,6 +20,8 @@ zone: us-central1-c workspace: /workspace blueprint_yaml: "{{ workspace }}/examples/image-builder.yaml" network: "{{ deployment_name }}-net" +packer_group_name: packer +packer_module_id: custom-image cli_deployment_vars: network_name: "{{ network }}" subnetwork_name: "{{ network }}-sub"