-
Notifications
You must be signed in to change notification settings - Fork 140
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add SlurmGCP v6 example of htc-slurm blueprint and integration test
- Loading branch information
1 parent
19e78f6
commit 896491d
Showing
5 changed files
with
364 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,165 @@ | ||
# Copyright 2024 Google LLC | ||
# Copyright (C) SchedMD LLC. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
--- | ||
|
||
# This blueprint provisions a cluster using the Slurm scheduler configured to | ||
# efficiently run many short duration, loosely-coupled (non-MPI) jobs. See also: | ||
# https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/htc.md | ||
# https://slurm.schedmd.com/high_throughput.html | ||
|
||
blueprint_name: htc-slurm-v6 | ||
|
||
vars: | ||
project_id: ## Set GCP Project ID Here ## | ||
deployment_name: htc-slurm-v6 | ||
region: us-west4 | ||
zone: us-west4-c | ||
# By default, public IPs are set in the login and controller to allow easier | ||
# SSH access. To turn this behavior off, set this to true. | ||
disable_public_ips: false | ||
|
||
# Documentation for each of the modules used below can be found at | ||
# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md | ||
|
||
deployment_groups: | ||
- group: primary | ||
modules: | ||
# Source is an embedded module, denoted by "modules/*" without ./, ../, / | ||
# as a prefix. To refer to a local or community module, prefix with ./, ../ or / | ||
# Example - ./modules/network/pre-existing-vpc | ||
- id: network | ||
source: modules/network/vpc | ||
|
||
- id: homefs | ||
source: modules/file-system/filestore | ||
use: [network] | ||
settings: | ||
local_mount: /home | ||
|
||
- id: projectsfs | ||
source: modules/file-system/filestore | ||
use: [network] | ||
settings: | ||
filestore_tier: HIGH_SCALE_SSD | ||
size_gb: 10240 | ||
local_mount: /projects | ||
|
||
# This file system has an associated license cost. | ||
# https://console.developers.google.com/marketplace/product/ddnstorage/exascaler-cloud | ||
- id: scratchfs | ||
source: community/modules/file-system/DDN-EXAScaler | ||
use: [network] | ||
settings: | ||
local_mount: /scratch | ||
|
||
# The compute partition is designed for performance. | ||
# Use: | ||
# `srun -N 4 -p compute <<Command>>` for any node in the partition. | ||
# `srun -N 4 -p compute --mincpus 30 <<Command>>` for node group c2s60. | ||
- id: compute_nodeset_c2s60 | ||
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset | ||
use: [network] | ||
settings: | ||
name: c2s60 | ||
node_count_dynamic_max: 200 | ||
bandwidth_tier: gvnic_enabled | ||
enable_placement: false | ||
|
||
- id: compute_nodeset_c2s30 | ||
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset | ||
use: [network] | ||
settings: | ||
node_count_dynamic_max: 200 | ||
machine_type: c2-standard-30 | ||
bandwidth_tier: gvnic_enabled | ||
enable_placement: false | ||
|
||
- id: compute_partition | ||
source: community/modules/compute/schedmd-slurm-gcp-v6-partition | ||
use: | ||
- homefs | ||
- scratchfs | ||
- projectsfs | ||
- compute_nodeset_c2s60 | ||
- compute_nodeset_c2s30 | ||
settings: | ||
partition_name: compute | ||
exclusive: false | ||
|
||
# The lowcost partition is designed to run at a lower cost and without additional quota | ||
# Use: | ||
# `srun -N 4 <<Command>>` for any node in the partition. | ||
# `srun -N 4 --mincpus 2` for node group n2s4. | ||
- id: low_cost_nodeset_n2s2 | ||
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset | ||
use: [network] | ||
settings: | ||
name: n2s2 | ||
machine_type: n2-standard-2 | ||
node_count_dynamic_max: 10 | ||
bandwidth_tier: gvnic_enabled | ||
enable_placement: false | ||
|
||
- id: low_cost_nodeset_n2s4 | ||
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset | ||
use: [network] | ||
settings: | ||
name: n2s4 | ||
machine_type: n2-standard-4 | ||
node_count_dynamic_max: 10 | ||
bandwidth_tier: gvnic_enabled | ||
enable_placement: false | ||
|
||
- id: low_cost_partition | ||
source: community/modules/compute/schedmd-slurm-gcp-v6-partition | ||
use: | ||
- homefs | ||
- scratchfs | ||
- projectsfs | ||
- low_cost_nodeset_n2s2 | ||
- low_cost_nodeset_n2s4 | ||
settings: | ||
is_default: true | ||
partition_name: lowcost | ||
exclusive: false | ||
|
||
- id: slurm_login | ||
source: community/modules/scheduler/schedmd-slurm-gcp-v6-login | ||
use: [network] | ||
settings: | ||
name_prefix: login | ||
machine_type: n2-standard-4 | ||
disable_login_public_ips: $(vars.disable_public_ips) | ||
|
||
- id: slurm_controller | ||
source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller | ||
use: | ||
- network | ||
- homefs | ||
- scratchfs | ||
- projectsfs | ||
- low_cost_partition | ||
- compute_partition | ||
- slurm_login | ||
settings: | ||
machine_type: c2-standard-8 | ||
disable_controller_public_ips: $(vars.disable_public_ips) | ||
slurm_conf_tpl: modules/embedded/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/htc-slurm.conf.tpl | ||
slurmdbd_conf_tpl: modules/embedded/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/htc-slurmdbd.conf.tpl | ||
|
||
- id: hpc_dashboard | ||
source: modules/monitoring/dashboard | ||
outputs: [instructions] |
67 changes: 67 additions & 0 deletions
67
community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/htc-slurm.conf.tpl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
# slurm.conf | ||
# https://slurm.schedmd.com/high_throughput.html | ||
|
||
ProctrackType=proctrack/cgroup | ||
SlurmctldPidFile=/var/run/slurm/slurmctld.pid | ||
SlurmdPidFile=/var/run/slurm/slurmd.pid | ||
TaskPlugin=task/affinity,task/cgroup | ||
MaxArraySize=10001 | ||
MaxJobCount=500000 | ||
MaxNodeCount=100000 | ||
MinJobAge=60 | ||
|
||
# | ||
# | ||
# SCHEDULING | ||
SchedulerType=sched/backfill | ||
SelectType=select/cons_tres | ||
SelectTypeParameters=CR_Core_Memory | ||
|
||
# | ||
# | ||
# LOGGING AND ACCOUNTING | ||
SlurmctldDebug=error | ||
SlurmdDebug=error | ||
|
||
# | ||
# | ||
# TIMERS | ||
MessageTimeout=60 | ||
|
||
################################################################################ | ||
# vvvvv WARNING: DO NOT MODIFY SECTION BELOW vvvvv # | ||
################################################################################ | ||
|
||
SlurmctldHost={control_host}({control_addr}) | ||
|
||
AuthType=auth/munge | ||
AuthInfo=cred_expire=120 | ||
AuthAltTypes=auth/jwt | ||
CredType=cred/munge | ||
MpiDefault={mpi_default} | ||
ReturnToService=2 | ||
SlurmctldPort={control_host_port} | ||
SlurmdPort=6818 | ||
SlurmdSpoolDir=/var/spool/slurmd | ||
SlurmUser=slurm | ||
StateSaveLocation={state_save} | ||
|
||
# | ||
# | ||
# LOGGING AND ACCOUNTING | ||
AccountingStorageType=accounting_storage/slurmdbd | ||
AccountingStorageHost={control_host} | ||
ClusterName={name} | ||
SlurmctldLogFile={slurmlog}/slurmctld.log | ||
SlurmdLogFile={slurmlog}/slurmd-%n.log | ||
|
||
# | ||
# | ||
# GENERATED CLOUD CONFIGURATIONS | ||
include cloud.conf | ||
|
||
################################################################################ | ||
# ^^^^^ WARNING: DO NOT MODIFY SECTION ABOVE ^^^^^ # | ||
################################################################################ | ||
|
||
SchedulerParameters=defer,salloc_wait_nodes,batch_sched_delay=20,bf_continue,bf_interval=300,bf_min_age_reserve=10800,bf_resolution=600,bf_yield_interval=1000000,partition_job_depth=500,sched_max_job_start=200,sched_min_interval=2000000 |
34 changes: 34 additions & 0 deletions
34
community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/htc-slurmdbd.conf.tpl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# slurmdbd.conf | ||
# https://slurm.schedmd.com/slurmdbd.conf.html | ||
|
||
DebugLevel=info | ||
PidFile=/var/run/slurm/slurmdbd.pid | ||
|
||
# https://slurm.schedmd.com/slurmdbd.conf.html#OPT_CommitDelay | ||
CommitDelay=1 | ||
|
||
################################################################################ | ||
# vvvvv WARNING: DO NOT MODIFY SECTION BELOW vvvvv # | ||
################################################################################ | ||
|
||
AuthType=auth/munge | ||
AuthAltTypes=auth/jwt | ||
AuthAltParameters=jwt_key={state_save}/jwt_hs256.key | ||
|
||
DbdHost={control_host} | ||
|
||
LogFile={slurmlog}/slurmdbd.log | ||
|
||
SlurmUser=slurm | ||
|
||
StorageLoc={db_name} | ||
|
||
StorageType=accounting_storage/mysql | ||
StorageHost={db_host} | ||
StoragePort={db_port} | ||
StorageUser={db_user} | ||
StoragePass={db_pass} | ||
|
||
################################################################################ | ||
# ^^^^^ WARNING: DO NOT MODIFY SECTION ABOVE ^^^^^ # | ||
################################################################################ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
# Copyright 2024 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
--- | ||
tags: | ||
- m.filestore | ||
- m.DDN-EXAScaler | ||
- m.schedmd-slurm-gcp-v6-controller | ||
- m.schedmd-slurm-gcp-v6-login | ||
- m.schedmd-slurm-gcp-v6-nodeset | ||
- m.schedmd-slurm-gcp-v6-partition | ||
- m.vpc | ||
- m.dashboard | ||
- slurm6 | ||
|
||
timeout: 14400s # 4hr | ||
steps: | ||
## Test simple golang build | ||
- id: build_ghpc | ||
waitFor: ["-"] | ||
name: "golang:bullseye" | ||
entrypoint: /bin/bash | ||
args: | ||
- -c | ||
- | | ||
cd /workspace | ||
make | ||
- id: fetch_builder | ||
waitFor: ["-"] | ||
name: >- | ||
us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder | ||
entrypoint: /bin/bash | ||
args: | ||
- -c | ||
- echo "done fetching builder" | ||
|
||
# Test htc-slurm deployment. | ||
- id: htc-slurm-v6 | ||
waitFor: ["fetch_builder", "build_ghpc"] | ||
name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder | ||
entrypoint: /bin/bash | ||
env: | ||
- "ANSIBLE_HOST_KEY_CHECKING=false" | ||
- "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" | ||
args: | ||
- -c | ||
- | | ||
set -x -e | ||
BUILD_ID_FULL=$BUILD_ID | ||
BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} | ||
ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ | ||
--user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ | ||
--extra-vars="@tools/cloud-build/daily-tests/tests/htc-slurm-v6.yml" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# Copyright 2024 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
--- | ||
|
||
test_name: htc-slurm-v6 | ||
deployment_name: htcv6{{ build }} | ||
zone: us-central1-c | ||
workspace: /workspace | ||
blueprint_yaml: "{{ workspace }}/community/examples/htc-slurm-v6.yaml" | ||
network: "{{ deployment_name }}-net" | ||
post_deploy_tests: | ||
- test-validation/test-mounts.yml | ||
- test-validation/test-partitions.yml | ||
custom_vars: | ||
partitions: | ||
- compute | ||
- lowcost | ||
mounts: | ||
- /home | ||
- /projects | ||
- /scratch |