Skip to content

Commit

Permalink
Add SlurmGCP v6 example of htc-slurm blueprint and integration test
Browse files Browse the repository at this point in the history
  • Loading branch information
harshthakkar01 committed Mar 13, 2024
1 parent 19e78f6 commit 896491d
Show file tree
Hide file tree
Showing 5 changed files with 364 additions and 0 deletions.
165 changes: 165 additions & 0 deletions community/examples/htc-slurm-v6.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
# Copyright 2024 Google LLC
# Copyright (C) SchedMD LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---

# This blueprint provisions a cluster using the Slurm scheduler configured to
# efficiently run many short duration, loosely-coupled (non-MPI) jobs. See also:
# https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/htc.md
# https://slurm.schedmd.com/high_throughput.html

blueprint_name: htc-slurm-v6

vars:
project_id: ## Set GCP Project ID Here ##
deployment_name: htc-slurm-v6
region: us-west4
zone: us-west4-c
# By default, public IPs are set in the login and controller to allow easier
# SSH access. To turn this behavior off, set this to true.
disable_public_ips: false

# Documentation for each of the modules used below can be found at
# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md

deployment_groups:
- group: primary
modules:
# Source is an embedded module, denoted by "modules/*" without ./, ../, /
# as a prefix. To refer to a local or community module, prefix with ./, ../ or /
# Example - ./modules/network/pre-existing-vpc
- id: network
source: modules/network/vpc

- id: homefs
source: modules/file-system/filestore
use: [network]
settings:
local_mount: /home

- id: projectsfs
source: modules/file-system/filestore
use: [network]
settings:
filestore_tier: HIGH_SCALE_SSD
size_gb: 10240
local_mount: /projects

# This file system has an associated license cost.
# https://console.developers.google.com/marketplace/product/ddnstorage/exascaler-cloud
- id: scratchfs
source: community/modules/file-system/DDN-EXAScaler
use: [network]
settings:
local_mount: /scratch

# The compute partition is designed for performance.
# Use:
# `srun -N 4 -p compute <<Command>>` for any node in the partition.
# `srun -N 4 -p compute --mincpus 30 <<Command>>` for node group c2s60.
- id: compute_nodeset_c2s60
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network]
settings:
name: c2s60
node_count_dynamic_max: 200
bandwidth_tier: gvnic_enabled
enable_placement: false

- id: compute_nodeset_c2s30
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network]
settings:
node_count_dynamic_max: 200
machine_type: c2-standard-30
bandwidth_tier: gvnic_enabled
enable_placement: false

- id: compute_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
use:
- homefs
- scratchfs
- projectsfs
- compute_nodeset_c2s60
- compute_nodeset_c2s30
settings:
partition_name: compute
exclusive: false

# The lowcost partition is designed to run at a lower cost and without additional quota
# Use:
# `srun -N 4 <<Command>>` for any node in the partition.
# `srun -N 4 --mincpus 2` for node group n2s4.
- id: low_cost_nodeset_n2s2
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network]
settings:
name: n2s2
machine_type: n2-standard-2
node_count_dynamic_max: 10
bandwidth_tier: gvnic_enabled
enable_placement: false

- id: low_cost_nodeset_n2s4
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network]
settings:
name: n2s4
machine_type: n2-standard-4
node_count_dynamic_max: 10
bandwidth_tier: gvnic_enabled
enable_placement: false

- id: low_cost_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
use:
- homefs
- scratchfs
- projectsfs
- low_cost_nodeset_n2s2
- low_cost_nodeset_n2s4
settings:
is_default: true
partition_name: lowcost
exclusive: false

- id: slurm_login
source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
use: [network]
settings:
name_prefix: login
machine_type: n2-standard-4
disable_login_public_ips: $(vars.disable_public_ips)

- id: slurm_controller
source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
use:
- network
- homefs
- scratchfs
- projectsfs
- low_cost_partition
- compute_partition
- slurm_login
settings:
machine_type: c2-standard-8
disable_controller_public_ips: $(vars.disable_public_ips)
slurm_conf_tpl: modules/embedded/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/htc-slurm.conf.tpl
slurmdbd_conf_tpl: modules/embedded/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/htc-slurmdbd.conf.tpl

- id: hpc_dashboard
source: modules/monitoring/dashboard
outputs: [instructions]
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# slurm.conf
# https://slurm.schedmd.com/high_throughput.html

ProctrackType=proctrack/cgroup
SlurmctldPidFile=/var/run/slurm/slurmctld.pid
SlurmdPidFile=/var/run/slurm/slurmd.pid
TaskPlugin=task/affinity,task/cgroup
MaxArraySize=10001
MaxJobCount=500000
MaxNodeCount=100000
MinJobAge=60

#
#
# SCHEDULING
SchedulerType=sched/backfill
SelectType=select/cons_tres
SelectTypeParameters=CR_Core_Memory

#
#
# LOGGING AND ACCOUNTING
SlurmctldDebug=error
SlurmdDebug=error

#
#
# TIMERS
MessageTimeout=60

################################################################################
# vvvvv WARNING: DO NOT MODIFY SECTION BELOW vvvvv #
################################################################################

SlurmctldHost={control_host}({control_addr})

AuthType=auth/munge
AuthInfo=cred_expire=120
AuthAltTypes=auth/jwt
CredType=cred/munge
MpiDefault={mpi_default}
ReturnToService=2
SlurmctldPort={control_host_port}
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmUser=slurm
StateSaveLocation={state_save}

#
#
# LOGGING AND ACCOUNTING
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost={control_host}
ClusterName={name}
SlurmctldLogFile={slurmlog}/slurmctld.log
SlurmdLogFile={slurmlog}/slurmd-%n.log

#
#
# GENERATED CLOUD CONFIGURATIONS
include cloud.conf

################################################################################
# ^^^^^ WARNING: DO NOT MODIFY SECTION ABOVE ^^^^^ #
################################################################################

SchedulerParameters=defer,salloc_wait_nodes,batch_sched_delay=20,bf_continue,bf_interval=300,bf_min_age_reserve=10800,bf_resolution=600,bf_yield_interval=1000000,partition_job_depth=500,sched_max_job_start=200,sched_min_interval=2000000
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# slurmdbd.conf
# https://slurm.schedmd.com/slurmdbd.conf.html

DebugLevel=info
PidFile=/var/run/slurm/slurmdbd.pid

# https://slurm.schedmd.com/slurmdbd.conf.html#OPT_CommitDelay
CommitDelay=1

################################################################################
# vvvvv WARNING: DO NOT MODIFY SECTION BELOW vvvvv #
################################################################################

AuthType=auth/munge
AuthAltTypes=auth/jwt
AuthAltParameters=jwt_key={state_save}/jwt_hs256.key

DbdHost={control_host}

LogFile={slurmlog}/slurmdbd.log

SlurmUser=slurm

StorageLoc={db_name}

StorageType=accounting_storage/mysql
StorageHost={db_host}
StoragePort={db_port}
StorageUser={db_user}
StoragePass={db_pass}

################################################################################
# ^^^^^ WARNING: DO NOT MODIFY SECTION ABOVE ^^^^^ #
################################################################################
65 changes: 65 additions & 0 deletions tools/cloud-build/daily-tests/builds/htc-slurm-v6.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---
tags:
- m.filestore
- m.DDN-EXAScaler
- m.schedmd-slurm-gcp-v6-controller
- m.schedmd-slurm-gcp-v6-login
- m.schedmd-slurm-gcp-v6-nodeset
- m.schedmd-slurm-gcp-v6-partition
- m.vpc
- m.dashboard
- slurm6

timeout: 14400s # 4hr
steps:
## Test simple golang build
- id: build_ghpc
waitFor: ["-"]
name: "golang:bullseye"
entrypoint: /bin/bash
args:
- -c
- |
cd /workspace
make
- id: fetch_builder
waitFor: ["-"]
name: >-
us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder
entrypoint: /bin/bash
args:
- -c
- echo "done fetching builder"

# Test htc-slurm deployment.
- id: htc-slurm-v6
waitFor: ["fetch_builder", "build_ghpc"]
name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder
entrypoint: /bin/bash
env:
- "ANSIBLE_HOST_KEY_CHECKING=false"
- "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
args:
- -c
- |
set -x -e
BUILD_ID_FULL=$BUILD_ID
BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \
--user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
--extra-vars="@tools/cloud-build/daily-tests/tests/htc-slurm-v6.yml"
33 changes: 33 additions & 0 deletions tools/cloud-build/daily-tests/tests/htc-slurm-v6.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---

test_name: htc-slurm-v6
deployment_name: htcv6{{ build }}
zone: us-central1-c
workspace: /workspace
blueprint_yaml: "{{ workspace }}/community/examples/htc-slurm-v6.yaml"
network: "{{ deployment_name }}-net"
post_deploy_tests:
- test-validation/test-mounts.yml
- test-validation/test-partitions.yml
custom_vars:
partitions:
- compute
- lowcost
mounts:
- /home
- /projects
- /scratch

0 comments on commit 896491d

Please sign in to comment.