Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SlurmGCP v6 example of htc-slurm blueprint #2348

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 165 additions & 0 deletions community/examples/htc-slurm-v6.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
# Copyright 2024 Google LLC
# Copyright (C) SchedMD LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---

# This blueprint provisions a cluster using the Slurm scheduler configured to
# efficiently run many short duration, loosely-coupled (non-MPI) jobs. See also:
# https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/htc.md
# https://slurm.schedmd.com/high_throughput.html

blueprint_name: htc-slurm-v6

vars:
project_id: ## Set GCP Project ID Here ##
deployment_name: htc-slurm-v6
region: us-west4
zone: us-west4-c
# By default, public IPs are set in the login and controller to allow easier
# SSH access. To turn this behavior off, set this to true.
disable_public_ips: false

# Documentation for each of the modules used below can be found at
# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md

deployment_groups:
- group: primary
modules:
# Source is an embedded module, denoted by "modules/*" without ./, ../, /
# as a prefix. To refer to a local or community module, prefix with ./, ../ or /
# Example - ./modules/network/pre-existing-vpc
- id: network
source: modules/network/vpc

- id: homefs
source: modules/file-system/filestore
use: [network]
settings:
local_mount: /home

- id: projectsfs
source: modules/file-system/filestore
use: [network]
settings:
filestore_tier: HIGH_SCALE_SSD
size_gb: 10240
local_mount: /projects

# This file system has an associated license cost.
# https://console.developers.google.com/marketplace/product/ddnstorage/exascaler-cloud
- id: scratchfs
source: community/modules/file-system/DDN-EXAScaler
use: [network]
settings:
local_mount: /scratch

# The compute partition is designed for performance.
# Use:
# `srun -N 4 -p compute <<Command>>` for any node in the partition.
# `srun -N 4 -p compute --mincpus 30 <<Command>>` for node group c2s60.
- id: compute_nodeset_c2s60
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network]
settings:
name: c2s60
node_count_dynamic_max: 200
bandwidth_tier: gvnic_enabled
enable_placement: false

- id: compute_nodeset_c2s30
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network]
settings:
node_count_dynamic_max: 200
machine_type: c2-standard-30
bandwidth_tier: gvnic_enabled
enable_placement: false

- id: compute_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
use:
- homefs
- scratchfs
- projectsfs
- compute_nodeset_c2s60
- compute_nodeset_c2s30
settings:
partition_name: compute
exclusive: false

# The lowcost partition is designed to run at a lower cost and without additional quota
# Use:
# `srun -N 4 <<Command>>` for any node in the partition.
# `srun -N 4 --mincpus 2` for node group n2s4.
- id: low_cost_nodeset_n2s2
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network]
settings:
name: n2s2
machine_type: n2-standard-2
node_count_dynamic_max: 10
bandwidth_tier: gvnic_enabled
enable_placement: false

- id: low_cost_nodeset_n2s4
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network]
settings:
name: n2s4
machine_type: n2-standard-4
node_count_dynamic_max: 10
bandwidth_tier: gvnic_enabled
enable_placement: false

- id: low_cost_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
use:
- homefs
- scratchfs
- projectsfs
- low_cost_nodeset_n2s2
- low_cost_nodeset_n2s4
settings:
is_default: true
partition_name: lowcost
exclusive: false

- id: slurm_login
source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
use: [network]
settings:
name_prefix: login
machine_type: n2-standard-4
disable_login_public_ips: $(vars.disable_public_ips)

- id: slurm_controller
source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
use:
- network
- homefs
- scratchfs
- projectsfs
- low_cost_partition
- compute_partition
- slurm_login
settings:
machine_type: c2-standard-8
disable_controller_public_ips: $(vars.disable_public_ips)
slurm_conf_tpl: modules/embedded/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/htc-slurm.conf.tpl
slurmdbd_conf_tpl: modules/embedded/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/htc-slurmdbd.conf.tpl

- id: hpc_dashboard
source: modules/monitoring/dashboard
outputs: [instructions]
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# slurm.conf
# https://slurm.schedmd.com/high_throughput.html

ProctrackType=proctrack/cgroup
SlurmctldPidFile=/var/run/slurm/slurmctld.pid
SlurmdPidFile=/var/run/slurm/slurmd.pid
TaskPlugin=task/affinity,task/cgroup
MaxArraySize=10001
MaxJobCount=500000
MaxNodeCount=100000
MinJobAge=60
nick-stroud marked this conversation as resolved.
Show resolved Hide resolved

#
#
# SCHEDULING
SchedulerType=sched/backfill
SelectType=select/cons_tres
SelectTypeParameters=CR_Core_Memory

#
#
# LOGGING AND ACCOUNTING
SlurmctldDebug=error
SlurmdDebug=error

#
#
# TIMERS
MessageTimeout=60

################################################################################
# vvvvv WARNING: DO NOT MODIFY SECTION BELOW vvvvv #
################################################################################

SlurmctldHost={control_host}({control_addr})

AuthType=auth/munge
AuthInfo=cred_expire=120
AuthAltTypes=auth/jwt
CredType=cred/munge
MpiDefault={mpi_default}
ReturnToService=2
SlurmctldPort={control_host_port}
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmUser=slurm
StateSaveLocation={state_save}

#
#
# LOGGING AND ACCOUNTING
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost={control_host}
ClusterName={name}
SlurmctldLogFile={slurmlog}/slurmctld.log
SlurmdLogFile={slurmlog}/slurmd-%n.log

#
#
# GENERATED CLOUD CONFIGURATIONS
include cloud.conf

################################################################################
# ^^^^^ WARNING: DO NOT MODIFY SECTION ABOVE ^^^^^ #
################################################################################

SchedulerParameters=defer,salloc_wait_nodes,batch_sched_delay=20,bf_continue,bf_interval=300,bf_min_age_reserve=10800,bf_resolution=600,bf_yield_interval=1000000,partition_job_depth=500,sched_max_job_start=200,sched_min_interval=2000000
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# slurmdbd.conf
# https://slurm.schedmd.com/slurmdbd.conf.html

DebugLevel=info
PidFile=/var/run/slurm/slurmdbd.pid

# https://slurm.schedmd.com/slurmdbd.conf.html#OPT_CommitDelay
CommitDelay=1

################################################################################
# vvvvv WARNING: DO NOT MODIFY SECTION BELOW vvvvv #
################################################################################

AuthType=auth/munge
AuthAltTypes=auth/jwt
AuthAltParameters=jwt_key={state_save}/jwt_hs256.key

DbdHost={control_host}

LogFile={slurmlog}/slurmdbd.log

SlurmUser=slurm

StorageLoc={db_name}

StorageType=accounting_storage/mysql
StorageHost={db_host}
StoragePort={db_port}
StorageUser={db_user}
StoragePass={db_pass}

################################################################################
# ^^^^^ WARNING: DO NOT MODIFY SECTION ABOVE ^^^^^ #
################################################################################
13 changes: 13 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /"
* [ml-gke](#ml-gkeyaml--) ![community-badge] ![experimental-badge]
* [storage-gke](#storage-gkeyaml--) ![community-badge] ![experimental-badge]
* [htc-slurm.yaml](#htc-slurmyaml--) ![community-badge] ![experimental-badge]
* [htc-slurm-v6.yaml](#htc-slurm-v6yaml--) ![community-badge] ![experimental-badge]
* [htc-htcondor.yaml](#htc-htcondoryaml--) ![community-badge] ![experimental-badge]
* [fsi-montecarlo-on-batch.yaml](#fsi-montecarlo-on-batchyaml-) ![community-badge] ![experimental-badge]
* [tutorial-starccm-slurm.yaml](#tutorial-starccm-slurmyaml--) ![community-badge] ![experimental-badge]
Expand Down Expand Up @@ -1213,6 +1214,18 @@ For more information see:

[htc-slurm.yaml]: ../community/examples/htc-slurm.yaml

### [htc-slurm-v6.yaml] ![community-badge] ![experimental-badge]

This blueprint provisions a cluster using the Slurm scheduler in a configuration
tuned for the execution of many short-duration, loosely-coupled (non-MPI) jobs.

For more information see:

* [Slurm on Google Cloud High Throughput documentation](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/htc.md)
* [General Slurm High Throughput documentation](https://slurm.schedmd.com/high_throughput.html)

[htc-slurm-v6.yaml]: ../community/examples/htc-slurm-v6.yaml

### [fsi-montecarlo-on-batch.yaml](../community/examples/fsi-montecarlo-on-batch.yaml) ![community-badge] ![experimental-badge]

## Monte Carlo Simulations for Value at Risk
Expand Down
65 changes: 65 additions & 0 deletions tools/cloud-build/daily-tests/builds/htc-slurm-v6.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---
tags:
- m.filestore
- m.DDN-EXAScaler
- m.schedmd-slurm-gcp-v6-controller
- m.schedmd-slurm-gcp-v6-login
- m.schedmd-slurm-gcp-v6-nodeset
- m.schedmd-slurm-gcp-v6-partition
- m.vpc
- m.dashboard
- slurm6

timeout: 14400s # 4hr
steps:
## Test simple golang build
- id: build_ghpc
waitFor: ["-"]
name: "golang:bullseye"
entrypoint: /bin/bash
args:
- -c
- |
cd /workspace
make
- id: fetch_builder
waitFor: ["-"]
name: >-
us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder
entrypoint: /bin/bash
args:
- -c
- echo "done fetching builder"

# Test htc-slurm deployment.
- id: htc-slurm-v6
waitFor: ["fetch_builder", "build_ghpc"]
name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/hpc-toolkit-builder
entrypoint: /bin/bash
env:
- "ANSIBLE_HOST_KEY_CHECKING=false"
- "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
args:
- -c
- |
set -x -e
BUILD_ID_FULL=$BUILD_ID
BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}

ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \
--user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
--extra-vars="@tools/cloud-build/daily-tests/tests/htc-slurm-v6.yml"
Loading
Loading