Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Slurm on GKE - Guide #864

Merged
50 changes: 50 additions & 0 deletions modules/slurm-cluster/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/**
* Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


locals {
wl_templates = [
for f in fileset(local.wl_templates_path, "[0-9]*yml") :
"${local.wl_templates_path}/${f}"
]
wl_templates_path = (
var.templates_path == null
? "${path.module}/manifest-templates"
: pathexpand(var.templates_path)
)
}

resource "kubernetes_namespace" "default" {
count = var.namespace_create ? 1 : 0
metadata {
name = var.namespace
}
}

resource "kubernetes_manifest" "default" {
for_each = toset(local.wl_templates)
manifest = yamldecode(templatefile(each.value, {
namespace = var.namespace
cluster_config = var.cluster_config
}))

timeouts {
create = "30m"
}
field_manager {
force_conflicts = true
}
}
123 changes: 123 additions & 0 deletions modules/slurm-cluster/manifest-templates/00-configmap-slurm-config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

apiVersion: v1
kind: ConfigMap
metadata:
name: slurm-conf-configmap
namespace: ${namespace}
data:
slurm.conf: |
# slurm.conf
#
# See the slurm.conf man page for more information.
#
ClusterName=linux
SlurmctldHost=slurmctld-0
#
SlurmUser=slurm
SlurmctldPort=6820-6830
SlurmdPort=6818
AuthType=auth/munge
StateSaveLocation=/var/spool/slurmctld
SlurmdSpoolDir=/var/spool/slurmd
SwitchType=switch/none
MpiDefault=pmix
SlurmctldPidFile=/var/run/slurmd/slurmctld.pid
SlurmdPidFile=/var/run/slurmd/slurmd.pid
ProctrackType=proctrack/linuxproc
ReturnToService=2
#
# TIMERS
SlurmctldTimeout=300
SlurmdTimeout=30
InactiveLimit=0
MinJobAge=300
KillWait=30
Waittime=0
#
# SCHEDULING
SchedulerType=sched/backfill
SelectType=select/cons_tres
SelectTypeParameters=CR_CPU_Memory
#
# LOGGING
SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdDebug=3
SlurmdLogFile=/var/log/slurm/slurmd.log
JobCompType=jobcomp/filetxt
JobCompLoc=/var/log/slurm/jobcomp.log
#
# ACCOUNTING
JobAcctGatherType=jobacct_gather/linux
JobAcctGatherFrequency=30
#
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost=slurmdbd
AccountingStoragePort=6819
#
SlurmctldParameters=cloud_reg_addrs

# CLOUD CONFIGURATIONS
MaxNodeCount=64000
include cloud.conf
cloud.conf: |
PrivateData=cloud
SlurmctldParameters=enable_configless
## GRES
GresTypes=gpu
AccountingStorageTRES=gres/gpu
DebugFlags=Gres
TreeWidth=128

# NODES
NodeName=DEFAULT State=UNKNOWN RealMemory=15000 CPUs=4 CoresPerSocket=2 ThreadsPerCore=2 Gres=gpu:1
NodeName=slurmd-[0-39] State=CLOUD Gres=gpu:1
NodeSet=slurmdnodeset Nodes=slurmd-[0-39]

NodeName=DEFAULT State=UNKNOWN RealMemory=30000 CPUs=8 CoresPerSocket=2 ThreadsPerCore=2 Gres=gpu:2
NodeName=slurmd1-[0-39] State=CLOUD Gres=gpu:2
NodeSet=slurmd1nodeset Nodes=slurmd1-[0-39]

# PARTITIONS
PartitionName=all Default=yes Nodes=ALL MaxTime=INFINITE State=UP

PropagateResourceLimitsExcept=MEMLOCK

PartitionName=1gpunodes Nodes=slurmdnodeset State=UP DefMemPerCPU=7007 SuspendTime=300 Oversubscribe=Exclusive PowerDownOnIdle=YES ResumeTimeout=300 SuspendTimeout=120
PartitionName=2gpunodes Nodes=slurmd1nodeset State=UP DefMemPerCPU=7007 SuspendTime=300 Oversubscribe=Exclusive PowerDownOnIdle=YES ResumeTimeout=300 SuspendTimeout=120

cloud_gres.conf: |
NodeName=slurmd-[0-39] Name=gpu File=/dev/nvidia0
NodeName=slurmd1-[0-39] Name=gpu File=/dev/nvidia[0-1]
gres.conf: |
NodeName=slurmd-[0-39] Name=gpu File=/dev/nvidia0
NodeName=slurmd1-[0-39] Name=gpu File=/dev/nvidia[0-1]
cgroup.conf: |
###
#
# Slurm cgroup support configuration file
#
# See man slurm.conf and man cgroup.conf for further
# information on cgroup configuration parameters
#--
ConstrainCores=yes
ConstrainDevices=yes
ConstrainRAMSpace=yes
ConstrainSwapSpace=yes
IgnoreSystemd=yes

Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# MIT License

# Copyright (c) 2019 Giovanni Torres

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

apiVersion: v1
kind: ConfigMap
metadata:
name: slurmdbd-conf-configmap
namespace: ${namespace}
data:
slurmdbd.conf: |
#
# Example slurmdbd.conf file.
#
# See the slurmdbd.conf man page for more information.
#
# Authentication info
AuthType=auth/munge
#
# slurmDBD info
DbdAddr=slurmdbd
DbdHost=slurmdbd
SlurmUser=slurm
DebugLevel=4
LogFile=/var/log/slurm/slurmdbd.log
PidFile=/var/run/slurmdbd/slurmdbd.pid
#
# Database info
StorageType=accounting_storage/mysql
StorageHost=${cluster_config.database.host}
StorageUser=${cluster_config.database.user}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# MIT License

# Copyright (c) 2019 Giovanni Torres

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

apiVersion: v1
kind: Secret
metadata:
name: database-auth-secret
namespace: ${namespace}
data:
password: ${cluster_config.database.password}
29 changes: 29 additions & 0 deletions modules/slurm-cluster/manifest-templates/00-secret-munge-key.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# MIT License

# Copyright (c) 2019 Giovanni Torres

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

apiVersion: v1
kind: Secret
metadata:
name: munge-key-secret
namespace: ${namespace}
data:
munge.key: ${base64encode(cluster_config.munge.key)}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# MIT License

# Copyright (c) 2019 Giovanni Torres

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: slurm-shared-storage
namespace: ${namespace}
spec:
storageClassName: standard-rwx
accessModes:
- ReadWriteMany
resources:
requests:
storage: ${cluster_config.storage.size_gb}Gi
36 changes: 36 additions & 0 deletions modules/slurm-cluster/manifest-templates/01-pvc-var-lib-mysql.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# MIT License

# Copyright (c) 2019 Giovanni Torres

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

apiVersion: v1
kind: PersistentVolumeClaim
metadata:
labels:
app.kubernetes.io/name: slurm
app.kubernetes.io/component: mysql
name: var-lib-mysql
namespace: ${namespace}
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: ${cluster_config.database.storage_size_gb}Gi
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# MIT License

# Copyright (c) 2019 Giovanni Torres

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

apiVersion: v1
kind: PersistentVolumeClaim
metadata:
labels:
app.kubernetes.io/name: slurm
app.kubernetes.io/component: slurmctld
name: var-spool-slurmctld
namespace: ${namespace}
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 100Mi
Loading