Skip to content

Commit

Permalink
Add local ssd RAID0 startup script
Browse files Browse the repository at this point in the history
  • Loading branch information
alyssa-sm committed Jul 31, 2024
1 parent 82f4a01 commit da5fa89
Show file tree
Hide file tree
Showing 5 changed files with 118 additions and 3 deletions.
3 changes: 2 additions & 1 deletion modules/scripts/startup-script/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ limitations under the License.
| Name | Version |
|------|---------|
| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 0.14.0 |
| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.3 |
| <a name="requirement_google"></a> [google](#requirement\_google) | >= 3.83 |
| <a name="requirement_local"></a> [local](#requirement\_local) | >= 2.0.0 |
| <a name="requirement_random"></a> [random](#requirement\_random) | ~> 3.0 |
Expand Down Expand Up @@ -298,6 +298,7 @@ No modules.
| <a name="input_install_docker"></a> [install\_docker](#input\_install\_docker) | Install Docker command line tool and daemon. | `bool` | `false` | no |
| <a name="input_install_stackdriver_agent"></a> [install\_stackdriver\_agent](#input\_install\_stackdriver\_agent) | Run Google Stackdriver Agent installation script if set to true. Preferred over ops agent for performance. | `bool` | `false` | no |
| <a name="input_labels"></a> [labels](#input\_labels) | Labels for the created GCS bucket. Key-value pairs. | `map(string)` | n/a | yes |
| <a name="input_local_ssd_filesystem"></a> [local\_ssd\_filesystem](#input\_local\_ssd\_filesystem) | Create and mount a single filesystem from all local SSD scratch disks (data will be lost if VMs are powered down without enabling migration); enable by setting mountpoint field to a valid directory path. | <pre>object({<br> fs_type = optional(string, "ext4")<br> mountpoint = optional(string, "")<br> })</pre> | <pre>{<br> "fs_type": "ext4",<br> "mountpoint": ""<br>}</pre> | no |
| <a name="input_prepend_ansible_installer"></a> [prepend\_ansible\_installer](#input\_prepend\_ansible\_installer) | DEPRECATED. Use `install_ansible=false` to prevent ansible installation. | `bool` | `null` | no |
| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created | `string` | n/a | yes |
| <a name="input_region"></a> [region](#input\_region) | The region to deploy to | `string` | n/a | yes |
Expand Down
76 changes: 76 additions & 0 deletions modules/scripts/startup-script/files/setup-raid.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---

- name: Configure local SSDs
become: true
hosts: localhost
vars:
raid_name: localssd
array_dev: /dev/md/{{ raid_name }}
fstype: ext4
interface: nvme
mode: '0755'
tasks:
- name: Get local SSD devices
ansible.builtin.find:
file_type: link
path: /dev/disk/by-id
patterns: google-local-{{ "nvme-" if interface == "nvme" else "" }}ssd-*
register: local_ssd_devices

- name: Exit if zero local ssd found
ansible.builtin.meta: end_play
when: local_ssd_devices.files | length == 0

- name: Install mdadm
ansible.builtin.package:
name: mdadm
state: present

- name: Force RAID array if only 1 local SSD
ansible.builtin.shell: mdadm --create {{ array_dev }} --name={{ raid_name }} --homehost=any --level=0 --raid-devices=1 /dev/disk/by-id/google-local-nvme-ssd-0 --force
args:
creates: "{{ array_dev }}"
when: local_ssd_devices.files | length == 1

- name: Create RAID array
ansible.builtin.shell: mdadm --create {{ array_dev }} --name={{ raid_name }} --homehost=any --level=0 --raid-devices={{ local_ssd_devices.files | length }} /dev/disk/by-id/google-local-nvme-ssd-*
args:
creates: "{{ array_dev }}"
when: local_ssd_devices.files | length >= 2

- name: Format filesystem
community.general.filesystem:
fstype: "{{ fstype }}"
device: "{{ array_dev }}"
opts: '{{ "-m 0" if fstype == "ext4" else "" }}'

- name: Mount RAID array
ansible.posix.mount:
src: "{{ array_dev }}"
path: '{{ mountpoint | default("/mnt/" + raid_name) }}'
fstype: "{{ fstype }}"
# the nofail option is critical as it enables the boot process to complete on machines
# that were powered off and had local SSD contents discarded; without this option
# VMs may fail to join the network
opts: discard,defaults,nofail
state: mounted

- name: Set mount permissions
ansible.builtin.file:
path: '{{ mountpoint | default("/mnt/" + raid_name) }}'
state: directory
mode: "{{ mode }}"
15 changes: 14 additions & 1 deletion modules/scripts/startup-script/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,20 @@ locals {
},
]

raid_setup = var.local_ssd_filesystem == {} ? [] : [
{
type = "ansible-local"
destination = "setup-raid.yml"
content = file("${path.module}/files/setup-raid.yml")
args = join(" ", [
"-e mountpoint=${var.local_ssd_filesystem.mountpoint}",
"-e fs_type=${var.local_ssd_filesystem.fs_type}",
])
},
]

supplied_ansible_runners = anytrue([for r in var.runners : r.type == "ansible-local"])
has_ansible_runners = anytrue([local.supplied_ansible_runners, local.configure_ssh, var.install_docker])
has_ansible_runners = anytrue([local.supplied_ansible_runners, local.configure_ssh, var.install_docker, can(coalesce(var.local_ssd_filesystem.mountpoint))])
install_ansible = coalesce(var.install_ansible, local.has_ansible_runners)
ansible_installer = local.install_ansible ? [{
type = "shell"
Expand All @@ -122,6 +134,7 @@ locals {
local.ansible_installer,
local.configure_ssh_runners,
local.docker_runner,
local.raid_setup,
var.runners
)

Expand Down
25 changes: 25 additions & 0 deletions modules/scripts/startup-script/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,31 @@ variable "install_docker" {
nullable = false
}

variable "local_ssd_filesystem" {
description = "Create and mount a single filesystem from all local SSD scratch disks (data will be lost if VMs are powered down without enabling migration); enable by setting mountpoint field to a valid directory path."
type = object({
fs_type = optional(string, "ext4")
mountpoint = optional(string, "")
})

validation {
condition = can(coalesce(var.local_ssd_filesystem.fs_type))
error_message = "var.local_ssd_filesystem.fs_type must be set to a filesystem supported by the Linux distribution."
}

validation {
condition = var.local_ssd_filesystem.mountpoint == "" || startswith(var.local_ssd_filesystem.mountpoint, "/")
error_message = "To enable local SSD filesystems, var.local_ssd_filesystem.mountpoint must be set to an absolute path to a mountpoint."
}

default = {
fs_type = "ext4"
mountpoint = ""
}

nullable = false
}

variable "install_cloud_ops_agent" {
description = "Warning: Consider using `install_stackdriver_agent` for better performance. Run Google Ops Agent installation script if set to true."
type = bool
Expand Down
2 changes: 1 addition & 1 deletion modules/scripts/startup-script/versions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,5 @@ terraform {
module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.36.0"
}

required_version = ">= 0.14.0"
required_version = ">= 1.3"
}

0 comments on commit da5fa89

Please sign in to comment.