Skip to content

Commit

Permalink
Add RL9 cuda build variant (#428)
Browse files Browse the repository at this point in the history
* determine cuda distro automatically

* fix typo in CUDA samples

* make facts available for cuda

* add RL9 cuda build variant

* fix typo in build definitions

* set packer build volume sizes depending on build variant

* fix volume size definition

* fix cuda verfsion to workaround issue with 12-6-0-1

* don't fail all builds if one fails

* bump CUDA builder disk size (build ran out of space)

* download cuda image to /mnt on gh runner

* download cuda image to /mnt on gh runner

* fix fatimage.yml mnt permissions

* Update main.yml

* switch to open nvidia drivers

* bump CI images

* make packer build volume-backed optional again

---------

Co-authored-by: bertiethorpe <bertie443@gmail.com>
Co-authored-by: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com>
  • Loading branch information
3 people authored and MaxBed4d committed Oct 15, 2024
1 parent 663e6cb commit 9e53ce6
Show file tree
Hide file tree
Showing 8 changed files with 37 additions and 34 deletions.
16 changes: 11 additions & 5 deletions .github/workflows/fatimage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,20 @@ jobs:
name: openstack-imagebuild
runs-on: ubuntu-22.04
strategy:
matrix:
fail-fast: false # allow other matrix jobs to continue even if one fails
matrix: # build RL8, RL9+OFED, RL9+CUDA versions
os_version:
- RL8
- RL9
build:
- openstack.openhpc
- openstack.openhpc-ofed
- openstack.openhpc-cuda
exclude:
- os_version: RL8
build: openstack.openhpc-ofed
- os_version: RL8
build: openstack.openhpc-cuda
- os_version: RL9
build: openstack.openhpc
env:
Expand Down Expand Up @@ -81,7 +85,9 @@ jobs:
- name: Download image
run: |
. venv/bin/activate
openstack image save --file ${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-name }}
sudo mkdir /mnt/images
sudo chmod 777 /mnt/images
openstack image save --file /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-name }}
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
Expand All @@ -95,13 +101,13 @@ jobs:
run: sudo mkdir -p './${{ steps.manifest.outputs.image-name }}'

- name: mount qcow2 file
run: sudo guestmount -a ${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}'
run: sudo guestmount -a /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}'

- name: Run Trivy vulnerability scanner
uses: aquasecurity/trivy-action@0.17.0
with:
scan-type: fs
scan-ref: "./${{ steps.manifest.outputs.image-name }}"
scan-ref: "${{ steps.manifest.outputs.image-name }}"
scanners: "vuln"
format: sarif
output: "${{ steps.manifest.outputs.image-name }}.sarif"
Expand All @@ -117,7 +123,7 @@ jobs:
uses: aquasecurity/trivy-action@0.16.1
with:
scan-type: fs
scan-ref: "./${{ steps.manifest.outputs.image-name }}"
scan-ref: "${{ steps.manifest.outputs.image-name }}"
scanners: "vuln"
format: table
exit-code: '1'
Expand Down
2 changes: 1 addition & 1 deletion ansible/extras.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
- name: Setup CUDA
hosts: cuda
become: yes
gather_facts: no
gather_facts: yes
tags: cuda
tasks:
- import_role:
Expand Down
7 changes: 4 additions & 3 deletions ansible/roles/cuda/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
cuda_distro: rhel8
cuda_distro: "rhel{{ ansible_distribution_major_version }}"
cuda_repo: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo"
cuda_driver_stream: default
cuda_package_version: 'latest'
cuda_packages:
- cuda
- "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}"
- nvidia-gds
# _cuda_version_tuple: # discovered from installed package e.g. ('12', '1', '0')
cuda_version_short: "{{ _cuda_version_tuple[0] }}.{{ cuda_version_tuple[1] }}"
cuda_version_short: "{{ _cuda_version_tuple[0] }}.{{ _cuda_version_tuple[1] }}"
cuda_samples_release_url: "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v{{ cuda_version_short }}.tar.gz"
cuda_samples_path: "/home/{{ ansible_user }}/cuda_samples"
cuda_samples_programs:
Expand Down
13 changes: 2 additions & 11 deletions ansible/roles/cuda/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,22 +24,13 @@
failed_when: false
register: _cuda_driver_module_enabled

- name: List nvidia driver dnf module stream versions
shell:
cmd: dnf module list nvidia-driver | grep -oP "\d+-dkms" | sort -V
# Output of interest from command is something like (some whitespace removed):
# "nvidia-driver 418-dkms default [d], fm, ks Nvidia driver for 418-dkms branch "
changed_when: false
register: _cuda_driver_module_streams
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"

- name: Enable nvidia driver module
ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ _cuda_driver_module_streams.stdout_lines | last }}"
ansible.builtin.command: "dnf module enable -y nvidia-driver:open-dkms"
register: _cuda_driver_module_enable
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout"

- name: Install nvidia drivers # TODO: make removal possible?
- name: Install nvidia drivers
ansible.builtin.command: dnf module install -y nvidia-driver
register: _cuda_driver_install
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
Expand Down
3 changes: 0 additions & 3 deletions environments/.stackhpc/ARCUS.pkrvars.hcl
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
flavor = "vm.ska.cpu.general.small"
use_blockstorage_volume = true
volume_size = 15 # GB
image_disk_format = "qcow2"
networks = ["4b6b2722-ee5b-40ec-8e52-a6610e14cc51"] # portal-internal (DNS broken on ilab-60)
ssh_keypair_name = "slurm-app-ci"
ssh_private_key_file = "~/.ssh/id_rsa"
Expand Down
3 changes: 0 additions & 3 deletions environments/.stackhpc/LEAFCLOUD.pkrvars.hcl
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
flavor = "ec1.large"
use_blockstorage_volume = true
volume_size = 15 # GB
volume_type = "unencrypted"
image_disk_format = "qcow2"
networks = ["909e49e8-6911-473a-bf88-0495ca63853c"] # slurmapp-ci
ssh_keypair_name = "slurm-app-ci"
ssh_private_key_file = "~/.ssh/id_rsa"
Expand Down
4 changes: 2 additions & 2 deletions environments/.stackhpc/terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ variable "cluster_image" {
type = map(string)
default = {
# https://github.com/stackhpc/ansible-slurm-appliance/pull/413
RL8: "openhpc-RL8-240813-1317-1b370a36"
RL9: "openhpc-ofed-RL9-240813-1317-1b370a36"
RL8: "openhpc-RL8-240904-1509-1687368f"
RL9: "openhpc-ofed-RL9-240904-1509-1687368f"
}
}

Expand Down
23 changes: 17 additions & 6 deletions packer/openstack.pkr.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ variable "manifest_output_path" {

variable "use_blockstorage_volume" {
type = bool
default = false
default = true
}

variable "volume_type" {
Expand All @@ -129,13 +129,18 @@ variable "volume_type" {
}

variable "volume_size" {
type = number
default = null # When not specified use the size of the builder instance root disk
type = map(number)
default = {
# fat image builds, GB:
openhpc = 15
openhpc-ofed = 15
openhpc-cuda = 30
}
}

variable "image_disk_format" {
type = string
default = null # When not specified use the image default
default = "qcow2"
}

variable "metadata" {
Expand All @@ -150,6 +155,7 @@ variable "groups" {
# fat image builds:
openhpc = ["control", "compute", "login"]
openhpc-ofed = ["control", "compute", "login", "ofed"]
openhpc-cuda = ["control", "compute", "login", "ofed", "cuda"]
}
}

Expand All @@ -158,11 +164,11 @@ source "openstack" "openhpc" {
flavor = var.flavor
use_blockstorage_volume = var.use_blockstorage_volume
volume_type = var.volume_type
volume_size = var.volume_size[source.name]
metadata = var.metadata
networks = var.networks
floating_ip_network = var.floating_ip_network
security_groups = var.security_groups
volume_size = var.volume_size

# Input image:
source_image = "${var.source_image[var.os_version]}"
Expand All @@ -178,7 +184,7 @@ source "openstack" "openhpc" {
ssh_bastion_private_key_file = var.ssh_bastion_private_key_file

# Output image:
image_disk_format = var.image_disk_format
image_disk_format = "qcow2"
image_visibility = var.image_visibility
image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}"
}
Expand All @@ -195,6 +201,11 @@ build {
name = "openhpc-ofed"
}

# CUDA fat image:
source "source.openstack.openhpc" {
name = "openhpc-cuda"
}

# Extended site-specific image, built on fat image:
source "source.openstack.openhpc" {
name = "openhpc-extra"
Expand Down

0 comments on commit 9e53ce6

Please sign in to comment.