Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add RL9 cuda build variant #428

Merged
merged 17 commits into from
Sep 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions .github/workflows/fatimage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,20 @@ jobs:
name: openstack-imagebuild
runs-on: ubuntu-22.04
strategy:
matrix:
fail-fast: false # allow other matrix jobs to continue even if one fails
matrix: # build RL8, RL9+OFED, RL9+CUDA versions
os_version:
- RL8
- RL9
build:
- openstack.openhpc
- openstack.openhpc-ofed
- openstack.openhpc-cuda
exclude:
- os_version: RL8
build: openstack.openhpc-ofed
- os_version: RL8
build: openstack.openhpc-cuda
- os_version: RL9
build: openstack.openhpc
env:
Expand Down Expand Up @@ -81,7 +85,9 @@ jobs:
- name: Download image
run: |
. venv/bin/activate
openstack image save --file ${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-name }}
sudo mkdir /mnt/images
sudo chmod 777 /mnt/images
openstack image save --file /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 ${{ steps.manifest.outputs.image-name }}

- name: Set up QEMU
uses: docker/setup-qemu-action@v3
Expand All @@ -95,13 +101,13 @@ jobs:
run: sudo mkdir -p './${{ steps.manifest.outputs.image-name }}'

- name: mount qcow2 file
run: sudo guestmount -a ${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}'
run: sudo guestmount -a /mnt/images/${{ steps.manifest.outputs.image-name }}.qcow2 -i --ro -o allow_other './${{ steps.manifest.outputs.image-name }}'

- name: Run Trivy vulnerability scanner
uses: aquasecurity/trivy-action@0.17.0
with:
scan-type: fs
scan-ref: "./${{ steps.manifest.outputs.image-name }}"
scan-ref: "${{ steps.manifest.outputs.image-name }}"
scanners: "vuln"
format: sarif
output: "${{ steps.manifest.outputs.image-name }}.sarif"
Expand All @@ -117,7 +123,7 @@ jobs:
uses: aquasecurity/trivy-action@0.16.1
with:
scan-type: fs
scan-ref: "./${{ steps.manifest.outputs.image-name }}"
scan-ref: "${{ steps.manifest.outputs.image-name }}"
scanners: "vuln"
format: table
exit-code: '1'
Expand Down
2 changes: 1 addition & 1 deletion ansible/extras.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
- name: Setup CUDA
hosts: cuda
become: yes
gather_facts: no
gather_facts: yes
tags: cuda
tasks:
- import_role:
Expand Down
7 changes: 4 additions & 3 deletions ansible/roles/cuda/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
cuda_distro: rhel8
cuda_distro: "rhel{{ ansible_distribution_major_version }}"
cuda_repo: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo"
cuda_driver_stream: default
cuda_package_version: 'latest'
cuda_packages:
- cuda
- "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}"
- nvidia-gds
# _cuda_version_tuple: # discovered from installed package e.g. ('12', '1', '0')
cuda_version_short: "{{ _cuda_version_tuple[0] }}.{{ cuda_version_tuple[1] }}"
cuda_version_short: "{{ _cuda_version_tuple[0] }}.{{ _cuda_version_tuple[1] }}"
cuda_samples_release_url: "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v{{ cuda_version_short }}.tar.gz"
cuda_samples_path: "/home/{{ ansible_user }}/cuda_samples"
cuda_samples_programs:
Expand Down
13 changes: 2 additions & 11 deletions ansible/roles/cuda/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,22 +24,13 @@
failed_when: false
register: _cuda_driver_module_enabled

- name: List nvidia driver dnf module stream versions
shell:
cmd: dnf module list nvidia-driver | grep -oP "\d+-dkms" | sort -V
# Output of interest from command is something like (some whitespace removed):
# "nvidia-driver 418-dkms default [d], fm, ks Nvidia driver for 418-dkms branch "
changed_when: false
register: _cuda_driver_module_streams
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"

- name: Enable nvidia driver module
ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ _cuda_driver_module_streams.stdout_lines | last }}"
ansible.builtin.command: "dnf module enable -y nvidia-driver:open-dkms"
sjpb marked this conversation as resolved.
Show resolved Hide resolved
register: _cuda_driver_module_enable
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout"

- name: Install nvidia drivers # TODO: make removal possible?
- name: Install nvidia drivers
ansible.builtin.command: dnf module install -y nvidia-driver
register: _cuda_driver_install
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
Expand Down
3 changes: 0 additions & 3 deletions environments/.stackhpc/ARCUS.pkrvars.hcl
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
flavor = "vm.ska.cpu.general.small"
use_blockstorage_volume = true
volume_size = 15 # GB
image_disk_format = "qcow2"
networks = ["4b6b2722-ee5b-40ec-8e52-a6610e14cc51"] # portal-internal (DNS broken on ilab-60)
ssh_keypair_name = "slurm-app-ci"
ssh_private_key_file = "~/.ssh/id_rsa"
Expand Down
3 changes: 0 additions & 3 deletions environments/.stackhpc/LEAFCLOUD.pkrvars.hcl
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
flavor = "ec1.large"
use_blockstorage_volume = true
volume_size = 15 # GB
volume_type = "unencrypted"
image_disk_format = "qcow2"
networks = ["909e49e8-6911-473a-bf88-0495ca63853c"] # slurmapp-ci
ssh_keypair_name = "slurm-app-ci"
ssh_private_key_file = "~/.ssh/id_rsa"
Expand Down
4 changes: 2 additions & 2 deletions environments/.stackhpc/terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ variable "cluster_image" {
type = map(string)
default = {
# https://github.com/stackhpc/ansible-slurm-appliance/pull/413
RL8: "openhpc-RL8-240813-1317-1b370a36"
RL9: "openhpc-ofed-RL9-240813-1317-1b370a36"
RL8: "openhpc-RL8-240904-1509-1687368f"
RL9: "openhpc-ofed-RL9-240904-1509-1687368f"
}
}

Expand Down
23 changes: 17 additions & 6 deletions packer/openstack.pkr.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ variable "manifest_output_path" {

variable "use_blockstorage_volume" {
type = bool
default = false
default = true
}

variable "volume_type" {
Expand All @@ -129,13 +129,18 @@ variable "volume_type" {
}

variable "volume_size" {
type = number
default = null # When not specified use the size of the builder instance root disk
type = map(number)
default = {
# fat image builds, GB:
openhpc = 15
openhpc-ofed = 15
openhpc-cuda = 30
}
}

variable "image_disk_format" {
type = string
default = null # When not specified use the image default
default = "qcow2"
}

variable "metadata" {
Expand All @@ -150,6 +155,7 @@ variable "groups" {
# fat image builds:
openhpc = ["control", "compute", "login"]
openhpc-ofed = ["control", "compute", "login", "ofed"]
openhpc-cuda = ["control", "compute", "login", "ofed", "cuda"]
}
}

Expand All @@ -158,11 +164,11 @@ source "openstack" "openhpc" {
flavor = var.flavor
use_blockstorage_volume = var.use_blockstorage_volume
volume_type = var.volume_type
volume_size = var.volume_size[source.name]
metadata = var.metadata
networks = var.networks
floating_ip_network = var.floating_ip_network
security_groups = var.security_groups
volume_size = var.volume_size

# Input image:
source_image = "${var.source_image[var.os_version]}"
Expand All @@ -178,7 +184,7 @@ source "openstack" "openhpc" {
ssh_bastion_private_key_file = var.ssh_bastion_private_key_file

# Output image:
image_disk_format = var.image_disk_format
image_disk_format = "qcow2"
image_visibility = var.image_visibility
image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}"
}
Expand All @@ -195,6 +201,11 @@ build {
name = "openhpc-ofed"
}

# CUDA fat image:
source "source.openstack.openhpc" {
name = "openhpc-cuda"
}

# Extended site-specific image, built on fat image:
source "source.openstack.openhpc" {
name = "openhpc-extra"
Expand Down
Loading