Skip to content

Commit

Permalink
Build RL8+OFED image in CI (#427)
Browse files Browse the repository at this point in the history
* Check major version for RL8 package installs

* Gather facts on ofed role

* Support kernel checks with mismatching version length

4.18.0-553.16.1.el8_9.x86_64
4.18.0-553.el8_9.x86_64
These would fail with the error:

'<' not supported between instances of 'str' and 'int'.

as the community.general.version_sort was trying to compare the `el8_9` of the latter with the `16` of the former.

Strip the last two chunks so we just compare numbers.

* Move to LTS version now RL9.4 is supported

* Fail when any inventory source cannot be parsed

* Always reboot after selinux and package updates

* Cleat facts before OFED so install will match newest kernel

* Clear facts after reboot so OFED install will match newest kernel

* fail caas and stackhpc if any inventory can't be read

* make reboot conditional on package or SELinux changes again

* include OFED in both RL8 and RL9 builds

* always run CI tests on RL8 and RL9

* allow concurrent RL8/RL9 CI tests

* mark pending reboot check as not a change

* fix workflow matrix definitions

* bump CI images - now both OFED

* use reboot hint for checking reboot required

---------

Co-authored-by: Steve Brasier <steveb@stackhpc.com>
  • Loading branch information
2 people authored and MaxBed4d committed Oct 15, 2024
1 parent 9e53ce6 commit 80c4ceb
Show file tree
Hide file tree
Showing 9 changed files with 45 additions and 51 deletions.
15 changes: 5 additions & 10 deletions .github/workflows/fatimage.yml
Original file line number Diff line number Diff line change
@@ -1,31 +1,26 @@

name: Build fat image
'on':
on:
workflow_dispatch:
concurrency:
group: ${{ github.ref }}-{{ matrix.os_version }}-{{ matrix.build }} # to branch/PR + OS + build
cancel-in-progress: true
jobs:
openstack:
name: openstack-imagebuild
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build
cancel-in-progress: true
runs-on: ubuntu-22.04
strategy:
fail-fast: false # allow other matrix jobs to continue even if one fails
matrix: # build RL8, RL9+OFED, RL9+CUDA versions
matrix: # build RL8+OFED, RL9+OFED, RL9+OFED+CUDA versions
os_version:
- RL8
- RL9
build:
- openstack.openhpc
- openstack.openhpc-ofed
- openstack.openhpc-cuda
exclude:
- os_version: RL8
build: openstack.openhpc-ofed
- os_version: RL8
build: openstack.openhpc-cuda
- os_version: RL9
build: openstack.openhpc
env:
ANSIBLE_FORCE_COLOR: True
OS_CLOUD: openstack
Expand Down
33 changes: 8 additions & 25 deletions .github/workflows/stackhpc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,40 +2,29 @@
name: Test deployment and reimage on OpenStack
on:
workflow_dispatch:
inputs:
use_RL8:
required: true
description: Include RL8 tests
type: boolean
default: false
push:
branches:
- main
pull_request:
jobs:
openstack:
name: openstack-ci
concurrency: ${{ github.ref }}-{{ matrix.os_version }} # to branch/PR + OS
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }} # to branch/PR + OS
cancel-in-progress: true
runs-on: ubuntu-22.04
strategy:
fail-fast: false # allow other matrix jobs to continue even if one fails
matrix:
os_version: [RL8, RL9]
rl8_selected:
- ${{ inputs.use_RL8 == true }} # only potentially true for workflow_dispatch
rl8_branch:
- ${{ startsWith(github.head_ref, 'rl8') == true }} # only potentially for pull_request, always false on merge
rl8_label:
- ${{ contains(github.event.pull_request.labels.*.name, 'RL8') }} # NB: needs a new commit if added after PR created
exclude:
- os_version: RL8
rl8_selected: false
rl8_branch: false
rl8_label: false
os_version:
- RL8
- RL9
env:
ANSIBLE_FORCE_COLOR: True
OS_CLOUD: openstack
TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }}
CI_CLOUD: ${{ vars.CI_CLOUD }}
TF_VAR_os_version: ${{ matrix.os_version }}
steps:
- uses: actions/checkout@v2

Expand Down Expand Up @@ -89,8 +78,6 @@ jobs:
. environments/.stackhpc/activate
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
terraform apply -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
env:
TF_VAR_os_version: ${{ matrix.os_version }}
- name: Delete infrastructure if provisioning failed
run: |
Expand All @@ -99,8 +86,6 @@ jobs:
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
if: failure() && steps.provision_servers.outcome == 'failure'
env:
TF_VAR_os_version: ${{ matrix.os_version }}

- name: Configure cluster
run: |
Expand Down Expand Up @@ -199,8 +184,6 @@ jobs:
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
if: ${{ success() || cancelled() }}
env:
TF_VAR_os_version: ${{ matrix.os_version }}

# - name: Delete images
# run: |
Expand Down
19 changes: 10 additions & 9 deletions ansible/bootstrap.yml
Original file line number Diff line number Diff line change
Expand Up @@ -227,24 +227,25 @@
- update
tasks:
- name: Check for pending reboot from package updates
stat:
path: /var/run/reboot-required
command:
cmd: dnf needs-restarting -r
register: update_reboot_required
- debug:
msg: "setstatus:{{ (sestatus.reboot_required | default(false)) }} packages: {{ (update_reboot_required.stat.exists | bool) }}"
- name: Reboot if required from SELinux state change or package upgrades
failed_when: "update_reboot_required.rc not in [0, 1]"
changed_when: false
- name: Reboot to cover SELinux state change or package upgrades
reboot:
post_reboot_delay: 30
when: (sestatus['reboot_required'] | default(false)) or (update_reboot_required.stat.exists | bool)
when: (sestatus['reboot_required'] | default(false)) or (update_reboot_required.rc == 1)
- name: Wait for hosts to be reachable
wait_for_connection:
sleep: 15
- name: update facts
- name: Clear facts
meta: clear_facts
- name: Update facts
setup:
when: (sestatus.changed | default(false)) or (sestatus.reboot_required | default(false))

- hosts: ofed
gather_facts: no
gather_facts: yes
become: yes
tags: ofed
tasks:
Expand Down
3 changes: 2 additions & 1 deletion ansible/roles/ofed/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
ofed_version: '24.04-0.6.6.0' # LTS version 23.10-2.1.3.1 does not support RL9.4
ofed_version: '23.10-3.2.2.0' # LTS
ofed_download_url: https://content.mellanox.com/ofed/MLNX_OFED-{{ ofed_version }}/MLNX_OFED_LINUX-{{ ofed_version }}-{{ ofed_distro }}{{ ofed_distro_version }}-{{ ofed_arch }}.tgz
ofed_distro: rhel # NB: not expected to work on other distros due to installation differences
ofed_distro_version: "{{ ansible_distribution_version }}" # e.g. '8.9'
ofed_distro_major_version: "{{ ansible_distribution_major_version }}" # e.g. '8'
ofed_arch: "{{ ansible_architecture }}"
ofed_tmp_dir: /tmp
ofed_update_firmware: false
Expand Down
8 changes: 5 additions & 3 deletions ansible/roles/ofed/tasks/install.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@

- name: Check current kernel is newest installed
assert:
that: _ofed_loaded_kernel.stdout == _ofed_dnf_kernels_newest
that: _ofed_kernel_current == _ofed_dnf_kernels_newest
fail_msg: "Kernel {{ _ofed_loaded_kernel.stdout }} is loaded but newer {{ _ofed_dnf_kernels_newest }} is installed: consider rebooting?"
vars:
_ofed_kernel_current: >-
{{ _ofed_loaded_kernel.stdout | regex_replace('\.(?:.(?!\.))+$', '') | regex_replace('\.(?:.(?!\.))+$', '') }}
_ofed_dnf_kernels_newest: >-
{{ _ofed_dnf_kernels.stdout_lines[1:] | map('regex_replace', '^\w+\.(\w+)\s+(\S+)\s+\S+\s*$', '\2.\1') | community.general.version_sort | last }}
{{ _ofed_dnf_kernels.stdout_lines[1:] | map('split') | map(attribute=1) | map('regex_replace', '\.(?:.(?!\.))+$', '') | community.general.version_sort | last }}
# dnf line format e.g. "kernel.x86_64 4.18.0-513.18.1.el8_9 @baseos "

- name: Enable epel
Expand All @@ -31,7 +33,7 @@

- name: Install build prerequisites
dnf:
name: "{{ ofed_build_packages + (ofed_build_rl8_packages if ofed_distro_version == '8.9' else []) }}"
name: "{{ ofed_build_packages + (ofed_build_rl8_packages if ofed_distro_major_version == '8' else []) }}"
when: "'MLNX_OFED_LINUX-' + ofed_version not in _ofed_info.stdout"
# don't want to install a load of prereqs unnecessarily

Expand Down
4 changes: 4 additions & 0 deletions environments/.caas/ansible.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,7 @@ filter_plugins = ../../ansible/filter_plugins
[ssh_connection]
ssh_args = -o ControlMaster=auto ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null
pipelining = True

[inventory]
# Fail when any inventory source cannot be parsed.
any_unparsed_is_failed = True
4 changes: 4 additions & 0 deletions environments/.stackhpc/ansible.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,7 @@ filter_plugins = ../../ansible/filter_plugins
[ssh_connection]
ssh_args = -o ServerAliveInterval=10 -o ControlMaster=auto -o ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null
pipelining = True

[inventory]
# Fail when any inventory source cannot be parsed.
any_unparsed_is_failed = True
6 changes: 3 additions & 3 deletions environments/.stackhpc/terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ variable "cluster_image" {
description = "single image for all cluster nodes, keyed by os_version - a convenience for CI"
type = map(string)
default = {
# https://github.com/stackhpc/ansible-slurm-appliance/pull/413
RL8: "openhpc-RL8-240904-1509-1687368f"
RL9: "openhpc-ofed-RL9-240904-1509-1687368f"
# https://github.com/stackhpc/ansible-slurm-appliance/pull/427
RL8: "openhpc-ofed-RL8-240906-1042-32568dbb"
RL9: "openhpc-ofed-RL9-240906-1041-32568dbb"
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,7 @@ filter_plugins = ../../ansible/filter_plugins
[ssh_connection]
ssh_args = -o ControlMaster=auto -o ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null
pipelining = True

[inventory]
# Fail when any inventory source cannot be parsed.
any_unparsed_is_failed = True

0 comments on commit 80c4ceb

Please sign in to comment.