Skip to content

Setup automated tests on a SLURM cluster #91

Setup automated tests on a SLURM cluster

Setup automated tests on a SLURM cluster #91

Workflow file for this run

# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
name: Python application
on:
push:
branches:
- master
pull_request:
permissions:
contents: read
# https://stackoverflow.com/a/72408109/6388696
# https://docs.github.com/en/actions/using-jobs/using-concurrency#example-using-concurrency-to-cancel-any-in-progress-job-or-run
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
linting:
name: Run linting/pre-commit checks
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
with:
python-version: '3.12'
- run: pip install pre-commit
- run: pre-commit --version
- run: pre-commit install
- run: pre-commit run --all-files
unit_tests:
needs: [linting]
runs-on: ${{ matrix.platform }}
strategy:
max-parallel: 4
matrix:
platform: [ubuntu-latest]
python-version: ['3.12']
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- run: pip install pdm
- name: Install dependencies
run: pdm install
- name: Test with pytest (very fast)
env:
JAX_PLATFORMS: cpu
run: pdm run pytest -v --shorter-than=1.0 --cov=project --cov-report=xml --cov-append
- name: Test with pytest (fast)
env:
JAX_PLATFORMS: cpu
run: pdm run pytest -v --cov=project --cov-report=xml --cov-append
- name: Store coverage report as an artifact
uses: actions/upload-artifact@v4
with:
name: coverage-reports-unit-tests-${{ matrix.platform }}-${{ matrix.python-version }}
path: ./coverage.xml
local_integration_tests:
needs: [unit_tests]
runs-on: self-hosted
strategy:
max-parallel: 1
matrix:
python-version: ['3.12']
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- run: pip install pdm
- name: Install dependencies
run: pdm config install.cache true && pdm install
- name: Test with pytest
run: pdm run pytest -v --cov=project --cov-report=xml --cov-append
# TODO: this is taking too long to run, and is failing consistently. Need to debug this before making it part of the CI again.
# - name: Test with pytest (only slow tests)
# run: pdm run pytest -v -m slow --slow --cov=project --cov-report=xml --cov-append
- name: Store coverage report as an artifact
uses: actions/upload-artifact@v4
with:
name: coverage-reports-integration-tests-${{ matrix.python-version }}
path: ./coverage.xml
launch-slurm-actions-runner:
needs: [local_integration_tests]
runs-on: self-hosted
strategy:
max-parallel: 5
matrix:
cluster: ['mila'] #, 'narval', 'beluga']
outputs:
job_id: ${{ steps.sbatch.outputs.stdout }}
steps:
- name: Copy job script to the cluster
run: "scp .github/actions-runner-job.sh ${{ matrix.cluster }}:actions-runner-job.sh"
- name: Launch Slurm Actions Runner
id: sbatch
# TODO: for DRAC clusters, the account needs to be set somehow (and obviously not be hard-coded here).
# Output the job ID to a file so that the next step can use it.
# NOTE: Could also use the --wait flag to wait for the job to finish (and have this run at the same time as the other step).
run: |
job_id=`ssh ${{ matrix.cluster }} 'cd $SCRATCH && sbatch --parsable $HOME/actions-runner-job.sh'`
echo "job_id=$job_id" >> "$GITHUB_OUTPUT"
# This step runs in a self-hosted Github Actions runner inside a SLURM job on the compute node of the cluster.
slurm_integration_tests:
name: Run integration tests on the ${{ matrix.cluster }} cluster in job ${{ needs.launch-slurm-actions-runner.outputs.job_id}}
needs: [launch-slurm-actions-runner]
runs-on: ${{ matrix.cluster }}
strategy:
max-parallel: 5
matrix:
# TODO: this should be tied to the same setting in the `launch-slurm-actions-runner` job.
# cluster: ${{ needs.launch-slurm-actions-runner.strategy.matrix.cluster }}
cluster: ['mila']
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.12
uses: actions/setup-python@v5
with:
python-version: 3.12
- run: pip install pdm
- name: Install dependencies
run: pdm install
- name: Test with pytest
run: pdm run pytest -v --cov=project --cov-report=xml --cov-append
- name: Test with pytest (only slow tests)
run: pdm run pytest -v -m slow --slow --cov=project --cov-report=xml --cov-append
- name: Store coverage report as an artifact
uses: actions/upload-artifact@v4
with:
name: coverage-reports-slurm-integration-tests-${{ matrix.cluster }}
path: ./coverage.xml
# https://about.codecov.io/blog/uploading-code-coverage-in-a-separate-job-on-github-actions/
upload-coverage-codecov:
needs: [local_integration_tests, slurm_integration_tests]
runs-on: ubuntu-latest
name: Upload coverage reports to Codecov
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Download artifacts
uses: actions/download-artifact@v4
with:
pattern: coverage-reports-*
merge-multiple: false
# download all the artifacts in this directory (each .coverage.xml will be in a subdirectory)
# Next step if this doesn't work would be to give the coverage files a unique name and use merge-multiple: true
path: coverage_reports
- name: Upload coverage reports to Codecov
uses: codecov/codecov-action@v4
with:
token: ${{ secrets.CODECOV_TOKEN }}
directory: coverage_reports
fail_ci_if_error: true