Setup automated tests on a SLURM cluster #103
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This workflow will install Python dependencies, run tests and lint with a single version of Python | |
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python | |
name: Python application | |
on: | |
push: | |
branches: | |
- master | |
pull_request: | |
permissions: | |
contents: read | |
# https://stackoverflow.com/a/72408109/6388696 | |
# https://docs.github.com/en/actions/using-jobs/using-concurrency#example-using-concurrency-to-cancel-any-in-progress-job-or-run | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} | |
cancel-in-progress: true | |
jobs: | |
linting: | |
name: Run linting/pre-commit checks | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v4 | |
- uses: actions/setup-python@v4 | |
with: | |
python-version: '3.12' | |
- run: pip install pre-commit | |
- run: pre-commit --version | |
- run: pre-commit install | |
- run: pre-commit run --all-files | |
unit_tests: | |
needs: [linting] | |
runs-on: ${{ matrix.platform }} | |
strategy: | |
max-parallel: 4 | |
matrix: | |
platform: [ubuntu-latest] | |
python-version: ['3.12'] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v5 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- run: pip install pdm | |
- name: Install dependencies | |
run: pdm install | |
- name: Test with pytest (very fast) | |
env: | |
JAX_PLATFORMS: cpu | |
run: pdm run pytest -v --shorter-than=1.0 --cov=project --cov-report=xml --cov-append | |
- name: Test with pytest (fast) | |
env: | |
JAX_PLATFORMS: cpu | |
run: pdm run pytest -v --cov=project --cov-report=xml --cov-append | |
- name: Store coverage report as an artifact | |
uses: actions/upload-artifact@v4 | |
with: | |
name: coverage-reports-unit-tests-${{ matrix.platform }}-${{ matrix.python-version }} | |
path: ./coverage.xml | |
local_integration_tests: | |
needs: [unit_tests] | |
runs-on: self-hosted | |
strategy: | |
max-parallel: 1 | |
matrix: | |
python-version: ['3.12'] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v5 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- run: pip install pdm | |
- name: Install dependencies | |
run: pdm config install.cache true && pdm install | |
- name: Test with pytest | |
run: pdm run pytest -v --cov=project --cov-report=xml --cov-append | |
# TODO: this is taking too long to run, and is failing consistently. Need to debug this before making it part of the CI again. | |
# - name: Test with pytest (only slow tests) | |
# run: pdm run pytest -v -m slow --slow --cov=project --cov-report=xml --cov-append | |
- name: Store coverage report as an artifact | |
uses: actions/upload-artifact@v4 | |
with: | |
name: coverage-reports-integration-tests-${{ matrix.python-version }} | |
path: ./coverage.xml | |
launch-slurm-actions-runner: | |
needs: [local_integration_tests] | |
runs-on: self-hosted | |
strategy: | |
max-parallel: 5 | |
matrix: | |
cluster: ['mila'] #, 'narval', 'beluga'] | |
outputs: | |
job_id: ${{ steps.sbatch.outputs.stdout }} | |
steps: | |
- name: Copy job script to the cluster | |
run: "scp actions-runner-job.sh ${{ matrix.cluster }}:actions-runner-job.sh" | |
- name: Launch Slurm Actions Runner | |
id: sbatch | |
# TODO: for DRAC clusters, the account needs to be set somehow (and obviously not be hard-coded here). | |
# Output the job ID to a file so that the next step can use it. | |
# NOTE: Could also use the --wait flag to wait for the job to finish (and have this run at the same time as the other step). | |
run: | | |
job_id=`ssh ${{ matrix.cluster }} 'cd $SCRATCH && sbatch --parsable $HOME/actions-runner-job.sh'` | |
echo "Submitted job $job_id on the ${{ matrix.cluster }} cluster!" | |
echo "job_id=$job_id" >> "$GITHUB_OUTPUT" | |
# This step runs in a self-hosted Github Actions runner inside a SLURM job on the compute node of the cluster. | |
slurm_integration_tests: | |
name: Run integration tests on the ${{ matrix.cluster }} cluster in job ${{ needs.launch-slurm-actions-runner.outputs.job_id}} | |
needs: [launch-slurm-actions-runner] | |
runs-on: ${{ matrix.cluster }} | |
strategy: | |
max-parallel: 5 | |
matrix: | |
# TODO: this should be tied to the same setting in the `launch-slurm-actions-runner` job. | |
# cluster: ${{ needs.launch-slurm-actions-runner.strategy.matrix.cluster }} | |
cluster: ['mila'] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Set up Python 3.12 | |
uses: actions/setup-python@v5 | |
with: | |
python-version: 3.12 | |
- run: pip install pdm | |
- name: Install dependencies | |
run: pdm install | |
- name: Test with pytest | |
run: pdm run pytest -v --cov=project --cov-report=xml --cov-append | |
- name: Test with pytest (only slow tests) | |
run: pdm run pytest -v -m slow --slow --cov=project --cov-report=xml --cov-append | |
- name: Store coverage report as an artifact | |
uses: actions/upload-artifact@v4 | |
with: | |
name: coverage-reports-slurm-integration-tests-${{ matrix.cluster }} | |
path: ./coverage.xml | |
# https://about.codecov.io/blog/uploading-code-coverage-in-a-separate-job-on-github-actions/ | |
upload-coverage-codecov: | |
needs: [local_integration_tests, slurm_integration_tests] | |
runs-on: ubuntu-latest | |
name: Upload coverage reports to Codecov | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Download artifacts | |
uses: actions/download-artifact@v4 | |
with: | |
pattern: coverage-reports-* | |
merge-multiple: false | |
# download all the artifacts in this directory (each .coverage.xml will be in a subdirectory) | |
# Next step if this doesn't work would be to give the coverage files a unique name and use merge-multiple: true | |
path: coverage_reports | |
- name: Upload coverage reports to Codecov | |
uses: codecov/codecov-action@v4 | |
with: | |
token: ${{ secrets.CODECOV_TOKEN }} | |
directory: coverage_reports | |
fail_ci_if_error: true |