From 054d487d9fec8f98a111bc30ec6d3ec1ce423356 Mon Sep 17 00:00:00 2001 From: Mike Henry <11765982+mikemhenry@users.noreply.github.com> Date: Thu, 27 Apr 2023 00:58:37 -0700 Subject: [PATCH] Add Self-Hosted AWS GPU Runner (#100) * added self hosted GPU runner CI file * should switch to micromamba * if this doesn't work we are using micromamba * use micromamba * needed to give env a name * see if switching the cudatoolkit to 11.7 works * should be able to use nvcc from the ami * fix some version pins * Remove pins from environment.yml * set HOME * forgot how to set envars * Add some debugging * getting some weird activation problems * Remove debugging output * see if now that things are working, I can override the pins * see if this works without activating * Fix the build that doesn't use cuda * Accidently kept a GPU package in base env * keep the environment.yml in the root of the repo intact, move custom env to a folder * missed a path * Add caching to speed up env creation * revert to keep PR as small as possible * accidently checkouted wrong versions from stale fork * forgot to use the cudatoolkit from the ami instead of Jimver/cuda-toolkit * forgot to set home * make sure we init the shell * missed a reference to the build matrix * set timeout to be 1 hr * make it easier to update the versions * had the env in the wrong spot * don't run on a schedule and timeout after 25 minutes --- .github/workflows/self-hosted-gpu-test.yml | 130 +++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 .github/workflows/self-hosted-gpu-test.yml diff --git a/.github/workflows/self-hosted-gpu-test.yml b/.github/workflows/self-hosted-gpu-test.yml new file mode 100644 index 0000000..53e4c47 --- /dev/null +++ b/.github/workflows/self-hosted-gpu-test.yml @@ -0,0 +1,130 @@ +name: self-hosted-gpu-test +on: + push: + branches: + - master + workflow_dispatch: + +defaults: + run: + shell: bash -l {0} + +jobs: + start-runner: + name: Start self-hosted EC2 runner + runs-on: ubuntu-latest + outputs: + label: ${{ steps.start-ec2-runner.outputs.label }} + ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} + steps: + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ secrets.AWS_REGION }} + - name: Try to start EC2 runner + id: start-ec2-runner + uses: machulav/ec2-github-runner@main + with: + mode: start + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + ec2-image-id: ami-04d16a12bbc76ff0b + ec2-instance-type: g4dn.xlarge + subnet-id: subnet-0dee8543e12afe0cd # us-east-1a + security-group-id: sg-0f9809618550edb98 + # iam-role-name: self-hosted-runner # optional, requires additional permissions + aws-resource-tags: > # optional, requires additional permissions + [ + {"Key": "Name", "Value": "ec2-github-runner"}, + {"Key": "GitHubRepository", "Value": "${{ github.repository }}"} + ] + + do-the-job: + name: Do the job on the runner + needs: start-runner # required to start the main job when the runner is ready + runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner + timeout-minutes: 25 + steps: + + + - name: Check out + uses: actions/checkout@v3 + + - name: Install Miniconda + uses: conda-incubator/setup-miniconda@v2 + env: + HOME: /home/ec2-user + + with: + activate-environment: "" + auto-activate-base: true + miniforge-variant: Mambaforge + + - name: Prepare dependencies (with CUDA) + env: + cudatoolkit: "11.7.*" + gxx_linux-64: "10.3.*" + torchani: "2.2.*" + nvcc_linux-64: "11.7.*" + python: "3.10.*" + pytorch-gpu: "2.0.*" + run: | + sed -i -e "/cudatoolkit/c\ - cudatoolkit ${{ env.cudatoolkit }}" \ + -e "/gxx_linux-64/c\ - gxx_linux-64 ${{ env.gxx_linux-64 }}" \ + -e "/torchani/c\ - torchani ${{ env.torchani }}" \ + -e "/nvcc_linux-64/c\ - nvcc_linux-64 ${{ env.nvcc_linux-64 }}" \ + -e "/python/c\ - python ${{ env.python }}" \ + -e "/pytorch-gpu/c\ - pytorch-gpu ${{ env.pytorch-gpu }}" \ + environment.yml + + - name: Show dependency file + run: cat environment.yml + + - name: Install dependencies + run: | + mamba env create -n nnpops -f environment.yml + conda init + + - name: List conda environment + run: | + conda activate nnpops + conda list + + - name: Configure, compile, and install + run: | + conda activate nnpops + mkdir build && cd build + cmake .. \ + -DENABLE_CUDA=true \ + -DTorch_DIR=$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')/Torch \ + -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX + make install + + - name: Test + run: | + conda activate nnpops + cd build + ctest --verbose + + stop-runner: + name: Stop self-hosted EC2 runner + needs: + - start-runner # required to get output from the start-runner job + - do-the-job # required to wait when the main job is done + runs-on: ubuntu-latest + if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs + steps: + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ secrets.AWS_REGION }} + - name: Stop EC2 runner + uses: machulav/ec2-github-runner@main + with: + mode: stop + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + label: ${{ needs.start-runner.outputs.label }} + ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}