.github/workflows/cd.yml

# Useful GitHub Actions docs:
#
# - https://help.github.com/en/actions
# - https://help.github.com/en/actions/reference/workflow-syntax-for-github-actions
# - https://help.github.com/en/actions/configuring-and-managing-workflows/configuring-a-workflow
# - https://help.github.com/en/actions/reference/context-and-expression-syntax-for-github-actions

name: Continuous Deployment

# only allow one deploy workflow to be running at a time
# serializes multiple outstanding deploys if PRs are merged before the last deploy finishes
# ref: https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#concurrency
concurrency: deploy

# Only trigger the workflow when pushing to main or a label is applied to a
# Pull Request
on:
  push:
    branches:
      - main
      - test-this-pr/**
    paths-ignore:
      - terraform/**
      - docs/**
      - .pre-commit-config.yaml
      - .github/**
      - "!.github/workflows/cd.yml"
  pull_request:
    types: [labeled]

# Global environment variables
env:
  # gcloud versions at https://cloud.google.com/sdk/docs/release-notes, update
  # to latest when updating.
  #
  GCLOUD_SDK_VERION: "432.0.0"
  # kubectl versions at https://github.com/kubernetes/kubernetes/tags, update to
  # second latest minor for a broad compatibility range when updating.
  #
  KUBECTL_VERSION: "v1.25.10"
  # helm versions at https://github.com/helm/helm/tags, use latest when
  # updating.
  #
  HELM_VERSION: "v3.12.0"

  # These variables influence git directly
  # https://git-scm.com/book/en/v2/Git-Internals-Environment-Variables#_committing
  #
  GIT_AUTHOR_EMAIL: 105740858+jupyterhub-bot@users.noreply.github.com
  GIT_COMMITTER_EMAIL: 105740858+jupyterhub-bot@users.noreply.github.com
  GIT_AUTHOR_NAME: JupterHub Bot Account
  GIT_COMMITTER_NAME: JupterHub Bot Account

  # This is required for chartpress that run docker build, to ensure that we can
  # make use of --mount flags in RUN statements for example.
  #
  DOCKER_BUILDKIT: "1"

jobs:
  # In this dedicated job to deploy our staging environment we build and push
  # images that the jobs to deploy to the production environments depend on.
  staging-deploy:
    # Only run the job if the 'test-staging' label is present OR if the event
    # is a push to main
    if: |
      (github.event.label.name == 'test-staging') ||
      ((github.event_name == 'push') && (github.ref == 'refs/heads/main')) ||
      ((github.event_name == 'push') && contains(github.ref, 'test-this-pr'))
    runs-on: ubuntu-20.04
    # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#example-preventing-a-specific-failing-matrix-job-from-failing-a-workflow-run
    continue-on-error: ${{ matrix.experimental }}
    strategy:
      fail-fast: false # Do not cancel all jobs if one fails
      matrix:
        include:
          # Now we have only one staging environment, but we have had two when
          # we transitioned from one k8s cluster to another.
          # - The domain name staging.mybinder.org currently refers to
          #   gke2.staging.mybinder.org through a CNAME but during the
          #   transition they were separate.
          # - When we have two separate staging environments it can be useful to
          #   pass --image-prefix to override the chartpress.yaml configuration
          #   for one of the environments.
          - federation_member: staging
            chartpress_args: ""
            helm_version: ""
            experimental: false

    steps:
      - name: "Stage 0: Update env vars based on job matrix arguments"
        run: |
          if [ -n "${{ matrix.helm_version }}" ]; then
              echo "HELM_VERSION=${{ matrix.helm_version }}" >> $GITHUB_ENV
          fi

      - name: "Stage 0: Checkout repo"
        uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: "Stage 0: Setup Python"
        uses: actions/setup-python@v5
        id: setup-python
        with:
          python-version-file: ".python-version"

      # Action Repo: https://github.com/actions/cache
      - name: "Stage 0: Cache pip dependencies"
        uses: actions/cache@v4
        with:
          path: ~/.cache/pip
          key: python-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/*requirements.txt') }}

      - name: "Stage 1: Install dependencies"
        # --upgrade is important since we carry a cache and may risk grow very
        # outdated otherwise.
        run: |
          pip install --upgrade setuptools pip
          pip install --upgrade -r requirements.txt

      # Install pre-requisite for "gcloud container clusters get-credentials"
      # command with a modern k8s client.
      #
      # A manual install step has been needed as they opted to not provide it in
      # the github-runner image. See
      # https://github.com/actions/runner-images/issues/5925#issuecomment-1216417721.
      #
      # Snippet based on
      # https://github.com/actions/runner-images/blob/9753e7301e19e29b89b0622b811bbb9b3891d02e/images/linux/scripts/installers/google-cloud-sdk.sh#L9-L13.
      #
      - name: Install gke-gcloud-auth-plugin
        run: |
          echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | sudo tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
          curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -

          sudo apt-get update -y
          sudo apt-get install -y google-cloud-sdk-gke-gcloud-auth-plugin

      # Action Repo: https://github.com/google-github-actions/setup-gcloud
      - name: "Stage 1: Install gcloud ${{ env.GCLOUD_SDK_VERION }}"
        uses: google-github-actions/setup-gcloud@v2
        with:
          version: ${{ env.GCLOUD_SDK_VERION }}

      # Action Repo: https://github.com/Azure/setup-kubectl
      - name: "Stage 1: Install kubectl ${{ env.KUBECTL_VERSION }}"
        uses: azure/setup-kubectl@v4
        with:
          version: ${{ env.KUBECTL_VERSION }}

      - name: "Stage 1: Install and setup helm ${{ env.HELM_VERSION }}"
        run: |
          curl -sf https://raw.githubusercontent.com/helm/helm/HEAD/scripts/get-helm-3 | DESIRED_VERSION=${HELM_VERSION} bash
          for d in ./mybinder*/; do
            helm dependency update "$d"
          done

      # Action Repo: https://github.com/sliteteam/github-action-git-crypt-unlock
      - name: "Stage 2: Unlock git-crypt secrets"
        uses: sliteteam/github-action-git-crypt-unlock@f99c0c6b60bb7ec30dcec033a8f0a3b3d48f21e1
        env:
          GIT_CRYPT_KEY: ${{ secrets.GIT_CRYPT_KEY }}

      # Action Repo: https://github.com/Azure/docker-login
      - name: "Stage 2: Login to Docker"
        uses: azure/docker-login@v2
        with:
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_PASSWORD }}

      - name: "Stage 3: Run chartpress to build/push images and updates values.yaml"
        run: |
          chartpress --push ${{ matrix.chartpress_args }}

      - name: "Stage 4: Deploy to staging"
        run: |
          . cert-manager.env
          python ./deploy.py ${{ matrix.federation_member }}
        env:
          TERM: xterm

      # Action Repo: https://github.com/nick-invision/retry
      - name: "Stage 4: Verify staging works"
        uses: nick-invision/retry@7152eba30c6575329ac0576536151aca5a72780e
        with:
          timeout_minutes: 10
          max_attempts: 3
          command: pytest -vx --color=yes --numprocesses=2 --release=${{ matrix.federation_member }} tests/

      - name: "Stage 5: Post message to Grafana that deployment to production has started"
        run: |
          source secrets/grafana-api-key
          export PULL_REQUEST_ID=$(git log -1 --pretty=%B | head -n1 | sed 's/^.*#\([0-9]*\).*/\1/')
          export AUTHOR_NAME="$(git log  -1 --pretty=%aN)"
          export PULL_REQUEST_TITLE="$(git log --pretty=%B -1 | tail -n+3)"
          python post-grafana-annotation.py  \
            --grafana-url https://grafana.mybinder.org \
            --tag deployment-start \
            "$(echo -en ${PULL_REQUEST_TITLE}\\n\\n${AUTHOR_NAME}: https://github.com/${{ github.repository }}/pull/${PULL_REQUEST_ID})" \
          || echo "failed!"

  prod-deploy:
    # Previous job must have successfully completed for this job to execute
    # - https://help.github.com/en/actions/reference/workflow-syntax-for-github-actions#jobsjob_idneeds
    needs: staging-deploy

    # Only run job if the event is a push to main
    if: github.event_name == 'push' && github.ref == 'refs/heads/main'

    runs-on: ubuntu-20.04

    # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#example-preventing-a-specific-failing-matrix-job-from-failing-a-workflow-run
    continue-on-error: ${{ matrix.experimental }}
    # We run this job multiple times with different parameters
    # - https://help.github.com/en/actions/reference/workflow-syntax-for-github-actions#jobsjob_idstrategy
    strategy:
      fail-fast: false # Do not cancel all jobs if one fails
      matrix:
        include:
          - federation_member: prod
            chartpress_args: ""
            helm_version: ""
            experimental: false

          - federation_member: ovh2
            helm_version: ""
            experimental: false

          - federation_member: curvenote
            helm_version: ""
            experimental: false
            aws_deployment_role: arn:aws:iam::166088433508:role/binderhub-github-oidc-mybinderorgdeploy
            aws_region: us-east-2

    # These permissions are needed to interact with GitHub's OIDC Token endpoint.
    permissions:
      id-token: write
      contents: read

    steps:
      - name: "Stage 0: Update env vars based on job matrix arguments"
        run: |
          if [ -n "${{ matrix.helm_version }}" ]; then
              echo "HELM_VERSION=${{ matrix.helm_version }}" >> $GITHUB_ENV
          fi

      - name: "Stage 0: Checkout repo"
        uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: "Stage 0: Setup Python"
        uses: actions/setup-python@v5
        id: setup-python
        with:
          python-version-file: ".python-version"

      # Action Repo: https://github.com/actions/cache
      - name: "Stage 0: Cache pip dependencies"
        uses: actions/cache@v4
        with:
          path: ~/.cache/pip
          key: python-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/*requirements.txt') }}

      - name: "Stage 1: Install dependencies"
        # --upgrade is important since we carry a cache and may risk grow very
        # outdated otherwise.
        run: |
          pip install --upgrade setuptools pip
          pip install --upgrade -r requirements.txt

      # Install pre-requisite for "gcloud container clusters get-credentials"
      # command with a modern k8s client.
      #
      # A manual install step has been needed as they opted to not provide it in
      # the github-runner image. See
      # https://github.com/actions/runner-images/issues/5925#issuecomment-1216417721.
      #
      # Snippet based on
      # https://github.com/actions/runner-images/blob/9753e7301e19e29b89b0622b811bbb9b3891d02e/images/linux/scripts/installers/google-cloud-sdk.sh#L9-L13.
      #
      - name: Install gke-gcloud-auth-plugin
        if: matrix.federation_member == 'prod'
        run: |
          echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | sudo tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
          curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -

          sudo apt-get update -y
          sudo apt-get install -y google-cloud-sdk-gke-gcloud-auth-plugin

      - name: "Stage 1: Install gcloud ${{ env.GCLOUD_SDK_VERION }}"
        if: matrix.federation_member == 'prod'
        uses: google-github-actions/setup-gcloud@v2
        with:
          version: ${{ env.GCLOUD_SDK_VERION }}

      - name: "Stage 1: Configure AWS credentials"
        if: matrix.federation_member == 'curvenote'
        uses: aws-actions/configure-aws-credentials@v4
        with:
          role-to-assume: ${{ matrix.aws_deployment_role }}
          aws-region: ${{ matrix.aws_region }}
          role-session-name: mybinder-github-cd

      - name: "Stage 1: Install kubectl ${{ env.KUBECTL_VERSION }}"
        uses: azure/setup-kubectl@v4
        with:
          version: ${{ env.KUBECTL_VERSION }}

      - name: "Stage 1: Install and setup helm ${{ env.HELM_VERSION }}"
        run: |
          curl -sf https://raw.githubusercontent.com/helm/helm/HEAD/scripts/get-helm-3 | DESIRED_VERSION=${HELM_VERSION} bash
          for d in ./mybinder*/; do
            helm dependency update "$d"
          done

      - name: "Stage 2: Unlock git-crypt secrets"
        uses: sliteteam/github-action-git-crypt-unlock@f99c0c6b60bb7ec30dcec033a8f0a3b3d48f21e1
        env:
          GIT_CRYPT_KEY: ${{ secrets.GIT_CRYPT_KEY }}

      - name: "Stage 3: Run chartpress to update values.yaml"
        run: |
          chartpress ${{ matrix.chartpress_args || '--skip-build' }}

      - name: "Stage 4: Deploy to ${{ matrix.federation_member }}"
        run: |
          . cert-manager.env
          python ./deploy.py ${{ matrix.federation_member }} ${{ matrix.cluster_name || matrix.federation_member }} --name ${{ matrix.release_name || matrix.federation_member }}
        env:
          TERM: xterm

      - name: "Stage 4: Verify ${{ matrix.federation_member }} works"
        uses: nick-invision/retry@7152eba30c6575329ac0576536151aca5a72780e
        with:
          timeout_minutes: 10
          max_attempts: 3
          command: pytest -vx --color=yes --numprocesses=2 --release=${{ matrix.federation_member }} tests/

  report-status:
    # This job will wait for staging-deploy to finish and report its status
    #
    # This job will only run on push events made to test-this-pr/** branches
    if: github.event_name == 'push' && contains(github.ref, 'test-this-pr')
    environment: test-this-pr-env
    runs-on: ubuntu-latest
    steps:
      # Poll the GitHub REST API and wait for staging-deploy job to finish
      - name: Wait for staging-deploy completion
        id: staging-deploy-status
        run: |
          STATUS=""
          CONCLUSION=""

          while [ "$STATUS" != "completed" ]; do
            RESPONSE=$(curl --silent --request GET \
              --url 'https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs' \
              --header 'Authorization: token ${{ secrets.jupyterhub_bot_pat }}' \
              --header 'Accept: application/vnd.github.v3+json')

            STATUS=$(echo $RESPONSE | jq -r '.jobs[] | select(.name | startswith("staging-deploy")) | .status')
            echo "Status: $STATUS"

            if [[ "$STATUS" == "completed" ]]; then
              CONCLUSION=$(echo $RESPONSE | jq -r '.jobs[] | select(.name | startswith("staging-deploy")) | .conclusion')
              echo "Conclusion: $CONCLUSION"
            else
              sleep 30s
            fi
          done

          echo "status=$STATUS" >> $GITHUB_OUTPUT
          echo "conclusion=$CONCLUSION" >> $GITHUB_OUTPUT

      - name: Set PR number as variable
        id: get-pr-number
        run: |
          PR_NUM=$(echo ${{ github.ref }} | awk -F'/' '{print $NF}')
          echo "prnum=$PR_NUM" >> $GITHUB_OUTPUT

      - name: Comment on PR with Job Status and delete the test branch
        if: ${{ steps.staging-deploy-status.outputs.status }} == 'completed'
        uses: actions/github-script@v7
        with:
          github-token: ${{ secrets.jupyterhub_bot_pat }}
          script: |
            var JOB_STATUS = process.env.JOB_STATUS;
            var PR_NUMBER = process.env.PR_NUMBER;

            github.rest.git.deleteRef({
              owner: context.repo.owner,
              repo: context.repo.repo,
              ref: `heads/test-this-pr/${PR_NUMBER}`
            })

            github.rest.issues.createComment({
              issue_number: PR_NUMBER,
              owner: context.repo.owner,
              repo: context.repo.repo,
              body: `**Job status:** ${JOB_STATUS}\nBranch 'test-this-pr/${PR_NUMBER}' has been deleted`
            })
        env:
          JOB_STATUS: ${{ steps.staging-deploy-status.outputs.conclusion }}
          PR_NUMBER: ${{ steps.get-pr-number.outputs.prnum }}