chore(ci): better docker prune (#5889)

Docker prune without -a could still allow continuous accumulation of space. Now only if you happen to keep hitting the same spot this will happen, but as long as the same spot is up I don't want to prune (yet) as to not prune while other jobs are saving images Bundled: - minor workflow streamlining/flexibility for impersonating - ci.py support for impersonating This was relevant to debugging our friend AztecBot's disk
AztecProtocol · Apr 22, 2024 · b5a8e02 · b5a8e02
1 parent 1c24c8e
commit b5a8e02
Show file tree

Hide file tree

Showing 6 changed files with 90 additions and 80 deletions.
diff --git a/.github/ci-setup-action/action.yml b/.github/ci-setup-action/action.yml
@@ -9,9 +9,6 @@ inputs:
   concurrency_key:
     required: false
     description: 'Concurrency key for locking jobs'
-  concurrency_token:
-    required: false
-    description: 'TODO unused'
 runs:
   # define an action, runs in OS of caller
   using: composite

diff --git a/.github/workflows/ci-arm.yml b/.github/workflows/ci-arm.yml
@@ -36,7 +36,6 @@ jobs:
       - uses: ./.github/ci-setup-action
         with:
           dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}"
-          concurrency_token: "${{ secrets.AZTEC_GITHUB_TOKEN }}"
           # must be globally unique for build x runner
           concurrency_key: build-master-arm
       # prepare images locally, tagged by commit hash

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -5,13 +5,12 @@ on:
   pull_request: {}
   workflow_dispatch:
     inputs:
+      username:
+        description: 'Defaults to GitHub Actor'
+        required: false
       runner_action:
         description: "The action to take with the self-hosted runner (start, stop, restart)."
         required: false
-      just_start_spot:
-        description: "Should we just run spots?"
-        type: boolean
-        required: false
 concurrency:
   # force parallelism in master
   group: ci-${{ github.ref_name == 'master' && github.run_id || github.ref_name }}
@@ -20,27 +19,26 @@ jobs:
   setup:
     uses: ./.github/workflows/setup-runner.yml
     with:
-      runner_label: ${{ github.actor }}-x86
+      runner_label: ${{ inputs.username || github.actor }}-x86
       ebs_cache_size_gb: 256
       runner_concurrency: 20
-      subaction: ${{ github.event.inputs.runner_action || 'start' }}
+      subaction: ${{ inputs.runner_action || 'start' }}
       ec2_instance_type: m6a.32xlarge
       ec2_ami_id: ami-04d8422a9ba4de80f
       ec2_instance_ttl: 40 # refreshed by jobs
     secrets: inherit
 
   build:
     needs: setup
-    runs-on: ${{ github.actor }}-x86
-    if: ${{ github.event.inputs.just_start_spot != 'true' }}
+    runs-on: ${{ inputs.username || github.actor }}-x86
     outputs:
       e2e_list: ${{ steps.e2e_list.outputs.list }}
     steps:
       - {uses: actions/checkout@v4, with: { ref: "${{ github.event.pull_request.head.sha }}"}}
       - uses: ./.github/ci-setup-action
         with:
           dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}"
-          concurrency_key: build-${{ github.actor }}-x86
+          concurrency_key: build-${{ inputs.username || github.actor }}-x86
       # prepare images locally, tagged by commit hash
       - name: "Build E2E Image"
         timeout-minutes: 40
@@ -54,7 +52,7 @@ jobs:
   # all the end-to-end integration tests for aztec
   e2e:
     needs: build
-    runs-on: ${{ github.actor }}-x86
+    runs-on: ${{ inputs.username || github.actor }}-x86
     strategy:
       fail-fast: false
       matrix:
@@ -65,7 +63,7 @@ jobs:
         with:
           dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}"
           # must be globally unique for build x runner
-          concurrency_key: e2e-${{ github.actor }}-x86-${{ matrix.test }}
+          concurrency_key: e2e-${{ inputs.username || github.actor }}-x86-${{ matrix.test }}
       - name: Test
         working-directory: ./yarn-project/end-to-end/
         timeout-minutes: 25
@@ -78,7 +76,7 @@ jobs:
   # only ran on x86 for resource reasons (memory intensive)
   bb-native-tests:
     needs: setup
-    runs-on: ${{ github.actor }}-x86
+    runs-on: ${{ inputs.username || github.actor }}-x86
     strategy:
       fail-fast: false
     steps:
@@ -88,7 +86,7 @@ jobs:
         with:
           dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}"
           # must be globally unique for build x runner
-          concurrency_key: bb-native-tests-${{ github.actor }}-x86
+          concurrency_key: bb-native-tests-${{ inputs.username || github.actor }}-x86
       - name: "Native Prover Tests"
         working-directory: ./barretenberg/cpp/
         timeout-minutes: 25
@@ -98,15 +96,14 @@ jobs:
   # push benchmarking binaries to dockerhub registry
   bb-bench-binaries:
     needs: setup
-    runs-on: ${{ github.actor }}-x86
+    runs-on: ${{ inputs.username || github.actor }}-x86
     steps:
       - {uses: actions/checkout@v4, with: { ref: "${{ github.event.pull_request.head.sha }}"}}
       - uses: ./.github/ci-setup-action
         with:
           dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}"
-          concurrency_key: bb-bench-binaries-${{ github.actor }}-x86
+          concurrency_key: bb-bench-binaries-${{ inputs.username || github.actor }}-x86
       - name: Build and Push Binaries
-        if: ${{ github.event.inputs.just_start_spot != 'true' }}
         timeout-minutes: 15
         working-directory: ./barretenberg/cpp/
         run: earthly-ci --push +bench-binaries
@@ -115,24 +112,24 @@ jobs:
     uses: ./.github/workflows/setup-runner.yml
     needs: bb-bench-binaries
     with:
-      runner_label: ${{ github.actor }}-bench-x86
+      runner_label: ${{ inputs.username || github.actor }}-bench-x86
       ebs_cache_size_gb: 64
       runner_concurrency: 1
-      subaction: ${{ github.event.inputs.runner_action || 'start' }}
+      subaction: ${{ inputs.runner_action || 'start' }}
       ec2_instance_type: m6a.4xlarge
       ec2_ami_id: ami-04d8422a9ba4de80f
       ec2_instance_ttl: 15 # refreshed by jobs
     secrets: inherit
 
   bb-bench:
-    runs-on: ${{ github.actor }}-bench-x86
+    runs-on: ${{ inputs.username || github.actor }}-bench-x86
     needs: setup-bench
     steps:
       - {uses: actions/checkout@v4, with: { ref: "${{ github.event.pull_request.head.sha }}"}}
       - uses: ./.github/ci-setup-action
         with:
           dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}"
-          concurrency_key: bb-bench-${{ github.actor }}-bench-x86
+          concurrency_key: bb-bench-${{ inputs.username || github.actor }}-bench-x86
       # Use bench_mode=cache to read the pushed build above
       - name: Client IVC Bench
         working-directory: ./barretenberg/cpp/
@@ -145,23 +142,14 @@ jobs:
         run: earthly-ci --no-output +bench-ultra-honk --bench_mode=cache
 
   merge-check:
-    runs-on: ubuntu-latest
+    runs-on: ${{ inputs.username || github.actor }}-x86
     needs: [e2e, bb-native-tests, bb-bench]
-    if: always()  # Ensures this job runs regardless of the success or failure of dependencies.
     steps:
-      - run: |
-          echo "E2E Test Status: ${{ needs.e2e.result }}"
-          echo "Native Tests Status: ${{ needs.bb-native-tests.result }}"
-          echo "Bench Tests Status: ${{ needs.bb-bench.result }}"
-          if [[ "${{ needs.e2e.result }}" != 'success' || "${{ needs.bb-native-tests.result }}" != 'success' || "${{ needs.bb-bench.result }}" != 'success' ]]; then
-            echo "Pull request merging not allowed due to failures."
-            exit 1
-          fi
-          echo "Pull request merging now allowed."
+      - run: echo Pull request merging now allowed.
 
   notify:
-    runs-on: ubuntu-latest
     needs: [e2e, bb-native-tests, bb-bench]
+    runs-on: ubuntu-latest
     if: ${{ github.ref == 'refs/heads/master' && failure() }}
     steps:
       - name: Send notification to aztec3-ci channel if workflow failed on master

diff --git a/.github/workflows/setup-runner.yml b/.github/workflows/setup-runner.yml
@@ -58,7 +58,7 @@ jobs:
       group: start-builder-${{ inputs.runner_label }}
     steps:
       - name: Start EC2 runner
-        uses: AztecProtocol/ec2-action-builder@v0.15
+        uses: AztecProtocol/ec2-action-builder@v0.14e
         with:
           github_token: ${{ secrets.GH_SELF_HOSTED_RUNNER_TOKEN }}
           aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
@@ -117,7 +117,7 @@ jobs:
 
       - name: Run Docker Prune
         # helps to not overuse space
-        run: docker system prune -f || true
+        run: docker system prune -f -a || true
 
       - name: Run Earthly Bootstrap
         run: earthly bootstrap
diff --git a/.github/workflows/start-spot.yml b/.github/workflows/start-spot.yml
@@ -1,27 +1,35 @@
 # Useful if the spot runners are in a bad state
-name: Start Personal Spot
+name: Start/Stop Personal Spot
 on:
-  workflow_dispatch: {}
+  workflow_dispatch:
+    inputs:
+      username:
+        description: 'Defaults to GitHub Actor'
+        required: false
+      action:
+        description: 'Can also be stop or restart, defaults to start'
+        required: false
+        default: 'start'
 jobs:
-  stop-build-x86:
+  start-build:
     uses: ./.github/workflows/setup-runner.yml
     with:
-      runner_label: ${{ github.actor }}-x86
+      runner_label: ${{ inputs.username || github.actor }}-x86
       ebs_cache_size_gb: 256
       runner_concurrency: 20
-      subaction: start
+      subaction: ${{ inputs.action }}
       ec2_instance_type: m6a.32xlarge
       ec2_ami_id: ami-04d8422a9ba4de80f
       ec2_instance_ttl: 40 # refreshed by jobs
     secrets: inherit
 
-  stop-bench:
+  start-bench:
     uses: ./.github/workflows/setup-runner.yml
     with:
-      runner_label: ${{ github.actor }}-bench-x86
+      runner_label: ${{ inputs.username || github.actor }}-bench-x86
       ebs_cache_size_gb: 64
       runner_concurrency: 1
-      subaction: start
+      subaction: ${{ inputs.action }}
       ec2_instance_type: m6a.4xlarge
       ec2_ami_id: ami-04d8422a9ba4de80f
       ec2_instance_ttl: 15 # refreshed by jobs

diff --git a/ci.py b/ci.py
@@ -1,27 +1,32 @@
 #!/usr/bin/env python3
 # ubuntu: apt install python3-blessed
 from blessed import Terminal
-import os, json, subprocess, sys
+import os, json, subprocess, sys, time
 
 term = Terminal()
 if 'GITHUB_ACTOR' not in os.environ:
     print("Make sure you have GITHUB_ACTOR in your environment variables e.g. .zshrc")
     sys.exit(1)
 GITHUB_ACTOR = os.environ['GITHUB_ACTOR']
+BRANCH = subprocess.run("git rev-parse --abbrev-ref HEAD", shell=True, text=True, capture_output=True).stdout.strip()
 
 def main():
     selection = -1
-    with term.fullscreen(), term.cbreak():
-        print(term.home + term.clear)
-        while selection not in ('1', '2', '3', '4', 'q'):
-            print(term.move_y(1) + "Please select an option:")
-            print("1. SSH into build machine")
-            print("2. SSH into bench machine")
-            print("3. Start/Stop spot machines")
-            print("4. Manage Running Jobs")
-            print("q. Quit")
-            with term.location(0, term.height - 1):
-                selection = term.inkey()
+    if len(sys.argv) >= 2:
+        selection = sys.argv[1]
+    else:
+        with term.fullscreen(), term.cbreak():
+            print(term.home + term.clear)
+            while selection not in ('1', '2', '3', '4', '5', 'q'):
+                print(term.move_y(1) + "Please select an option:")
+                print("1. SSH into build machine")
+                print("2. SSH into bench machine")
+                print("3. Start/Stop spot machines")
+                print("4. Manage Running Jobs")
+                print("5. Run ci.yml manually")
+                print("q. Quit")
+                with term.location(0, term.height - 1):
+                    selection = term.inkey()
 
     if selection == '1':
         ssh_into_machine('x86')
@@ -31,42 +36,55 @@ def main():
         manage_spot_instances()
     elif selection == '4':
         manage_ci_workflows()
+    elif selection == '5':
+        call_ci_workflow()
 
 def ssh_into_machine(suffix):
-    GITHUB_ACTOR = os.getenv('GITHUB_ACTOR', 'default_actor')
     ssh_key_path = os.path.expanduser('~/.ssh/build_instance_key')
     if not os.path.exists(ssh_key_path):
         print("SSH key does not exist.")
         return
 
-    # Command to get the instance information
-    cmd = f'aws ec2 describe-instances --filters "Name=instance-state-name,Values=running" "Name=tag:Name,Values=aztec-packages-{GITHUB_ACTOR}-{suffix}" --output json --region us-east-2'
-    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
-    if result.returncode != 0:
-        print("Failed to get AWS instances:", result.stderr)
-        return
-
     # Parse the output to find the public IP address
-    try:
-        instances_data = json.loads(result.stdout)
-        instance = instances_data['Reservations'][0]['Instances'][0]
-        instance_ip = instance['PublicIpAddress']
-    except (KeyError, IndexError, json.JSONDecodeError) as e:
-        print("Error parsing AWS CLI output:", e)
-        return
+    for i in range(10):
+        # Command to get the instance information
+        cmd = f'aws ec2 describe-instances --filters "Name=instance-state-name,Values=running" "Name=tag:Name,Values=aztec-packages-{GITHUB_ACTOR}-{suffix}" --output json --region us-east-2'
+        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+        if result.returncode != 0:
+            print("Failed to get AWS instances:", result.stderr)
+            return
+        try:
+            instances_data = json.loads(result.stdout)
+            instance = instances_data['Reservations'][0]['Instances'][0]
+            instance_ip = instance['PublicIpAddress']
+            break
+        except (KeyError, IndexError, json.JSONDecodeError) as e:
+            print("Error parsing AWS CLI output, trying again:", e)
+            if i == 0:
+                print("Couldn't find spot, starting spot, and looping until we can find it")
+                call_spot_workflow('start')
+            elif i == 9:
+                print("Couldn't find spot even after creating it!")
+                sys.exit(1)
+            time.sleep(10)
 
     # SSH command using the public IP
     ssh_cmd = f"ssh -o StrictHostKeychecking=no -i {ssh_key_path} ubuntu@{instance_ip}"
-    print(f"Connecting to {instance_ip}. Consider delaying the impeding shutdown.")
+    print(f"Connecting to {instance_ip}. Consider delaying the impending shutdown and running a process called Runner.Worker to fool the reaper (automation TODO).")
     ssh_process = subprocess.Popen(ssh_cmd, shell=True)
     ssh_process.wait()  # Wait for the SSH session to complete
 
+def call_spot_workflow(action):
+    subprocess.run(f'gh workflow run start-spot.yml --ref {BRANCH} --field username="{GITHUB_ACTOR}" --field action="{action}"', shell=True)
+
+def call_ci_workflow():
+    print(
+        "NOTE: This is mostly useful if impersonating a GITHUB_ACTOR. Usually you rather do Manage Running Jobs and retry."
+    )
+    subprocess.run(f'gh workflow run ci.yml --ref {BRANCH} --field username="{GITHUB_ACTOR}"', shell=True)
+
 def manage_spot_instances():
-    action = input("Enter 'start' to run or 'stop' to stop spot instances: ")
-    if action == 'start':
-        subprocess.run('gh workflow run start-spot.yml', shell=True)
-    elif action == 'stop':
-        subprocess.run('gh workflow run stop-spot.yml', shell=True)
+    call_spot_workflow(input("Enter one of 'start', 'stop', 'restart':"))
 
 def manage_ci_workflows():
     # Retrieve the most recent workflow run
@@ -86,7 +104,7 @@ def manage_ci_workflows():
         subprocess.run(f"gh run cancel {run_id}", shell=True)
     if action.lower() == 'rerun':
         # needed so the spot runners still work
-        subprocess.run('gh workflow run start-spot.yml', shell=True)
+        call_spot_workflow('start')
         subprocess.run(f"gh run rerun {run_id} --failed", shell=True)
     elif action.lower() == 'rerun-all':
         subprocess.run(f"gh run rerun {run_id}", shell=True)