Skip to content

Commit

Permalink
chore(ci): better docker prune (#5889)
Browse files Browse the repository at this point in the history
Docker prune without -a could still allow continuous accumulation of
space. Now only if you happen to keep hitting the same spot this will
happen, but as long as the same spot is up I don't want to prune (yet)
as to not prune while other jobs are saving images

Bundled:
- minor workflow streamlining/flexibility for impersonating
- ci.py support for impersonating
This was relevant to debugging our friend AztecBot's disk
  • Loading branch information
ludamad authored Apr 22, 2024
1 parent 1c24c8e commit b5a8e02
Show file tree
Hide file tree
Showing 6 changed files with 90 additions and 80 deletions.
3 changes: 0 additions & 3 deletions .github/ci-setup-action/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,6 @@ inputs:
concurrency_key:
required: false
description: 'Concurrency key for locking jobs'
concurrency_token:
required: false
description: 'TODO unused'
runs:
# define an action, runs in OS of caller
using: composite
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/ci-arm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ jobs:
- uses: ./.github/ci-setup-action
with:
dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}"
concurrency_token: "${{ secrets.AZTEC_GITHUB_TOKEN }}"
# must be globally unique for build x runner
concurrency_key: build-master-arm
# prepare images locally, tagged by commit hash
Expand Down
52 changes: 20 additions & 32 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,12 @@ on:
pull_request: {}
workflow_dispatch:
inputs:
username:
description: 'Defaults to GitHub Actor'
required: false
runner_action:
description: "The action to take with the self-hosted runner (start, stop, restart)."
required: false
just_start_spot:
description: "Should we just run spots?"
type: boolean
required: false
concurrency:
# force parallelism in master
group: ci-${{ github.ref_name == 'master' && github.run_id || github.ref_name }}
Expand All @@ -20,27 +19,26 @@ jobs:
setup:
uses: ./.github/workflows/setup-runner.yml
with:
runner_label: ${{ github.actor }}-x86
runner_label: ${{ inputs.username || github.actor }}-x86
ebs_cache_size_gb: 256
runner_concurrency: 20
subaction: ${{ github.event.inputs.runner_action || 'start' }}
subaction: ${{ inputs.runner_action || 'start' }}
ec2_instance_type: m6a.32xlarge
ec2_ami_id: ami-04d8422a9ba4de80f
ec2_instance_ttl: 40 # refreshed by jobs
secrets: inherit

build:
needs: setup
runs-on: ${{ github.actor }}-x86
if: ${{ github.event.inputs.just_start_spot != 'true' }}
runs-on: ${{ inputs.username || github.actor }}-x86
outputs:
e2e_list: ${{ steps.e2e_list.outputs.list }}
steps:
- {uses: actions/checkout@v4, with: { ref: "${{ github.event.pull_request.head.sha }}"}}
- uses: ./.github/ci-setup-action
with:
dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}"
concurrency_key: build-${{ github.actor }}-x86
concurrency_key: build-${{ inputs.username || github.actor }}-x86
# prepare images locally, tagged by commit hash
- name: "Build E2E Image"
timeout-minutes: 40
Expand All @@ -54,7 +52,7 @@ jobs:
# all the end-to-end integration tests for aztec
e2e:
needs: build
runs-on: ${{ github.actor }}-x86
runs-on: ${{ inputs.username || github.actor }}-x86
strategy:
fail-fast: false
matrix:
Expand All @@ -65,7 +63,7 @@ jobs:
with:
dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}"
# must be globally unique for build x runner
concurrency_key: e2e-${{ github.actor }}-x86-${{ matrix.test }}
concurrency_key: e2e-${{ inputs.username || github.actor }}-x86-${{ matrix.test }}
- name: Test
working-directory: ./yarn-project/end-to-end/
timeout-minutes: 25
Expand All @@ -78,7 +76,7 @@ jobs:
# only ran on x86 for resource reasons (memory intensive)
bb-native-tests:
needs: setup
runs-on: ${{ github.actor }}-x86
runs-on: ${{ inputs.username || github.actor }}-x86
strategy:
fail-fast: false
steps:
Expand All @@ -88,7 +86,7 @@ jobs:
with:
dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}"
# must be globally unique for build x runner
concurrency_key: bb-native-tests-${{ github.actor }}-x86
concurrency_key: bb-native-tests-${{ inputs.username || github.actor }}-x86
- name: "Native Prover Tests"
working-directory: ./barretenberg/cpp/
timeout-minutes: 25
Expand All @@ -98,15 +96,14 @@ jobs:
# push benchmarking binaries to dockerhub registry
bb-bench-binaries:
needs: setup
runs-on: ${{ github.actor }}-x86
runs-on: ${{ inputs.username || github.actor }}-x86
steps:
- {uses: actions/checkout@v4, with: { ref: "${{ github.event.pull_request.head.sha }}"}}
- uses: ./.github/ci-setup-action
with:
dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}"
concurrency_key: bb-bench-binaries-${{ github.actor }}-x86
concurrency_key: bb-bench-binaries-${{ inputs.username || github.actor }}-x86
- name: Build and Push Binaries
if: ${{ github.event.inputs.just_start_spot != 'true' }}
timeout-minutes: 15
working-directory: ./barretenberg/cpp/
run: earthly-ci --push +bench-binaries
Expand All @@ -115,24 +112,24 @@ jobs:
uses: ./.github/workflows/setup-runner.yml
needs: bb-bench-binaries
with:
runner_label: ${{ github.actor }}-bench-x86
runner_label: ${{ inputs.username || github.actor }}-bench-x86
ebs_cache_size_gb: 64
runner_concurrency: 1
subaction: ${{ github.event.inputs.runner_action || 'start' }}
subaction: ${{ inputs.runner_action || 'start' }}
ec2_instance_type: m6a.4xlarge
ec2_ami_id: ami-04d8422a9ba4de80f
ec2_instance_ttl: 15 # refreshed by jobs
secrets: inherit

bb-bench:
runs-on: ${{ github.actor }}-bench-x86
runs-on: ${{ inputs.username || github.actor }}-bench-x86
needs: setup-bench
steps:
- {uses: actions/checkout@v4, with: { ref: "${{ github.event.pull_request.head.sha }}"}}
- uses: ./.github/ci-setup-action
with:
dockerhub_password: "${{ secrets.DOCKERHUB_PASSWORD }}"
concurrency_key: bb-bench-${{ github.actor }}-bench-x86
concurrency_key: bb-bench-${{ inputs.username || github.actor }}-bench-x86
# Use bench_mode=cache to read the pushed build above
- name: Client IVC Bench
working-directory: ./barretenberg/cpp/
Expand All @@ -145,23 +142,14 @@ jobs:
run: earthly-ci --no-output +bench-ultra-honk --bench_mode=cache

merge-check:
runs-on: ubuntu-latest
runs-on: ${{ inputs.username || github.actor }}-x86
needs: [e2e, bb-native-tests, bb-bench]
if: always() # Ensures this job runs regardless of the success or failure of dependencies.
steps:
- run: |
echo "E2E Test Status: ${{ needs.e2e.result }}"
echo "Native Tests Status: ${{ needs.bb-native-tests.result }}"
echo "Bench Tests Status: ${{ needs.bb-bench.result }}"
if [[ "${{ needs.e2e.result }}" != 'success' || "${{ needs.bb-native-tests.result }}" != 'success' || "${{ needs.bb-bench.result }}" != 'success' ]]; then
echo "Pull request merging not allowed due to failures."
exit 1
fi
echo "Pull request merging now allowed."
- run: echo Pull request merging now allowed.

notify:
runs-on: ubuntu-latest
needs: [e2e, bb-native-tests, bb-bench]
runs-on: ubuntu-latest
if: ${{ github.ref == 'refs/heads/master' && failure() }}
steps:
- name: Send notification to aztec3-ci channel if workflow failed on master
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/setup-runner.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ jobs:
group: start-builder-${{ inputs.runner_label }}
steps:
- name: Start EC2 runner
uses: AztecProtocol/ec2-action-builder@v0.15
uses: AztecProtocol/ec2-action-builder@v0.14e
with:
github_token: ${{ secrets.GH_SELF_HOSTED_RUNNER_TOKEN }}
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
Expand Down Expand Up @@ -117,7 +117,7 @@ jobs:
- name: Run Docker Prune
# helps to not overuse space
run: docker system prune -f || true
run: docker system prune -f -a || true

- name: Run Earthly Bootstrap
run: earthly bootstrap
24 changes: 16 additions & 8 deletions .github/workflows/start-spot.yml
Original file line number Diff line number Diff line change
@@ -1,27 +1,35 @@
# Useful if the spot runners are in a bad state
name: Start Personal Spot
name: Start/Stop Personal Spot
on:
workflow_dispatch: {}
workflow_dispatch:
inputs:
username:
description: 'Defaults to GitHub Actor'
required: false
action:
description: 'Can also be stop or restart, defaults to start'
required: false
default: 'start'
jobs:
stop-build-x86:
start-build:
uses: ./.github/workflows/setup-runner.yml
with:
runner_label: ${{ github.actor }}-x86
runner_label: ${{ inputs.username || github.actor }}-x86
ebs_cache_size_gb: 256
runner_concurrency: 20
subaction: start
subaction: ${{ inputs.action }}
ec2_instance_type: m6a.32xlarge
ec2_ami_id: ami-04d8422a9ba4de80f
ec2_instance_ttl: 40 # refreshed by jobs
secrets: inherit

stop-bench:
start-bench:
uses: ./.github/workflows/setup-runner.yml
with:
runner_label: ${{ github.actor }}-bench-x86
runner_label: ${{ inputs.username || github.actor }}-bench-x86
ebs_cache_size_gb: 64
runner_concurrency: 1
subaction: start
subaction: ${{ inputs.action }}
ec2_instance_type: m6a.4xlarge
ec2_ami_id: ami-04d8422a9ba4de80f
ec2_instance_ttl: 15 # refreshed by jobs
Expand Down
86 changes: 52 additions & 34 deletions ci.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,32 @@
#!/usr/bin/env python3
# ubuntu: apt install python3-blessed
from blessed import Terminal
import os, json, subprocess, sys
import os, json, subprocess, sys, time

term = Terminal()
if 'GITHUB_ACTOR' not in os.environ:
print("Make sure you have GITHUB_ACTOR in your environment variables e.g. .zshrc")
sys.exit(1)
GITHUB_ACTOR = os.environ['GITHUB_ACTOR']
BRANCH = subprocess.run("git rev-parse --abbrev-ref HEAD", shell=True, text=True, capture_output=True).stdout.strip()

def main():
selection = -1
with term.fullscreen(), term.cbreak():
print(term.home + term.clear)
while selection not in ('1', '2', '3', '4', 'q'):
print(term.move_y(1) + "Please select an option:")
print("1. SSH into build machine")
print("2. SSH into bench machine")
print("3. Start/Stop spot machines")
print("4. Manage Running Jobs")
print("q. Quit")
with term.location(0, term.height - 1):
selection = term.inkey()
if len(sys.argv) >= 2:
selection = sys.argv[1]
else:
with term.fullscreen(), term.cbreak():
print(term.home + term.clear)
while selection not in ('1', '2', '3', '4', '5', 'q'):
print(term.move_y(1) + "Please select an option:")
print("1. SSH into build machine")
print("2. SSH into bench machine")
print("3. Start/Stop spot machines")
print("4. Manage Running Jobs")
print("5. Run ci.yml manually")
print("q. Quit")
with term.location(0, term.height - 1):
selection = term.inkey()

if selection == '1':
ssh_into_machine('x86')
Expand All @@ -31,42 +36,55 @@ def main():
manage_spot_instances()
elif selection == '4':
manage_ci_workflows()
elif selection == '5':
call_ci_workflow()

def ssh_into_machine(suffix):
GITHUB_ACTOR = os.getenv('GITHUB_ACTOR', 'default_actor')
ssh_key_path = os.path.expanduser('~/.ssh/build_instance_key')
if not os.path.exists(ssh_key_path):
print("SSH key does not exist.")
return

# Command to get the instance information
cmd = f'aws ec2 describe-instances --filters "Name=instance-state-name,Values=running" "Name=tag:Name,Values=aztec-packages-{GITHUB_ACTOR}-{suffix}" --output json --region us-east-2'
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
if result.returncode != 0:
print("Failed to get AWS instances:", result.stderr)
return

# Parse the output to find the public IP address
try:
instances_data = json.loads(result.stdout)
instance = instances_data['Reservations'][0]['Instances'][0]
instance_ip = instance['PublicIpAddress']
except (KeyError, IndexError, json.JSONDecodeError) as e:
print("Error parsing AWS CLI output:", e)
return
for i in range(10):
# Command to get the instance information
cmd = f'aws ec2 describe-instances --filters "Name=instance-state-name,Values=running" "Name=tag:Name,Values=aztec-packages-{GITHUB_ACTOR}-{suffix}" --output json --region us-east-2'
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
if result.returncode != 0:
print("Failed to get AWS instances:", result.stderr)
return
try:
instances_data = json.loads(result.stdout)
instance = instances_data['Reservations'][0]['Instances'][0]
instance_ip = instance['PublicIpAddress']
break
except (KeyError, IndexError, json.JSONDecodeError) as e:
print("Error parsing AWS CLI output, trying again:", e)
if i == 0:
print("Couldn't find spot, starting spot, and looping until we can find it")
call_spot_workflow('start')
elif i == 9:
print("Couldn't find spot even after creating it!")
sys.exit(1)
time.sleep(10)

# SSH command using the public IP
ssh_cmd = f"ssh -o StrictHostKeychecking=no -i {ssh_key_path} ubuntu@{instance_ip}"
print(f"Connecting to {instance_ip}. Consider delaying the impeding shutdown.")
print(f"Connecting to {instance_ip}. Consider delaying the impending shutdown and running a process called Runner.Worker to fool the reaper (automation TODO).")
ssh_process = subprocess.Popen(ssh_cmd, shell=True)
ssh_process.wait() # Wait for the SSH session to complete

def call_spot_workflow(action):
subprocess.run(f'gh workflow run start-spot.yml --ref {BRANCH} --field username="{GITHUB_ACTOR}" --field action="{action}"', shell=True)

def call_ci_workflow():
print(
"NOTE: This is mostly useful if impersonating a GITHUB_ACTOR. Usually you rather do Manage Running Jobs and retry."
)
subprocess.run(f'gh workflow run ci.yml --ref {BRANCH} --field username="{GITHUB_ACTOR}"', shell=True)

def manage_spot_instances():
action = input("Enter 'start' to run or 'stop' to stop spot instances: ")
if action == 'start':
subprocess.run('gh workflow run start-spot.yml', shell=True)
elif action == 'stop':
subprocess.run('gh workflow run stop-spot.yml', shell=True)
call_spot_workflow(input("Enter one of 'start', 'stop', 'restart':"))

def manage_ci_workflows():
# Retrieve the most recent workflow run
Expand All @@ -86,7 +104,7 @@ def manage_ci_workflows():
subprocess.run(f"gh run cancel {run_id}", shell=True)
if action.lower() == 'rerun':
# needed so the spot runners still work
subprocess.run('gh workflow run start-spot.yml', shell=True)
call_spot_workflow('start')
subprocess.run(f"gh run rerun {run_id} --failed", shell=True)
elif action.lower() == 'rerun-all':
subprocess.run(f"gh run rerun {run_id}", shell=True)
Expand Down

0 comments on commit b5a8e02

Please sign in to comment.