diff --git a/.github/ensure-builder/action.yml b/.github/ensure-builder/action.yml index d4a4bff719f..6cd8a433c31 100644 --- a/.github/ensure-builder/action.yml +++ b/.github/ensure-builder/action.yml @@ -25,7 +25,7 @@ runs: echo "runner_label=$USERNAME-$runner_type" >> $GITHUB_OUTPUT if [[ $TYPE == builder-x86 ]]; then # 128-core x86 instance types with least evictions - echo "instance_type=r6in.32xlarge r6a.32xlarge i4i.32xlarge r7iz.32xlarge" >> $GITHUB_OUTPUT + echo "instance_type=i4i.32xlarge m6a.32xlarge m6i.32xlarge m6id.32xlarge m6idn.32xlarge m6in.32xlarge m7a.32xlarge r6a.32xlarge r6i.32xlarge r6id.32xlarge r6in.32xlarge r7iz.32xlarge" >> $GITHUB_OUTPUT echo "ami_id=ami-04d8422a9ba4de80f" >> $GITHUB_OUTPUT echo "ebs_cache_size=256" >> $GITHUB_OUTPUT echo "runner_concurrency=20" >> $GITHUB_OUTPUT diff --git a/.github/ensure-tester-with-images/action.yml b/.github/ensure-tester-with-images/action.yml index 010acf9119b..52bb975d223 100644 --- a/.github/ensure-tester-with-images/action.yml +++ b/.github/ensure-tester-with-images/action.yml @@ -60,14 +60,14 @@ runs: export BUILDER_SPOT_IP=${{ env.BUILDER_SPOT_IP }} export BUILDER_SPOT_KEY=~/.ssh/build_instance_key scripts/run_on_builder " - sudo mkdir -p /var/lib/docker/tmp + sudo mkdir -p /var/lib/docker/tmp-images - sudo flock /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.lock bash -c ' - if ! [ -f /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli ] ; then - docker save aztecprotocol/aztec:${{ env.IMAGE_KEY }} aztecprotocol/end-to-end:${{ env.IMAGE_KEY }} | brotli -2 > /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli.tmp - mv /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli.tmp /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli + sudo flock /var/lib/docker/tmp-images/${{ env.IMAGE_KEY }}.lock bash -c ' + if ! [ -f /var/lib/docker/tmp-images/${{ env.IMAGE_KEY }}.brotli ] ; then + docker save aztecprotocol/aztec:${{ env.IMAGE_KEY }} aztecprotocol/end-to-end:${{ env.IMAGE_KEY }} | brotli -2 > /var/lib/docker/tmp-images/${{ env.IMAGE_KEY }}.brotli.tmp + mv /var/lib/docker/tmp-images/${{ env.IMAGE_KEY }}.brotli.tmp /var/lib/docker/tmp-images/${{ env.IMAGE_KEY }}.brotli fi' - sudo cat /var/lib/docker/tmp/${{ env.IMAGE_KEY }}.brotli + sudo cat /var/lib/docker/tmp-images/${{ env.IMAGE_KEY }}.brotli " | brotli --decompress | docker load - name: Test diff --git a/.github/ensure-tester/action.yml b/.github/ensure-tester/action.yml index 4ad920921a5..24d61cbbb65 100644 --- a/.github/ensure-tester/action.yml +++ b/.github/ensure-tester/action.yml @@ -38,7 +38,7 @@ runs: elif [[ $TYPE == 128core-* ]]; then SIZE=32xlarge fi - echo "instance_type=m6a.$SIZE r6in.$SIZE r6a.$SIZE i4i.$SIZE r7iz.$SIZE" >> $GITHUB_OUTPUT + echo "instance_type=i4i.$SIZE m6a.$SIZE m6i.$SIZE m6id.$SIZE m6idn.$SIZE m6in.$SIZE m7a.$SIZE r6a.$SIZE r6i.$SIZE r6id.$SIZE r6in.$SIZE r7iz.$SIZE" >> $GITHUB_OUTPUT - name: Start Tester uses: ./.github/spot-runner-action diff --git a/.github/spot-runner-action/dist/index.js b/.github/spot-runner-action/dist/index.js index 666658bd34c..7ecbd8d91de 100644 --- a/.github/spot-runner-action/dist/index.js +++ b/.github/spot-runner-action/dist/index.js @@ -260,13 +260,6 @@ class Ec2Instance { LaunchTemplateData: { ImageId: this.config.ec2AmiId, InstanceInitiatedShutdownBehavior: "terminate", - InstanceRequirements: { - // We do not know what the instance types correspond to - // just let the user send a list of allowed instance types - VCpuCount: { Min: 0 }, - MemoryMiB: { Min: 0 }, - AllowedInstanceTypes: this.config.ec2InstanceType, - }, SecurityGroupIds: [this.config.ec2SecurityGroupId], KeyName: this.config.ec2KeyName, UserData: userDataScript, @@ -326,6 +319,9 @@ class Ec2Instance { Type: "instant", LaunchTemplateConfigs: [fleetLaunchConfig], ClientToken: this.config.clientToken || undefined, + SpotOptions: { + AllocationStrategy: "price-capacity-optimized", + }, TargetCapacitySpecification: { TotalTargetCapacity: 1, OnDemandTargetCapacity: useOnDemand ? 1 : 0, @@ -336,13 +332,13 @@ class Ec2Instance { const client = yield this.getEc2Client(); const fleet = yield client.createFleet(createFleetRequest).promise(); if (fleet.Errors && fleet.Errors.length > 0) { + core.warning(JSON.stringify(fleet.Errors, null, 2)); for (const error of fleet.Errors) { if (error.ErrorCode === "RequestLimitExceeded" || error.ErrorCode === "InsufficientInstanceCapacity") { return error.ErrorCode; } } - core.error(JSON.stringify(fleet.Errors, null, 2)); } const instances = ((fleet === null || fleet === void 0 ? void 0 : fleet.Instances) || [])[0] || {}; return (instances.InstanceIds || [])[0] || ""; @@ -728,11 +724,10 @@ function requestAndWaitForSpot(config) { } let instanceId = ""; for (const ec2Strategy of ec2SpotStrategies) { - let backoff = 1; + let backoff = 0; core.info(`Starting instance with ${ec2Strategy} strategy`); - // 6 * 10000ms = 1 minute per strategy, unless we hit RequestLimitExceeded, then we do exponential backoff - // TODO make longer lived spot request? - for (let i = 0; i < 6; i++) { + const MAX_ATTEMPTS = 3; // uses exponential backoff + for (let i = 0; i < MAX_ATTEMPTS; i++) { // Start instance const instanceIdOrError = yield ec2Client.requestMachine( // we fallback to on-demand @@ -742,15 +737,15 @@ function requestAndWaitForSpot(config) { instanceIdOrError === "InsufficientInstanceCapacity") { core.info("Failed to create instance due to " + instanceIdOrError + - " , waiting 10 seconds and trying again."); - backoff += 1; + ", waiting " + 5 * Math.pow(2, backoff) + " seconds and trying again."); } else { instanceId = instanceIdOrError; break; } // wait 10 seconds - yield new Promise((r) => setTimeout(r, 10000 * Math.pow(2, backoff))); + yield new Promise((r) => setTimeout(r, 5000 * Math.pow(2, backoff))); + backoff += 1; } if (instanceId) { core.info("Successfully requested instance with ID " + instanceId); diff --git a/.github/spot-runner-action/src/ec2.ts b/.github/spot-runner-action/src/ec2.ts index 41c91bcdd4d..42d4f922349 100644 --- a/.github/spot-runner-action/src/ec2.ts +++ b/.github/spot-runner-action/src/ec2.ts @@ -179,13 +179,6 @@ export class Ec2Instance { LaunchTemplateData: { ImageId: this.config.ec2AmiId, InstanceInitiatedShutdownBehavior: "terminate", - InstanceRequirements: { - // We do not know what the instance types correspond to - // just let the user send a list of allowed instance types - VCpuCount: { Min: 0 }, - MemoryMiB: { Min: 0 }, - AllowedInstanceTypes: this.config.ec2InstanceType, - }, SecurityGroupIds: [this.config.ec2SecurityGroupId], KeyName: this.config.ec2KeyName, UserData: userDataScript, @@ -245,6 +238,9 @@ export class Ec2Instance { Type: "instant", LaunchTemplateConfigs: [fleetLaunchConfig], ClientToken: this.config.clientToken || undefined, + SpotOptions: { + AllocationStrategy: "price-capacity-optimized", + }, TargetCapacitySpecification: { TotalTargetCapacity: 1, OnDemandTargetCapacity: useOnDemand ? 1 : 0, @@ -255,6 +251,7 @@ export class Ec2Instance { const client = await this.getEc2Client(); const fleet = await client.createFleet(createFleetRequest).promise(); if (fleet.Errors && fleet.Errors.length > 0) { + core.warning(JSON.stringify(fleet.Errors, null, 2)); for (const error of fleet.Errors) { if ( error.ErrorCode === "RequestLimitExceeded" || @@ -263,7 +260,6 @@ export class Ec2Instance { return error.ErrorCode; } } - core.error(JSON.stringify(fleet.Errors, null, 2)); } const instances: CreateFleetInstance = (fleet?.Instances || [])[0] || {}; return (instances.InstanceIds || [])[0] || ""; diff --git a/.github/spot-runner-action/src/main.ts b/.github/spot-runner-action/src/main.ts index 01397bcfd50..52e4f9e6b3b 100644 --- a/.github/spot-runner-action/src/main.ts +++ b/.github/spot-runner-action/src/main.ts @@ -63,7 +63,8 @@ async function requestAndWaitForSpot(config: ActionConfig): Promise { for (const ec2Strategy of ec2SpotStrategies) { let backoff = 0; core.info(`Starting instance with ${ec2Strategy} strategy`); - for (let i = 0; i < 6; i++) { + const MAX_ATTEMPTS = 3; // uses exponential backoff + for (let i = 0; i < MAX_ATTEMPTS; i++) { // Start instance const instanceIdOrError = await ec2Client.requestMachine( @@ -75,18 +76,18 @@ async function requestAndWaitForSpot(config: ActionConfig): Promise { instanceIdOrError === "RequestLimitExceeded" || instanceIdOrError === "InsufficientInstanceCapacity" ) { - backoff += 1; core.info( "Failed to create instance due to " + instanceIdOrError + - " , waiting " + 10000 * 2 ** backoff + " seconds and trying again." + ", waiting " + 5 * 2 ** backoff + " seconds and trying again." ); } else { instanceId = instanceIdOrError; break; } // wait 10 seconds - await new Promise((r) => setTimeout(r, 10000 * 2 ** backoff)); + await new Promise((r) => setTimeout(r, 5000 * 2 ** backoff)); + backoff += 1; } if (instanceId) { core.info("Successfully requested instance with ID " + instanceId); diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 70315ae2792..966c1536f7c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -346,7 +346,7 @@ jobs: concurrency_key: docs-preview-${{ inputs.username || github.actor }}-x86 - name: "Docs Preview" timeout-minutes: 30 - run: earthly --no-output ./docs/+deploy-preview --PR=${{ github.event.number }} --AZTEC_BOT_COMMENTER_GITHUB_TOKEN=${{ secrets.AZTEC_BOT_GITHUB_TOKEN }} --NETLIFY_AUTH_TOKEN=${{ secrets.NETLIFY_AUTH_TOKEN }} --NETLIFY_SITE_ID=${{ secrets.NETLIFY_SITE_ID }} + run: earthly-ci --no-output ./docs/+deploy-preview --PR=${{ github.event.number }} --AZTEC_BOT_COMMENTER_GITHUB_TOKEN=${{ secrets.AZTEC_BOT_GITHUB_TOKEN }} --NETLIFY_AUTH_TOKEN=${{ secrets.NETLIFY_AUTH_TOKEN }} --NETLIFY_SITE_ID=${{ secrets.NETLIFY_SITE_ID }} bb-bench: runs-on: ubuntu-latest diff --git a/scripts/ci/attach_ebs_cache.sh b/scripts/ci/attach_ebs_cache.sh index 640341814d9..2ca88f3c602 100755 --- a/scripts/ci/attach_ebs_cache.sh +++ b/scripts/ci/attach_ebs_cache.sh @@ -152,9 +152,9 @@ fi # Create a mount point and mount the volume mkdir -p /var/lib/docker mount $BLKDEVICE /var/lib/docker -service docker restart # clear our images temp folder -rm -rf /var/lib/docker/tmp +rm -rf /var/lib/docker/tmp-images +systemctl restart docker # important: everything (except earthly ls) should go through earthly-ci scripts/earthly-ci bootstrap touch /home/ubuntu/.setup-complete \ No newline at end of file diff --git a/scripts/earthly-ci b/scripts/earthly-ci index 05ef1a8d9e5..fad75193c82 100755 --- a/scripts/earthly-ci +++ b/scripts/earthly-ci @@ -11,7 +11,7 @@ OUTPUT_FILE=$(mktemp) INCONSISTENT_GRAPH_STATE_COUNT=0 # Counter for 'inconsistent graph state' errors # Maximum attempts -MAX_ATTEMPTS=8 +MAX_ATTEMPTS=5 ATTEMPT_COUNT=0 # earthly settings @@ -45,9 +45,15 @@ while [ $ATTEMPT_COUNT -lt $MAX_ATTEMPTS ]; do echo "Got 'inconsistent graph state' or 'failed to get state for index'. Sleeping for 30 seconds and retrying." sleep 30 elif grep 'Error: pull ping error: pull ping response' $OUTPUT_FILE >/dev/null; then - echo "Got 'Error: pull ping error: pull ping response', intermittent failure when writing out images to docker" + echo "Got 'Error: pull ping error: pull ping response', intermittent failure when writing out images to docker. If this persists, try 'systemctl restart docker' on the spot instance." elif grep '================================= System Info ==================================' $OUTPUT_FILE >/dev/null; then echo "Detected an Earthly daemon restart, possibly due to it (mis)detecting a cache setting change, trying again..." + elif grep 'dial unix /run/buildkit/buildkitd.sock' $OUTPUT_FILE >/dev/null; then + echo "Detected earthly unable to find buildkit, waiting and trying again..." + sleep 20 + elif grep 'The container name "/earthly-buildkitd" is already in use by container' $OUTPUT_FILE >/dev/null; then + echo "Detected earthly bootstrap happening in parallel and failing, waiting and trying again." + sleep 20 else # If other errors, exit the script exit 1