diff --git a/build-system/scripts/create_ecr_manifest b/build-system/scripts/create_ecr_manifest index 44d887a190f..2e9fad6ae68 100755 --- a/build-system/scripts/create_ecr_manifest +++ b/build-system/scripts/create_ecr_manifest @@ -27,9 +27,9 @@ for A in $ARCH_LIST do ARCH_IMAGE=$IMAGE_URI-$A echo "Adding image $ARCH_IMAGE to manifest list." - docker manifest create $IMAGE_URI --amend $ARCH_IMAGE + retry docker manifest create $IMAGE_URI --amend $ARCH_IMAGE done IFS=$OLD_IFS unset OLD_IFS -docker manifest push --purge $IMAGE_URI +retry docker manifest push --purge $IMAGE_URI diff --git a/build-system/scripts/remote_run_script b/build-system/scripts/remote_run_script index 8aa4e0ea64f..e9ef84d723d 100755 --- a/build-system/scripts/remote_run_script +++ b/build-system/scripts/remote_run_script @@ -15,6 +15,7 @@ shift SSH_CONFIG_PATH=${SSH_CONFIG_PATH:-$BUILD_SYSTEM_PATH/remote/ssh_config} # Copy the runner script to spot instance. This is what we actually run. +echo "Copying ./remote_runner to $IP..." scp -rF $SSH_CONFIG_PATH $BUILD_SYSTEM_PATH/scripts/remote_runner $IP:. # Run script on remote instance, passing environment variables. diff --git a/build-system/scripts/request_spot b/build-system/scripts/request_spot index 40d7d9a585a..9e35b9b769a 100755 --- a/build-system/scripts/request_spot +++ b/build-system/scripts/request_spot @@ -24,7 +24,7 @@ INSTANCE_TYPE_SUFFIX=${cpu_map[$CPUS]} # Check if INSTANCE_TYPE_SUFFIX is set, if not, the CPU count is not recognized. if [ -z "$INSTANCE_TYPE_SUFFIX" ]; then - echo "Unrecognized CPU count: $CPUS" + >&2 echo "Unrecognized CPU count: $CPUS" exit 1 fi @@ -110,6 +110,11 @@ done # Wait till ssh port is open. >&2 echo "Waiting for SSH at $IP..." -while ! nc -z $IP 22; do sleep 1; done; - -echo $IP +for I in {1..60}; do + if nc -z $IP 22; then + echo $IP + exit 0 + fi + sleep 1 +done +exit 1 diff --git a/build-system/scripts/retry b/build-system/scripts/retry index 88cbeb6789e..0489aa226fd 100755 --- a/build-system/scripts/retry +++ b/build-system/scripts/retry @@ -3,5 +3,5 @@ ATTEMPTS=3 for i in $(seq 1 $ATTEMPTS); do "$@" && exit || sleep 10 done -echo "$@ failed after $ATTEMPTS attempts" +>&2 echo "$@ failed after $ATTEMPTS attempts" exit 1 diff --git a/build-system/scripts/spot_run_script b/build-system/scripts/spot_run_script index d8cd288070a..69707de660f 100755 --- a/build-system/scripts/spot_run_script +++ b/build-system/scripts/spot_run_script @@ -12,26 +12,31 @@ CONTENT_HASH=$1 CPUS=$2 shift 2 -# On any sort of exit (error or not), kill spot request so it doesn't count against quota. +# On any sort of exit (error or not). function on_exit { + set +e + + if [ -n "$IP" ]; then + echo "Terminating spot instance..." + ssh -F $SSH_CONFIG_PATH $IP sudo halt -p > /dev/null 2>&1 + fi + + # Kill spot request so it doesn't count against quota. if [ -f "sir-$CONTENT_HASH:$JOB_NAME.txt" ]; then SIR=$(cat sir-$CONTENT_HASH:$JOB_NAME.txt) - echo "Cancelling spot instance request $SIR (silently)" + echo "Cancelling spot instance request $SIR..." aws ec2 cancel-spot-instance-requests --spot-instance-request-ids $SIR >/dev/null 2>&1 || true fi } trap on_exit EXIT # Get spot instance. -IP=$(request_spot $CONTENT_HASH:$JOB_NAME $CPUS) +IP=$(retry request_spot $CONTENT_HASH:$JOB_NAME $CPUS) -# Run script remotely on spot instance, capturing success or failure. -set +e -remote_run_script $IP $@ -CODE=$? - -# Shutdown spot. -echo "Terminating spot instance..." -ssh -F $SSH_CONFIG_PATH $IP sudo halt -p > /dev/null 2>&1 +if [ -z "$IP" ]; then + echo "Failed to get spot instance." + exit 1 +fi -exit $CODE +# Run script remotely on spot instance, capturing success or failure. +remote_run_script $IP $@ \ No newline at end of file