diff --git a/docker/ci-runner/guest/entrypoint.01-ci-storage-load.sh b/docker/ci-runner/guest/entrypoint.01-ci-storage-load.sh index b56a229..62face2 100644 --- a/docker/ci-runner/guest/entrypoint.01-ci-storage-load.sh +++ b/docker/ci-runner/guest/entrypoint.01-ci-storage-load.sh @@ -17,7 +17,7 @@ cd "$local_dir" 2>/dev/null || true EOT if [[ "$CI_STORAGE_HOST" != "" && -f ~/.ssh/id_rsa ]]; then - say "Running the initial \"ci-storage load\" for $local_dir..." + say "Running the initial \"ci-storage load\" for $local_dir in background..." ci-storage load \ --storage-host="$CI_STORAGE_HOST" \ --storage-dir="$WORK_DIR/$GH_REPOSITORY/$(realpath "$local_dir" | tr / _)" \ diff --git a/docker/ci-runner/guest/entrypoint.05-config.sh b/docker/ci-runner/guest/entrypoint.05-config.sh index 31c84ff..b829798 100644 --- a/docker/ci-runner/guest/entrypoint.05-config.sh +++ b/docker/ci-runner/guest/entrypoint.05-config.sh @@ -1,6 +1,6 @@ #!/bin/bash # -# Configures self-hosted runner and sets up graceful shutdown handling. +# Configures self-hosted runner. # # Here we make an opinionated decision to NOT use ephemeral or jit acton # runners. Reasons: @@ -14,8 +14,8 @@ # - One downside happens when a runner container dies unexpectedly (rare). In # this case, regular "offline" long-living runners are auto-removed by GitHub # itself once in 2 weeks, whilst ephemeral (or jit) "offline" runners are -# auto-removed in 1 day. But we anyways need to implement some manual removal -# cycle exernally, since even 1 day is way too much for garbage accumulation. +# auto-removed in 1 day. But we anyways implement the manual removal cycle in +# ci-scaler, since even 1 day is way too much for garbage accumulation. # set -u -e @@ -42,35 +42,3 @@ token=$(gh api -X POST --jq .token "repos/$GH_REPOSITORY/actions/runners/registr --disableupdate \ --replace -cleanup() { - say "Received graceful shutdown signal $1..." - - # A debug facility to test, how much time does the orchestrator give the - # container to gracefully shutdown before killing it. - if [[ "$DEBUG_SHUTDOWN_DELAY_SEC" != "" ]]; then - say "Artificially delaying shutdown for $DEBUG_SHUTDOWN_DELAY_SEC second(s)..." - count=0 - while [[ $count -lt "$DEBUG_SHUTDOWN_DELAY_SEC" ]]; do - sleep 1 - count=$((count + 1)) - say " ...$count seconds elapsed" - done - fi - - # Retry deleting the runner until it succeeds. - # - Busy runner fails in deletion, so we can retry safely until it becomes - # idle and is successfully deleted. - # - In case we can't delete the runner for a long time still, the extrnal - # orchestrator will eventually kill the container after a large timeout - # (say, 15 minutes or so) needed for a running job to finish. - say "Removing the runner..." - while :; do - token=$(gh api -X POST --jq .token "repos/$GH_REPOSITORY/actions/runners/remove-token") - cd ~/actions-runner && ./config.sh remove --token "$token" && break - sleep 5 - say "Retrying till the runner becomes idle and the removal succeeds..." - done -} - -trap "cleanup SIGINT; exit 130" INT -trap "cleanup SIGHUP; exit 143" TERM diff --git a/docker/ci-runner/guest/entrypoint.06-terminate-on-signal.sh b/docker/ci-runner/guest/entrypoint.06-terminate-on-signal.sh new file mode 100644 index 0000000..1b3110d --- /dev/null +++ b/docker/ci-runner/guest/entrypoint.06-terminate-on-signal.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# +# Configures self-hosted runner and sets up graceful shutdown handling. +# +# Here we make an opinionated decision to NOT use ephemeral or jit acton +# runners. Reasons: +# - We WANT to reuse the work directory across job runs, that's the whole point +# of ci-storage architecture and its speedup benefits. So once the runner +# finishes some job, we do NOT want it to terminate (as it does in ephemeral +# or jit mode), we want it to CONTINUE listening for more jobs to run. +# - GitHub doesn't allow to remove busy runners via API, which is very good for +# us: in case the container shuts down externaly due to downscaling, we just +# enter the graceful retry loop to delete the corresponding runner via API. +# - One downside happens when a runner container dies unexpectedly (rare). In +# this case, regular "offline" long-living runners are auto-removed by GitHub +# itself once in 2 weeks, whilst ephemeral (or jit) "offline" runners are +# auto-removed in 1 day. But we anyways need to implement some manual removal +# cycle exernally, since even 1 day is way too much for garbage accumulation. +# +set -u -e + +terminate_on_signal() { + say "Received graceful shutdown signal $1..." + + # A debug facility to test, how much time does the orchestrator give the + # container to gracefully shutdown before killing it. + if [[ "$DEBUG_SHUTDOWN_DELAY_SEC" != "" ]]; then + say "Artificially delaying shutdown for $DEBUG_SHUTDOWN_DELAY_SEC second(s)..." + count=0 + while [[ $count -lt "$DEBUG_SHUTDOWN_DELAY_SEC" ]]; do + sleep 1 + count=$((count + 1)) + say " ...$count seconds elapsed" + done + fi + + # Retry deleting the runner until it succeeds. + # - Busy runner fails in deletion, so we can retry safely until it becomes + # idle and is successfully deleted. + # - In case we can't delete the runner for a long time still, the extrnal + # orchestrator will eventually kill the container after a large timeout + # (say, 15 minutes or so) needed for a running job to finish. + say "Removing the runner..." + while :; do + token=$(gh api -X POST --jq .token "repos/$GH_REPOSITORY/actions/runners/remove-token") + cd ~/actions-runner && ./config.sh remove --token "$token" && break + sleep 5 + say "Retrying till the runner becomes idle and the removal succeeds..." + done +} + +trap "terminate_on_signal SIGINT; exit 130" INT +trap "terminate_on_signal SIGHUP; exit 143" TERM diff --git a/docker/ci-runner/guest/entrypoint.20-check-runner-health.sh b/docker/ci-runner/guest/entrypoint.20-check-runner-health.sh new file mode 100644 index 0000000..288a4b7 --- /dev/null +++ b/docker/ci-runner/guest/entrypoint.20-check-runner-health.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# +# GitHub Runners have some bugs. They sometimes don't die, and instead get stuck +# in a desperate restart loop. Here we work it around. +# +# Related GitHub issue: https://github.com/actions/runner/issues/2507 +# +set -u -e + +rm -f ~/actions-runner/_diag/*.log + +check_runner_health_loop() { + pid="$1" + unhealthy_re="Registration was not found or is not medium trust" + + while :; do + log=$(find ~/actions-runner/_diag -name "*.log" | tail -n1) + if [[ "$log" != "" ]]; then + # Find the very last line matching the regexp. + error=$(tail -n 500 "$log" | tac | grep -m1 -E -B15 -A15 "$unhealthy_re" | tac) + if [[ "$error" != "" ]]; then + say "Deadly message found in $log, terminating self. Last log lines:" + say "---------------------" + echo "$error" + say "---------------------" + kill -SIGINT "$pid" + return + fi + fi + sleep 5 + done +} + +check_runner_health_loop $$ & diff --git a/docker/ci-runner/guest/entrypoint.99-run.sh b/docker/ci-runner/guest/entrypoint.99-run.sh index bf69a89..580746a 100644 --- a/docker/ci-runner/guest/entrypoint.99-run.sh +++ b/docker/ci-runner/guest/entrypoint.99-run.sh @@ -1,8 +1,8 @@ #!/bin/bash # # In the very end, runs the self-hosted runner and waits for its termination. In -# case a SIGINT or SIGHUP are received, they will be processed by the cleanup() -# function defined in the config script above. +# case a SIGINT or SIGHUP are received, they will be processed by the +# terminate_on_signal() function defined in the config script above. # set -u -e @@ -16,4 +16,7 @@ while :; do done say "Starting the self-hosted runner..." + +# Use "& wait $!" to let terminate_on_signal() properly handle signals for +# graceful termination (we can't use "exec" here). cd ~/actions-runner && ./run.sh & wait $!