Skip to content

Commit

Permalink
Work-around stuck runner restart loop on "Registration was not found …
Browse files Browse the repository at this point in the history
…or is not medium trust" error
  • Loading branch information
dimikot committed Oct 12, 2024
1 parent de71529 commit aaae214
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 38 deletions.
2 changes: 1 addition & 1 deletion docker/ci-runner/guest/entrypoint.01-ci-storage-load.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ cd "$local_dir" 2>/dev/null || true
EOT

if [[ "$CI_STORAGE_HOST" != "" && -f ~/.ssh/id_rsa ]]; then
say "Running the initial \"ci-storage load\" for $local_dir..."
say "Running the initial \"ci-storage load\" for $local_dir in background..."
ci-storage load \
--storage-host="$CI_STORAGE_HOST" \
--storage-dir="$WORK_DIR/$GH_REPOSITORY/$(realpath "$local_dir" | tr / _)" \
Expand Down
38 changes: 3 additions & 35 deletions docker/ci-runner/guest/entrypoint.05-config.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash
#
# Configures self-hosted runner and sets up graceful shutdown handling.
# Configures self-hosted runner.
#
# Here we make an opinionated decision to NOT use ephemeral or jit acton
# runners. Reasons:
Expand All @@ -14,8 +14,8 @@
# - One downside happens when a runner container dies unexpectedly (rare). In
# this case, regular "offline" long-living runners are auto-removed by GitHub
# itself once in 2 weeks, whilst ephemeral (or jit) "offline" runners are
# auto-removed in 1 day. But we anyways need to implement some manual removal
# cycle exernally, since even 1 day is way too much for garbage accumulation.
# auto-removed in 1 day. But we anyways implement the manual removal cycle in
# ci-scaler, since even 1 day is way too much for garbage accumulation.
#
set -u -e

Expand All @@ -42,35 +42,3 @@ token=$(gh api -X POST --jq .token "repos/$GH_REPOSITORY/actions/runners/registr
--disableupdate \
--replace

cleanup() {
say "Received graceful shutdown signal $1..."

# A debug facility to test, how much time does the orchestrator give the
# container to gracefully shutdown before killing it.
if [[ "$DEBUG_SHUTDOWN_DELAY_SEC" != "" ]]; then
say "Artificially delaying shutdown for $DEBUG_SHUTDOWN_DELAY_SEC second(s)..."
count=0
while [[ $count -lt "$DEBUG_SHUTDOWN_DELAY_SEC" ]]; do
sleep 1
count=$((count + 1))
say " ...$count seconds elapsed"
done
fi

# Retry deleting the runner until it succeeds.
# - Busy runner fails in deletion, so we can retry safely until it becomes
# idle and is successfully deleted.
# - In case we can't delete the runner for a long time still, the extrnal
# orchestrator will eventually kill the container after a large timeout
# (say, 15 minutes or so) needed for a running job to finish.
say "Removing the runner..."
while :; do
token=$(gh api -X POST --jq .token "repos/$GH_REPOSITORY/actions/runners/remove-token")
cd ~/actions-runner && ./config.sh remove --token "$token" && break
sleep 5
say "Retrying till the runner becomes idle and the removal succeeds..."
done
}

trap "cleanup SIGINT; exit 130" INT
trap "cleanup SIGHUP; exit 143" TERM
53 changes: 53 additions & 0 deletions docker/ci-runner/guest/entrypoint.06-terminate-on-signal.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/bin/bash
#
# Configures self-hosted runner and sets up graceful shutdown handling.
#
# Here we make an opinionated decision to NOT use ephemeral or jit acton
# runners. Reasons:
# - We WANT to reuse the work directory across job runs, that's the whole point
# of ci-storage architecture and its speedup benefits. So once the runner
# finishes some job, we do NOT want it to terminate (as it does in ephemeral
# or jit mode), we want it to CONTINUE listening for more jobs to run.
# - GitHub doesn't allow to remove busy runners via API, which is very good for
# us: in case the container shuts down externaly due to downscaling, we just
# enter the graceful retry loop to delete the corresponding runner via API.
# - One downside happens when a runner container dies unexpectedly (rare). In
# this case, regular "offline" long-living runners are auto-removed by GitHub
# itself once in 2 weeks, whilst ephemeral (or jit) "offline" runners are
# auto-removed in 1 day. But we anyways need to implement some manual removal
# cycle exernally, since even 1 day is way too much for garbage accumulation.
#
set -u -e

terminate_on_signal() {
say "Received graceful shutdown signal $1..."

# A debug facility to test, how much time does the orchestrator give the
# container to gracefully shutdown before killing it.
if [[ "$DEBUG_SHUTDOWN_DELAY_SEC" != "" ]]; then
say "Artificially delaying shutdown for $DEBUG_SHUTDOWN_DELAY_SEC second(s)..."
count=0
while [[ $count -lt "$DEBUG_SHUTDOWN_DELAY_SEC" ]]; do
sleep 1
count=$((count + 1))
say " ...$count seconds elapsed"
done
fi

# Retry deleting the runner until it succeeds.
# - Busy runner fails in deletion, so we can retry safely until it becomes
# idle and is successfully deleted.
# - In case we can't delete the runner for a long time still, the extrnal
# orchestrator will eventually kill the container after a large timeout
# (say, 15 minutes or so) needed for a running job to finish.
say "Removing the runner..."
while :; do
token=$(gh api -X POST --jq .token "repos/$GH_REPOSITORY/actions/runners/remove-token")
cd ~/actions-runner && ./config.sh remove --token "$token" && break
sleep 5
say "Retrying till the runner becomes idle and the removal succeeds..."
done
}

trap "terminate_on_signal SIGINT; exit 130" INT
trap "terminate_on_signal SIGHUP; exit 143" TERM
34 changes: 34 additions & 0 deletions docker/ci-runner/guest/entrypoint.20-check-runner-health.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash
#
# GitHub Runners have some bugs. They sometimes don't die, and instead get stuck
# in a desperate restart loop. Here we work it around.
#
# Related GitHub issue: https://github.com/actions/runner/issues/2507
#
set -u -e

rm -f ~/actions-runner/_diag/*.log

check_runner_health_loop() {
pid="$1"
unhealthy_re="Registration was not found or is not medium trust"

while :; do
log=$(find ~/actions-runner/_diag -name "*.log" | tail -n1)
if [[ "$log" != "" ]]; then
# Find the very last line matching the regexp.
error=$(tail -n 500 "$log" | tac | grep -m1 -E -B15 -A15 "$unhealthy_re" | tac)
if [[ "$error" != "" ]]; then
say "Deadly message found in $log, terminating self. Last log lines:"
say "---------------------"
echo "$error"
say "---------------------"
kill -SIGINT "$pid"
return
fi
fi
sleep 5
done
}

check_runner_health_loop $$ &
7 changes: 5 additions & 2 deletions docker/ci-runner/guest/entrypoint.99-run.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#!/bin/bash
#
# In the very end, runs the self-hosted runner and waits for its termination. In
# case a SIGINT or SIGHUP are received, they will be processed by the cleanup()
# function defined in the config script above.
# case a SIGINT or SIGHUP are received, they will be processed by the
# terminate_on_signal() function defined in the config script above.
#
set -u -e

Expand All @@ -16,4 +16,7 @@ while :; do
done

say "Starting the self-hosted runner..."

# Use "& wait $!" to let terminate_on_signal() properly handle signals for
# graceful termination (we can't use "exec" here).
cd ~/actions-runner && ./run.sh & wait $!

0 comments on commit aaae214

Please sign in to comment.