Work-around stuck runner restart loop on "Registration was not found …

…or is not medium trust" error
dimikot · Oct 12, 2024 · aaae214 · aaae214
1 parent de71529
commit aaae214
Show file tree

Hide file tree

Showing 5 changed files with 96 additions and 38 deletions.
diff --git a/docker/ci-runner/guest/entrypoint.01-ci-storage-load.sh b/docker/ci-runner/guest/entrypoint.01-ci-storage-load.sh
@@ -17,7 +17,7 @@ cd "$local_dir" 2>/dev/null || true
 EOT
 
 if [[ "$CI_STORAGE_HOST" != "" && -f ~/.ssh/id_rsa ]]; then
-  say "Running the initial \"ci-storage load\" for $local_dir..."
+  say "Running the initial \"ci-storage load\" for $local_dir in background..."
   ci-storage load \
     --storage-host="$CI_STORAGE_HOST" \
     --storage-dir="$WORK_DIR/$GH_REPOSITORY/$(realpath "$local_dir" | tr / _)" \

diff --git a/docker/ci-runner/guest/entrypoint.05-config.sh b/docker/ci-runner/guest/entrypoint.05-config.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Configures self-hosted runner and sets up graceful shutdown handling.
+# Configures self-hosted runner.
 #
 # Here we make an opinionated decision to NOT use ephemeral or jit acton
 # runners. Reasons:
@@ -14,8 +14,8 @@
 # - One downside happens when a runner container dies unexpectedly (rare). In
 #   this case, regular "offline" long-living runners are auto-removed by GitHub
 #   itself once in 2 weeks, whilst ephemeral (or jit) "offline" runners are
-#   auto-removed in 1 day. But we anyways need to implement some manual removal
-#   cycle exernally, since even 1 day is way too much for garbage accumulation.
+#   auto-removed in 1 day. But we anyways implement the manual removal cycle in
+#   ci-scaler, since even 1 day is way too much for garbage accumulation.
 #
 set -u -e
 
@@ -42,35 +42,3 @@ token=$(gh api -X POST --jq .token "repos/$GH_REPOSITORY/actions/runners/registr
   --disableupdate \
   --replace
 
-cleanup() {
-  say "Received graceful shutdown signal $1..."
-
-  # A debug facility to test, how much time does the orchestrator give the
-  # container to gracefully shutdown before killing it.
-  if [[ "$DEBUG_SHUTDOWN_DELAY_SEC" != "" ]]; then
-    say "Artificially delaying shutdown for $DEBUG_SHUTDOWN_DELAY_SEC second(s)..."
-    count=0
-    while [[ $count -lt "$DEBUG_SHUTDOWN_DELAY_SEC" ]]; do
-      sleep 1
-      count=$((count + 1))
-      say "  ...$count seconds elapsed"
-    done
-  fi
-
-  # Retry deleting the runner until it succeeds.
-  # - Busy runner fails in deletion, so we can retry safely until it becomes
-  #   idle and is successfully deleted.
-  # - In case we can't delete the runner for a long time still, the extrnal
-  #   orchestrator will eventually kill the container after a large timeout
-  #   (say, 15 minutes or so) needed for a running job to finish.
-  say "Removing the runner..."
-  while :; do
-    token=$(gh api -X POST --jq .token "repos/$GH_REPOSITORY/actions/runners/remove-token")
-    cd ~/actions-runner && ./config.sh remove --token "$token" && break
-    sleep 5
-    say "Retrying till the runner becomes idle and the removal succeeds..."
-  done
-}
-
-trap "cleanup SIGINT; exit 130" INT
-trap "cleanup SIGHUP; exit 143" TERM
diff --git a/docker/ci-runner/guest/entrypoint.06-terminate-on-signal.sh b/docker/ci-runner/guest/entrypoint.06-terminate-on-signal.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+#
+# Configures self-hosted runner and sets up graceful shutdown handling.
+#
+# Here we make an opinionated decision to NOT use ephemeral or jit acton
+# runners. Reasons:
+# - We WANT to reuse the work directory across job runs, that's the whole point
+#   of ci-storage architecture and its speedup benefits. So once the runner
+#   finishes some job, we do NOT want it to terminate (as it does in ephemeral
+#   or jit mode), we want it to CONTINUE listening for more jobs to run.
+# - GitHub doesn't allow to remove busy runners via API, which is very good for
+#   us: in case the container shuts down externaly due to downscaling, we just
+#   enter the graceful retry loop to delete the corresponding runner via API.
+# - One downside happens when a runner container dies unexpectedly (rare). In
+#   this case, regular "offline" long-living runners are auto-removed by GitHub
+#   itself once in 2 weeks, whilst ephemeral (or jit) "offline" runners are
+#   auto-removed in 1 day. But we anyways need to implement some manual removal
+#   cycle exernally, since even 1 day is way too much for garbage accumulation.
+#
+set -u -e
+
+terminate_on_signal() {
+  say "Received graceful shutdown signal $1..."
+
+  # A debug facility to test, how much time does the orchestrator give the
+  # container to gracefully shutdown before killing it.
+  if [[ "$DEBUG_SHUTDOWN_DELAY_SEC" != "" ]]; then
+    say "Artificially delaying shutdown for $DEBUG_SHUTDOWN_DELAY_SEC second(s)..."
+    count=0
+    while [[ $count -lt "$DEBUG_SHUTDOWN_DELAY_SEC" ]]; do
+      sleep 1
+      count=$((count + 1))
+      say "  ...$count seconds elapsed"
+    done
+  fi
+
+  # Retry deleting the runner until it succeeds.
+  # - Busy runner fails in deletion, so we can retry safely until it becomes
+  #   idle and is successfully deleted.
+  # - In case we can't delete the runner for a long time still, the extrnal
+  #   orchestrator will eventually kill the container after a large timeout
+  #   (say, 15 minutes or so) needed for a running job to finish.
+  say "Removing the runner..."
+  while :; do
+    token=$(gh api -X POST --jq .token "repos/$GH_REPOSITORY/actions/runners/remove-token")
+    cd ~/actions-runner && ./config.sh remove --token "$token" && break
+    sleep 5
+    say "Retrying till the runner becomes idle and the removal succeeds..."
+  done
+}
+
+trap "terminate_on_signal SIGINT; exit 130" INT
+trap "terminate_on_signal SIGHUP; exit 143" TERM
diff --git a/docker/ci-runner/guest/entrypoint.20-check-runner-health.sh b/docker/ci-runner/guest/entrypoint.20-check-runner-health.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+#
+# GitHub Runners have some bugs. They sometimes don't die, and instead get stuck
+# in a desperate restart loop. Here we work it around.
+#
+# Related GitHub issue: https://github.com/actions/runner/issues/2507
+#
+set -u -e
+
+rm -f ~/actions-runner/_diag/*.log
+
+check_runner_health_loop() {
+  pid="$1"
+  unhealthy_re="Registration was not found or is not medium trust"
+
+  while :; do
+    log=$(find ~/actions-runner/_diag -name "*.log" | tail -n1)
+    if [[ "$log" != "" ]]; then
+      # Find the very last line matching the regexp.
+      error=$(tail -n 500 "$log" | tac | grep -m1 -E -B15 -A15 "$unhealthy_re" | tac)
+      if [[ "$error" != "" ]]; then
+        say "Deadly message found in $log, terminating self. Last log lines:"
+        say "---------------------"
+        echo "$error"
+        say "---------------------"
+        kill -SIGINT "$pid"
+        return
+      fi
+    fi
+    sleep 5
+  done
+}
+
+check_runner_health_loop $$ &
diff --git a/docker/ci-runner/guest/entrypoint.99-run.sh b/docker/ci-runner/guest/entrypoint.99-run.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
 #
 # In the very end, runs the self-hosted runner and waits for its termination. In
-# case a SIGINT or SIGHUP are received, they will be processed by the cleanup()
-# function defined in the config script above.
+# case a SIGINT or SIGHUP are received, they will be processed by the
+# terminate_on_signal() function defined in the config script above.
 #
 set -u -e
 
@@ -16,4 +16,7 @@ while :; do
 done
 
 say "Starting the self-hosted runner..."
+
+# Use "& wait $!" to let terminate_on_signal() properly handle signals for
+# graceful termination (we can't use "exec" here).
 cd ~/actions-runner && ./run.sh & wait $!