Skip to content

Commit

Permalink
Make boot latency show up in instance Name tag, in CloudWatch and on …
Browse files Browse the repository at this point in the history
…Monitoring tab
  • Loading branch information
dimikot committed Jun 27, 2024
1 parent 33667d0 commit 97319c0
Show file tree
Hide file tree
Showing 7 changed files with 145 additions and 36 deletions.
2 changes: 1 addition & 1 deletion docker/ci-runner/guest/entrypoint.05-config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ set -u -e
cd ~/actions-runner

name_prefix="ci-storage"
instance_id=$(aws_metadata_curl latest/meta-data/instance-id)
instance_id=$(aws_instance_id)
if [[ "$instance_id" != "" ]]; then
hash="${instance_id##i-}"
name="$name_prefix-$hash-$(date '+%m%d-%H%M')"
Expand Down
65 changes: 59 additions & 6 deletions docker/ci-runner/guest/entrypoint.98-log-uptime.sh
Original file line number Diff line number Diff line change
@@ -1,27 +1,80 @@
#!/bin/bash
#
# Logs uptime of the runner (since instance boot timestamp if passed) in the
# beginning and then time to time.
# Logs uptime of the runner (since instance boot timestamp, if it's passed) in
# the beginning and then time to time. Also, tries to amend the instance Name
# tag by adding boot latency suffix to it: "docker_boot_sec+runner_boot_sec".
#
set -u -e

# shellcheck disable=SC2034
TAG_NAME="Name"
TAG_BTIME="ci-storage:BTIME"
TAG_DOCKER_BOOT_SEC="ci-storage:DockerBootSec"
TAG_RUNNER_BOOT_SEC="ci-storage:RunnerBootSec"

log_uptime_loop() {
label=${GH_LABELS##*,}
dimensions="GH_REPOSITORY=$GH_REPOSITORY,GH_LABEL=$label"
instance_id=$(aws_instance_id)

i=0
while :; do
out=()
if [[ "$i" == 0 ]]; then
RunnerBootSec=$(awk '{print int($1)}' /proc/uptime)
DockerBootSec=$(($(date '+%s') - BTIME - RunnerBootSec))
suffix="$DockerBootSec+$RunnerBootSec sec"
message="$(nice_date): Appending boot latency ($suffix) suffix to the instance Name tag..."
if [[ "$instance_id" != "" ]]; then
echo "$message"
# Only append boot latency if we actually booted or rebooted (i.e.
# $btime injected from the outside has changed). Don't do it if the
# container has just been restarted (i.e. if $btime didn't change).
name=$(aws_read_tag "$TAG_NAME" || true)
if [[ "$name" != "" ]]; then
prev_btime=$(aws_read_tag "$TAG_BTIME" || true)
if [[ "$prev_btime" != "$BTIME" ]]; then
aws_write_tag "$TAG_BTIME" "$BTIME" || true
aws_write_tag "$TAG_RUNNER_BOOT_SEC" "$RunnerBootSec" || true
aws_write_tag "$TAG_DOCKER_BOOT_SEC" "$DockerBootSec" || true
aws_write_tag "$TAG_NAME" "$name ($suffix)" || true
else
echo "It is the container who restarted, not the instance start/reboot, so skipping."
DockerBootSec=$(aws_read_tag "$TAG_DOCKER_BOOT_SEC" || true)
fi
else
echo -e "Could not read Name tag of instance \"$instance_id\".\n"
fi
else
echo "$message (AWS metadata service is not available, so skipping)"
fi
fi

# shellcheck disable=SC2034
InstanceUptimeSec=$(($(date '+%s') - BTIME))
# shellcheck disable=SC2034
RunnerUptimeSec=$(awk '{print int($1)}' /proc/uptime)
for metric in InstanceUptimeSec RunnerUptimeSec; do
if aws_cloudwatch_put_metric_data "$metric" "${!metric}" "$dimensions"; then

out=()
for metric in RunnerBootSec DockerBootSec InstanceUptimeSec RunnerUptimeSec; do
if [[ "${!metric}" == "" ]]; then
continue
fi
if [[ "$instance_id" != "" ]]; then
suffix="publishing to CloudWatch"
namespace=""
if [[ "$metric" == *Boot* ]]; then
# By default, metrics in CWAgent namespace are shown right on the
# instance's Monitoring tab.
namespace="CWAgent"
fi
aws_cloudwatch_put_metric_data "$metric" "${!metric}" "$dimensions" "$namespace" || true
else
suffix="AWS metadata service is not available, so not publishing"
fi
out+=("$metric=${!metric}")
done

echo "$(nice_date): $GH_REPOSITORY: ${out[*]} ($suffix)"
i=$((i + 1))
sleep 60
done
}
Expand Down
71 changes: 57 additions & 14 deletions docker/ci-runner/root/entrypoint.00-helpers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ export WORK_DIR="/mnt"

# We don't use ec2metadata CLI tool, because it does not allow to configure
# timeout. In case the container is run outside of AWS infra (e.g. during local
# development), the absense of timeout causes problems.
# development), the absense of timeout causes problems. Prints an empty value
# and always succeeds if not in AWS infra.
aws_metadata_curl() {
local timeout_sec token path
timeout_sec=5
Expand All @@ -26,28 +27,66 @@ aws_metadata_curl() {
fi
}

# Publishes a metric to CloudWatch. Returns an error exit code in case we are
# running not in AWS infra (i.e. there is no CloudWatch available), otherwise
# always succeeds, independently on aws CLI exit code (for caller simplicity).
aws_cloudwatch_put_metric_data() {
local metric="$1"
local value="$2"
local dimensions="$3"
# Prints the current AWS region name or nothing if not in AWS infra. Always
# succeeds.
aws_region() {
if [[ "${REGION-unset}" == "unset" ]]; then
REGION=$(aws_metadata_curl latest/meta-data/placement/availability-zone | sed "s/[a-z]$//")
fi
if [[ "$REGION" == "" ]]; then
return 1
echo "$REGION"
}

# Prints the current AWS instance ID or nothing if not in AWS infra. Always
# succeeds.
aws_instance_id() {
if [[ "${INSTANCE_ID-unset}" == "unset" ]]; then
INSTANCE_ID=$(aws_metadata_curl latest/meta-data/instance-id)
fi
echo "$INSTANCE_ID"
}

# Publishes a metric to CloudWatch. Fails on error.
aws_cloudwatch_put_metric_data() {
local metric="$1"
local value="$2"
local dimensions="$3"
local namespace="${4:-ci-storage/metrics}"
aws cloudwatch put-metric-data \
--region="$(aws_region)" \
--metric-name="$metric" \
--namespace="ci-storage/metrics" \
--namespace="$namespace" \
--value="$value" \
--storage-resolution="1" \
--unit="None" \
--dimensions="$dimensions" \
--region="$REGION" \
|| true
--dimensions="Region=$(aws_region),$dimensions,InstanceId=$(aws_instance_id)"
}

# Prints the value of the current instance's tag with the provided name. If
# there is no such tag, prints nothing and succeeds (default AWS behavior).
# Fails on error.
aws_read_tag() {
local key="$1"
local res; res=$(
aws ec2 describe-tags \
--region "$(aws_region)" \
--query "Tags[0].Value" \
--output text \
--filters "Name=resource-id,Values=$(aws_instance_id)" "Name=key,Values=$key"
)
if [[ "$res" != "None" ]]; then
echo "$res"
fi
}

# Writes (or overwrites) a tag with the provided key and value to the current
# instance. Fails on error.
aws_write_tag() {
local key="$1"
local value="$2"
aws ec2 create-tags \
--region "$(aws_region)" \
--resources "$(aws_instance_id)" \
--tags "Key=$key,Value=$value"
}

# Prints the current date in the same format as the GitHub Actions runner does.
Expand All @@ -56,7 +95,11 @@ nice_date() {
}

export -f aws_metadata_curl
export -f aws_region
export -f aws_instance_id
export -f aws_cloudwatch_put_metric_data
export -f aws_read_tag
export -f aws_write_tag
export -f nice_date

nice_date
16 changes: 9 additions & 7 deletions docker/ci-runner/root/entrypoint.70-prewarm.sh
Original file line number Diff line number Diff line change
@@ -1,27 +1,29 @@
#!/bin/bash
#
# Prints usage statistics and also, if work directory is not on tmpfs, keeps it
# in cache to lower the chances of the directory entries to be evicted.
# Prints usage statistics and also, if the target directory is not on tmpfs,
# keeps it in cache to lower the chances of the directory entries to be evicted.
#
# Does it only when TZ is set. This prevents it from printing in debug dev
# environment of the client for instance.
#
set -u -e

prewarm_loop() {
dir=$1
sleep 30
while :; do
time_took=/tmp/time_took
export TIMEFORMAT="%R sec"
info=$({ time df -h --output=fstype,target,used "$WORK_DIR" | tail -n1 | sed -E 's/[[:space:]]+/ /g'; } 2>$time_took)
info=$({ time df -h --output=fstype,target,used "$dir" | tail -n1 | sed -E 's/[[:space:]]+/ /g'; } 2>$time_took)
if [[ "$info" != *tmpfs* ]]; then
info=$({ time du -sh "$WORK_DIR" | sed -E 's/\s+/ /g'; } 2>$time_took)
info=$({ time du -sh "$dir" | sed -E 's/\s+/ /g'; } 2>$time_took)
fi
uptime=$(uptime | sed -E -e 's/^\s*[0-9:]+\s+//' -e 's/\s+/ /g')
echo "$(nice_date): Prewarm (took $(cat $time_took)): $info: $uptime"
sleep 60
done
}

# Print "prewarm" stats only when TZ is set. This prevents it from printing in
# debug dev environment of the client for instance.
if [[ "$TZ" != "" ]]; then
prewarm_loop &
prewarm_loop "$WORK_DIR" &
fi
4 changes: 3 additions & 1 deletion docker/ci-scaler/guest/scaler/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,9 @@ def __exit__(
doing_ellipsis = f"{doing}..." if doing else None
success_msg = f"{doing_ellipsis} ok" if doing_ellipsis else None
failure_msg = " ".join(s for s in (doing_ellipsis, failure) if s)
if exc_type and exc_value:
if isinstance(exc_value, KeyboardInterrupt):
return None
elif exc_type and exc_value:
log(
f"{failure_msg + ': ' if failure_msg else ''}{exc_type.__name__}: {exc_value}\n"
+ "".join(traceback.format_tb(tb)),
Expand Down
14 changes: 8 additions & 6 deletions docker/ci-storage/root/entrypoint.70-prewarm.sh
Original file line number Diff line number Diff line change
@@ -1,27 +1,29 @@
#!/bin/bash
#
# Prints usage statistics and also, if the storage directory is not on tmpfs,
# Prints usage statistics and also, if the target directory is not on tmpfs,
# keeps it in cache to lower the chances of the directory entries to be evicted.
#
# Does it only when TZ is set. This prevents it from printing in debug dev
# environment of the client for instance.
#
set -u -e

prewarm_loop() {
dir=$1
sleep 30
while :; do
time_took=/tmp/time_took
export TIMEFORMAT="%R sec"
info=$({ time df -h --output=fstype,target,used "$STORAGE_DIR" | tail -n1 | sed -E 's/[[:space:]]+/ /g'; } 2>$time_took)
info=$({ time df -h --output=fstype,target,used "$dir" | tail -n1 | sed -E 's/[[:space:]]+/ /g'; } 2>$time_took)
if [[ "$info" != *tmpfs* ]]; then
info=$({ time du -sh "$STORAGE_DIR" | sed -E 's/\s+/ /g'; } 2>$time_took)
info=$({ time du -sh "$dir" | sed -E 's/\s+/ /g'; } 2>$time_took)
fi
uptime=$(uptime | sed -E -e 's/^\s*[0-9:]+\s+//' -e 's/\s+/ /g')
echo "$(nice_date): Prewarm (took $(cat $time_took)): $info: $uptime"
sleep 60
done
}

# Print "prewarm" stats only when TZ is set. This prevents it from printing in
# debug dev environment of the client for instance.
if [[ "$TZ" != "" ]]; then
prewarm_loop &
prewarm_loop "$STORAGE_DIR" &
fi
9 changes: 8 additions & 1 deletion docker/compose-up-dev.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,19 @@ set -e

echo "Building & booting containters on the local laptop for debugging purposes..."

btime=1719410000
if [[ "$OSTYPE" == darwin* ]]; then
btime=$(sysctl -n kern.boottime | awk '{print $4}' | tr -d ',')
elif [[ "$OSTYPE" == linux* ]]; then
btime=$(grep btime /proc/stat | awk '{print $2}')
fi

GH_TOKEN=$(gh auth token) \
GH_REPOSITORY=$(gh repo view --json owner,name -q '.owner.login + "/" + .name') \
GH_LABELS=ci-storage-dev \
FORWARD_HOST=host.docker.internal \
TZ=America/Los_Angeles \
BTIME=$(date '+%s') \
BTIME="$btime" \
ASGS=$(gh repo view --json owner,name -q '.owner.login + "/" + .name'):ci-storage-dev:myasg \
DOMAIN=${DOMAIN:-example.com} \
docker compose up --pull=always --build "$@"

0 comments on commit 97319c0

Please sign in to comment.