From 8fd9cc6a8b261df02931358007fecac38eb3b682 Mon Sep 17 00:00:00 2001 From: Theo Ilie Date: Tue, 12 Dec 2023 18:17:00 -0800 Subject: [PATCH 1/3] Notify Slack when stage nodes aren't updating --- .circleci/config.yml | 73 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index d6d88c67482..9e62e13440c 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -335,6 +335,70 @@ jobs: git checkout -b release-client-v${VERSION} git push -u origin release-client-v${VERSION} + notify-stuck-stage-nodes-job: + resource_class: small + docker: + - image: cimg/base:2023.01 + steps: + - run: + name: Alert Slack of stuck stage nodes + command: | + handle_error() { + # Construct failure Slack message + failure_content="{ \"blocks\": [" + failure_content+="{ \"type\": \"section\", \"text\": { \"type\": \"plain_text\", \"text\": \"Encountered error while checking for stuck staging nodes\n\" } }" + failure_content+="]}" + echo "Sending error message to Slack: $failure_content" + + # Send Slack failure message + curl -f -X POST -H 'Content-type: application/json' \ + --data "$failure_content" \ + $SLACK_DAILY_DEPLOY_WEBHOOK + } + + ( + set -e + + # Fetch the latest version from the GitHub repository (assume Content and Discovery have the same latest versions) + versionUrl="https://raw.githubusercontent.com/AudiusProject/audius-protocol/main/packages/discovery-provider/.version.json" + VERSION=$(curl -s "$versionUrl" | jq -r '.version') + + if [ -z "$VERSION" ]; then + echo "Failed to fetch version data" + exit 1 + fi + + contentEndpoints=("https://creatornode5.staging.audius.co" "https://creatornode6.staging.audius.co" "https://creatornode7.staging.audius.co" "https://creatornode8.staging.audius.co" "https://creatornode9.staging.audius.co" "https://creatornode10.staging.audius.co" "https://creatornode11.staging.audius.co" "https://creatornode12.staging.audius.co") + discoveryEndpoints=("https://discoveryprovider.staging.audius.co" "https://discoveryprovider2.staging.audius.co" "https://discoveryprovider3.staging.audius.co" "https://discoveryprovider4.staging.audius.co" "https://discoveryprovider5.staging.audius.co") + + slack_message="" + + compareVersions() { + for endpoint in "$@"; do + response=$(curl -s -o /dev/null -w "%{http_code}" "$endpoint/health_check") + if [ "$response" -eq 200 ]; then + endpointVersion=$(curl -s "$endpoint/health_check" | jq -r '.data.version') + if [ "$endpointVersion" != "$VERSION" ]; then + slack_message+="\n$endpoint (behind at v$endpointVersion)" + fi + else + slack_message+="\n$endpoint (error status=$response)" + fi + done + } + + compareVersions "${contentEndpoints[@]}" + compareVersions "${discoveryEndpoints[@]}" + + # Send Slack message if any node is behind + if [ ! -z "$slack_message" ]; then + json_content="{ \"blocks\": [ { \"type\": \"section\", \"text\": { \"type\": \"mrkdwn\", \"text\": \"Please set these nodes back on auto-upgrade if they're not in use:$slack_message\" } } ] }" + curl -f -X POST -H 'Content-type: application/json' \ + --data "$json_content" \ + $SLACK_DAILY_DEPLOY_WEBHOOK + fi + ) || handle_error + workflows: setup: when: @@ -516,3 +580,12 @@ workflows: - equal: ['release-client-create-branch', << pipeline.schedule.name >>] jobs: - generate-client-release + + notify-stuck-stage-nodes: + when: + and: + - equal: [scheduled_pipeline, << pipeline.trigger_source >>] + - equal: ['notify-stuck-stage-nodes', << pipeline.schedule.name >>] + jobs: + - notify-stuck-stage-nodes-job: + context: [slack-secrets] From d93d390651ca31f2033c2aa093163a4fe6f80984 Mon Sep 17 00:00:00 2001 From: Theo Ilie Date: Fri, 15 Dec 2023 15:05:30 -0800 Subject: [PATCH 2/3] Dynamically fetch node list --- .circleci/config.yml | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 9e62e13440c..432f42785e2 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -356,6 +356,19 @@ jobs: $SLACK_DAILY_DEPLOY_WEBHOOK } + fetchEndpoints() { + url=$1 + fallback=$2 + fetchedEndpoints=$(curl -s "$url" | jq -r '.data[]' 2>/dev/null) + + if [ -z "$fetchedEndpoints" ]; then + echo "FETCH_ERROR" + echo "$fallback" + else + echo $fetchedEndpoints + fi + } + ( set -e @@ -368,13 +381,21 @@ jobs: exit 1 fi - contentEndpoints=("https://creatornode5.staging.audius.co" "https://creatornode6.staging.audius.co" "https://creatornode7.staging.audius.co" "https://creatornode8.staging.audius.co" "https://creatornode9.staging.audius.co" "https://creatornode10.staging.audius.co" "https://creatornode11.staging.audius.co" "https://creatornode12.staging.audius.co") - discoveryEndpoints=("https://discoveryprovider.staging.audius.co" "https://discoveryprovider2.staging.audius.co" "https://discoveryprovider3.staging.audius.co" "https://discoveryprovider4.staging.audius.co" "https://discoveryprovider5.staging.audius.co") + contentFallbackEndpoints=("https://creatornode5.staging.audius.co" "https://creatornode6.staging.audius.co" "https://creatornode7.staging.audius.co" "https://creatornode8.staging.audius.co" "https://creatornode9.staging.audius.co" "https://creatornode10.staging.audius.co" "https://creatornode11.staging.audius.co" "https://creatornode12.staging.audius.co") + discoveryFallbackEndpoints=("https://discoveryprovider.staging.audius.co" "https://discoveryprovider2.staging.audius.co" "https://discoveryprovider3.staging.audius.co" "https://discoveryprovider4.staging.audius.co" "https://discoveryprovider5.staging.audius.co") + + contentEndpoints=($(fetchEndpoints "https://api.staging.audius.co/content" "${contentFallback[@]}")) + discoveryEndpoints=($(fetchEndpoints "https://api.staging.audius.co/discovery" "${discoveryFallback[@]}")) + + slack_message="" slack_message="" compareVersions() { for endpoint in "$@"; do + if [ "$endpoint" == "FETCH_ERROR" ]; then + continue + fi response=$(curl -s -o /dev/null -w "%{http_code}" "$endpoint/health_check") if [ "$response" -eq 200 ]; then endpointVersion=$(curl -s "$endpoint/health_check" | jq -r '.data.version') @@ -397,6 +418,14 @@ jobs: --data "$json_content" \ $SLACK_DAILY_DEPLOY_WEBHOOK fi + + # Also send a message if the API Gateway is down + if [[ " ${contentEndpoints[@]} " =~ " FETCH_ERROR " ]] || [[ " ${discoveryEndpoints[@]} " =~ " FETCH_ERROR " ]]; then + json_content="{ \"blocks\": [ { \"type\": \"section\", \"text\": { \"type\": \"mrkdwn\", \"text\": \"Note: api.staging.audius.co is offline, so a hardcoded list was used to check for offline/out-of-date nodes. \" } } ] }" + curl -f -X POST -H 'Content-type: application/json' \ + --data "$json_content" \ + $SLACK_DAILY_DEPLOY_WEBHOOK + fi ) || handle_error workflows: From 2975105efaa0cd8fb8a71f2c07408cdd1c9faafc Mon Sep 17 00:00:00 2001 From: Theo Ilie Date: Fri, 15 Dec 2023 15:23:19 -0800 Subject: [PATCH 3/3] Remove fallback completely --- .circleci/config.yml | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 432f42785e2..0748ac4463e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -358,12 +358,10 @@ jobs: fetchEndpoints() { url=$1 - fallback=$2 fetchedEndpoints=$(curl -s "$url" | jq -r '.data[]' 2>/dev/null) if [ -z "$fetchedEndpoints" ]; then echo "FETCH_ERROR" - echo "$fallback" else echo $fetchedEndpoints fi @@ -381,14 +379,8 @@ jobs: exit 1 fi - contentFallbackEndpoints=("https://creatornode5.staging.audius.co" "https://creatornode6.staging.audius.co" "https://creatornode7.staging.audius.co" "https://creatornode8.staging.audius.co" "https://creatornode9.staging.audius.co" "https://creatornode10.staging.audius.co" "https://creatornode11.staging.audius.co" "https://creatornode12.staging.audius.co") - discoveryFallbackEndpoints=("https://discoveryprovider.staging.audius.co" "https://discoveryprovider2.staging.audius.co" "https://discoveryprovider3.staging.audius.co" "https://discoveryprovider4.staging.audius.co" "https://discoveryprovider5.staging.audius.co") - - contentEndpoints=($(fetchEndpoints "https://api.staging.audius.co/content" "${contentFallback[@]}")) - discoveryEndpoints=($(fetchEndpoints "https://api.staging.audius.co/discovery" "${discoveryFallback[@]}")) - - slack_message="" - + contentEndpoints=($(fetchEndpoints "https://api.staging.audius.co/content")) + discoveryEndpoints=($(fetchEndpoints "https://api.staging.audius.co/discovery")) slack_message="" compareVersions() { @@ -411,17 +403,14 @@ jobs: compareVersions "${contentEndpoints[@]}" compareVersions "${discoveryEndpoints[@]}" - # Send Slack message if any node is behind + # Send Slack message if any node is behind or if the API Gateway is down if [ ! -z "$slack_message" ]; then json_content="{ \"blocks\": [ { \"type\": \"section\", \"text\": { \"type\": \"mrkdwn\", \"text\": \"Please set these nodes back on auto-upgrade if they're not in use:$slack_message\" } } ] }" curl -f -X POST -H 'Content-type: application/json' \ --data "$json_content" \ $SLACK_DAILY_DEPLOY_WEBHOOK - fi - - # Also send a message if the API Gateway is down - if [[ " ${contentEndpoints[@]} " =~ " FETCH_ERROR " ]] || [[ " ${discoveryEndpoints[@]} " =~ " FETCH_ERROR " ]]; then - json_content="{ \"blocks\": [ { \"type\": \"section\", \"text\": { \"type\": \"mrkdwn\", \"text\": \"Note: api.staging.audius.co is offline, so a hardcoded list was used to check for offline/out-of-date nodes. \" } } ] }" + elif [[ " ${contentEndpoints[@]} " =~ " FETCH_ERROR " ]] || [[ " ${discoveryEndpoints[@]} " =~ " FETCH_ERROR " ]]; then + json_content="{ \"blocks\": [ { \"type\": \"section\", \"text\": { \"type\": \"mrkdwn\", \"text\": \"Unable to check for offline / out-of-date stage nodes due to api.staging.audius.co being unreachable... \" } } ] }" curl -f -X POST -H 'Content-type: application/json' \ --data "$json_content" \ $SLACK_DAILY_DEPLOY_WEBHOOK