diff --git a/monitoring/README.md b/monitoring/README.md index 63beff146b1..cf14fda25a5 100644 --- a/monitoring/README.md +++ b/monitoring/README.md @@ -5,6 +5,10 @@ ```bash A run monitoring up +# on first start, wait a few minutes, then upload the dashboards again +# this will fix a recent regression with missing Library panels +# ./grafana/bin/upload-dashboards.sh + # A run monitoring down # optionally remove all grafana and prometheus data diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml index 337fb3bdff9..7abedc859c7 100644 --- a/monitoring/docker-compose.yml +++ b/monitoring/docker-compose.yml @@ -24,7 +24,6 @@ services: build: context: grafana args: - - GRAFANA_VERSION=latest - GF_INSTALL_IMAGE_RENDERER_PLUGIN=true user: 0:0 env_file: diff --git a/monitoring/grafana/Dockerfile b/monitoring/grafana/Dockerfile index 71d436658ff..02a22a08bfe 100644 --- a/monitoring/grafana/Dockerfile +++ b/monitoring/grafana/Dockerfile @@ -1,7 +1,7 @@ # Docs: https://grafana.com/docs/grafana/latest/installation/docker/#build-and-run-a-docker-image-with-pre-installed-plugins # Original: https://github.com/grafana/grafana/blob/a51c2774b8e77cafc0100a3882d77039c189e301/packaging/docker/custom/ubuntu.Dockerfile -ARG GRAFANA_VERSION="latest" +ARG GRAFANA_VERSION="9.1.0" FROM grafana/grafana:${GRAFANA_VERSION} diff --git a/monitoring/grafana/alerts/alert.template.json b/monitoring/grafana/alerts/alert.template.json index 514003eed31..7ab80573b8e 100644 --- a/monitoring/grafana/alerts/alert.template.json +++ b/monitoring/grafana/alerts/alert.template.json @@ -2,14 +2,14 @@ "id": {alert_id}, "uid": "{alert_uid}", "orgID": 1, - "folderUID": "ARwYXYzVk", + "folderUID": "zpejR_iVz", "ruleGroup": "group", "title": "{title}", "condition": "{condition_ref}", "data": {data}, "noDataState": "OK", "execErrState": "Alerting", - "for": "0", + "for": "0s", "annotations": {{ "__alertId__": "{alert_uid}", "__dashboardUid__": "{dashboard_uid}", diff --git a/monitoring/grafana/alerts/audius-governance_new-proposals.json b/monitoring/grafana/alerts/audius-governance_new-proposals.json index 4897fd49c47..67bf7571d97 100644 --- a/monitoring/grafana/alerts/audius-governance_new-proposals.json +++ b/monitoring/grafana/alerts/audius-governance_new-proposals.json @@ -1,7 +1,7 @@ [ { "annotations": { - "__alertId__": "YCgu57m4z_015_Notification", + "__alertId__": "YCgu57m4z_015_High", "__dashboardUid__": "YCgu57m4z", "__panelId__": "15", "description": "\n\nAny more than 2 should trigger multiple alarms (including this one) since an attacker may be self-signing multiple proposals.", @@ -37,7 +37,7 @@ { "evaluator": { "params": [ - null, + 2, 0 ], "type": "gt" @@ -77,16 +77,16 @@ } ], "execErrState": "Alerting", - "folderUID": "ARwYXYzVk", - "for": "0", - "id": 220154, + "folderUID": "zpejR_iVz", + "for": "0s", + "id": 180151, "labels": { - "channel": "notification" + "channel": "high-alert" }, "noDataState": "OK", "orgID": 1, "ruleGroup": "group", - "title": "New Proposals (Notification)", - "uid": "YCgu57m4z_015_Notification" + "title": "New Proposals (High)", + "uid": "YCgu57m4z_015_High" } ] \ No newline at end of file diff --git a/monitoring/grafana/alerts/audius-governance_open-proposals.json b/monitoring/grafana/alerts/audius-governance_open-proposals.json index a0978a714ed..ca333e79a32 100644 --- a/monitoring/grafana/alerts/audius-governance_open-proposals.json +++ b/monitoring/grafana/alerts/audius-governance_open-proposals.json @@ -1,7 +1,7 @@ [ { "annotations": { - "__alertId__": "YCgu57m4z_020_Notification", + "__alertId__": "YCgu57m4z_020_Low", "__dashboardUid__": "YCgu57m4z", "__panelId__": "20", "description": "\n\nThe number open proposals should typically be 1, which will send a Slack notification.\n\n2 opened PRs at the same time will aggressively escalate to on-call.", @@ -37,7 +37,7 @@ { "evaluator": { "params": [ - null, + 1, 0 ], "type": "gt" @@ -77,16 +77,106 @@ } ], "execErrState": "Alerting", - "folderUID": "ARwYXYzVk", - "for": "0", - "id": 220204, + "folderUID": "zpejR_iVz", + "for": "0s", + "id": 180203, "labels": { - "channel": "notification" + "channel": "low-alert" }, "noDataState": "OK", "orgID": 1, "ruleGroup": "group", - "title": "Open Proposals (Notification)", - "uid": "YCgu57m4z_020_Notification" + "title": "Open Proposals (Low)", + "uid": "YCgu57m4z_020_Low" + }, + { + "annotations": { + "__alertId__": "YCgu57m4z_020_High", + "__dashboardUid__": "YCgu57m4z", + "__panelId__": "20", + "description": "\n\nThe number open proposals should typically be 1, which will send a Slack notification.\n\n2 opened PRs at the same time will aggressively escalate to on-call.", + "runbook_url": "http://grafana.audius.co/d/YCgu57m4z?viewPanel=20", + "summary": "The number of open proposals at the moment.\n\n" + }, + "condition": "B", + "data": [ + { + "datasourceUid": "r2_nnDL7z", + "model": { + "datasource": { + "type": "prometheus", + "uid": "r2_nnDL7z" + }, + "expr": "audius_exporters_sdk_proposals{outcome=\"InProgress\"}", + "hide": false, + "intervalMs": 1000, + "maxDataPoints": 43200, + "refId": "A" + }, + "queryType": "", + "refId": "A", + "relativeTimeRange": { + "from": 600, + "to": 0 + } + }, + { + "datasourceUid": "-100", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 2, + 0 + ], + "type": "gt" + }, + "operator": { + "type": "or" + }, + "query": { + "params": [ + "A" + ] + }, + "reducer": { + "params": [], + "type": "last" + }, + "type": "query" + } + ], + "datasource": { + "name": "Expression", + "type": "__expr__", + "uid": "__expr__" + }, + "hide": false, + "intervalMs": 1000, + "maxDataPoints": 43200, + "refId": "B", + "type": "classic_conditions" + }, + "queryType": "", + "refId": "B", + "relativeTimeRange": { + "from": 0, + "to": 0 + } + } + ], + "execErrState": "Alerting", + "folderUID": "zpejR_iVz", + "for": "0s", + "id": 180201, + "labels": { + "channel": "high-alert" + }, + "noDataState": "OK", + "orgID": 1, + "ruleGroup": "group", + "title": "Open Proposals (High)", + "uid": "YCgu57m4z_020_High" } ] \ No newline at end of file diff --git a/monitoring/grafana/alerts/audius-governance_proposals-by-unknown-proposers.json b/monitoring/grafana/alerts/audius-governance_proposals-by-unknown-proposers.json index d8622039141..4f18340d281 100644 --- a/monitoring/grafana/alerts/audius-governance_proposals-by-unknown-proposers.json +++ b/monitoring/grafana/alerts/audius-governance_proposals-by-unknown-proposers.json @@ -1,7 +1,7 @@ [ { "annotations": { - "__alertId__": "YCgu57m4z_017_Notification", + "__alertId__": "YCgu57m4z_017_High", "__dashboardUid__": "YCgu57m4z", "__panelId__": "17", "description": "\n\nWe ignore any previous one-off proposers to keep this metric at 0. Anything greater than 0 should be considered to be created by an attacker.", @@ -37,7 +37,7 @@ { "evaluator": { "params": [ - null, + 1, 0 ], "type": "gt" @@ -77,16 +77,16 @@ } ], "execErrState": "Alerting", - "folderUID": "ARwYXYzVk", - "for": "0", - "id": 220174, + "folderUID": "zpejR_iVz", + "for": "0s", + "id": 180171, "labels": { - "channel": "notification" + "channel": "high-alert" }, "noDataState": "OK", "orgID": 1, "ruleGroup": "group", - "title": "Proposals by Unknown Proposers (Notification)", - "uid": "YCgu57m4z_017_Notification" + "title": "Proposals by Unknown Proposers (High)", + "uid": "YCgu57m4z_017_High" } ] \ No newline at end of file diff --git a/monitoring/grafana/alerts/audius-token_contract-balances-staking.json b/monitoring/grafana/alerts/audius-token_contract-balances-staking.json index c930ebb0b63..4f7346d8188 100644 --- a/monitoring/grafana/alerts/audius-token_contract-balances-staking.json +++ b/monitoring/grafana/alerts/audius-token_contract-balances-staking.json @@ -1,7 +1,7 @@ [ { "annotations": { - "__alertId__": "jPaB_Yk4k_017_Notification", + "__alertId__": "jPaB_Yk4k_017_Low", "__dashboardUid__": "jPaB_Yk4k", "__panelId__": "17", "description": "\n\nIf this number dips, we may have lost a service provider.", @@ -37,10 +37,10 @@ { "evaluator": { "params": [ - null, + 355000000, 0 ], - "type": "gt" + "type": "lt" }, "operator": { "type": "or" @@ -77,16 +77,16 @@ } ], "execErrState": "Alerting", - "folderUID": "ARwYXYzVk", - "for": "0", - "id": 200174, + "folderUID": "zpejR_iVz", + "for": "0s", + "id": 110173, "labels": { - "channel": "notification" + "channel": "low-alert" }, "noDataState": "OK", "orgID": 1, "ruleGroup": "group", - "title": "Contract Balances (Staking) (Notification)", - "uid": "jPaB_Yk4k_017_Notification" + "title": "Contract Balances (Staking) (Low)", + "uid": "jPaB_Yk4k_017_Low" } ] \ No newline at end of file diff --git a/monitoring/grafana/alerts/audius-token_contract-balances.json b/monitoring/grafana/alerts/audius-token_contract-balances.json deleted file mode 100644 index fd599f1cfa8..00000000000 --- a/monitoring/grafana/alerts/audius-token_contract-balances.json +++ /dev/null @@ -1,176 +0,0 @@ -[ - { - "annotations": { - "__alertId__": "jPaB_Yk4k_004_Notification", - "__dashboardUid__": "jPaB_Yk4k", - "__panelId__": "4", - "description": "\n\nIf this number dips, we may have lost a service provider.", - "runbook_url": "http://grafana.audius.co/d/jPaB_Yk4k?viewPanel=4", - "summary": "The number of tokens held by the staking contract.\n\n" - }, - "condition": "D", - "data": [ - { - "datasourceUid": "r2_nnDL7z", - "model": { - "datasource": { - "type": "prometheus", - "uid": "r2_nnDL7z" - }, - "expr": "audius_exporters_alchemy_balance{address_name=\"NotStaking\"}", - "hide": false, - "intervalMs": 1000, - "maxDataPoints": 43200, - "refId": "A" - }, - "queryType": "", - "refId": "A", - "relativeTimeRange": { - "from": 600, - "to": 0 - } - }, - { - "datasourceUid": "r2_nnDL7z", - "model": { - "datasource": { - "type": "prometheus", - "uid": "r2_nnDL7z" - }, - "expr": "audius_exporters_alchemy_balance{address_name=\"Staking\"}", - "hide": false, - "intervalMs": 1000, - "maxDataPoints": 43200, - "refId": "B" - }, - "queryType": "", - "refId": "B", - "relativeTimeRange": { - "from": 600, - "to": 0 - } - }, - { - "datasourceUid": "r2_nnDL7z", - "model": { - "datasource": { - "type": "prometheus", - "uid": "r2_nnDL7z" - }, - "expr": "audius_exporters_alchemy_balance{address_name!=\"Staking\"}", - "hide": false, - "intervalMs": 1000, - "maxDataPoints": 43200, - "refId": "C" - }, - "queryType": "", - "refId": "C", - "relativeTimeRange": { - "from": 600, - "to": 0 - } - }, - { - "datasourceUid": "-100", - "model": { - "conditions": [ - { - "evaluator": { - "params": [ - null, - 0 - ], - "type": "gt" - }, - "operator": { - "type": "or" - }, - "query": { - "params": [ - "A" - ] - }, - "reducer": { - "params": [], - "type": "last" - }, - "type": "query" - }, - { - "evaluator": { - "params": [ - null, - 0 - ], - "type": "gt" - }, - "operator": { - "type": "or" - }, - "query": { - "params": [ - "B" - ] - }, - "reducer": { - "params": [], - "type": "last" - }, - "type": "query" - }, - { - "evaluator": { - "params": [ - null, - 0 - ], - "type": "gt" - }, - "operator": { - "type": "or" - }, - "query": { - "params": [ - "C" - ] - }, - "reducer": { - "params": [], - "type": "last" - }, - "type": "query" - } - ], - "datasource": { - "name": "Expression", - "type": "__expr__", - "uid": "__expr__" - }, - "hide": false, - "intervalMs": 1000, - "maxDataPoints": 43200, - "refId": "D", - "type": "classic_conditions" - }, - "queryType": "", - "refId": "D", - "relativeTimeRange": { - "from": 0, - "to": 0 - } - } - ], - "execErrState": "Alerting", - "folderUID": "ARwYXYzVk", - "for": "0", - "id": 200044, - "labels": { - "channel": "notification" - }, - "noDataState": "OK", - "orgID": 1, - "ruleGroup": "group", - "title": "Contract Balances (Notification)", - "uid": "jPaB_Yk4k_004_Notification" - } -] \ No newline at end of file diff --git a/monitoring/grafana/alerts/audius-token_contract-movements.json b/monitoring/grafana/alerts/audius-token_contract-movements.json index 477ffccdcc4..f2495c729d9 100644 --- a/monitoring/grafana/alerts/audius-token_contract-movements.json +++ b/monitoring/grafana/alerts/audius-token_contract-movements.json @@ -1,7 +1,7 @@ [ { "annotations": { - "__alertId__": "jPaB_Yk4k_005_Notification", + "__alertId__": "jPaB_Yk4k_005_High", "__dashboardUid__": "jPaB_Yk4k", "__panelId__": "5", "description": "", @@ -37,7 +37,7 @@ { "evaluator": { "params": [ - null, + 10, 0 ], "type": "gt" @@ -77,16 +77,16 @@ } ], "execErrState": "Alerting", - "folderUID": "ARwYXYzVk", - "for": "0", - "id": 200054, + "folderUID": "zpejR_iVz", + "for": "0s", + "id": 110051, "labels": { - "channel": "notification" + "channel": "high-alert" }, "noDataState": "OK", "orgID": 1, "ruleGroup": "group", - "title": "Contract Movements (Notification)", - "uid": "jPaB_Yk4k_005_Notification" + "title": "Contract Movements (High)", + "uid": "jPaB_Yk4k_005_High" } ] \ No newline at end of file diff --git a/monitoring/grafana/alerts/audius-token_token-transfers-amount-increase-over-5m.json b/monitoring/grafana/alerts/audius-token_token-transfers-amount-increase-over-5m.json deleted file mode 100644 index 2f857be73ff..00000000000 --- a/monitoring/grafana/alerts/audius-token_token-transfers-amount-increase-over-5m.json +++ /dev/null @@ -1,92 +0,0 @@ -[ - { - "annotations": { - "__alertId__": "jPaB_Yk4k_015_Notification", - "__dashboardUid__": "jPaB_Yk4k", - "__panelId__": "15", - "description": "\n\nOccasional huge transfers are routine. Many large transfers may indicate a mass exodus.\n\nThis panel does not differentiate between a whale and many individuals moving the same amount of tokens.", - "runbook_url": "http://grafana.audius.co/d/jPaB_Yk4k?viewPanel=15", - "summary": "The number of tokens transferred within a rolling 5 minute window.\n\n" - }, - "condition": "B", - "data": [ - { - "datasourceUid": "r2_nnDL7z", - "model": { - "datasource": { - "type": "prometheus", - "uid": "r2_nnDL7z" - }, - "expr": "increase(audius_exporters_alchemy_token_transfer_sum[5m])", - "hide": false, - "intervalMs": 1000, - "maxDataPoints": 43200, - "refId": "A" - }, - "queryType": "", - "refId": "A", - "relativeTimeRange": { - "from": 600, - "to": 0 - } - }, - { - "datasourceUid": "-100", - "model": { - "conditions": [ - { - "evaluator": { - "params": [ - null, - 0 - ], - "type": "gt" - }, - "operator": { - "type": "or" - }, - "query": { - "params": [ - "A" - ] - }, - "reducer": { - "params": [], - "type": "last" - }, - "type": "query" - } - ], - "datasource": { - "name": "Expression", - "type": "__expr__", - "uid": "__expr__" - }, - "hide": false, - "intervalMs": 1000, - "maxDataPoints": 43200, - "refId": "B", - "type": "classic_conditions" - }, - "queryType": "", - "refId": "B", - "relativeTimeRange": { - "from": 0, - "to": 0 - } - } - ], - "execErrState": "Alerting", - "folderUID": "ARwYXYzVk", - "for": "0", - "id": 200154, - "labels": { - "channel": "notification" - }, - "noDataState": "OK", - "orgID": 1, - "ruleGroup": "group", - "title": "Token Transfers Amount (increase over 5m) (Notification)", - "uid": "jPaB_Yk4k_015_Notification" - } -] \ No newline at end of file diff --git a/monitoring/grafana/bin/extract-alerts.py b/monitoring/grafana/bin/extract-alerts.py index c1ceeb8d7f6..297142eadc0 100644 --- a/monitoring/grafana/bin/extract-alerts.py +++ b/monitoring/grafana/bin/extract-alerts.py @@ -70,7 +70,7 @@ def main(filename): panel_alerts = [] for step in panel["fieldConfig"]["defaults"]["thresholds"]["steps"]: # skip the base case when value is not set, nor visible - if "value" not in step or step["value"]: + if "value" not in step or not step["value"]: continue # ensure thresholds are visible @@ -92,7 +92,7 @@ def main(filename): level = "low-alert" level_id = 3 else: - break + continue # UI Ordering: dark, semi-dark, base, light, super-light # Conditional: <= >= diff --git a/monitoring/grafana/bin/extract-alerts.sh b/monitoring/grafana/bin/extract-alerts.sh index bf1a848fa40..7da79889cc4 100755 --- a/monitoring/grafana/bin/extract-alerts.sh +++ b/monitoring/grafana/bin/extract-alerts.sh @@ -19,7 +19,7 @@ set -x # refresh all dashboards and do not strip the dashboard ID CLEAR_DASHBOARD_ID=. ./grafana/bin/save-dashboards.sh -json_dashboards=$(find "${GRAFANA_DASHBOARD_DIR}" -name '*.json' -not -name 'library.json') +json_dashboards=$(find "${GRAFANA_DASHBOARD_DIR}" -name '*.json' -not -name 'library.json' -not -name 'folders.json') for json_dashboard in ${json_dashboards} do diff --git a/monitoring/grafana/bin/save-dashboards.sh b/monitoring/grafana/bin/save-dashboards.sh index b28cd43f8d1..63beb413c69 100755 --- a/monitoring/grafana/bin/save-dashboards.sh +++ b/monitoring/grafana/bin/save-dashboards.sh @@ -55,6 +55,23 @@ CLEAR_LIBRARY_PANEL_UPDATED='del(.panels[].libraryPanel.meta.updated)' # wrap the final output in a different format and use overwrite: true, to avoid .id and .version collisions PUSH_FORMATTING='{dashboard: ., overwrite: true}' +# FOLDERS +# ids have to be unique +CLEAR_FOLDER_IDS='del(.[].id)' + +path=grafana/dashboards/folders.json +response=$(curl \ + -s \ + -H "Authorization: Bearer ${BEARER_TOKEN}" \ + -H 'Content-Type: application/json' \ + -H 'Accept: application/json' \ + ${BASE_URL}/api/folders) + +echo ${response} \ + | jq "${CLEAR_FOLDER_IDS}" \ + > "${path}" +echo "Saved to: ${path}" + path=grafana/dashboards/library.json # save all library panels into a single file curl -s "${PASS_URL}/api/library-elements?perPage=100" \ diff --git a/monitoring/grafana/bin/upload-dashboards.sh b/monitoring/grafana/bin/upload-dashboards.sh index 2a55d72e68d..ee6aaad2beb 100755 --- a/monitoring/grafana/bin/upload-dashboards.sh +++ b/monitoring/grafana/bin/upload-dashboards.sh @@ -17,8 +17,30 @@ set +o allexport BASE_URL=http://${GRAFANA_API_URL}:${GRAFANA_API_PORT} +folders=grafana/dashboards/folders.json +cat ${folders} \ + | jq -cr '.[]' \ + | while read -r folder; + do + echo "Updating: ${folder}" + + # ignore stdout since it really only gets run on setup + # mainly needed for Alerts (Production only) + curl \ + -s \ + -H "Authorization: Bearer ${BEARER_TOKEN}" \ + -u ${GRAFANA_USER}:${GRAFANA_PASS} \ + -X POST \ + -H "Content-Type: application/json" \ + -H "Accept: application/json" \ + -d "${folder}" \ + ${BASE_URL}/api/folders \ + | jq . \ + > /dev/null + done + # upload dashboard json files -json_dashboards=$(find "${GRAFANA_DASHBOARD_DIR}" -name '*.json' -not -name 'library.json') +json_dashboards=$(find "${GRAFANA_DASHBOARD_DIR}" -name '*.json' -not -name 'library.json' -not -name 'folders.json') for json_dashboard in ${json_dashboards} do curl \ diff --git a/monitoring/grafana/bin/upload-library-panels.sh b/monitoring/grafana/bin/upload-library-panels.sh index c97e72e1a62..b2e3a1f385e 100755 --- a/monitoring/grafana/bin/upload-library-panels.sh +++ b/monitoring/grafana/bin/upload-library-panels.sh @@ -15,6 +15,28 @@ set +o allexport BASE_URL=http://${GRAFANA_API_URL}:${GRAFANA_API_PORT} # upload all library panels +cat grafana/dashboards/library.json \ + | jq -cr '.[]' \ + | while read -r panel; + do + response=$(curl \ + -s \ + -H "Authorization: Bearer ${BEARER_TOKEN}" \ + -u ${GRAFANA_USER}:${GRAFANA_PASS} \ + -X POST \ + -H "Content-Type: application/json" \ + -H "Accept: application/json" \ + -d "${panel}" \ + ${BASE_URL}/api/library-elements) + message=$(echo ${response} | jq -r '.message // empty') + if [[ "${message}" =~ .*"library element with that name or UID already exists".* ]]; then + echo "Found: $(echo ${panel} | jq -r .uid)" + else + echo "Created: $(echo ${panel} | jq -r .uid)" + echo ${response} | jq . + fi + done + cat grafana/dashboards/library.json \ | jq -cr '.[]' \ | while read -r panel; diff --git a/monitoring/grafana/dashboards/folders.json b/monitoring/grafana/dashboards/folders.json new file mode 100644 index 00000000000..a0ed726f57a --- /dev/null +++ b/monitoring/grafana/dashboards/folders.json @@ -0,0 +1,6 @@ +[ + { + "uid": "zpejR_iVz", + "title": "Alerts" + } +] diff --git a/monitoring/scripts/deploy.sh b/monitoring/scripts/deploy.sh index 383e721ea10..5d83d855add 100755 --- a/monitoring/scripts/deploy.sh +++ b/monitoring/scripts/deploy.sh @@ -7,4 +7,5 @@ PROM_ENV="${1:-local}" docker-compose build --build-arg PROM_ENV=${PROM_ENV} docker-compose down +docker network create audius_dev || true docker-compose up -d