-
Notifications
You must be signed in to change notification settings - Fork 109
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
INF production Grafana snapshot (#3890)
- Loading branch information
1 parent
64fb281
commit fb87b2e
Showing
20 changed files
with
11,474 additions
and
1,459 deletions.
There are no files selected for viewing
186 changes: 186 additions & 0 deletions
186
monitoring/grafana/alerts/audius-content-node-bull-queue_waiting-queue-jobs-prod.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,186 @@ | ||
[ | ||
{ | ||
"annotations": { | ||
"__alertId__": "3VPUdqZVz_039_P3_prod", | ||
"__dashboardUid__": "3VPUdqZVz", | ||
"__panelId__": "39", | ||
"description": "If the number of waiting jobs is skyrocketing, it might mean:\n\n* Too many are being created.\n* The jobs for that specific queue are taking too long to execute.", | ||
"runbook_url": "http://grafana.audius.co/d/3VPUdqZVz?viewPanel=39", | ||
"summary": "The number of bull jobs waiting to be executed." | ||
}, | ||
"condition": "B", | ||
"data": [ | ||
{ | ||
"datasourceUid": "r2_nnDL7z", | ||
"model": { | ||
"datasource": { | ||
"type": "prometheus", | ||
"uid": "r2_nnDL7z" | ||
}, | ||
"expr": "audius_cn_jobs_waiting_total{environment=~\"prod\", queue_name=~\".*\", host=~\".*\"}", | ||
"hide": false, | ||
"intervalMs": 1000, | ||
"maxDataPoints": 43200, | ||
"refId": "A" | ||
}, | ||
"queryType": "", | ||
"refId": "A", | ||
"relativeTimeRange": { | ||
"from": 600, | ||
"to": 0 | ||
} | ||
}, | ||
{ | ||
"datasourceUid": "-100", | ||
"model": { | ||
"conditions": [ | ||
{ | ||
"evaluator": { | ||
"params": [ | ||
1100, | ||
0 | ||
], | ||
"type": "gt" | ||
}, | ||
"operator": { | ||
"type": "or" | ||
}, | ||
"query": { | ||
"params": [ | ||
"A" | ||
] | ||
}, | ||
"reducer": { | ||
"params": [], | ||
"type": "last" | ||
}, | ||
"type": "query" | ||
} | ||
], | ||
"datasource": { | ||
"name": "Expression", | ||
"type": "__expr__", | ||
"uid": "__expr__" | ||
}, | ||
"hide": false, | ||
"intervalMs": 1000, | ||
"maxDataPoints": 43200, | ||
"refId": "B", | ||
"type": "classic_conditions" | ||
}, | ||
"queryType": "", | ||
"refId": "B", | ||
"relativeTimeRange": { | ||
"from": 0, | ||
"to": 0 | ||
} | ||
} | ||
], | ||
"execErrState": "Alerting", | ||
"folderUID": "gtl7D0W4z", | ||
"for": "0s", | ||
"id": 3303931, | ||
"labels": { | ||
"alert": "p3", | ||
"env": "prod", | ||
"team": "content" | ||
}, | ||
"noDataState": "OK", | ||
"orgID": 1, | ||
"ruleGroup": "group", | ||
"title": "PROD P3 | Waiting Queue Jobs", | ||
"uid": "3VPUdqZVz_039_P3_prod" | ||
}, | ||
{ | ||
"annotations": { | ||
"__alertId__": "3VPUdqZVz_039_P2_prod", | ||
"__dashboardUid__": "3VPUdqZVz", | ||
"__panelId__": "39", | ||
"description": "If the number of waiting jobs is skyrocketing, it might mean:\n\n* Too many are being created.\n* The jobs for that specific queue are taking too long to execute.", | ||
"runbook_url": "http://grafana.audius.co/d/3VPUdqZVz?viewPanel=39", | ||
"summary": "The number of bull jobs waiting to be executed." | ||
}, | ||
"condition": "B", | ||
"data": [ | ||
{ | ||
"datasourceUid": "r2_nnDL7z", | ||
"model": { | ||
"datasource": { | ||
"type": "prometheus", | ||
"uid": "r2_nnDL7z" | ||
}, | ||
"expr": "audius_cn_jobs_waiting_total{environment=~\"prod\", queue_name=~\".*\", host=~\".*\"}", | ||
"hide": false, | ||
"intervalMs": 1000, | ||
"maxDataPoints": 43200, | ||
"refId": "A" | ||
}, | ||
"queryType": "", | ||
"refId": "A", | ||
"relativeTimeRange": { | ||
"from": 600, | ||
"to": 0 | ||
} | ||
}, | ||
{ | ||
"datasourceUid": "-100", | ||
"model": { | ||
"conditions": [ | ||
{ | ||
"evaluator": { | ||
"params": [ | ||
5000, | ||
0 | ||
], | ||
"type": "gt" | ||
}, | ||
"operator": { | ||
"type": "or" | ||
}, | ||
"query": { | ||
"params": [ | ||
"A" | ||
] | ||
}, | ||
"reducer": { | ||
"params": [], | ||
"type": "last" | ||
}, | ||
"type": "query" | ||
} | ||
], | ||
"datasource": { | ||
"name": "Expression", | ||
"type": "__expr__", | ||
"uid": "__expr__" | ||
}, | ||
"hide": false, | ||
"intervalMs": 1000, | ||
"maxDataPoints": 43200, | ||
"refId": "B", | ||
"type": "classic_conditions" | ||
}, | ||
"queryType": "", | ||
"refId": "B", | ||
"relativeTimeRange": { | ||
"from": 0, | ||
"to": 0 | ||
} | ||
} | ||
], | ||
"execErrState": "Alerting", | ||
"folderUID": "gtl7D0W4z", | ||
"for": "0s", | ||
"id": 3303921, | ||
"labels": { | ||
"alert": "p2", | ||
"env": "prod", | ||
"team": "content" | ||
}, | ||
"noDataState": "OK", | ||
"orgID": 1, | ||
"ruleGroup": "group", | ||
"title": "PROD P2 | Waiting Queue Jobs", | ||
"uid": "3VPUdqZVz_039_P2_prod" | ||
} | ||
] |
186 changes: 186 additions & 0 deletions
186
monitoring/grafana/alerts/audius-content-node-bull-queue_waiting-queue-jobs-stage.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,186 @@ | ||
[ | ||
{ | ||
"annotations": { | ||
"__alertId__": "3VPUdqZVz_039_P3_stage", | ||
"__dashboardUid__": "3VPUdqZVz", | ||
"__panelId__": "39", | ||
"description": "If the number of waiting jobs is skyrocketing, it might mean:\n\n* Too many are being created.\n* The jobs for that specific queue are taking too long to execute.", | ||
"runbook_url": "http://grafana.audius.co/d/3VPUdqZVz?viewPanel=39", | ||
"summary": "The number of bull jobs waiting to be executed." | ||
}, | ||
"condition": "B", | ||
"data": [ | ||
{ | ||
"datasourceUid": "r2_nnDL7z", | ||
"model": { | ||
"datasource": { | ||
"type": "prometheus", | ||
"uid": "r2_nnDL7z" | ||
}, | ||
"expr": "audius_cn_jobs_waiting_total{environment=~\"stage\", queue_name=~\".*\", host=~\".*\"}", | ||
"hide": false, | ||
"intervalMs": 1000, | ||
"maxDataPoints": 43200, | ||
"refId": "A" | ||
}, | ||
"queryType": "", | ||
"refId": "A", | ||
"relativeTimeRange": { | ||
"from": 600, | ||
"to": 0 | ||
} | ||
}, | ||
{ | ||
"datasourceUid": "-100", | ||
"model": { | ||
"conditions": [ | ||
{ | ||
"evaluator": { | ||
"params": [ | ||
1100, | ||
0 | ||
], | ||
"type": "gt" | ||
}, | ||
"operator": { | ||
"type": "or" | ||
}, | ||
"query": { | ||
"params": [ | ||
"A" | ||
] | ||
}, | ||
"reducer": { | ||
"params": [], | ||
"type": "last" | ||
}, | ||
"type": "query" | ||
} | ||
], | ||
"datasource": { | ||
"name": "Expression", | ||
"type": "__expr__", | ||
"uid": "__expr__" | ||
}, | ||
"hide": false, | ||
"intervalMs": 1000, | ||
"maxDataPoints": 43200, | ||
"refId": "B", | ||
"type": "classic_conditions" | ||
}, | ||
"queryType": "", | ||
"refId": "B", | ||
"relativeTimeRange": { | ||
"from": 0, | ||
"to": 0 | ||
} | ||
} | ||
], | ||
"execErrState": "Alerting", | ||
"folderUID": "gtl7D0W4z", | ||
"for": "0s", | ||
"id": 3303930, | ||
"labels": { | ||
"alert": "p3", | ||
"env": "stage", | ||
"team": "content" | ||
}, | ||
"noDataState": "OK", | ||
"orgID": 1, | ||
"ruleGroup": "group", | ||
"title": "STAGE P3 | Waiting Queue Jobs", | ||
"uid": "3VPUdqZVz_039_P3_stage" | ||
}, | ||
{ | ||
"annotations": { | ||
"__alertId__": "3VPUdqZVz_039_P2_stage", | ||
"__dashboardUid__": "3VPUdqZVz", | ||
"__panelId__": "39", | ||
"description": "If the number of waiting jobs is skyrocketing, it might mean:\n\n* Too many are being created.\n* The jobs for that specific queue are taking too long to execute.", | ||
"runbook_url": "http://grafana.audius.co/d/3VPUdqZVz?viewPanel=39", | ||
"summary": "The number of bull jobs waiting to be executed." | ||
}, | ||
"condition": "B", | ||
"data": [ | ||
{ | ||
"datasourceUid": "r2_nnDL7z", | ||
"model": { | ||
"datasource": { | ||
"type": "prometheus", | ||
"uid": "r2_nnDL7z" | ||
}, | ||
"expr": "audius_cn_jobs_waiting_total{environment=~\"stage\", queue_name=~\".*\", host=~\".*\"}", | ||
"hide": false, | ||
"intervalMs": 1000, | ||
"maxDataPoints": 43200, | ||
"refId": "A" | ||
}, | ||
"queryType": "", | ||
"refId": "A", | ||
"relativeTimeRange": { | ||
"from": 600, | ||
"to": 0 | ||
} | ||
}, | ||
{ | ||
"datasourceUid": "-100", | ||
"model": { | ||
"conditions": [ | ||
{ | ||
"evaluator": { | ||
"params": [ | ||
5000, | ||
0 | ||
], | ||
"type": "gt" | ||
}, | ||
"operator": { | ||
"type": "or" | ||
}, | ||
"query": { | ||
"params": [ | ||
"A" | ||
] | ||
}, | ||
"reducer": { | ||
"params": [], | ||
"type": "last" | ||
}, | ||
"type": "query" | ||
} | ||
], | ||
"datasource": { | ||
"name": "Expression", | ||
"type": "__expr__", | ||
"uid": "__expr__" | ||
}, | ||
"hide": false, | ||
"intervalMs": 1000, | ||
"maxDataPoints": 43200, | ||
"refId": "B", | ||
"type": "classic_conditions" | ||
}, | ||
"queryType": "", | ||
"refId": "B", | ||
"relativeTimeRange": { | ||
"from": 0, | ||
"to": 0 | ||
} | ||
} | ||
], | ||
"execErrState": "Alerting", | ||
"folderUID": "gtl7D0W4z", | ||
"for": "0s", | ||
"id": 3303920, | ||
"labels": { | ||
"alert": "p2", | ||
"env": "stage", | ||
"team": "content" | ||
}, | ||
"noDataState": "OK", | ||
"orgID": 1, | ||
"ruleGroup": "group", | ||
"title": "STAGE P2 | Waiting Queue Jobs", | ||
"uid": "3VPUdqZVz_039_P2_stage" | ||
} | ||
] |
Oops, something went wrong.