From c375c42433d3814dcd7a0ed0c3d122c8ca774223 Mon Sep 17 00:00:00 2001 From: QuantumEnigmaa Date: Tue, 10 Sep 2024 11:43:38 +0200 Subject: [PATCH 01/13] add MimirContinuousTestFailingOnWrites and MimirContinuousTestFailingOnReads alerts --- CHANGELOG.md | 1 + .../atlas/alerting-rules/mimir.rules.yml | 34 +++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 21604b25..444294c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Add `MimirHPAReachedMaxReplicas` alert, to detect when Mimir's HPAs have reached maximum capacity. +- Add `MimirContinuousTestFailingOnWrites` and `MimirContinuousTestFailingOnReads` alerts. ### Changed diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml index 2189daee..b41871b0 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml @@ -182,4 +182,38 @@ spec: severity: page team: atlas topic: observability + - alert: MimirContinuousTestFailingOnWrites + annotations: + description: 'Mimir continous-test detected errors in the write path.' + opsrecipe: mimir/ + # Query is based on the following upstream mixin alerting rule : https://github.com/grafana/mimir/blob/main/operations/mimir-mixin-compiled/alerts.yaml#L1097 + expr: sum by(cluster_id, installation, namespace, pipeline, provider, test) (rate(mimir_continuous_test_writes_failed_total[5m])) > 0 + for: 1h + labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + #TODO dashboard: + severity: page + team: atlas + topic: observability + - alert: MimirContinuousTestFailingOnReads + annotations: + description: 'Mimir continous-test detected errors in the write path.' + opsrecipe: mimir/ + # Query is based on the following upstream mixin alerting rule : https://github.com/grafana/mimir/blob/main/operations/mimir-mixin-compiled/alerts.yaml#L1097 + expr: sum by(cluster_id, installation, namespace, pipeline, provider, test) (rate(mimir_continuous_test_queries_failed_total[5m])) > 0 + for: 1h + labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + # TODO dashboard: + severity: page + team: atlas + topic: observability {{- end }} From 36a8c3ffd349ea644feb1026dd10ee7063138f44 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Tue, 12 Nov 2024 14:50:05 +0100 Subject: [PATCH 02/13] Update CHANGELOG.md --- CHANGELOG.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 71b844d4..a25c139f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Add `MimirContinuousTestFailingOnWrites` and `MimirContinuousTestFailingOnReads` alerts. + ### Removed - Remove the `mimir.enabled` property to replace it with the MC flavor as all CAPI MCs now run Mimir. @@ -16,7 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Fix `MonitoringAgentDown` to page when both prometheus-agent and alloy-metrics jobs are missing. - +https://github.com/search?q=repo%3Agiantswarm%2Fmc-bootstrap%20urandom&type=codeCon ## [4.24.0] - 2024-11-12 ### Added @@ -158,7 +162,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add aggregations for slo metrics to export them to grafana cloud - Add `MimirHPAReachedMaxReplicas` alert, to detect when Mimir's HPAs have reached maximum capacity. -- Add `MimirContinuousTestFailingOnWrites` and `MimirContinuousTestFailingOnReads` alerts. ### Changed From 360c3ee95a7c37d75f12cbef19564338bbe8186d Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Tue, 12 Nov 2024 15:03:12 +0100 Subject: [PATCH 03/13] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a25c139f..71abab8a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,7 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Fix `MonitoringAgentDown` to page when both prometheus-agent and alloy-metrics jobs are missing. -https://github.com/search?q=repo%3Agiantswarm%2Fmc-bootstrap%20urandom&type=codeCon + ## [4.24.0] - 2024-11-12 ### Added From 70ca9880edca51eb2b8ae0d1bb4d566e6da1f036 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Tue, 12 Nov 2024 15:07:04 +0100 Subject: [PATCH 04/13] Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml --- .../templates/platform/atlas/alerting-rules/mimir.rules.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml index a6d96078..7ee1b926 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml @@ -171,6 +171,7 @@ spec: topic: observability - alert: MimirContinuousTestFailingOnWrites annotations: + dashboard: bdxh7hszfgmbkc/mimir-continous-test description: 'Mimir continous-test detected errors in the write path.' opsrecipe: mimir/ # Query is based on the following upstream mixin alerting rule : https://github.com/grafana/mimir/blob/main/operations/mimir-mixin-compiled/alerts.yaml#L1097 From be82d78511ce3cc7925b7156389ee42d5359e755 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Tue, 12 Nov 2024 15:07:30 +0100 Subject: [PATCH 05/13] Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml --- .../templates/platform/atlas/alerting-rules/mimir.rules.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml index 7ee1b926..05eb17f9 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml @@ -183,7 +183,6 @@ spec: cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cancel_if_outside_working_hours: "true" - #TODO dashboard: severity: page team: atlas topic: observability From dc61d6d9b85f133e43736483720d19fbd260f3dc Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Tue, 12 Nov 2024 15:07:46 +0100 Subject: [PATCH 06/13] Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml --- .../templates/platform/atlas/alerting-rules/mimir.rules.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml index 05eb17f9..61fe9d33 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml @@ -188,6 +188,7 @@ spec: topic: observability - alert: MimirContinuousTestFailingOnReads annotations: + dashboard: bdxh7hszfgmbkc/mimir-continous-test description: 'Mimir continous-test detected errors in the write path.' opsrecipe: mimir/ # Query is based on the following upstream mixin alerting rule : https://github.com/grafana/mimir/blob/main/operations/mimir-mixin-compiled/alerts.yaml#L1097 From 1d2797ac31b087ea4286758656db75be70d54428 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Tue, 12 Nov 2024 15:08:00 +0100 Subject: [PATCH 07/13] Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml --- .../templates/platform/atlas/alerting-rules/mimir.rules.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml index 61fe9d33..10e68506 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml @@ -200,7 +200,6 @@ spec: cancel_if_cluster_status_deleting: "true" cancel_if_cluster_status_updating: "true" cancel_if_outside_working_hours: "true" - # TODO dashboard: severity: page team: atlas topic: observability From 5d3aa9e993ad911222bc4b0e36d91a0f40fc01bb Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Tue, 12 Nov 2024 15:15:06 +0100 Subject: [PATCH 08/13] add tests for rules --- .../atlas/alerting-rules/mimir.rules.yml | 6 +- .../atlas/alerting-rules/mimir.rules.test.yml | 62 +++++++++++++++++++ 2 files changed, 66 insertions(+), 2 deletions(-) diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml index 10e68506..e42aef59 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml @@ -169,9 +169,11 @@ spec: severity: page team: atlas topic: observability + - name: mimir.continuous-test + rules: - alert: MimirContinuousTestFailingOnWrites annotations: - dashboard: bdxh7hszfgmbkc/mimir-continous-test + dashboard: mimir-continous-test/mimir-continous-test description: 'Mimir continous-test detected errors in the write path.' opsrecipe: mimir/ # Query is based on the following upstream mixin alerting rule : https://github.com/grafana/mimir/blob/main/operations/mimir-mixin-compiled/alerts.yaml#L1097 @@ -188,7 +190,7 @@ spec: topic: observability - alert: MimirContinuousTestFailingOnReads annotations: - dashboard: bdxh7hszfgmbkc/mimir-continous-test + dashboard: mimir-continous-test/mimir-continous-test description: 'Mimir continous-test detected errors in the write path.' opsrecipe: mimir/ # Query is based on the following upstream mixin alerting rule : https://github.com/grafana/mimir/blob/main/operations/mimir-mixin-compiled/alerts.yaml#L1097 diff --git a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml index 6bdfeaea..7e575ac9 100644 --- a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml +++ b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml @@ -390,3 +390,65 @@ tests: eval_time: 205m - alertname: MimirCompactorFailedCompaction eval_time: 350m + + # Test for MimirContinuousTestFailingOnWrites alert + - interval: 1m + input_series: + # Test: none, rate > 0, rate = 0 + - series: 'mimir_continuous_test_writes_failed_total{cluster_id="golem", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}' + values: "_x20 1+1x80 0+0x70" + alert_rule_test: + - alertname: MimirContinuousTestFailingOnWrites + eval_time: 40m + - alertname: MimirContinuousTestFailingOnWrites + eval_time: 95m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: golem + installation: golem + namespace: mimir + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: mimir-continous-test/mimir-continous-test + description: "Mimir continous-test detected errors in the write path." + opsrecipe: "mimir/" + - alertname: MimirContinuousTestFailingOnWrites + eval_time: 160m + + # Test for MimirContinuousTestFailingOnReads alert + - interval: 1m + input_series: + # Test: none, rate > 0, rate = 0 + - series: 'mimir_continuous_test_queries_failed_total{cluster_id="golem", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}' + values: "_x20 1+1x80 0+0x70" + alert_rule_test: + - alertname: MimirContinuousTestFailingOnReads + eval_time: 40m + - alertname: MimirContinuousTestFailingOnReads + eval_time: 95m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: golem + installation: golem + namespace: mimir + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: mimir-continous-test/mimir-continous-test + description: "Mimir continous-test detected errors in the write path." + opsrecipe: "mimir/" + - alertname: MimirContinuousTestFailingOnReads + eval_time: 160m From 4963d56a46f1862c9d9b6a33a2c75ecca3ef56e9 Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Tue, 12 Nov 2024 15:31:02 +0100 Subject: [PATCH 09/13] fix tests --- .../atlas/alerting-rules/mimir.rules.yml | 25 ++++- .../atlas/alerting-rules/mimir.rules.test.yml | 48 ++++++++- .../atlas/alerting-rules/mimir.rules.test.yml | 102 ++++++++++++++++++ 3 files changed, 167 insertions(+), 8 deletions(-) diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml index e42aef59..9e39be3e 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml @@ -174,9 +174,9 @@ spec: - alert: MimirContinuousTestFailingOnWrites annotations: dashboard: mimir-continous-test/mimir-continous-test - description: 'Mimir continous-test detected errors in the write path.' + description: '{{`Mimir continuous test {{ $labels.test }} in {{ $labels.cluster_id }}/{{ $labels.namespace }} is not effectively running because writes are failing.`}}' opsrecipe: mimir/ - # Query is based on the following upstream mixin alerting rule : https://github.com/grafana/mimir/blob/main/operations/mimir-mixin-compiled/alerts.yaml#L1097 + # Query is based on the following upstream mixin alerting rule: https://github.com/grafana/mimir/blob/b873372adbf0996bff70de55934f3dd4a10c7b89/operations/mimir-mixin-compiled/alerts.yaml#L1196 expr: sum by(cluster_id, installation, namespace, pipeline, provider, test) (rate(mimir_continuous_test_writes_failed_total[5m])) > 0 for: 1h labels: @@ -191,9 +191,9 @@ spec: - alert: MimirContinuousTestFailingOnReads annotations: dashboard: mimir-continous-test/mimir-continous-test - description: 'Mimir continous-test detected errors in the write path.' + description: '{{`Mimir continuous test {{ $labels.test }} in {{ $labels.cluster_id }}/{{ $labels.namespace }} is not effectively running because queries are failing.`}}' opsrecipe: mimir/ - # Query is based on the following upstream mixin alerting rule : https://github.com/grafana/mimir/blob/main/operations/mimir-mixin-compiled/alerts.yaml#L1097 + # Query is based on the following upstream mixin alerting rule: https://github.com/grafana/mimir/blob/b873372adbf0996bff70de55934f3dd4a10c7b89/operations/mimir-mixin-compiled/alerts.yaml#L1185 expr: sum by(cluster_id, installation, namespace, pipeline, provider, test) (rate(mimir_continuous_test_queries_failed_total[5m])) > 0 for: 1h labels: @@ -205,4 +205,21 @@ spec: severity: page team: atlas topic: observability + - alert: MimirContinuousTestFailed + annotations: + dashboard: mimir-continous-test/mimir-continous-test + description: '{{`Mimir continuous test {{ $labels.test }} in {{ $labels.cluster_id }}/{{ $labels.namespace }} is not effectively running because queries are failing.`}}' + opsrecipe: mimir/ + # Query is based on the following upstream mixin alerting rule: https://github.com/grafana/mimir/blob/b873372adbf0996bff70de55934f3dd4a10c7b89/operations/mimir-mixin-compiled/alerts.yaml#L1205 + expr: sum by(cluster_id, installation, pipeline, provider, namespace, test) (rate(mimir_continuous_test_query_result_checks_failed_total[10m])) > 0 + for: 1h + labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability {{- end }} diff --git a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml index 7e575ac9..c95c1b31 100644 --- a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml +++ b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml @@ -395,7 +395,7 @@ tests: - interval: 1m input_series: # Test: none, rate > 0, rate = 0 - - series: 'mimir_continuous_test_writes_failed_total{cluster_id="golem", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}' + - series: 'mimir_continuous_test_writes_failed_total{cluster_id="golem", test="continuous-test", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}' values: "_x20 1+1x80 0+0x70" alert_rule_test: - alertname: MimirContinuousTestFailingOnWrites @@ -412,12 +412,15 @@ tests: cluster_id: golem installation: golem namespace: mimir + pipeline: testing + provider: capa severity: page team: atlas + test: continuous-test topic: observability exp_annotations: dashboard: mimir-continous-test/mimir-continous-test - description: "Mimir continous-test detected errors in the write path." + description: "Mimir continuous test continuous-test in golem/mimir is not effectively running because writes are failing." opsrecipe: "mimir/" - alertname: MimirContinuousTestFailingOnWrites eval_time: 160m @@ -426,7 +429,7 @@ tests: - interval: 1m input_series: # Test: none, rate > 0, rate = 0 - - series: 'mimir_continuous_test_queries_failed_total{cluster_id="golem", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}' + - series: 'mimir_continuous_test_queries_failed_total{cluster_id="golem", test="continuous-test", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}' values: "_x20 1+1x80 0+0x70" alert_rule_test: - alertname: MimirContinuousTestFailingOnReads @@ -443,12 +446,49 @@ tests: cluster_id: golem installation: golem namespace: mimir + pipeline: testing + provider: capa severity: page team: atlas + test: continuous-test topic: observability exp_annotations: dashboard: mimir-continous-test/mimir-continous-test - description: "Mimir continous-test detected errors in the write path." + description: "Mimir continuous test continuous-test in golem/mimir is not effectively running because queries are failing." opsrecipe: "mimir/" - alertname: MimirContinuousTestFailingOnReads eval_time: 160m + + # Test for MimirContinuousTestFailed alert + - interval: 1m + input_series: + # Test: none, rate > 0, rate = 0 + - series: 'mimir_continuous_test_query_result_checks_failed_total{cluster_id="golem", test="continuous-test", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}' + values: "_x20 1+1x80 0+0x70" + alert_rule_test: + - alertname: MimirContinuousTestFailed + eval_time: 40m + - alertname: MimirContinuousTestFailed + eval_time: 95m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: golem + installation: golem + namespace: mimir + pipeline: testing + provider: capa + severity: page + team: atlas + topic: observability + test: continuous-test + exp_annotations: + dashboard: mimir-continous-test/mimir-continous-test + description: "Mimir continuous test continuous-test in golem/mimir is not effectively running because queries are failing." + opsrecipe: "mimir/" + - alertname: MimirContinuousTestFailed + eval_time: 160m diff --git a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml index 6bdfeaea..c95c1b31 100644 --- a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml +++ b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml @@ -390,3 +390,105 @@ tests: eval_time: 205m - alertname: MimirCompactorFailedCompaction eval_time: 350m + + # Test for MimirContinuousTestFailingOnWrites alert + - interval: 1m + input_series: + # Test: none, rate > 0, rate = 0 + - series: 'mimir_continuous_test_writes_failed_total{cluster_id="golem", test="continuous-test", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}' + values: "_x20 1+1x80 0+0x70" + alert_rule_test: + - alertname: MimirContinuousTestFailingOnWrites + eval_time: 40m + - alertname: MimirContinuousTestFailingOnWrites + eval_time: 95m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: golem + installation: golem + namespace: mimir + pipeline: testing + provider: capa + severity: page + team: atlas + test: continuous-test + topic: observability + exp_annotations: + dashboard: mimir-continous-test/mimir-continous-test + description: "Mimir continuous test continuous-test in golem/mimir is not effectively running because writes are failing." + opsrecipe: "mimir/" + - alertname: MimirContinuousTestFailingOnWrites + eval_time: 160m + + # Test for MimirContinuousTestFailingOnReads alert + - interval: 1m + input_series: + # Test: none, rate > 0, rate = 0 + - series: 'mimir_continuous_test_queries_failed_total{cluster_id="golem", test="continuous-test", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}' + values: "_x20 1+1x80 0+0x70" + alert_rule_test: + - alertname: MimirContinuousTestFailingOnReads + eval_time: 40m + - alertname: MimirContinuousTestFailingOnReads + eval_time: 95m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: golem + installation: golem + namespace: mimir + pipeline: testing + provider: capa + severity: page + team: atlas + test: continuous-test + topic: observability + exp_annotations: + dashboard: mimir-continous-test/mimir-continous-test + description: "Mimir continuous test continuous-test in golem/mimir is not effectively running because queries are failing." + opsrecipe: "mimir/" + - alertname: MimirContinuousTestFailingOnReads + eval_time: 160m + + # Test for MimirContinuousTestFailed alert + - interval: 1m + input_series: + # Test: none, rate > 0, rate = 0 + - series: 'mimir_continuous_test_query_result_checks_failed_total{cluster_id="golem", test="continuous-test", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}' + values: "_x20 1+1x80 0+0x70" + alert_rule_test: + - alertname: MimirContinuousTestFailed + eval_time: 40m + - alertname: MimirContinuousTestFailed + eval_time: 95m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: golem + installation: golem + namespace: mimir + pipeline: testing + provider: capa + severity: page + team: atlas + topic: observability + test: continuous-test + exp_annotations: + dashboard: mimir-continous-test/mimir-continous-test + description: "Mimir continuous test continuous-test in golem/mimir is not effectively running because queries are failing." + opsrecipe: "mimir/" + - alertname: MimirContinuousTestFailed + eval_time: 160m From 9f1b29ef190953b919c2e6692b92904735365f1e Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Tue, 12 Nov 2024 17:22:15 +0100 Subject: [PATCH 10/13] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 71abab8a..380b1abc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- Add `MimirContinuousTestFailingOnWrites` and `MimirContinuousTestFailingOnReads` alerts. +- Add `MimirContinuousTestFailingOnWrites`, `MimirContinuousTestFailingOnReads` and `MimirContinuousTestFailed` alerts. ### Removed From d42aeecd7f700f649a01c4eda362605388f69b44 Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Thu, 14 Nov 2024 17:01:43 +0100 Subject: [PATCH 11/13] add mimir continuous test missing alert --- .../atlas/alerting-rules/mimir.rules.yml | 28 ++++++++++ .../atlas/alerting-rules/mimir.rules.test.yml | 52 +++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml index 9e39be3e..730f106f 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml @@ -222,4 +222,32 @@ spec: severity: page team: atlas topic: observability + - alert: MimirContinuousTestMissing + annotations: + dashboard: mimir-continous-test/mimir-continous-test + description: '{{`Mimir continuous test {{ $labels.cluster_id }} is not producing metrics.`}}' + opsrecipe: mimir/ + expr: | + sum by(cluster_id, installation, pipeline, provider) ( + rate(mimir_continuous_test_writes_total[10m]) == 0 + or absent( + mimir_continuous_test_writes_total{ + cluster_type="management_cluster", + cluster_id="{{ .Values.managementCluster.name }}", + installation="{{ .Values.managementCluster.name }}", + provider="{{ .Values.managementCluster.provider.kind }}", + pipeline="{{ .Values.managementCluster.pipeline }}" + } + ) + ) + for: 1h + labels: + area: platform + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cancel_if_outside_working_hours: "true" + severity: page + team: atlas + topic: observability {{- end }} diff --git a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml index c95c1b31..96bfa0d4 100644 --- a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml +++ b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml @@ -492,3 +492,55 @@ tests: opsrecipe: "mimir/" - alertname: MimirContinuousTestFailed eval_time: 160m + + # Test for MimirContinuousTestMissing alert + - interval: 1m + input_series: + # Test: none, rate > 0, rate = 0 + - series: 'mimir_continuous_test_writes_total{cluster_id="myinstall", cluster_type="management_cluster", installation="myinstall", namespace="mimir", pipeline="stable", provider="capa"}' + values: "_x80 1+1x80 0+0x80" + alert_rule_test: + - alertname: MimirContinuousTestMissing + eval_time: 40m + - alertname: MimirContinuousTestMissing + eval_time: 70m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: myinstall + installation: myinstall + pipeline: stable + provider: capa + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: mimir-continous-test/mimir-continous-test + description: "Mimir continuous test myinstall is not producing metrics." + opsrecipe: "mimir/" + - alertname: MimirContinuousTestMissing + eval_time: 150m + - alertname: MimirContinuousTestMissing + eval_time: 205m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: myinstall + installation: myinstall + pipeline: stable + provider: capa + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: mimir-continous-test/mimir-continous-test + description: "Mimir continuous test myinstall is not producing metrics." + opsrecipe: "mimir/" From a3e3bc3ffdab10276c15f26fa652a7e20e9ec491 Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Thu, 14 Nov 2024 17:03:15 +0100 Subject: [PATCH 12/13] add new test --- CHANGELOG.md | 6 +- .../atlas/alerting-rules/mimir.rules.yml | 2 +- .../atlas/alerting-rules/mimir.rules.test.yml | 8 +-- .../atlas/alerting-rules/mimir.rules.test.yml | 60 +++++++++++++++++-- 4 files changed, 66 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 380b1abc..251b6354 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- Add `MimirContinuousTestFailingOnWrites`, `MimirContinuousTestFailingOnReads` and `MimirContinuousTestFailed` alerts. +- Add new mimir continuous test alerts: + - `MimirContinuousTestFailingOnWrites` + - `MimirContinuousTestFailingOnReads` + - `MimirContinuousTestMissing` + - `MimirContinuousTestFailing` ### Removed diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml index 730f106f..30b677d4 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.rules.yml @@ -205,7 +205,7 @@ spec: severity: page team: atlas topic: observability - - alert: MimirContinuousTestFailed + - alert: MimirContinuousTestFailing annotations: dashboard: mimir-continous-test/mimir-continous-test description: '{{`Mimir continuous test {{ $labels.test }} in {{ $labels.cluster_id }}/{{ $labels.namespace }} is not effectively running because queries are failing.`}}' diff --git a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml index 96bfa0d4..000c0724 100644 --- a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml +++ b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml @@ -459,16 +459,16 @@ tests: - alertname: MimirContinuousTestFailingOnReads eval_time: 160m - # Test for MimirContinuousTestFailed alert + # Test for MimirContinuousTestFailing alert - interval: 1m input_series: # Test: none, rate > 0, rate = 0 - series: 'mimir_continuous_test_query_result_checks_failed_total{cluster_id="golem", test="continuous-test", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}' values: "_x20 1+1x80 0+0x70" alert_rule_test: - - alertname: MimirContinuousTestFailed + - alertname: MimirContinuousTestFailing eval_time: 40m - - alertname: MimirContinuousTestFailed + - alertname: MimirContinuousTestFailing eval_time: 95m exp_alerts: - exp_labels: @@ -490,7 +490,7 @@ tests: dashboard: mimir-continous-test/mimir-continous-test description: "Mimir continuous test continuous-test in golem/mimir is not effectively running because queries are failing." opsrecipe: "mimir/" - - alertname: MimirContinuousTestFailed + - alertname: MimirContinuousTestFailing eval_time: 160m # Test for MimirContinuousTestMissing alert diff --git a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml index c95c1b31..000c0724 100644 --- a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml +++ b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml @@ -459,16 +459,16 @@ tests: - alertname: MimirContinuousTestFailingOnReads eval_time: 160m - # Test for MimirContinuousTestFailed alert + # Test for MimirContinuousTestFailing alert - interval: 1m input_series: # Test: none, rate > 0, rate = 0 - series: 'mimir_continuous_test_query_result_checks_failed_total{cluster_id="golem", test="continuous-test", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}' values: "_x20 1+1x80 0+0x70" alert_rule_test: - - alertname: MimirContinuousTestFailed + - alertname: MimirContinuousTestFailing eval_time: 40m - - alertname: MimirContinuousTestFailed + - alertname: MimirContinuousTestFailing eval_time: 95m exp_alerts: - exp_labels: @@ -490,5 +490,57 @@ tests: dashboard: mimir-continous-test/mimir-continous-test description: "Mimir continuous test continuous-test in golem/mimir is not effectively running because queries are failing." opsrecipe: "mimir/" - - alertname: MimirContinuousTestFailed + - alertname: MimirContinuousTestFailing eval_time: 160m + + # Test for MimirContinuousTestMissing alert + - interval: 1m + input_series: + # Test: none, rate > 0, rate = 0 + - series: 'mimir_continuous_test_writes_total{cluster_id="myinstall", cluster_type="management_cluster", installation="myinstall", namespace="mimir", pipeline="stable", provider="capa"}' + values: "_x80 1+1x80 0+0x80" + alert_rule_test: + - alertname: MimirContinuousTestMissing + eval_time: 40m + - alertname: MimirContinuousTestMissing + eval_time: 70m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: myinstall + installation: myinstall + pipeline: stable + provider: capa + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: mimir-continous-test/mimir-continous-test + description: "Mimir continuous test myinstall is not producing metrics." + opsrecipe: "mimir/" + - alertname: MimirContinuousTestMissing + eval_time: 150m + - alertname: MimirContinuousTestMissing + eval_time: 205m + exp_alerts: + - exp_labels: + area: platform + cancel_if_outside_working_hours: "true" + cancel_if_cluster_status_creating: "true" + cancel_if_cluster_status_deleting: "true" + cancel_if_cluster_status_updating: "true" + cluster_id: myinstall + installation: myinstall + pipeline: stable + provider: capa + severity: page + team: atlas + topic: observability + exp_annotations: + dashboard: mimir-continous-test/mimir-continous-test + description: "Mimir continuous test myinstall is not producing metrics." + opsrecipe: "mimir/" From 5b82acd77ba2182e0483cc7982b72ad885454439 Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Thu, 14 Nov 2024 18:11:51 +0100 Subject: [PATCH 13/13] fix tests --- .../atlas/alerting-rules/mimir.rules.test.yml | 2 +- .../atlas/alerting-rules/mimir.rules.test.yml | 64 +++++++++---------- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml index 000c0724..e25ac35e 100644 --- a/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml +++ b/test/tests/providers/capi/capa/platform/atlas/alerting-rules/mimir.rules.test.yml @@ -525,7 +525,7 @@ tests: - alertname: MimirContinuousTestMissing eval_time: 150m - alertname: MimirContinuousTestMissing - eval_time: 205m + eval_time: 230m exp_alerts: - exp_labels: area: platform diff --git a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml index 000c0724..d6b37c8e 100644 --- a/test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml +++ b/test/tests/providers/capi/capz/platform/atlas/alerting-rules/mimir.rules.test.yml @@ -148,24 +148,24 @@ tests: - interval: 1m input_series: # mimir-ingester real memory usage gradually increases until it goes beyond 90% of the memory requests. - - series: 'container_memory_working_set_bytes{pod="mimir-ingester-0", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + - series: 'container_memory_working_set_bytes{pod="mimir-ingester-0", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capz", region="eu-west-2"}' values: "8+0x20 11+0x70 8+0x140 11+0x70 8+0x60" - - series: 'container_memory_working_set_bytes{pod="mimir-ingester-1", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + - series: 'container_memory_working_set_bytes{pod="mimir-ingester-1", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capz", region="eu-west-2"}' values: "8+0x20 11+0x70 8+0x140 11+0x70 8+0x60" # mimir-ingester memory requests stay the same for the entire duration of the test. - - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-0", container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-0", container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capz", region="eu-west-2"}' values: "12+0x400" - - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-1", container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-1", container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capz", region="eu-west-2"}' values: "12+0x400" # mimir-ingester real cpu usage gradually increases until it goes beyond 90% of the cpu requests. - - series: 'container_cpu_usage_seconds_total{pod="mimir-ingester-0", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + - series: 'container_cpu_usage_seconds_total{pod="mimir-ingester-0", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capz", region="eu-west-2"}' values: "0+60x100 6000+110x70 10400+60x60 14000+110x70 18400+60x60" - - series: 'container_cpu_usage_seconds_total{pod="mimir-ingester-1", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + - series: 'container_cpu_usage_seconds_total{pod="mimir-ingester-1", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capz", region="eu-west-2"}' values: "0+60x400" # mimir-ingester cpu requests stay the same for the entire duration of the test. - - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-0", container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-0", container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capz", region="eu-west-2"}' values: "1.5+0x400" - - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-1", container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-1", container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capz", region="eu-west-2"}' values: "1.5+0x400" alert_rule_test: - alertname: MimirIngesterNeedsToBeScaledUp @@ -182,7 +182,7 @@ tests: cluster_id: golem installation: "golem" pipeline: "testing" - provider: "capa" + provider: "capz" namespace: mimir severity: page team: atlas @@ -204,7 +204,7 @@ tests: cluster_id: golem installation: "golem" pipeline: "testing" - provider: "capa" + provider: "capz" namespace: mimir severity: page team: atlas @@ -226,7 +226,7 @@ tests: cluster_id: golem installation: "golem" pipeline: "testing" - provider: "capa" + provider: "capz" namespace: mimir severity: page team: atlas @@ -240,24 +240,24 @@ tests: - interval: 1m input_series: # mimir-ingester real memory usage gradually decreases until it goes below 30% of the memory requests. - - series: 'container_memory_working_set_bytes{pod="mimir-ingester-0", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + - series: 'container_memory_working_set_bytes{pod="mimir-ingester-0", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capz", region="eu-west-2"}' values: "8+0x20 2+0x40 8+0x140 2+0x40 8+0x60" - - series: 'container_memory_working_set_bytes{pod="mimir-ingester-1", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + - series: 'container_memory_working_set_bytes{pod="mimir-ingester-1", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capz", region="eu-west-2"}' values: "8+0x20 2+0x40 8+0x140 2+0x40 8+0x60" # mimir-ingester memory requests stay the same for the entire duration of the test. - - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-0", container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-0", container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capz", region="eu-west-2"}' values: "12+0x300" - - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-1", container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-1", container="ingester", namespace="mimir", unit="byte", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capz", region="eu-west-2"}' values: "12+0x300" # mimir-ingester real cpu usage gradually increases until it goes below 30% of the cpu requests. - - series: 'container_cpu_usage_seconds_total{pod="mimir-ingester-0", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + - series: 'container_cpu_usage_seconds_total{pod="mimir-ingester-0", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capz", region="eu-west-2"}' values: "0+60x100 6000+10x40 6400+60x60 10000+10x40 10400+60x60" - - series: 'container_cpu_usage_seconds_total{pod="mimir-ingester-1", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + - series: 'container_cpu_usage_seconds_total{pod="mimir-ingester-1", container="ingester", namespace="mimir", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capz", region="eu-west-2"}' values: "0+30x300" # mimir-ingester cpu requests stay the same for the entire duration of the test - - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-0", container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-0", container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capz", region="eu-west-2"}' values: "1.5+0x300" - - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-1", container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capa", region="eu-west-2"}' + - series: 'kube_pod_container_resource_requests{pod="mimir-ingester-1", container="ingester", namespace="mimir", unit="core", cluster_type="management_cluster", cluster_id="golem", installation="golem", pipeline="testing", provider="capz", region="eu-west-2"}' values: "1.5+0x300" alert_rule_test: - alertname: MimirIngesterNeedsToBeScaledDown @@ -282,7 +282,7 @@ tests: cluster_id: golem installation: "golem" pipeline: "testing" - provider: "capa" + provider: "capz" namespace: mimir severity: page team: atlas @@ -358,7 +358,7 @@ tests: # Test for MimirCompactorFailedCompaction alert - interval: 1m input_series: - - series: 'cortex_compactor_runs_failed_total{reason="error", installation="golem", cluster_id="golem", namespace="mimir", pipeline="testing", provider="capa"}' + - series: 'cortex_compactor_runs_failed_total{reason="error", installation="golem", cluster_id="golem", namespace="mimir", pipeline="testing", provider="capz"}' values: "8+0x20 1+0x40 0+0x20 4+0x130 0+0x190" alert_rule_test: - alertname: MimirCompactorFailedCompaction @@ -377,7 +377,7 @@ tests: cluster_id: golem installation: "golem" pipeline: "testing" - provider: "capa" + provider: "capz" namespace: mimir severity: page team: atlas @@ -395,7 +395,7 @@ tests: - interval: 1m input_series: # Test: none, rate > 0, rate = 0 - - series: 'mimir_continuous_test_writes_failed_total{cluster_id="golem", test="continuous-test", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}' + - series: 'mimir_continuous_test_writes_failed_total{cluster_id="golem", test="continuous-test", installation="golem", namespace="mimir", pipeline="testing", provider="capz"}' values: "_x20 1+1x80 0+0x70" alert_rule_test: - alertname: MimirContinuousTestFailingOnWrites @@ -413,7 +413,7 @@ tests: installation: golem namespace: mimir pipeline: testing - provider: capa + provider: capz severity: page team: atlas test: continuous-test @@ -429,7 +429,7 @@ tests: - interval: 1m input_series: # Test: none, rate > 0, rate = 0 - - series: 'mimir_continuous_test_queries_failed_total{cluster_id="golem", test="continuous-test", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}' + - series: 'mimir_continuous_test_queries_failed_total{cluster_id="golem", test="continuous-test", installation="golem", namespace="mimir", pipeline="testing", provider="capz"}' values: "_x20 1+1x80 0+0x70" alert_rule_test: - alertname: MimirContinuousTestFailingOnReads @@ -447,7 +447,7 @@ tests: installation: golem namespace: mimir pipeline: testing - provider: capa + provider: capz severity: page team: atlas test: continuous-test @@ -463,7 +463,7 @@ tests: - interval: 1m input_series: # Test: none, rate > 0, rate = 0 - - series: 'mimir_continuous_test_query_result_checks_failed_total{cluster_id="golem", test="continuous-test", installation="golem", namespace="mimir", pipeline="testing", provider="capa"}' + - series: 'mimir_continuous_test_query_result_checks_failed_total{cluster_id="golem", test="continuous-test", installation="golem", namespace="mimir", pipeline="testing", provider="capz"}' values: "_x20 1+1x80 0+0x70" alert_rule_test: - alertname: MimirContinuousTestFailing @@ -481,7 +481,7 @@ tests: installation: golem namespace: mimir pipeline: testing - provider: capa + provider: capz severity: page team: atlas topic: observability @@ -497,7 +497,7 @@ tests: - interval: 1m input_series: # Test: none, rate > 0, rate = 0 - - series: 'mimir_continuous_test_writes_total{cluster_id="myinstall", cluster_type="management_cluster", installation="myinstall", namespace="mimir", pipeline="stable", provider="capa"}' + - series: 'mimir_continuous_test_writes_total{cluster_id="myinstall", cluster_type="management_cluster", installation="myinstall", namespace="mimir", pipeline="stable", provider="capz"}' values: "_x80 1+1x80 0+0x80" alert_rule_test: - alertname: MimirContinuousTestMissing @@ -514,7 +514,7 @@ tests: cluster_id: myinstall installation: myinstall pipeline: stable - provider: capa + provider: capz severity: page team: atlas topic: observability @@ -525,7 +525,7 @@ tests: - alertname: MimirContinuousTestMissing eval_time: 150m - alertname: MimirContinuousTestMissing - eval_time: 205m + eval_time: 230m exp_alerts: - exp_labels: area: platform @@ -536,7 +536,7 @@ tests: cluster_id: myinstall installation: myinstall pipeline: stable - provider: capa + provider: capz severity: page team: atlas topic: observability