Skip to content

Commit

Permalink
add new loki compactor failing alert (#1375)
Browse files Browse the repository at this point in the history
* add new loki compactor failing alert

* Update test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml

Co-authored-by: Hervé Nicol <hervenicol@users.noreply.github.com>

* Update test/tests/providers/capi/capa-mimir/platform/atlas/alerting-rules/mimir.rules.test.yml

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml

* Update test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml

* Update test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml

* Add loki compaction failing since startup

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml

* Update helm/prometheus-rules/templates/platform/atlas/alerting-rules/loki.rules.yml

* Update test/tests/providers/global/platform/atlas/alerting-rules/loki.rules.test.yml

Co-authored-by: Hervé Nicol <hervenicol@users.noreply.github.com>

---------

Co-authored-by: Hervé Nicol <hervenicol@users.noreply.github.com>
  • Loading branch information
QuentinBisson and hervenicol authored Sep 25, 2024
1 parent 5637471 commit 8a49c9f
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 4 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Added

- Add `LokiFailedCompaction` alert to know when Loki did not manage to run a successfull compaction in the last 2 hours.

### Removed

- Remove CRsync alerting rules.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,3 +135,39 @@ spec:
severity: page
team: atlas
topic: observability
- name: loki.compactor
rules:
- alert: LokiCompactorFailedCompaction
annotations:
dashboard: loki-retention/loki-retention
description: 'Loki compactor has been failing compactions for more than 2 hours since last compaction.'
opsrecipe: loki#lokicompactorfailedcompaction
# This alert checks if Loki's the last successful compaction run is older than 2 hours
expr: (time() - (loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds > 0) > 60 * 60 * 2)
for: 1h
labels:
area: platform
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: atlas
topic: observability
- alert: LokiCompactorFailedCompaction
annotations:
dashboard: loki-retention/loki-retention
description: 'Loki compactor has been failing compactions for more than 2 hours since start-up.'
opsrecipe: loki#lokicompactorfailedcompaction
# This alert covers the special case at compactor startup, where the "normal" alert would always consider time `0` is more than 2 hours ago, yet we want to let it 2 hours + `for` duration.
expr: max(max_over_time(loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{}[2h])) by (cluster_id, installation, provider, pipeline) == 0
for: 1h
labels:
area: platform
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
severity: page
team: atlas
topic: observability
Original file line number Diff line number Diff line change
Expand Up @@ -166,11 +166,13 @@ spec:
severity: page
team: atlas
topic: observability
- name: mimir.compactor
rules:
- alert: MimirCompactorFailedCompaction
annotations:
dashboard: 09a5c49e9cdb2f2b24c6d184574a07fd/mimir-compactor-resources
description: 'Mimir compactor has been failing its compactions for 2 hours.'
opsrecipe: mimir/
opsrecipe: mimir#mimircompactorfailedcompaction
# Query is based on the following upstream mixin alerting rule : https://github.com/grafana/mimir/blob/main/operations/mimir-mixin-compiled/alerts.yaml#L858
expr: sum(increase(cortex_compactor_runs_failed_total{reason!="shutdown"}[2h])) by (cluster_id, installation, namespace, pipeline, provider) > 2
labels:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -387,9 +387,8 @@ tests:
# Test for MimirCompactorFailedCompaction alert
- interval: 1m
input_series:
# mimir-ingester real memory usage gradually decreases until it goes below 30% of the memory requests.
- series: 'cortex_compactor_runs_failed_total{reason="error", installation="golem", cluster_id="golem", namespace="mimir", pipeline="testing", provider="capa"}'
values: "8+0x20 1+0x40 0+0x20 4+0x130 0+0x190"
values: "8+0x20 1+0x40 0+0x20 4+0x130 0+0x190"
alert_rule_test:
- alertname: MimirCompactorFailedCompaction
eval_time: 15m
Expand All @@ -415,7 +414,7 @@ tests:
exp_annotations:
dashboard: 09a5c49e9cdb2f2b24c6d184574a07fd/mimir-compactor-resources
description: Mimir compactor has been failing its compactions for 2 hours.
opsrecipe: "mimir/"
opsrecipe: "mimir#mimircompactorfailedcompaction"
- alertname: MimirCompactorFailedCompaction
eval_time: 205m
- alertname: MimirCompactorFailedCompaction
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -227,3 +227,65 @@ tests:
opsrecipe: loki/
- alertname: LokiHpaReachedMaxReplicas
eval_time: 515m

# Test for LokiCompactorFailedCompaction since last compaction alert
- interval: 1m
input_series:
- series: 'loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{cluster_id="golem", installation="golem", pipeline="testing", provider="capa"}'
values: "1x240 14400+60x100" # compactions worked once at the first second the does not work for the first 240 minutes so the timestamp stays still, then it gets continuously updated after 240 minutes to a valid timestamp (which is number of seconds since start for the test).
alert_rule_test:
- alertname: LokiCompactorFailedCompaction
eval_time: 15m
- alertname: LokiCompactorFailedCompaction
eval_time: 230m
exp_alerts:
- exp_labels:
area: platform
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
cluster_id: golem
installation: "golem"
pipeline: "testing"
provider: "capa"
severity: page
team: atlas
topic: observability
exp_annotations:
dashboard: loki-retention/loki-retention
description: Loki compactor has been failing compactions for more than 2 hours since last compaction.
opsrecipe: "loki#lokicompactorfailedcompaction"
- alertname: LokiCompactorFailedCompaction
eval_time: 300m

# Test for LokiCompactorFailedCompaction since start-up alert
- interval: 1m
input_series:
- series: 'loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{cluster_id="grizzly", installation="grizzly", pipeline="testing", provider="capz"}'
values: "0x240 14400+60x100" # compactions did not work since start-up for the first 240 minutes so the timestamp stays at 0, then it gets continuously updated after 240 minutes to a valid timestamp (which is number of seconds since start for the test).
alert_rule_test:
- alertname: LokiCompactorFailedCompaction
eval_time: 15m
- alertname: LokiCompactorFailedCompaction
eval_time: 230m
exp_alerts:
- exp_labels:
area: platform
cancel_if_cluster_status_creating: "true"
cancel_if_cluster_status_deleting: "true"
cancel_if_cluster_status_updating: "true"
cancel_if_outside_working_hours: "true"
cluster_id: grizzly
installation: "grizzly"
pipeline: "testing"
provider: "capz"
severity: page
team: atlas
topic: observability
exp_annotations:
dashboard: loki-retention/loki-retention
description: Loki compactor has been failing compactions for more than 2 hours since start-up.
opsrecipe: "loki#lokicompactorfailedcompaction"
- alertname: LokiCompactorFailedCompaction
eval_time: 300m

0 comments on commit 8a49c9f

Please sign in to comment.