diff --git a/monitoring/lifecycle/alerts.test.yaml b/monitoring/lifecycle/alerts.test.yaml index 1df3a5de5..3ec5b0fc9 100644 --- a/monitoring/lifecycle/alerts.test.yaml +++ b/monitoring/lifecycle/alerts.test.yaml @@ -117,3 +117,100 @@ tests: description: "Less than 50% of lifecycle object processors for expiration are up and healthy" summary: "Degraded lifecycle object processor" + - name: KafkaConsumerSlowTask + interval: 1m + input_series: + - series: s3_zenko_queue_slowTasks_count{namespace="zenko",job="artesca-data-backbeat-object-processor-headless"} + values: 0 0 0 0 1 0 0 0 0 0 + - series: s3_zenko_queue_slowTasks_count{namespace="zenko",job="artesca-data-backbeat-bucket-processor-headless"} + values: 0 0 1 1 1 1 1 1 1 0 + alert_rule_test: + - alertname: KafkaConsumerSlowTask + eval_time: 1m + exp_alerts: [] + - alertname: KafkaConsumerSlowTask + eval_time: 3m + exp_alerts: [] + - alertname: KafkaConsumerSlowTask + eval_time: 5m + exp_alerts: [] + - alertname: KafkaConsumerSlowTask + eval_time: 6m + exp_alerts: [] + - alertname: KafkaConsumerSlowTask + eval_time: 7m + exp_alerts: + - exp_labels: + severity: warning + job: artesca-data-backbeat-bucket-processor-headless + exp_annotations: + description: >- + Some tasks are taking too long to process in artesca-data-backbeat-bucket-processor-headless. This is not expected, and + may be a sign that other components are not behaving nominally or may need to be scaled. + + If this alert lasts, it may mean the task is blocked, and that the consumer should be + restarted. + summary: Some Kafka messages are taking too long to process + - alertname: KafkaConsumerSlowTask + eval_time: 8m + exp_alerts: + - exp_labels: + severity: warning + job: artesca-data-backbeat-bucket-processor-headless + exp_annotations: + description: >- + Some tasks are taking too long to process in artesca-data-backbeat-bucket-processor-headless. This is not expected, and + may be a sign that other components are not behaving nominally or may need to be scaled. + + If this alert lasts, it may mean the task is blocked, and that the consumer should be + restarted. + summary: Some Kafka messages are taking too long to process + - alertname: KafkaConsumerSlowTask + eval_time: 9m + exp_alerts: [] + + - name: KafkaConsumerRebalance + interval: 1m + input_series: + - series: s3_zenko_queue_rebalance_total_count{namespace="zenko",job="artesca-data-backbeat-object-processor-headless",status="drained",pod="foo"} + values: 1 2 _ _ stale + - series: s3_zenko_queue_rebalance_total_count{namespace="zenko",job="artesca-data-backbeat-object-processor-headless",status="timeout",pod="foo"} + values: _ 1 _ _ stale + - series: s3_zenko_queue_rebalance_total_count{namespace="zenko",job="artesca-data-backbeat-object-processor-headless",status="drained",pod="bar"} + values: _ _ 1 2 3 4 5 6 7 + - series: s3_zenko_queue_rebalance_total_count{namespace="zenko",job="artesca-data-backbeat-object-processor-headless",status="timeout",pod="bar"} + values: _ _ 0 0 0 0 0 0 0 + alert_rule_test: + - alertname: KafkaConsumerRebalanceTimeout + eval_time: 0m + exp_alerts: [] + - alertname: KafkaConsumerRebalanceTimeout + eval_time: 1m + exp_alerts: + - exp_labels: + severity: critical + pod: "foo" + exp_annotations: + summary: Kafka consumer has stopped consuming messages + description: Kafka rebalance has timed out for pod `foo`, which indicates that the consumer is not working anymore, and should be restarted. + - alertname: KafkaConsumerRebalanceTimeout + eval_time: 2m + exp_alerts: + - exp_labels: + severity: critical + pod: "foo" + exp_annotations: + summary: Kafka consumer has stopped consuming messages + description: Kafka rebalance has timed out for pod `foo`, which indicates that the consumer is not working anymore, and should be restarted. + - alertname: KafkaConsumerRebalanceTimeout + eval_time: 3m + exp_alerts: + - exp_labels: + severity: critical + pod: "foo" + exp_annotations: + summary: Kafka consumer has stopped consuming messages + description: Kafka rebalance has timed out for pod `foo`, which indicates that the consumer is not working anymore, and should be restarted. + - alertname: KafkaConsumerRebalanceTimeouts + eval_time: 4m + exp_alerts: [] diff --git a/monitoring/lifecycle/alerts.yaml b/monitoring/lifecycle/alerts.yaml index 38254b645..3558a2460 100644 --- a/monitoring/lifecycle/alerts.yaml +++ b/monitoring/lifecycle/alerts.yaml @@ -181,3 +181,28 @@ groups: description: "More than 5% of Kafka messages failed to publish to the lifecycle object topic" summary: "High rate of failed messages to the object topic" + - alert: KafkaConsumerSlowTask + Expr: | + sum(s3_zenko_queue_slowTasks_count{namespace="${namespace}"}) by(job) > 0 + For: "5m" + Labels: + severity: warning + Annotations: + description: >- + Some tasks are taking too long to process in {{ $labels.job }}. This is not expected, and + may be a sign that other components are not behaving nominally or may need to be scaled. + + If this alert lasts, it may mean the task is blocked, and that the consumer should be + restarted. + summary: "Some Kafka messages are taking too long to process" + + - alert: KafkaConsumerRebalanceTimeout + Expr: | + sum(s3_zenko_queue_rebalance_total_count{namespace="${namespace}", status="timeout"}) by(pod) > 0 + Labels: + severity: critical + Annotations: + description: >- + Kafka rebalance has timed out for pod `{{ $labels.pod }}`, which indicates that the consumer + is not working anymore, and should be restarted. + summary: "Kafka consumer has stopped consuming messages"