From 4899b551619eb98037355b916685ea32d9291d35 Mon Sep 17 00:00:00 2001 From: Yaguang Tang Date: Tue, 8 Oct 2024 18:49:30 +0800 Subject: [PATCH] Add support for node down and softirq alert --- .../files/jsonnet/legacy.libsonnet | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/roles/kube_prometheus_stack/files/jsonnet/legacy.libsonnet b/roles/kube_prometheus_stack/files/jsonnet/legacy.libsonnet index b11d8a6f0..955da8312 100644 --- a/roles/kube_prometheus_stack/files/jsonnet/legacy.libsonnet +++ b/roles/kube_prometheus_stack/files/jsonnet/legacy.libsonnet @@ -142,6 +142,14 @@ labels: { severity: 'warning', }, + }, + { + alert: 'NodeHighLoad', + expr: 'node_load1 / count(node_cpu_seconds_total{mode="system"}) without (cpu, mode) > 0.5', + 'for': '5m', + labels: { + severity: 'info', + }, }, { alert: 'NodeHighMemoryUsage', @@ -174,6 +182,14 @@ severity: 'P5', }, }, + { + alert: 'NodeDown', + expr: 'up{job="node-exporter"} == 0', + 'for': '5m', + labels: { + severity: 'P1', + }, + }, ], }, { @@ -226,6 +242,23 @@ alertRule('dropped', '0', '0.75'), ], }, + { + name: 'softirq', + rules: [ + { + alert: 'NodeSoftirqRcu', + expr: 'rate(node_softirqs_total{vector="rcu"}[1m]) > 10000', + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'High softirq rcu on node {{ $labels.instance }}: {{ $value }} ', + description: 'This can result in high software interrupt load on the node which can bring system performance down.', + }, + }, + ], + }, ], }, },