From b2fa0b54f19c22bb87835df4f9f7b9c7cfc99d81 Mon Sep 17 00:00:00 2001 From: suiguoxin Date: Tue, 10 Nov 2020 15:41:01 +0800 Subject: [PATCH] fix NodeGpuCountChanged definition issue --- src/prometheus/deploy/alerting/gpu.rules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/prometheus/deploy/alerting/gpu.rules b/src/prometheus/deploy/alerting/gpu.rules index 90e670f947..d6dae182eb 100644 --- a/src/prometheus/deploy/alerting/gpu.rules +++ b/src/prometheus/deploy/alerting/gpu.rules @@ -71,7 +71,7 @@ groups: summary: "found nvidia used by zombie container in {{$labels.instance}}" - alert: NodeGpuCountChanged - expr: changes(node:gpu_utilization:count[5m]) > 0 + expr: node:gpu_utilization:count != on (instance) configured_gpu_count labels: severity: fatal annotations: