From b1304fbd8e3d5228a3eefb266a1a60f5ad77440b Mon Sep 17 00:00:00 2001 From: Cezary Zawadka Date: Wed, 15 Feb 2023 10:54:50 +0100 Subject: [PATCH] improve L4FailedHealthCheckCount metric (do no report multiple times the same case) --- pkg/l4lb/l4controller.go | 6 +++++- pkg/l4lb/l4netlbcontroller.go | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pkg/l4lb/l4controller.go b/pkg/l4lb/l4controller.go index dd06818091..4e29d8433b 100644 --- a/pkg/l4lb/l4controller.go +++ b/pkg/l4lb/l4controller.go @@ -153,11 +153,15 @@ func (l4c *L4Controller) checkHealth() error { syncTimeLatest := lastEnqueueTime.Add(enqueueToSyncDelayThreshold) controllerHealth := l4metrics.ControllerHealthyStatus if lastSyncTime.After(syncTimeLatest) { - msg := fmt.Sprintf("L4 ILB Sync happened at time %v - %v after enqueue time, threshold is %v", lastSyncTime, lastSyncTime.Sub(lastEnqueueTime), enqueueToSyncDelayThreshold) + msg := fmt.Sprintf("L4 ILB Sync happened at time %v, %v after enqueue time, last enqueue time %v, threshold is %v", lastSyncTime, lastSyncTime.Sub(lastEnqueueTime), lastEnqueueTime, enqueueToSyncDelayThreshold) // Log here, context/http handler do no log the error. klog.Error(msg) l4metrics.PublishL4FailedHealthCheckCount(l4ILBControllerName) controllerHealth = l4metrics.ControllerUnhealthyStatus + // Reset trackers. Otherwise, if there is nothing in the queue then it will report the FailedHealthCheckCount every time the checkHealth is called + // If checkHealth returned error (as it is meant to) then container would be restarted and trackers would be reset either + l4c.enqueueTracker.Track() + l4c.syncTracker.Track() } if l4c.enableDualStack { l4metrics.PublishL4ControllerHealthCheckStatus(l4ILBDualStackControllerName, controllerHealth) diff --git a/pkg/l4lb/l4netlbcontroller.go b/pkg/l4lb/l4netlbcontroller.go index 8c528a766a..4230b05e12 100644 --- a/pkg/l4lb/l4netlbcontroller.go +++ b/pkg/l4lb/l4netlbcontroller.go @@ -359,11 +359,15 @@ func (lc *L4NetLBController) checkHealth() error { syncTimeLatest := lastEnqueueTime.Add(enqueueToSyncDelayThreshold) controllerHealth := metrics.ControllerHealthyStatus if lastSyncTime.After(syncTimeLatest) { - msg := fmt.Sprintf("L4 External LoadBalancer Sync happened at time %v - %v after enqueue time, threshold is %v", lastSyncTime, lastSyncTime.Sub(lastEnqueueTime), enqueueToSyncDelayThreshold) + msg := fmt.Sprintf("L4 NetLB Sync happened at time %v, %v after enqueue time, last enqueue time %v, threshold is %v", lastSyncTime, lastSyncTime.Sub(lastEnqueueTime), lastEnqueueTime, enqueueToSyncDelayThreshold) // Log here, context/http handler do no log the error. klog.Error(msg) metrics.PublishL4FailedHealthCheckCount(l4NetLBControllerName) controllerHealth = metrics.ControllerUnhealthyStatus + // Reset trackers. Otherwise, if there is nothing in the queue then it will report the FailedHealthCheckCount every time the checkHealth is called + // If checkHealth returned error (as it is meant to) then container would be restarted and trackers would be reset either + lc.enqueueTracker.Track() + lc.syncTracker.Track() } if lc.enableDualStack { metrics.PublishL4ControllerHealthCheckStatus(l4NetLBDualStackControllerName, controllerHealth)