From b1304fbd8e3d5228a3eefb266a1a60f5ad77440b Mon Sep 17 00:00:00 2001
From: Cezary Zawadka <czawadka@google.com>
Date: Wed, 15 Feb 2023 10:54:50 +0100
Subject: [PATCH] improve L4FailedHealthCheckCount metric (do no report
 multiple times the same case)

---
 pkg/l4lb/l4controller.go      | 6 +++++-
 pkg/l4lb/l4netlbcontroller.go | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/pkg/l4lb/l4controller.go b/pkg/l4lb/l4controller.go
index dd06818091..4e29d8433b 100644
--- a/pkg/l4lb/l4controller.go
+++ b/pkg/l4lb/l4controller.go
@@ -153,11 +153,15 @@ func (l4c *L4Controller) checkHealth() error {
 	syncTimeLatest := lastEnqueueTime.Add(enqueueToSyncDelayThreshold)
 	controllerHealth := l4metrics.ControllerHealthyStatus
 	if lastSyncTime.After(syncTimeLatest) {
-		msg := fmt.Sprintf("L4 ILB Sync happened at time %v - %v after enqueue time, threshold is %v", lastSyncTime, lastSyncTime.Sub(lastEnqueueTime), enqueueToSyncDelayThreshold)
+		msg := fmt.Sprintf("L4 ILB Sync happened at time %v, %v after enqueue time, last enqueue time %v, threshold is %v", lastSyncTime, lastSyncTime.Sub(lastEnqueueTime), lastEnqueueTime, enqueueToSyncDelayThreshold)
 		// Log here, context/http handler do no log the error.
 		klog.Error(msg)
 		l4metrics.PublishL4FailedHealthCheckCount(l4ILBControllerName)
 		controllerHealth = l4metrics.ControllerUnhealthyStatus
+		// Reset trackers. Otherwise, if there is nothing in the queue then it will report the FailedHealthCheckCount every time the checkHealth is called
+		// If checkHealth returned error (as it is meant to) then container would be restarted and trackers would be reset either
+		l4c.enqueueTracker.Track()
+		l4c.syncTracker.Track()
 	}
 	if l4c.enableDualStack {
 		l4metrics.PublishL4ControllerHealthCheckStatus(l4ILBDualStackControllerName, controllerHealth)
diff --git a/pkg/l4lb/l4netlbcontroller.go b/pkg/l4lb/l4netlbcontroller.go
index 8c528a766a..4230b05e12 100644
--- a/pkg/l4lb/l4netlbcontroller.go
+++ b/pkg/l4lb/l4netlbcontroller.go
@@ -359,11 +359,15 @@ func (lc *L4NetLBController) checkHealth() error {
 	syncTimeLatest := lastEnqueueTime.Add(enqueueToSyncDelayThreshold)
 	controllerHealth := metrics.ControllerHealthyStatus
 	if lastSyncTime.After(syncTimeLatest) {
-		msg := fmt.Sprintf("L4 External LoadBalancer Sync happened at time %v - %v after enqueue time, threshold is %v", lastSyncTime, lastSyncTime.Sub(lastEnqueueTime), enqueueToSyncDelayThreshold)
+		msg := fmt.Sprintf("L4 NetLB Sync happened at time %v, %v after enqueue time, last enqueue time %v, threshold is %v", lastSyncTime, lastSyncTime.Sub(lastEnqueueTime), lastEnqueueTime, enqueueToSyncDelayThreshold)
 		// Log here, context/http handler do no log the error.
 		klog.Error(msg)
 		metrics.PublishL4FailedHealthCheckCount(l4NetLBControllerName)
 		controllerHealth = metrics.ControllerUnhealthyStatus
+		// Reset trackers. Otherwise, if there is nothing in the queue then it will report the FailedHealthCheckCount every time the checkHealth is called
+		// If checkHealth returned error (as it is meant to) then container would be restarted and trackers would be reset either
+		lc.enqueueTracker.Track()
+		lc.syncTracker.Track()
 	}
 	if lc.enableDualStack {
 		metrics.PublishL4ControllerHealthCheckStatus(l4NetLBDualStackControllerName, controllerHealth)