From 2b95524c67202166769b1f24fc10a4db953da03b Mon Sep 17 00:00:00 2001
From: Cezary Zawadka <czawadka@google.com>
Date: Mon, 11 Apr 2022 16:18:33 +0200
Subject: [PATCH] Do not restart controller after L4 healthcheck failure,
 publish metric instead

After discussion offline we decide that restarting controller on single healthckeck failure may be too invasive and reduce the reliability. This PR undoes the https://github.com/kubernetes/ingress-gce/pull/1683 and replaces healthcheck failure error with metric to count the errors.
---
 pkg/l4lb/l4controller.go      |  5 +++--
 pkg/l4lb/l4netlbcontroller.go |  6 ++++--
 pkg/l4lb/metrics/metrics.go   | 15 +++++++++++++++
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/pkg/l4lb/l4controller.go b/pkg/l4lb/l4controller.go
index d0ba85ad24..f08b0650a8 100644
--- a/pkg/l4lb/l4controller.go
+++ b/pkg/l4lb/l4controller.go
@@ -45,6 +45,7 @@ import (
 const (
 	// The max tolerated delay between update being enqueued and sync being invoked.
 	enqueueToSyncDelayThreshold = 15 * time.Minute
+	l4ILBLBControllerName       = "l4-ilb-subsetting-controller"
 )
 
 // L4Controller manages the create/update delete of all L4 Internal LoadBalancer services.
@@ -132,7 +133,7 @@ func NewILBController(ctx *context.ControllerContext, stopCh chan struct{}) *L4C
 	})
 	// TODO enhance this by looking at some metric from service controller to ensure it is up.
 	// We cannot use existence of a backend service or other resource, since those are on a per-service basis.
-	ctx.AddHealthCheck("l4-ilb-subsetting-controller", l4c.checkHealth)
+	ctx.AddHealthCheck(l4ILBLBControllerName, l4c.checkHealth)
 	return l4c
 }
 
@@ -146,7 +147,7 @@ func (l4c *L4Controller) checkHealth() error {
 		msg := fmt.Sprintf("L4 ILB Sync happened at time %v - %v after enqueue time, threshold is %v", lastSyncTime, lastSyncTime.Sub(lastEnqueueTime), enqueueToSyncDelayThreshold)
 		// Log here, context/http handler do no log the error.
 		klog.Error(msg)
-		return fmt.Errorf(msg)
+		l4metrics.PublishLL4FailedHealthCheckCount(l4ILBLBControllerName)
 	}
 	return nil
 }
diff --git a/pkg/l4lb/l4netlbcontroller.go b/pkg/l4lb/l4netlbcontroller.go
index 4818f8f6c9..63b47c0c52 100644
--- a/pkg/l4lb/l4netlbcontroller.go
+++ b/pkg/l4lb/l4netlbcontroller.go
@@ -40,6 +40,8 @@ import (
 	"k8s.io/klog"
 )
 
+const l4NetLBControllerName = "l4netlb-controller"
+
 type L4NetLBController struct {
 	ctx           *context.ControllerContext
 	svcQueue      utils.TaskQueue
@@ -109,7 +111,7 @@ func NewL4NetLBController(
 			}
 		},
 	})
-	ctx.AddHealthCheck("l4netlb-controller", l4netLBc.checkHealth)
+	ctx.AddHealthCheck(l4NetLBControllerName, l4netLBc.checkHealth)
 	return l4netLBc
 }
 
@@ -265,7 +267,7 @@ func (lc *L4NetLBController) checkHealth() error {
 		msg := fmt.Sprintf("L4 External LoadBalancer Sync happened at time %v - %v after enqueue time, threshold is %v", lastSyncTime, lastSyncTime.Sub(lastEnqueueTime), enqueueToSyncDelayThreshold)
 		// Log here, context/http handler do no log the error.
 		klog.Error(msg)
-		return fmt.Errorf(msg)
+		l4metrics.PublishLL4FailedHealthCheckCount(l4NetLBControllerName)
 	}
 	return nil
 }
diff --git a/pkg/l4lb/metrics/metrics.go b/pkg/l4lb/metrics/metrics.go
index fcd44e93bd..0879b9acbd 100644
--- a/pkg/l4lb/metrics/metrics.go
+++ b/pkg/l4lb/metrics/metrics.go
@@ -30,6 +30,7 @@ const (
 	L4ilbErrorMetricName     = "l4_ilb_sync_error_count"
 	L4netlbLatencyMetricName = "l4_netlb_sync_duration_seconds"
 	L4netlbErrorMetricName   = "l4_netlb_sync_error_count"
+	l4failedHealthCheckName  = "l4_failed_healthcheck_count"
 )
 
 var (
@@ -80,6 +81,13 @@ var (
 		},
 		l4LBSyncErrorMetricLabels,
 	)
+	l4FailedHealthCheckCount = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: l4failedHealthCheckName,
+			Help: "Count l4 controller healthcheck failures",
+		},
+		[]string{"controller_name"},
+	)
 )
 
 // init registers l4 ilb nad netlb sync metrics.
@@ -88,6 +96,8 @@ func init() {
 	prometheus.MustRegister(l4ILBSyncLatency, l4ILBSyncErrorCount)
 	klog.V(3).Infof("Registering L4 NetLB controller metrics %v, %v", l4NetLBSyncLatency, l4NetLBSyncErrorCount)
 	prometheus.MustRegister(l4NetLBSyncLatency, l4NetLBSyncErrorCount)
+	klog.V(3).Infof("Registering L4  healthcheck failures count metric: %v", l4FailedHealthCheckCount)
+	prometheus.MustRegister(l4FailedHealthCheckCount)
 }
 
 // PublishL4ILBSyncMetrics exports metrics related to the L4 ILB sync.
@@ -133,3 +143,8 @@ func publishL4NetLBSyncLatency(success bool, syncType string, startTime time.Tim
 func publishL4NetLBSyncErrorCount(syncType, gceResource, errorType string) {
 	l4NetLBSyncErrorCount.WithLabelValues(syncType, gceResource, errorType).Inc()
 }
+
+// publishLL4FailedHealthCheckCount observers failed healt check from controller.
+func PublishLL4FailedHealthCheckCount(controllerName string) {
+	l4FailedHealthCheckCount.WithLabelValues(controllerName).Inc()
+}