diff --git a/pkg/l4lb/l4controller.go b/pkg/l4lb/l4controller.go index d0ba85ad24..d1c36ff5cc 100644 --- a/pkg/l4lb/l4controller.go +++ b/pkg/l4lb/l4controller.go @@ -45,6 +45,7 @@ import ( const ( // The max tolerated delay between update being enqueued and sync being invoked. enqueueToSyncDelayThreshold = 15 * time.Minute + l4ILBControllerName = "l4-ilb-subsetting-controller" ) // L4Controller manages the create/update delete of all L4 Internal LoadBalancer services. @@ -132,7 +133,7 @@ func NewILBController(ctx *context.ControllerContext, stopCh chan struct{}) *L4C }) // TODO enhance this by looking at some metric from service controller to ensure it is up. // We cannot use existence of a backend service or other resource, since those are on a per-service basis. - ctx.AddHealthCheck("l4-ilb-subsetting-controller", l4c.checkHealth) + ctx.AddHealthCheck(l4ILBControllerName, l4c.checkHealth) return l4c } @@ -146,7 +147,7 @@ func (l4c *L4Controller) checkHealth() error { msg := fmt.Sprintf("L4 ILB Sync happened at time %v - %v after enqueue time, threshold is %v", lastSyncTime, lastSyncTime.Sub(lastEnqueueTime), enqueueToSyncDelayThreshold) // Log here, context/http handler do no log the error. klog.Error(msg) - return fmt.Errorf(msg) + l4metrics.PublishL4FailedHealthCheckCount(l4ILBControllerName) } return nil } diff --git a/pkg/l4lb/l4netlbcontroller.go b/pkg/l4lb/l4netlbcontroller.go index 4818f8f6c9..2182ebb077 100644 --- a/pkg/l4lb/l4netlbcontroller.go +++ b/pkg/l4lb/l4netlbcontroller.go @@ -40,6 +40,8 @@ import ( "k8s.io/klog" ) +const l4NetLBControllerName = "l4netlb-controller" + type L4NetLBController struct { ctx *context.ControllerContext svcQueue utils.TaskQueue @@ -109,7 +111,7 @@ func NewL4NetLBController( } }, }) - ctx.AddHealthCheck("l4netlb-controller", l4netLBc.checkHealth) + ctx.AddHealthCheck(l4NetLBControllerName, l4netLBc.checkHealth) return l4netLBc } @@ -265,7 +267,7 @@ func (lc *L4NetLBController) checkHealth() error { msg := fmt.Sprintf("L4 External LoadBalancer Sync happened at time %v - %v after enqueue time, threshold is %v", lastSyncTime, lastSyncTime.Sub(lastEnqueueTime), enqueueToSyncDelayThreshold) // Log here, context/http handler do no log the error. klog.Error(msg) - return fmt.Errorf(msg) + l4metrics.PublishL4FailedHealthCheckCount(l4NetLBControllerName) } return nil } diff --git a/pkg/l4lb/metrics/metrics.go b/pkg/l4lb/metrics/metrics.go index fcd44e93bd..0bd7629e47 100644 --- a/pkg/l4lb/metrics/metrics.go +++ b/pkg/l4lb/metrics/metrics.go @@ -30,6 +30,7 @@ const ( L4ilbErrorMetricName = "l4_ilb_sync_error_count" L4netlbLatencyMetricName = "l4_netlb_sync_duration_seconds" L4netlbErrorMetricName = "l4_netlb_sync_error_count" + l4failedHealthCheckName = "l4_failed_healthcheck_count" ) var ( @@ -80,6 +81,13 @@ var ( }, l4LBSyncErrorMetricLabels, ) + l4FailedHealthCheckCount = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: l4failedHealthCheckName, + Help: "Count l4 controller healthcheck failures", + }, + []string{"controller_name"}, + ) ) // init registers l4 ilb nad netlb sync metrics. @@ -88,6 +96,8 @@ func init() { prometheus.MustRegister(l4ILBSyncLatency, l4ILBSyncErrorCount) klog.V(3).Infof("Registering L4 NetLB controller metrics %v, %v", l4NetLBSyncLatency, l4NetLBSyncErrorCount) prometheus.MustRegister(l4NetLBSyncLatency, l4NetLBSyncErrorCount) + klog.V(3).Infof("Registering L4 healthcheck failures count metric: %v", l4FailedHealthCheckCount) + prometheus.MustRegister(l4FailedHealthCheckCount) } // PublishL4ILBSyncMetrics exports metrics related to the L4 ILB sync. @@ -133,3 +143,8 @@ func publishL4NetLBSyncLatency(success bool, syncType string, startTime time.Tim func publishL4NetLBSyncErrorCount(syncType, gceResource, errorType string) { l4NetLBSyncErrorCount.WithLabelValues(syncType, gceResource, errorType).Inc() } + +// PublishL4FailedHealthCheckCount observers failed healt check from controller. +func PublishL4FailedHealthCheckCount(controllerName string) { + l4FailedHealthCheckCount.WithLabelValues(controllerName).Inc() +}