Skip to content

Commit

Permalink
Do not restart controller after L4 healthcheck failure, publish metri…
Browse files Browse the repository at this point in the history
…c instead

After discussion offline we decide that restarting controller on single healthckeck failure may be too invasive and reduce the reliability. This PR undoes the kubernetes#1683 and replaces healthcheck failure error with metric to count the errors.
  • Loading branch information
cezarygerard committed Apr 13, 2022
1 parent e9fec8e commit 5439667
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 4 deletions.
5 changes: 3 additions & 2 deletions pkg/l4lb/l4controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ import (
const (
// The max tolerated delay between update being enqueued and sync being invoked.
enqueueToSyncDelayThreshold = 15 * time.Minute
l4ILBControllerName = "l4-ilb-subsetting-controller"
)

// L4Controller manages the create/update delete of all L4 Internal LoadBalancer services.
Expand Down Expand Up @@ -132,7 +133,7 @@ func NewILBController(ctx *context.ControllerContext, stopCh chan struct{}) *L4C
})
// TODO enhance this by looking at some metric from service controller to ensure it is up.
// We cannot use existence of a backend service or other resource, since those are on a per-service basis.
ctx.AddHealthCheck("l4-ilb-subsetting-controller", l4c.checkHealth)
ctx.AddHealthCheck(l4ILBControllerName, l4c.checkHealth)
return l4c
}

Expand All @@ -146,7 +147,7 @@ func (l4c *L4Controller) checkHealth() error {
msg := fmt.Sprintf("L4 ILB Sync happened at time %v - %v after enqueue time, threshold is %v", lastSyncTime, lastSyncTime.Sub(lastEnqueueTime), enqueueToSyncDelayThreshold)
// Log here, context/http handler do no log the error.
klog.Error(msg)
return fmt.Errorf(msg)
l4metrics.PublishL4FailedHealthCheckCount(l4ILBControllerName)
}
return nil
}
Expand Down
6 changes: 4 additions & 2 deletions pkg/l4lb/l4netlbcontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ import (
"k8s.io/klog"
)

const l4NetLBControllerName = "l4netlb-controller"

type L4NetLBController struct {
ctx *context.ControllerContext
svcQueue utils.TaskQueue
Expand Down Expand Up @@ -109,7 +111,7 @@ func NewL4NetLBController(
}
},
})
ctx.AddHealthCheck("l4netlb-controller", l4netLBc.checkHealth)
ctx.AddHealthCheck(l4NetLBControllerName, l4netLBc.checkHealth)
return l4netLBc
}

Expand Down Expand Up @@ -265,7 +267,7 @@ func (lc *L4NetLBController) checkHealth() error {
msg := fmt.Sprintf("L4 External LoadBalancer Sync happened at time %v - %v after enqueue time, threshold is %v", lastSyncTime, lastSyncTime.Sub(lastEnqueueTime), enqueueToSyncDelayThreshold)
// Log here, context/http handler do no log the error.
klog.Error(msg)
return fmt.Errorf(msg)
l4metrics.PublishL4FailedHealthCheckCount(l4NetLBControllerName)
}
return nil
}
Expand Down
15 changes: 15 additions & 0 deletions pkg/l4lb/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ const (
L4ilbErrorMetricName = "l4_ilb_sync_error_count"
L4netlbLatencyMetricName = "l4_netlb_sync_duration_seconds"
L4netlbErrorMetricName = "l4_netlb_sync_error_count"
l4failedHealthCheckName = "l4_failed_healthcheck_count"
)

var (
Expand Down Expand Up @@ -80,6 +81,13 @@ var (
},
l4LBSyncErrorMetricLabels,
)
l4FailedHealthCheckCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: l4failedHealthCheckName,
Help: "Count l4 controller healthcheck failures",
},
[]string{"controller_name"},
)
)

// init registers l4 ilb nad netlb sync metrics.
Expand All @@ -88,6 +96,8 @@ func init() {
prometheus.MustRegister(l4ILBSyncLatency, l4ILBSyncErrorCount)
klog.V(3).Infof("Registering L4 NetLB controller metrics %v, %v", l4NetLBSyncLatency, l4NetLBSyncErrorCount)
prometheus.MustRegister(l4NetLBSyncLatency, l4NetLBSyncErrorCount)
klog.V(3).Infof("Registering L4 healthcheck failures count metric: %v", l4FailedHealthCheckCount)
prometheus.MustRegister(l4FailedHealthCheckCount)
}

// PublishL4ILBSyncMetrics exports metrics related to the L4 ILB sync.
Expand Down Expand Up @@ -133,3 +143,8 @@ func publishL4NetLBSyncLatency(success bool, syncType string, startTime time.Tim
func publishL4NetLBSyncErrorCount(syncType, gceResource, errorType string) {
l4NetLBSyncErrorCount.WithLabelValues(syncType, gceResource, errorType).Inc()
}

// PublishL4FailedHealthCheckCount observers failed healt check from controller.
func PublishL4FailedHealthCheckCount(controllerName string) {
l4FailedHealthCheckCount.WithLabelValues(controllerName).Inc()
}

0 comments on commit 5439667

Please sign in to comment.