Skip to content

Commit

Permalink
Add L4 DualStack Sync Latency metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
panslava committed Feb 16, 2023
1 parent e8e35b5 commit 76c5640
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 3 deletions.
6 changes: 5 additions & 1 deletion pkg/l4lb/l4controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -505,11 +505,12 @@ func (l4c *L4Controller) publishMetrics(result *loadbalancers.L4ILBSyncResult, n
case loadbalancers.SyncTypeCreate, loadbalancers.SyncTypeUpdate:
klog.V(2).Infof("Internal L4 Loadbalancer for Service %s ensured, updating its state %v in metrics cache", namespacedName, result.MetricsState)
l4c.ctx.ControllerMetrics.SetL4ILBService(namespacedName, result.MetricsState)
l4metrics.PublishILBSyncMetrics(result.Error == nil, result.SyncType, result.GCEResourceInError, utils.GetErrorType(result.Error), result.StartTime)
if l4c.enableDualStack {
klog.V(2).Infof("Internal L4 DualStack Loadbalancer for Service %s ensured, updating its state %v in metrics cache", namespacedName, result.DualStackMetricsState)
l4c.ctx.ControllerMetrics.SetL4ILBDualStackService(namespacedName, result.DualStackMetricsState)
l4metrics.PublishL4ILBDualStackSyncLatency(result.Error == nil, result.SyncType, result.DualStackMetricsState.IPFamilies, result.StartTime)
}
l4metrics.PublishILBSyncMetrics(result.Error == nil, result.SyncType, result.GCEResourceInError, utils.GetErrorType(result.Error), result.StartTime)

case loadbalancers.SyncTypeDelete:
// if service is successfully deleted, remove it from cache
Expand All @@ -522,6 +523,9 @@ func (l4c *L4Controller) publishMetrics(result *loadbalancers.L4ILBSyncResult, n
}
}
l4metrics.PublishILBSyncMetrics(result.Error == nil, result.SyncType, result.GCEResourceInError, utils.GetErrorType(result.Error), result.StartTime)
if l4c.enableDualStack {
l4metrics.PublishL4ILBDualStackSyncLatency(result.Error == nil, result.SyncType, result.DualStackMetricsState.IPFamilies, result.StartTime)
}
default:
klog.Warningf("Unknown sync type %q, skipping metrics", result.SyncType)
}
Expand Down
3 changes: 3 additions & 0 deletions pkg/l4lb/l4netlbcontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -670,4 +670,7 @@ func (lc *L4NetLBController) publishSyncMetrics(result *loadbalancers.L4NetLBSyn
return
}
metrics.PublishL4NetLBSyncError(result.SyncType, result.GCEResourceInError, utils.GetErrorType(result.Error), result.StartTime)
if lc.enableDualStack {
metrics.PublishL4NetLBDualStackSyncLatency(result.Error == nil, result.SyncType, result.DualStackMetricsState.IPFamilies, result.StartTime)
}
}
45 changes: 43 additions & 2 deletions pkg/l4lb/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@ const (
statusSuccess = "success"
statusError = "error"
L4ilbLatencyMetricName = "l4_ilb_sync_duration_seconds"
L4ILBDualStackLatencyMetricName = "l4_ilb_dualstack_sync_duration_seconds"
L4ilbErrorMetricName = "l4_ilb_sync_error_count"
L4netlbLatencyMetricName = "l4_netlb_sync_duration_seconds"
L4NetLBDualStackLatencyMetricName = "l4_netlb_dualstack_sync_duration_seconds"
L4netlbErrorMetricName = "l4_netlb_sync_error_count"
L4netlbLegacyToRBSMigrationPreventedMetricName = "l4_netlb_legacy_to_rbs_migration_prevented_count"
l4failedHealthCheckName = "l4_failed_healthcheck_count"
Expand All @@ -40,7 +42,8 @@ var (
"sync_result", // result of the sync
"sync_type", // whether this is a new service, update or delete
}
l4LBSyncErrorMetricLabels = []string{
l4LBDualStackSyncLatencyMetricsLabels = append(l4LBSyncLatencyMetricsLabels, "ip_families")
l4LBSyncErrorMetricLabels = []string{
"sync_type", // whether this is a new service, update or delete
"gce_resource", // The GCE resource whose update caused the error
// max number of values for error_type = 18 k8s error reasons + 60 http status errors.
Expand All @@ -57,14 +60,22 @@ var (
},
l4LBSyncLatencyMetricsLabels,
)
l4ILBDualStackSyncLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: L4ILBDualStackLatencyMetricName,
Help: "Latency of an L4 ILB DualStack Sync",
Buckets: prometheus.ExponentialBuckets(0.5, 2, 12),
},
l4LBDualStackSyncLatencyMetricsLabels,
)
l4ILBSyncErrorCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: L4ilbErrorMetricName,
Help: "Count of L4 ILB Sync errors",
},
l4LBSyncErrorMetricLabels,
)
// l4ILBSyncLatency is a metric that represents the time spent processing L4NetLB service.
// l4NetLBSyncLatency is a metric that represents the time spent processing L4NetLB service.
// The metric is labeled with synchronization type and its result.
l4NetLBSyncLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Expand All @@ -76,6 +87,14 @@ var (
},
l4LBSyncLatencyMetricsLabels,
)
l4NetLBDualStackSyncLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: L4NetLBDualStackLatencyMetricName,
Help: "Latency of an L4 NetB DualStack Sync",
Buckets: prometheus.ExponentialBuckets(0.5, 2, 12),
},
l4LBDualStackSyncLatencyMetricsLabels,
)
// l4NetLBSyncErrorCount is a metric that counts number of L4NetLB services in Error state.
// The metric is labeled with synchronization type, the type of error and the name of gce resource that is in error.
l4NetLBSyncErrorCount = prometheus.NewCounterVec(
Expand Down Expand Up @@ -112,8 +131,12 @@ var (
func init() {
klog.V(3).Infof("Registering L4 ILB controller metrics %v, %v", l4ILBSyncLatency, l4ILBSyncErrorCount)
prometheus.MustRegister(l4ILBSyncLatency, l4ILBSyncErrorCount)
klog.V(3).Infof("Registering L4 ILB DualStack controller metrics %v", l4ILBDualStackSyncLatency)
prometheus.MustRegister(l4ILBDualStackSyncLatency)
klog.V(3).Infof("Registering L4 NetLB controller metrics %v, %v", l4NetLBSyncLatency, l4NetLBSyncErrorCount)
prometheus.MustRegister(l4NetLBSyncLatency, l4NetLBSyncErrorCount)
klog.V(3).Infof("Registering L4 NetLB DualStack controller metrics %v", l4NetLBDualStackSyncLatency)
prometheus.MustRegister(l4NetLBDualStackSyncLatency)
klog.V(3).Infof("Registering L4 healthcheck failures count metric: %v", l4FailedHealthCheckCount)
prometheus.MustRegister(l4FailedHealthCheckCount)
klog.V(3).Infof("Registering L4 controller healthcheck metric: %v", l4ControllerHealthCheck)
Expand All @@ -137,6 +160,15 @@ func publishL4ILBSyncLatency(success bool, syncType string, startTime time.Time)
l4ILBSyncLatency.WithLabelValues(status, syncType).Observe(time.Since(startTime).Seconds())
}

// PublishL4ILBDualStackSyncLatency exports the given sync latency datapoint.
func PublishL4ILBDualStackSyncLatency(success bool, syncType, ipFamilies string, startTime time.Time) {
status := statusSuccess
if !success {
status = statusError
}
l4ILBDualStackSyncLatency.WithLabelValues(status, syncType, ipFamilies).Observe(time.Since(startTime).Seconds())
}

// publishL4ILBSyncLatency exports the given sync latency datapoint.
func publishL4ILBSyncErrorCount(syncType, gceResource, errorType string) {
l4ILBSyncErrorCount.WithLabelValues(syncType, gceResource, errorType).Inc()
Expand All @@ -147,6 +179,15 @@ func PublishL4NetLBSyncSuccess(syncType string, startTime time.Time) {
l4NetLBSyncLatency.WithLabelValues(statusSuccess, syncType).Observe(time.Since(startTime).Seconds())
}

// PublishL4NetLBDualStackSyncLatency exports the given sync latency datapoint.
func PublishL4NetLBDualStackSyncLatency(success bool, syncType, ipFamilies string, startTime time.Time) {
status := statusSuccess
if !success {
status = statusError
}
l4NetLBDualStackSyncLatency.WithLabelValues(status, syncType, ipFamilies).Observe(time.Since(startTime).Seconds())
}

// PublishL4NetLBSyncError exports latency and error count metrics for L4 NetLB after error sync.
func PublishL4NetLBSyncError(syncType, gceResource, errType string, startTime time.Time) {
l4NetLBSyncLatency.WithLabelValues(statusError, syncType).Observe(time.Since(startTime).Seconds())
Expand Down

0 comments on commit 76c5640

Please sign in to comment.