diff --git a/pkg/l4lb/l4controller.go b/pkg/l4lb/l4controller.go index 9ce7ff62ae..df459744cd 100644 --- a/pkg/l4lb/l4controller.go +++ b/pkg/l4lb/l4controller.go @@ -499,11 +499,12 @@ func (l4c *L4Controller) publishMetrics(result *loadbalancers.L4ILBSyncResult, n case loadbalancers.SyncTypeCreate, loadbalancers.SyncTypeUpdate: klog.V(6).Infof("Internal L4 Loadbalancer for Service %s ensured, updating its state %v in metrics cache", namespacedName, result.MetricsState) l4c.ctx.ControllerMetrics.SetL4ILBService(namespacedName, result.MetricsState) + l4metrics.PublishILBSyncMetrics(result.Error == nil, result.SyncType, result.GCEResourceInError, utils.GetErrorType(result.Error), result.StartTime) if l4c.enableDualStack { klog.V(6).Infof("Internal L4 DualStack Loadbalancer for Service %s ensured, updating its state %v in metrics cache", namespacedName, result.MetricsState) l4c.ctx.ControllerMetrics.SetL4ILBDualStackService(namespacedName, result.DualStackMetricsState) + l4metrics.PublishL4ILBDualStackSyncLatency(result.Error == nil, result.SyncType, result.DualStackMetricsState.IPFamilies, result.StartTime) } - l4metrics.PublishILBSyncMetrics(result.Error == nil, result.SyncType, result.GCEResourceInError, utils.GetErrorType(result.Error), result.StartTime) case loadbalancers.SyncTypeDelete: // if service is successfully deleted, remove it from cache @@ -516,6 +517,9 @@ func (l4c *L4Controller) publishMetrics(result *loadbalancers.L4ILBSyncResult, n } } l4metrics.PublishILBSyncMetrics(result.Error == nil, result.SyncType, result.GCEResourceInError, utils.GetErrorType(result.Error), result.StartTime) + if l4c.enableDualStack { + l4metrics.PublishL4ILBDualStackSyncLatency(result.Error == nil, result.SyncType, result.DualStackMetricsState.IPFamilies, result.StartTime) + } default: klog.Warningf("Unknown sync type %q, skipping metrics", result.SyncType) } diff --git a/pkg/l4lb/metrics/metrics.go b/pkg/l4lb/metrics/metrics.go index 274696ef48..57f4a64c19 100644 --- a/pkg/l4lb/metrics/metrics.go +++ b/pkg/l4lb/metrics/metrics.go @@ -27,6 +27,7 @@ const ( statusSuccess = "success" statusError = "error" L4ilbLatencyMetricName = "l4_ilb_sync_duration_seconds" + L4ILBDualStackLatencyMetricName = "l4_ilb_dualstack_sync_duration_seconds" L4ilbErrorMetricName = "l4_ilb_sync_error_count" L4netlbLatencyMetricName = "l4_netlb_sync_duration_seconds" L4netlbErrorMetricName = "l4_netlb_sync_error_count" @@ -39,7 +40,8 @@ var ( "sync_result", // result of the sync "sync_type", // whether this is a new service, update or delete } - l4LBSyncErrorMetricLabels = []string{ + l4LBDualStackSyncLatencyMetricsLabels = append(l4LBSyncLatencyMetricsLabels, "ipfamilies") + l4LBSyncErrorMetricLabels = []string{ "sync_type", // whether this is a new service, update or delete "gce_resource", // The GCE resource whose update caused the error // max number of values for error_type = 18 k8s error reasons + 60 http status errors. @@ -56,6 +58,16 @@ var ( }, l4LBSyncLatencyMetricsLabels, ) + l4ILBDualStackSyncLatency = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: L4ILBDualStackLatencyMetricName, + Help: "Latency of an L4 ILB DualStack Sync", + // custom buckets - [0.9375s, 1.875s, 3.75s, 7.5s, 30s, 60s, 120s, 240s(4min), 480s(8min), 960s(16m), 3840s(64min), 7680s(128m) +Inf] + // using funny starter bucket, 0.9375s will only add buckets to existing metric, this is a safe operation in most time series db + Buckets: prometheus.ExponentialBuckets(0.9375, 2, 12), + }, + l4LBDualStackSyncLatencyMetricsLabels, + ) l4ILBSyncErrorCount = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: L4ilbErrorMetricName, @@ -63,7 +75,7 @@ var ( }, l4LBSyncErrorMetricLabels, ) - // l4ILBSyncLatency is a metric that represents the time spent processing L4NetLB service. + // l4NetLBSyncLatency is a metric that represents the time spent processing L4NetLB service. // The metric is labeled with synchronization type and its result. l4NetLBSyncLatency = prometheus.NewHistogramVec( prometheus.HistogramOpts{ @@ -104,6 +116,8 @@ var ( func init() { klog.V(3).Infof("Registering L4 ILB controller metrics %v, %v", l4ILBSyncLatency, l4ILBSyncErrorCount) prometheus.MustRegister(l4ILBSyncLatency, l4ILBSyncErrorCount) + klog.V(3).Infof("Registering L4 ILB DualStack controller metrics %v", l4ILBDualStackSyncLatency) + prometheus.MustRegister(l4ILBDualStackSyncLatency) klog.V(3).Infof("Registering L4 NetLB controller metrics %v, %v", l4NetLBSyncLatency, l4NetLBSyncErrorCount) prometheus.MustRegister(l4NetLBSyncLatency, l4NetLBSyncErrorCount) klog.V(3).Infof("Registering L4 healthcheck failures count metric: %v", l4FailedHealthCheckCount) @@ -127,6 +141,15 @@ func publishL4ILBSyncLatency(success bool, syncType string, startTime time.Time) l4ILBSyncLatency.WithLabelValues(status, syncType).Observe(time.Since(startTime).Seconds()) } +// PublishL4ILBDualStackSyncLatency exports the given sync latency datapoint. +func PublishL4ILBDualStackSyncLatency(success bool, syncType, ipFamilies string, startTime time.Time) { + status := statusSuccess + if !success { + status = statusError + } + l4ILBDualStackSyncLatency.WithLabelValues(status, syncType).Observe(time.Since(startTime).Seconds()) +} + // publishL4ILBSyncLatency exports the given sync latency datapoint. func publishL4ILBSyncErrorCount(syncType, gceResource, errorType string) { l4ILBSyncErrorCount.WithLabelValues(syncType, gceResource, errorType).Inc()