Skip to content

Commit

Permalink
Add metrics to track endpointslice staleness
Browse files Browse the repository at this point in the history
Added metrics to track the sync staleness of endpointslices, where
staleness is defined as how long since it has been last processed.
  • Loading branch information
sawsa307 committed Feb 13, 2023
1 parent b68db51 commit 91de2da
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 1 deletion.
17 changes: 17 additions & 0 deletions pkg/neg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ const (
negOpLatencyKey = "neg_operation_duration_seconds"
negOpEndpointsKey = "neg_operation_endpoints"
lastSyncTimestampKey = "sync_timestamp"
epsStalenessKey = "endpointslice_staleness"

resultSuccess = "success"
resultError = "error"
Expand Down Expand Up @@ -127,6 +128,17 @@ var (
Help: "The timestamp of the last execution of NEG controller sync loop.",
},
)

// EPSStaleness tracks for every endpoint slice, how long since it was last processed
EPSStaleness = prometheus.NewHistogram(
prometheus.HistogramOpts{
Subsystem: negControllerSubsystem,
Name: epsStalenessKey,
Help: "The duration for an endpoint slice since it was last processed by syncer",
// custom buckets - [1s, 2s, 4s, 8s, 16s, 32s, 64s, 128s, 256s(~4min), 512s(~8min), 1024s(~17min), 2048 (~34min), 4096(~68min), +Inf]
Buckets: prometheus.ExponentialBuckets(1, 2, 13),
},
)
)

var register sync.Once
Expand All @@ -139,6 +151,7 @@ func RegisterMetrics() {
prometheus.MustRegister(SyncerSyncLatency)
prometheus.MustRegister(LastSyncTimestamp)
prometheus.MustRegister(InitializationLatency)
prometheus.MustRegister(EPSStaleness)

RegisterSyncerMetrics()
})
Expand Down Expand Up @@ -170,6 +183,10 @@ func PublishNegInitializationMetrics(latency time.Duration) {
InitializationLatency.Observe(latency.Seconds())
}

func PublishNegEPSStalenessMetrics(epsStaleness time.Duration) {
EPSStaleness.Observe(epsStaleness.Seconds())
}

func getResult(err error) string {
if err != nil {
return resultError
Expand Down
22 changes: 21 additions & 1 deletion pkg/neg/syncers/transaction.go
Original file line number Diff line number Diff line change
Expand Up @@ -245,9 +245,29 @@ func (s *transactionSyncer) syncInternalImpl() error {
return nil
}
endpointSlices := make([]*discovery.EndpointSlice, len(slices))
negCR, err := getNegFromStore(s.svcNegLister, s.Namespace, s.NegSyncerKey.NegName)
if err != nil {
s.logger.Error(err, "unable to retrieve neg from the store", "neg", klog.KRef(s.Namespace, s.NegName))
return err
}
for i, slice := range slices {
endpointSlices[i] = slice.(*discovery.EndpointSlice)
endpointslice := slice.(*discovery.EndpointSlice)
endpointSlices[i] = endpointslice

var epsStaleness time.Duration
lastSyncTimestamp := negCR.Status.LastSyncTime
epsCreationTimestamp := endpointslice.ObjectMeta.CreationTimestamp

// if this endpoint slice is newly created/created after last sync
if lastSyncTimestamp.Before(&epsCreationTimestamp) {
epsStaleness = time.Since(epsCreationTimestamp.Time)
} else {
epsStaleness = time.Since(lastSyncTimestamp.Time)
}
metrics.PublishNegEPSStalenessMetrics(epsStaleness)
s.logger.V(3).Info("Endpoint slice syncs", "Namespace", endpointslice.Namespace, "Name", endpointslice.Name, "staleness", epsStaleness)
}

endpointsData := negtypes.EndpointsDataFromEndpointSlices(endpointSlices)
targetMap, endpointPodMap, dupCount, err = s.endpointsCalculator.CalculateEndpoints(endpointsData, currentMap)
if !s.isValidEPField(err) || !s.isValidEndpointInfo(endpointsData, endpointPodMap, dupCount) {
Expand Down

0 comments on commit 91de2da

Please sign in to comment.