From 9bc00085a9a37d7bec01574b6ef76f3108ff41c2 Mon Sep 17 00:00:00 2001 From: Alex Katsman Date: Thu, 18 May 2023 18:53:32 +0200 Subject: [PATCH] Avoid double delays when using throttling strategy --- pkg/neg/readiness/poller.go | 20 ++++++++++++-------- pkg/neg/syncers/syncer.go | 5 ++++- pkg/neg/syncers/transaction.go | 10 +++++++--- 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/pkg/neg/readiness/poller.go b/pkg/neg/readiness/poller.go index 36f71aff81..fbf520a365 100644 --- a/pkg/neg/readiness/poller.go +++ b/pkg/neg/readiness/poller.go @@ -161,14 +161,18 @@ func (p *poller) Poll(key negMeta) (retry bool, err error) { // TODO(freehan): filter the NEs that are in interest once the API supports it res, err := p.negCloud.ListNetworkEndpoints(key.Name, key.Zone /*showHealthStatus*/, true, key.SyncerKey.GetAPIVersion()) if err != nil { - // On receiving GCE API error, do not retry immediately. This is to prevent the reflector to overwhelm the GCE NEG API when - // rate limiting is in effect. This will prevent readiness reflector to overwhelm the GCE NEG API and cause NEG syncers to backoff. - // This will effectively batch NEG health status updates for 100s. The pods added into NEG in this 100s will not be marked ready - // until the next status poll is executed. However, the pods are not marked as Ready and still passes the LB health check will - // serve LB traffic. The side effect during the delay period is the workload (depending on rollout strategy) might slow down rollout. - // TODO(freehan): enable exponential backoff. - p.logger.Error(err, "Failed to ListNetworkEndpoint in NEG. Retrying after some time.", "neg", key.String(), "retryDelay", retryDelay.String()) - <-p.clock.After(retryDelay) + if negtypes.IsStrategyQuotaError(err) { + p.logger.V(4).Error(err, "Failed to ListNetworkEndpoints in NEG", "neg", key.String()) + } else { + // On receiving GCE API error, do not retry immediately. This is to prevent the reflector to overwhelm the GCE NEG API when + // rate limiting is in effect. This will prevent readiness reflector to overwhelm the GCE NEG API and cause NEG syncers to backoff. + // This will effectively batch NEG health status updates for 100s. The pods added into NEG in this 100s will not be marked ready + // until the next status poll is executed. However, the pods are not marked as Ready and still passes the LB health check will + // serve LB traffic. The side effect during the delay period is the workload (depending on rollout strategy) might slow down rollout. + // TODO(freehan): enable exponential backoff. + p.logger.Error(err, "Failed to ListNetworkEndpoints in NEG. Retrying after some time.", "neg", key.String(), "retryDelay", retryDelay.String()) + <-p.clock.After(retryDelay) + } return true, err } diff --git a/pkg/neg/syncers/syncer.go b/pkg/neg/syncers/syncer.go index 0736a65cb0..65fdb58d2c 100644 --- a/pkg/neg/syncers/syncer.go +++ b/pkg/neg/syncers/syncer.go @@ -90,7 +90,10 @@ func (s *syncer) Start() error { retryCh := make(<-chan time.Time) err := s.core.sync() if err != nil { - delay, retryErr := s.backoff.NextDelay() + delay, retryErr := time.Duration(0), error(nil) + if !negtypes.IsStrategyQuotaError(err) { + delay, retryErr = s.backoff.NextDelay() + } retryMsg := "" if retryErr == backoff.ErrRetriesExceeded { retryMsg = "(will not retry)" diff --git a/pkg/neg/syncers/transaction.go b/pkg/neg/syncers/transaction.go index a40e81bc64..ed0f01e7b5 100644 --- a/pkg/neg/syncers/transaction.go +++ b/pkg/neg/syncers/transaction.go @@ -250,7 +250,7 @@ func (s *transactionSyncer) syncInternalImpl() error { currentMap, currentPodLabelMap, err := retrieveExistingZoneNetworkEndpointMap(s.NegSyncerKey.NegName, s.zoneGetter, s.cloud, s.NegSyncerKey.GetAPIVersion(), s.endpointsCalculator.Mode(), s.enableDualStackNEG) if err != nil { - return fmt.Errorf("%w: %v", negtypes.ErrCurrentNegEPNotFound, err) + return fmt.Errorf("%w: %w", negtypes.ErrCurrentNegEPNotFound, err) } s.logStats(currentMap, "current NEG endpoints") @@ -592,8 +592,12 @@ func (s *transactionSyncer) commitTransaction(err error, networkEndpointMap map[ } if needRetry { - if retryErr := s.retry.Retry(); retryErr != nil { - s.recordEvent(apiv1.EventTypeWarning, "RetryFailed", fmt.Sprintf("Failed to retry NEG sync for %q: %v", s.NegSyncerKey.String(), retryErr)) + if negtypes.IsStrategyQuotaError(err) { + s.syncer.Sync() + } else { + if retryErr := s.retry.Retry(); retryErr != nil { + s.recordEvent(apiv1.EventTypeWarning, "RetryFailed", fmt.Sprintf("Failed to retry NEG sync for %q: %v", s.NegSyncerKey.String(), retryErr)) + } } return }