diff --git a/pkg/neg/readiness/poller.go b/pkg/neg/readiness/poller.go index 509e88b112..b901f658a3 100644 --- a/pkg/neg/readiness/poller.go +++ b/pkg/neg/readiness/poller.go @@ -162,14 +162,18 @@ func (p *poller) Poll(key negMeta) (retry bool, err error) { // TODO(freehan): filter the NEs that are in interest once the API supports it res, err := p.negCloud.ListNetworkEndpoints(key.Name, key.Zone /*showHealthStatus*/, true, key.SyncerKey.GetAPIVersion()) if err != nil { - // On receiving GCE API error, do not retry immediately. This is to prevent the reflector to overwhelm the GCE NEG API when - // rate limiting is in effect. This will prevent readiness reflector to overwhelm the GCE NEG API and cause NEG syncers to backoff. - // This will effectively batch NEG health status updates for 100s. The pods added into NEG in this 100s will not be marked ready - // until the next status poll is executed. However, the pods are not marked as Ready and still passes the LB health check will - // serve LB traffic. The side effect during the delay period is the workload (depending on rollout strategy) might slow down rollout. - // TODO(freehan): enable exponential backoff. - p.logger.Error(err, "Failed to ListNetworkEndpoint in NEG. Retrying after some time.", "neg", key.String(), "retryDelay", retryDelay.String()) - <-p.clock.After(retryDelay) + if negtypes.IsStrategyQuotaError(err) { + p.logger.V(4).Error(err, "Failed to ListNetworkEndpoints in NEG", "neg", key.String()) + } else { + // On receiving GCE API error, do not retry immediately. This is to prevent the reflector to overwhelm the GCE NEG API when + // rate limiting is in effect. This will prevent readiness reflector to overwhelm the GCE NEG API and cause NEG syncers to backoff. + // This will effectively batch NEG health status updates for 100s. The pods added into NEG in this 100s will not be marked ready + // until the next status poll is executed. However, the pods are not marked as Ready and still passes the LB health check will + // serve LB traffic. The side effect during the delay period is the workload (depending on rollout strategy) might slow down rollout. + // TODO(freehan): enable exponential backoff. + p.logger.Error(err, "Failed to ListNetworkEndpoints in NEG. Retrying after some time.", "neg", key.String(), "retryDelay", retryDelay.String()) + <-p.clock.After(retryDelay) + } return true, err } diff --git a/pkg/neg/syncers/syncer.go b/pkg/neg/syncers/syncer.go index 964fd85519..5169572bcc 100644 --- a/pkg/neg/syncers/syncer.go +++ b/pkg/neg/syncers/syncer.go @@ -92,7 +92,10 @@ func (s *syncer) Start() error { err := s.core.sync() if err != nil { metrics.PublishNegControllerErrorCountMetrics(err, false) - delay, retryErr := s.backoff.NextDelay() + delay, retryErr := time.Duration(0), error(nil) + if !negtypes.IsStrategyQuotaError(err) { + delay, retryErr = s.backoff.NextDelay() + } retryMsg := "" if retryErr == backoff.ErrRetriesExceeded { retryMsg = "(will not retry)" diff --git a/pkg/neg/syncers/transaction.go b/pkg/neg/syncers/transaction.go index 5dc4b5e9eb..4287142973 100644 --- a/pkg/neg/syncers/transaction.go +++ b/pkg/neg/syncers/transaction.go @@ -251,7 +251,7 @@ func (s *transactionSyncer) syncInternalImpl() error { currentMap, currentPodLabelMap, err := retrieveExistingZoneNetworkEndpointMap(s.NegSyncerKey.NegName, s.zoneGetter, s.cloud, s.NegSyncerKey.GetAPIVersion(), s.endpointsCalculator.Mode(), s.enableDualStackNEG) if err != nil { - return fmt.Errorf("%w: %v", negtypes.ErrCurrentNegEPNotFound, err) + return fmt.Errorf("%w: %w", negtypes.ErrCurrentNegEPNotFound, err) } s.logStats(currentMap, "current NEG endpoints") @@ -594,9 +594,13 @@ func (s *transactionSyncer) commitTransaction(err error, networkEndpointMap map[ } if needRetry { - if retryErr := s.retry.Retry(); retryErr != nil { - s.recordEvent(apiv1.EventTypeWarning, "RetryFailed", fmt.Sprintf("Failed to retry NEG sync for %q: %v", s.NegSyncerKey.String(), retryErr)) - metrics.PublishNegControllerErrorCountMetrics(retryErr, false) + if negtypes.IsStrategyQuotaError(err) { + s.syncer.Sync() + } else { + if retryErr := s.retry.Retry(); retryErr != nil { + s.recordEvent(apiv1.EventTypeWarning, "RetryFailed", fmt.Sprintf("Failed to retry NEG sync for %q: %v", s.NegSyncerKey.String(), retryErr)) + metrics.PublishNegControllerErrorCountMetrics(retryErr, false) + } } return }