Skip to content

Commit

Permalink
Avoid double delays when using throttling strategy
Browse files Browse the repository at this point in the history
  • Loading branch information
Alex Katsman committed May 18, 2023
1 parent 04e089b commit 9bc0008
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 12 deletions.
20 changes: 12 additions & 8 deletions pkg/neg/readiness/poller.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,14 +161,18 @@ func (p *poller) Poll(key negMeta) (retry bool, err error) {
// TODO(freehan): filter the NEs that are in interest once the API supports it
res, err := p.negCloud.ListNetworkEndpoints(key.Name, key.Zone /*showHealthStatus*/, true, key.SyncerKey.GetAPIVersion())
if err != nil {
// On receiving GCE API error, do not retry immediately. This is to prevent the reflector to overwhelm the GCE NEG API when
// rate limiting is in effect. This will prevent readiness reflector to overwhelm the GCE NEG API and cause NEG syncers to backoff.
// This will effectively batch NEG health status updates for 100s. The pods added into NEG in this 100s will not be marked ready
// until the next status poll is executed. However, the pods are not marked as Ready and still passes the LB health check will
// serve LB traffic. The side effect during the delay period is the workload (depending on rollout strategy) might slow down rollout.
// TODO(freehan): enable exponential backoff.
p.logger.Error(err, "Failed to ListNetworkEndpoint in NEG. Retrying after some time.", "neg", key.String(), "retryDelay", retryDelay.String())
<-p.clock.After(retryDelay)
if negtypes.IsStrategyQuotaError(err) {
p.logger.V(4).Error(err, "Failed to ListNetworkEndpoints in NEG", "neg", key.String())
} else {
// On receiving GCE API error, do not retry immediately. This is to prevent the reflector to overwhelm the GCE NEG API when
// rate limiting is in effect. This will prevent readiness reflector to overwhelm the GCE NEG API and cause NEG syncers to backoff.
// This will effectively batch NEG health status updates for 100s. The pods added into NEG in this 100s will not be marked ready
// until the next status poll is executed. However, the pods are not marked as Ready and still passes the LB health check will
// serve LB traffic. The side effect during the delay period is the workload (depending on rollout strategy) might slow down rollout.
// TODO(freehan): enable exponential backoff.
p.logger.Error(err, "Failed to ListNetworkEndpoints in NEG. Retrying after some time.", "neg", key.String(), "retryDelay", retryDelay.String())
<-p.clock.After(retryDelay)
}
return true, err
}

Expand Down
5 changes: 4 additions & 1 deletion pkg/neg/syncers/syncer.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,10 @@ func (s *syncer) Start() error {
retryCh := make(<-chan time.Time)
err := s.core.sync()
if err != nil {
delay, retryErr := s.backoff.NextDelay()
delay, retryErr := time.Duration(0), error(nil)
if !negtypes.IsStrategyQuotaError(err) {
delay, retryErr = s.backoff.NextDelay()
}
retryMsg := ""
if retryErr == backoff.ErrRetriesExceeded {
retryMsg = "(will not retry)"
Expand Down
10 changes: 7 additions & 3 deletions pkg/neg/syncers/transaction.go
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ func (s *transactionSyncer) syncInternalImpl() error {

currentMap, currentPodLabelMap, err := retrieveExistingZoneNetworkEndpointMap(s.NegSyncerKey.NegName, s.zoneGetter, s.cloud, s.NegSyncerKey.GetAPIVersion(), s.endpointsCalculator.Mode(), s.enableDualStackNEG)
if err != nil {
return fmt.Errorf("%w: %v", negtypes.ErrCurrentNegEPNotFound, err)
return fmt.Errorf("%w: %w", negtypes.ErrCurrentNegEPNotFound, err)
}
s.logStats(currentMap, "current NEG endpoints")

Expand Down Expand Up @@ -592,8 +592,12 @@ func (s *transactionSyncer) commitTransaction(err error, networkEndpointMap map[
}

if needRetry {
if retryErr := s.retry.Retry(); retryErr != nil {
s.recordEvent(apiv1.EventTypeWarning, "RetryFailed", fmt.Sprintf("Failed to retry NEG sync for %q: %v", s.NegSyncerKey.String(), retryErr))
if negtypes.IsStrategyQuotaError(err) {
s.syncer.Sync()
} else {
if retryErr := s.retry.Retry(); retryErr != nil {
s.recordEvent(apiv1.EventTypeWarning, "RetryFailed", fmt.Sprintf("Failed to retry NEG sync for %q: %v", s.NegSyncerKey.String(), retryErr))
}
}
return
}
Expand Down

0 comments on commit 9bc0008

Please sign in to comment.