Skip to content

Commit

Permalink
Avoid double delays when using throttling strategy
Browse files Browse the repository at this point in the history
  • Loading branch information
Alex Katsman committed Jun 19, 2023
1 parent ea8e24c commit c06446c
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 14 deletions.
20 changes: 12 additions & 8 deletions pkg/neg/readiness/poller.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,14 +162,18 @@ func (p *poller) Poll(key negMeta) (retry bool, err error) {
// TODO(freehan): filter the NEs that are in interest once the API supports it
res, err := p.negCloud.ListNetworkEndpoints(key.Name, key.Zone /*showHealthStatus*/, true, key.SyncerKey.GetAPIVersion())
if err != nil {
// On receiving GCE API error, do not retry immediately. This is to prevent the reflector to overwhelm the GCE NEG API when
// rate limiting is in effect. This will prevent readiness reflector to overwhelm the GCE NEG API and cause NEG syncers to backoff.
// This will effectively batch NEG health status updates for 100s. The pods added into NEG in this 100s will not be marked ready
// until the next status poll is executed. However, the pods are not marked as Ready and still passes the LB health check will
// serve LB traffic. The side effect during the delay period is the workload (depending on rollout strategy) might slow down rollout.
// TODO(freehan): enable exponential backoff.
p.logger.Error(err, "Failed to ListNetworkEndpoint in NEG. Retrying after some time.", "neg", key.String(), "retryDelay", retryDelay.String())
<-p.clock.After(retryDelay)
if negtypes.IsStrategyQuotaError(err) {
p.logger.V(4).Error(err, "Failed to ListNetworkEndpoints in NEG", "neg", key.String())
} else {
// On receiving GCE API error, do not retry immediately. This is to prevent the reflector to overwhelm the GCE NEG API when
// rate limiting is in effect. This will prevent readiness reflector to overwhelm the GCE NEG API and cause NEG syncers to backoff.
// This will effectively batch NEG health status updates for 100s. The pods added into NEG in this 100s will not be marked ready
// until the next status poll is executed. However, the pods are not marked as Ready and still passes the LB health check will
// serve LB traffic. The side effect during the delay period is the workload (depending on rollout strategy) might slow down rollout.
// TODO(freehan): enable exponential backoff.
p.logger.Error(err, "Failed to ListNetworkEndpoints in NEG. Retrying after some time.", "neg", key.String(), "retryDelay", retryDelay.String())
<-p.clock.After(retryDelay)
}
return true, err
}

Expand Down
5 changes: 4 additions & 1 deletion pkg/neg/syncers/syncer.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,10 @@ func (s *syncer) Start() error {
err := s.core.sync()
if err != nil {
metrics.PublishNegControllerErrorCountMetrics(err, false)
delay, retryErr := s.backoff.NextDelay()
delay, retryErr := time.Duration(0), error(nil)
if !negtypes.IsStrategyQuotaError(err) {
delay, retryErr = s.backoff.NextDelay()
}
retryMsg := ""
if retryErr == backoff.ErrRetriesExceeded {
retryMsg = "(will not retry)"
Expand Down
12 changes: 8 additions & 4 deletions pkg/neg/syncers/transaction.go
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ func (s *transactionSyncer) syncInternalImpl() error {

currentMap, currentPodLabelMap, err := retrieveExistingZoneNetworkEndpointMap(s.NegSyncerKey.NegName, s.zoneGetter, s.cloud, s.NegSyncerKey.GetAPIVersion(), s.endpointsCalculator.Mode(), s.enableDualStackNEG)
if err != nil {
return fmt.Errorf("%w: %v", negtypes.ErrCurrentNegEPNotFound, err)
return fmt.Errorf("%w: %w", negtypes.ErrCurrentNegEPNotFound, err)
}
s.logStats(currentMap, "current NEG endpoints")

Expand Down Expand Up @@ -595,9 +595,13 @@ func (s *transactionSyncer) commitTransaction(err error, networkEndpointMap map[
}

if needRetry {
if retryErr := s.retry.Retry(); retryErr != nil {
s.recordEvent(apiv1.EventTypeWarning, "RetryFailed", fmt.Sprintf("Failed to retry NEG sync for %q: %v", s.NegSyncerKey.String(), retryErr))
metrics.PublishNegControllerErrorCountMetrics(retryErr, false)
if negtypes.IsStrategyQuotaError(err) {
s.syncer.Sync()
} else {
if retryErr := s.retry.Retry(); retryErr != nil {
s.recordEvent(apiv1.EventTypeWarning, "RetryFailed", fmt.Sprintf("Failed to retry NEG sync for %q: %v", s.NegSyncerKey.String(), retryErr))
metrics.PublishNegControllerErrorCountMetrics(retryErr, false)
}
}
return
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/neg/syncers/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -672,7 +672,7 @@ func retrieveExistingZoneNetworkEndpointMap(negName string, zoneGetter negtypes.
metrics.PublishNegControllerErrorCountMetrics(err, true)
continue
}
return nil, nil, fmt.Errorf("Failed to lookup NEG in zone %q, candidate zones %v, err - %v", zone, candidateZonesMap, err)
return nil, nil, fmt.Errorf("Failed to lookup NEG in zone %q, candidate zones %v, err - %w", zone, candidateZonesMap, err)
}
zoneNetworkEndpointMap[zone] = negtypes.NewNetworkEndpointSet()
for _, ne := range networkEndpointsWithHealthStatus {
Expand Down

0 comments on commit c06446c

Please sign in to comment.