Don't CommitUpdate twice in a single reconcile loop (#1684)

Multiple updates in a single loop means that there are multiple changes with new resourceVersions floating around. This then leads to races where you might end up reconciling on the resource from after update1 when you really want the change from after update2. Since the two-updates was just an optimization, removed it.
Azure · Aug 4, 2021 · 1db5905 · 1db5905
1 parent 189aae5
commit 1db5905
Show file tree

Hide file tree

Showing 3 changed files with 79 additions and 112 deletions.
diff --git a/hack/generated/controllers/generic_controller.go b/hack/generated/controllers/generic_controller.go
@@ -39,7 +39,7 @@ type GenericReconciler struct {
 	Name                 string
 	GVK                  schema.GroupVersionKind
 	Controller           controller.Controller
-	RequeueDelay         time.Duration
+	RequeueDelayOverride time.Duration
 	CreateDeploymentName func(obj metav1.Object) (string, error)
 }
 
@@ -54,11 +54,6 @@ type Options struct {
 }
 
 func (options *Options) setDefaults() {
-	// default requeue delay to 5 seconds
-	if options.RequeueDelay == 0 {
-		options.RequeueDelay = 5 * time.Second
-	}
-
 	// override deployment name generator, if provided
 	if options.CreateDeploymentName == nil {
 		options.CreateDeploymentName = createDeploymentName
@@ -135,7 +130,7 @@ func register(mgr ctrl.Manager, reconciledResourceLookup map[schema.GroupKind]sc
 		Log:                  log.WithName(controllerName),
 		Recorder:             mgr.GetEventRecorderFor(controllerName),
 		GVK:                  gvk,
-		RequeueDelay:         options.RequeueDelay,
+		RequeueDelayOverride: options.RequeueDelay,
 		CreateDeploymentName: options.CreateDeploymentName,
 	}
 
@@ -225,10 +220,13 @@ func (gr *GenericReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
 		return ctrl.Result{}, err
 	}
 
-	// TODO: This actually means we don't get exponential backoff which Kubernetes does by default,
-	// TODO: but we need this for test today to get fast reconciles.
-	if result.Requeue && result.RequeueAfter == time.Duration(0) {
-		result.RequeueAfter = gr.RequeueDelay
+	// If we have a requeue delay override, set it for all situations where
+	// we are requeueing.
+	hasRequeueDelayOverride := gr.RequeueDelayOverride != time.Duration(0)
+	isRequeueing := result.Requeue || result.RequeueAfter > time.Duration(0)
+	if hasRequeueDelayOverride && isRequeueing {
+		result.RequeueAfter = gr.RequeueDelayOverride
+		result.Requeue = true
 	}
 
 	return result, nil

diff --git a/hack/generated/pkg/reconcilers/azure_deployment_reconciler.go b/hack/generated/pkg/reconcilers/azure_deployment_reconciler.go
@@ -12,7 +12,6 @@ import (
 	"encoding/json"
 	"fmt"
 	"net/http"
-	"strconv"
 	"strings"
 	"time"
 
@@ -47,8 +46,6 @@ const (
 	ResourceStateAnnotation  = "resource-state.infra.azure.com"
 	ResourceErrorAnnotation  = "resource-error.infra.azure.com"
 	ResourceSigAnnotationKey = "resource-sig.infra.azure.com"
-	// PreserveDeploymentAnnotation is the key which tells the applier to keep or delete the deployment
-	PreserveDeploymentAnnotation = "x-preserve-deployment"
 )
 
 // TODO: Do we actually want this at the controller level or this level?
@@ -186,23 +183,6 @@ func (r *AzureDeploymentReconciler) SetDeploymentName(name string) {
 	genruntime.AddAnnotation(r.obj, DeploymentNameAnnotation, name)
 }
 
-func (r *AzureDeploymentReconciler) GetShouldPreserveDeployment() bool {
-	preserveDeploymentString, ok := r.obj.GetAnnotations()[PreserveDeploymentAnnotation]
-	if !ok {
-		return false
-	}
-
-	preserveDeployment, err := strconv.ParseBool(preserveDeploymentString)
-	// Anything other than an error is assumed to be false...
-	// TODO: Would we rather have any usage of this key imply true (regardless of value?)
-	if err != nil {
-		// TODO: Log here
-		return false
-	}
-
-	return preserveDeployment
-}
-
 func (r *AzureDeploymentReconciler) SetResourceProvisioningState(state armclient.ProvisioningState) {
 	// TODO: It's almost certainly not safe to use this as our serialized format as it's not guaranteed backwards compatible?
 	genruntime.AddAnnotation(r.obj, ResourceStateAnnotation, string(state))
@@ -497,7 +477,7 @@ func (r *AzureDeploymentReconciler) CreateDeployment(ctx context.Context) (ctrl.
 
 	err = r.CommitUpdate(ctx)
 	if err != nil {
-		// This is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
+		// NotFound is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
 		// The correct handling is just to ignore it and we will get an event shortly with the updated version to patch
 		return ctrl.Result{}, client.IgnoreNotFound(err)
 	}
@@ -531,7 +511,7 @@ func (r *AzureDeploymentReconciler) CreateDeployment(ctx context.Context) (ctrl.
 				err = r.CommitUpdate(ctx)
 
 				if err != nil {
-					// This is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
+					// NotFound is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
 					// The correct handling is just to ignore it and we will get an event shortly with the updated version to patch
 					return ctrl.Result{}, client.IgnoreNotFound(err)
 				}
@@ -560,7 +540,40 @@ func (r *AzureDeploymentReconciler) CreateDeployment(ctx context.Context) (ctrl.
 	return result, err
 }
 
-// TODO: There's a bit too much duplicated code between this and create deployment -- should be a good way to combine them?
+func (r *AzureDeploymentReconciler) handleDeploymentFinished(ctx context.Context, deployment *armclient.Deployment) (ctrl.Result, error) {
+	var status genruntime.FromARMConverter
+	if deployment.IsSuccessful() {
+		// TODO: There's some overlap here with what Update does
+		if len(deployment.Properties.OutputResources) == 0 {
+			return ctrl.Result{}, errors.Errorf("template deployment didn't have any output resources")
+		}
+
+		resourceID, err := deployment.ResourceID()
+		if err != nil {
+			return ctrl.Result{}, errors.Wrap(err, "getting resource ID from resource")
+		}
+
+		status, _, err = r.getStatus(ctx, resourceID)
+		if err != nil {
+			return ctrl.Result{}, errors.Wrap(err, "getting status from ARM")
+		}
+	}
+
+	err := r.Update(deployment, status)
+	if err != nil {
+		return ctrl.Result{}, errors.Wrap(err, "updating obj")
+	}
+
+	err = r.CommitUpdate(ctx)
+	if err != nil {
+		// NotFound is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
+		// The correct handling is just to ignore it and we will get an event shortly with the updated version to patch
+		return ctrl.Result{}, client.IgnoreNotFound(err)
+	}
+
+	return ctrl.Result{}, nil
+}
+
 func (r *AzureDeploymentReconciler) MonitorDeployment(ctx context.Context) (ctrl.Result, error) {
 	deploymentID, deploymentIDOk := r.GetDeploymentID()
 	if !deploymentIDOk {
@@ -576,7 +589,7 @@ func (r *AzureDeploymentReconciler) MonitorDeployment(ctx context.Context) (ctrl
 			r.SetDeploymentName("")
 			err = r.CommitUpdate(ctx)
 			if err != nil {
-				// This is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
+				// NotFound is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
 				// The correct handling is just to ignore it and we will get an event shortly with the updated version to patch
 				return ctrl.Result{}, client.IgnoreNotFound(err)
 			}
@@ -593,90 +606,46 @@ func (r *AzureDeploymentReconciler) MonitorDeployment(ctx context.Context) (ctrl
 		return ctrl.Result{}, errors.Wrapf(err, "getting deployment %q from ARM", deploymentID)
 	}
 
-	var status genruntime.FromARMConverter
-	if deployment.IsSuccessful() {
-		// TODO: There's some overlap here with what Update does
-		if len(deployment.Properties.OutputResources) == 0 {
-			return ctrl.Result{}, errors.Errorf("template deployment didn't have any output resources")
-		}
-
-		resourceID, idErr := deployment.ResourceID()
-		if idErr != nil {
-			return ctrl.Result{}, errors.Wrap(idErr, "getting resource ID from resource")
-		}
-
-		s, _, statusErr := r.getStatus(ctx, resourceID)
-		if statusErr != nil {
-			return ctrl.Result{}, errors.Wrap(statusErr, "getting status from ARM")
-		}
+	r.log.V(4).Info(
+		"Monitoring deployment",
+		"action", string(CreateOrUpdateActionMonitorDeployment),
+		"id", deploymentID,
+		"state", deployment.ProvisioningStateOrUnknown())
+	r.recorder.Event(
+		r.obj,
+		v1.EventTypeNormal,
+		string(CreateOrUpdateActionMonitorDeployment),
+		fmt.Sprintf("Monitoring Azure deployment ID=%q, state=%q", deploymentID, deployment.ProvisioningStateOrUnknown()))
 
-		status = s
+	// If the deployment isn't done yet, there's nothing to do just bail out
+	if !deployment.IsTerminalProvisioningState() {
+		r.log.V(3).Info("Deployment still running")
+		return ctrl.Result{Requeue: true, RequeueAfter: retryAfter}, nil
 	}
 
-	err = r.Update(deployment, status)
-	if err != nil {
-		return ctrl.Result{}, errors.Wrap(err, "updating obj")
-	}
-
-	err = r.CommitUpdate(ctx)
+	// The deployment is in a terminal state - let's handle it
+	r.log.Info(
+		"Deployment in terminal state",
+		"DeploymentID", deployment.ID,
+		"State", deployment.ProvisioningStateOrUnknown(),
+		"Error", deployment.ErrorOrEmpty())
 
+	// It is possible that we delete the deployment here and then are unable to persist the details of the created
+	// resource to etcd below. If this happens, a subsequent reconciliation will attempt to GET the deployment which will
+	// fail. That will trigger us to throw the deployment ID away and create a new one (which will end up being a no-op
+	// because the Azure resource already exists). Since it's expected that this sequence of events is rare, we don't
+	// try to optimize for preventing it with some sort of two phase commit or anything.
+	// TODO: Create a unit test that forces this specific sequence of events
+	r.log.V(4).Info("Deleting deployment", "DeploymentID", deployment.ID)
+	_, err = r.ARMClient.DeleteDeployment(ctx, deployment.ID)
 	if err != nil {
-		// This is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
-		// The correct handling is just to ignore it and we will get an event shortly with the updated version to patch
-		return ctrl.Result{}, client.IgnoreNotFound(err)
+		return ctrl.Result{}, errors.Wrapf(err, "failed deleting deployment %q", deployment.ID)
 	}
 
-	// TODO: Could somehow have a method that grouped both of these calls
-	currentState := r.GetResourceProvisioningState()
-	r.log.V(4).Info("Monitoring deployment", "action", string(CreateOrUpdateActionMonitorDeployment), "id", deployment.ID, "state", currentState)
-	r.recorder.Event(r.obj, v1.EventTypeNormal, string(CreateOrUpdateActionMonitorDeployment), fmt.Sprintf("Monitoring Azure deployment ID=%q, state=%q", deployment.ID, currentState))
-
-	// We do two patches here because if we remove the deployment before we've actually confirmed we persisted
-	// the resource ID, then we will be unable to get the resource ID the next time around. Only once we have
-	// persisted the resource ID can we safely delete the deployment
-	retryAfter = time.Duration(0) // ARM can tell us how long to check after issuing DELETE
-	if deployment.IsTerminalProvisioningState() && !r.GetShouldPreserveDeployment() {
-		r.log.Info(
-			"Deployment in terminal state",
-			"DeploymentID", deployment.ID,
-			"State", deployment.ProvisioningStateOrUnknown(),
-			"Error", deployment.ErrorOrEmpty())
-
-		r.log.V(4).Info("Deleting deployment", "DeploymentID", deployment.ID)
-
-		retryAfter, err = r.ARMClient.DeleteDeployment(ctx, deployment.ID)
-		if err != nil {
-			return ctrl.Result{}, errors.Wrapf(err, "failed deleting deployment %q", deployment.ID)
-		}
-
-		deployment.ID = ""
-		deployment.Name = ""
-
-		err = r.Update(deployment, status)
-		if err != nil {
-			return ctrl.Result{}, errors.Wrap(err, "updating obj")
-		}
-
-		err = r.CommitUpdate(ctx)
-		if err != nil {
-			// This is a superfluous error as per https://github.com/kubernetes-sigs/controller-runtime/issues/377
-			// The correct handling is just to ignore it and we will get an event shortly with the updated version to patch
-			return ctrl.Result{}, client.IgnoreNotFound(err)
-		}
-
-		r.log.V(4).Info(
-			"Cleared deployment info from resource",
-			"DeploymentID", r.GetDeploymentIDOrDefault(),
-			"DeploymentName", r.GetDeploymentNameOrDefault())
-	}
-
-	if deployment.IsTerminalProvisioningState() {
-		// we are done
-		return ctrl.Result{}, nil
-	}
+	deployment.ID = ""
+	deployment.Name = ""
 
-	r.log.V(3).Info("Deployment still running")
-	return ctrl.Result{Requeue: true, RequeueAfter: retryAfter}, err
+	return r.handleDeploymentFinished(ctx, deployment)
 }
 
 func (r *AzureDeploymentReconciler) ManageOwnership(ctx context.Context) (ctrl.Result, error) {

diff --git a/hack/generated/pkg/testcommon/kube_test_context_envtest.go b/hack/generated/pkg/testcommon/kube_test_context_envtest.go
@@ -65,7 +65,7 @@ func createEnvtestContext(perTestContext PerTestContext) (*KubeBaseTestContext,
 		return nil, errors.Wrapf(err, "creating controller-runtime manager")
 	}
 
-	var requeueDelay time.Duration // defaults to 5s when zero is passed
+	var requeueDelay time.Duration
 	if perTestContext.AzureClientRecorder.Mode() == recorder.ModeReplaying {
 		perTestContext.T.Log("Minimizing requeue delay")
 		// skip requeue delays when replaying