From 8cfa568adb65756547fe2791e3113337852bc7d5 Mon Sep 17 00:00:00 2001 From: Dave Protasowski Date: Thu, 28 Apr 2022 17:15:52 -0400 Subject: [PATCH] ServerlessService's initial operation mode is now Proxy (#12842) Prior the initial operation mode was Serve. This would create a narrow window where the revision's pods directly receive requests from the ingress. The size of this window depends on when the Revision's Pod becomes ready and the time it takes the autoscaler to make an initial decision on scale an excess burst capacity. If there is a continuous traffic this could overwhelm Revisions with a low container concurrency causing the queue-proxy to return 503s since it's queue is full. --- pkg/reconciler/autoscaling/kpa/kpa.go | 24 +++++++++++++--------- pkg/reconciler/autoscaling/kpa/kpa_test.go | 17 ++++++++------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/pkg/reconciler/autoscaling/kpa/kpa.go b/pkg/reconciler/autoscaling/kpa/kpa.go index d0d18712f373..5f38600e585e 100644 --- a/pkg/reconciler/autoscaling/kpa/kpa.go +++ b/pkg/reconciler/autoscaling/kpa/kpa.go @@ -98,7 +98,7 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, pa *autoscalingv1alpha1. if sks == nil || sks.Status.PrivateServiceName == "" { // Before we can reconcile decider and get real number of activators // we start with default of 2. - if _, err = c.ReconcileSKS(ctx, pa, nv1alpha1.SKSOperationModeServe, minActivators); err != nil { + if _, err = c.ReconcileSKS(ctx, pa, nv1alpha1.SKSOperationModeProxy, minActivators); err != nil { return fmt.Errorf("error reconciling SKS: %w", err) } pa.Status.MarkSKSNotReady(noPrivateServiceName) // In both cases this is true. @@ -123,22 +123,26 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, pa *autoscalingv1alpha1. return fmt.Errorf("error scaling target: %w", err) } - mode := nv1alpha1.SKSOperationModeServe + mode := nv1alpha1.SKSOperationModeProxy + switch { // When activator CA is enabled, force activator always in path. // TODO: This is a temporary state and to be fixed. // See also issues/11906 and issues/12797. case len(config.FromContext(ctx).Network.ActivatorCA) > 0: mode = nv1alpha1.SKSOperationModeProxy - // We put activator in the serving path in the following cases: - // 1. The revision is scaled to 0: - // a. want == 0 - // b. want == -1 && PA is inactive (Autoscaler has no previous knowledge of - // this revision, e.g. after a restart) but PA status is inactive (it was - // already scaled to 0). - // 2. The excess burst capacity is negative. - case want == 0 || decider.Status.ExcessBurstCapacity < 0 || want == scaleUnknown && pa.Status.IsInactive(): + + // If the want == -1 and PA is inactive that implies the autoscaler + // has no knowledge of the revision (due to restart) but it was previously + // scaled down (inactive). In this instance we want to remain in Proxy Mode + case want == scaleUnknown && pa.Status.IsInactive(): mode = nv1alpha1.SKSOperationModeProxy + + // We remove the activator from the serving path when + // we want the revision's scale to be greater than 0 + // and we have excess burst capacity (>=0) + case want > 0 && decider.Status.ExcessBurstCapacity >= 0: + mode = nv1alpha1.SKSOperationModeServe } // Compare the desired and observed resources to determine our situation. diff --git a/pkg/reconciler/autoscaling/kpa/kpa_test.go b/pkg/reconciler/autoscaling/kpa/kpa_test.go index 30d818f67e3d..7f8739d0480d 100644 --- a/pkg/reconciler/autoscaling/kpa/kpa_test.go +++ b/pkg/reconciler/autoscaling/kpa/kpa_test.go @@ -607,7 +607,7 @@ func TestReconcile(t *testing.T) { WithObservedGeneration(1)), }}, WantCreates: []runtime.Object{ - sks(testNamespace, testRevision, WithDeployRef(deployName), WithNumActivators(minActivators), sksNoConds), + sks(testNamespace, testRevision, WithDeployRef(deployName), WithNumActivators(minActivators), WithProxyMode, sksNoConds), }, }, { Name: "sks is out of whack", @@ -642,7 +642,7 @@ func TestReconcile(t *testing.T) { }, WantErr: true, WantCreates: []runtime.Object{ - sks(testNamespace, testRevision, WithDeployRef(deployName), WithNumActivators(minActivators), sksNoConds), + sks(testNamespace, testRevision, WithDeployRef(deployName), WithProxyMode, WithNumActivators(minActivators), sksNoConds), }, WantEvents: []string{ Eventf(corev1.EventTypeWarning, "InternalError", @@ -653,7 +653,7 @@ func TestReconcile(t *testing.T) { Key: key, Objects: []runtime.Object{ kpa(testNamespace, testRevision, withScales(1, defaultScale), WithPAMetricsService(privateSvc), WithTraffic, WithObservedGeneration(1)), - sks(testNamespace, testRevision, WithDeployRef("bar")), + sks(testNamespace, testRevision, WithProxyMode, WithDeployRef("bar")), metric(testNamespace, testRevision), defaultDeployment, }, @@ -662,7 +662,7 @@ func TestReconcile(t *testing.T) { }, WantErr: true, WantUpdates: []clientgotesting.UpdateActionImpl{{ - Object: sks(testNamespace, testRevision, WithDeployRef(deployName), WithNumActivators(minActivators)), + Object: sks(testNamespace, testRevision, WithProxyMode, WithDeployRef(deployName), WithNumActivators(minActivators)), }}, WantEvents: []string{ Eventf(corev1.EventTypeWarning, "InternalError", "error reconciling SKS: error updating SKS test-revision: inducing failure for update serverlessservices"), @@ -1059,7 +1059,7 @@ func TestReconcile(t *testing.T) { kpa(testNamespace, testRevision, withScales(0, -1), WithReachabilityReachable, WithPAMetricsService(privateSvc), WithPASKSNotReady(noPrivateServiceName)), // SKS won't be ready bc no ready endpoints, but private service name will be populated. - sks(testNamespace, testRevision, WithDeployRef(deployName), WithPrivateService, WithPubService), + sks(testNamespace, testRevision, WithProxyMode, WithDeployRef(deployName), WithPrivateService, WithPubService), metric(testNamespace, testRevision), deploy(testNamespace, testRevision, func(d *appsv1.Deployment) { d.Spec.Replicas = ptr.Int32(0) @@ -1083,7 +1083,7 @@ func TestReconcile(t *testing.T) { kpa(testNamespace, testRevision, withScales(0, -1), WithReachabilityReachable, WithPAMetricsService(privateSvc), WithPASKSNotReady(noPrivateServiceName)), // SKS won't be ready bc no ready endpoints, but private service name will be populated. - sks(testNamespace, testRevision, WithDeployRef(deployName), WithPrivateService), + sks(testNamespace, testRevision, WithProxyMode, WithDeployRef(deployName), WithPrivateService), metric(testNamespace, testRevision), deploy(testNamespace, testRevision, func(d *appsv1.Deployment) { d.Spec.Replicas = ptr.Int32(0) @@ -1101,13 +1101,12 @@ func TestReconcile(t *testing.T) { Name: "initial scale zero: stay at zero", Key: key, Ctx: context.WithValue(context.WithValue(context.Background(), asConfigKey{}, initialScaleZeroASConfig()), deciderKey{}, - decider(testNamespace, testRevision, -1, /* desiredScale */ - 0 /* ebc */)), + decider(testNamespace, testRevision, -1 /* desiredScale */, 0 /* ebc */)), Objects: []runtime.Object{ kpa(testNamespace, testRevision, markScaleTargetInitialized, withScales(0, scaleUnknown), WithReachabilityReachable, WithPAMetricsService(privateSvc), WithPASKSNotReady(""), ), - sks(testNamespace, testRevision, WithDeployRef(deployName), WithPrivateService), + sks(testNamespace, testRevision, WithProxyMode, WithDeployRef(deployName), WithPrivateService), metric(testNamespace, testRevision), deploy(testNamespace, testRevision, func(d *appsv1.Deployment) { d.Spec.Replicas = ptr.Int32(0)