From 2fe0c7634cc43cc0321abf0a020c570400eee6e3 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Wed, 22 Feb 2023 11:12:56 -0800 Subject: [PATCH] Prevent eviction hanging due to do-not-evict (#220) --- pkg/controllers/deprovisioning/drift.go | 12 +- pkg/controllers/deprovisioning/emptiness.go | 9 +- pkg/controllers/deprovisioning/expiration.go | 14 +- pkg/controllers/deprovisioning/helpers.go | 29 +-- pkg/controllers/deprovisioning/suite_test.go | 3 +- pkg/controllers/inflightchecks/termination.go | 9 - .../machine/terminator/terminator.go | 3 - pkg/controllers/termination/suite_test.go | 217 ------------------ 8 files changed, 31 insertions(+), 265 deletions(-) diff --git a/pkg/controllers/deprovisioning/drift.go b/pkg/controllers/deprovisioning/drift.go index 308a831be7..7998d27361 100644 --- a/pkg/controllers/deprovisioning/drift.go +++ b/pkg/controllers/deprovisioning/drift.go @@ -19,6 +19,7 @@ import ( "errors" "fmt" + "github.com/samber/lo" v1 "k8s.io/api/core/v1" "knative.dev/pkg/logging" "sigs.k8s.io/controller-runtime/pkg/client" @@ -61,13 +62,12 @@ func (d *Drift) ComputeCommand(ctx context.Context, candidates ...CandidateNode) if err != nil { return Command{}, fmt.Errorf("tracking PodDisruptionBudgets, %w", err) } - for _, candidate := range candidates { - // is this a node that we can terminate? This check is meant to be fast so we can save the expense of simulated - // scheduling unless its really needed - if _, canTerminate := canBeTerminated(candidate, pdbs); !canTerminate { - continue - } + candidates = lo.Filter(candidates, func(n CandidateNode, _ int) bool { + _, canTerminate := canBeTerminated(n, pdbs) + return canTerminate + }) + for _, candidate := range candidates { // Check if we need to create any nodes. newNodes, allPodsScheduled, err := simulateScheduling(ctx, d.kubeClient, d.cluster, d.provisioner, candidate) if err != nil { diff --git a/pkg/controllers/deprovisioning/emptiness.go b/pkg/controllers/deprovisioning/emptiness.go index cf0e2ce457..df02d95673 100644 --- a/pkg/controllers/deprovisioning/emptiness.go +++ b/pkg/controllers/deprovisioning/emptiness.go @@ -18,9 +18,8 @@ import ( "context" "time" - "k8s.io/utils/clock" - v1 "k8s.io/api/core/v1" + "k8s.io/utils/clock" "knative.dev/pkg/logging" "knative.dev/pkg/ptr" @@ -66,7 +65,11 @@ func (e *Emptiness) ShouldDeprovision(ctx context.Context, n *state.Node, provis // ComputeCommand generates a deprovisioning command given deprovisionable nodes func (e *Emptiness) ComputeCommand(_ context.Context, nodes ...CandidateNode) (Command, error) { - emptyNodes := lo.Filter(nodes, func(n CandidateNode, _ int) bool { return len(n.pods) == 0 }) + emptyNodes := lo.Filter(nodes, func(n CandidateNode, _ int) bool { + _, canTerminate := canBeTerminated(n, nil) + return len(n.pods) == 0 && canTerminate + }) + if len(emptyNodes) == 0 { return Command{action: actionDoNothing}, nil } diff --git a/pkg/controllers/deprovisioning/expiration.go b/pkg/controllers/deprovisioning/expiration.go index 3bc7edd372..c3f9319b62 100644 --- a/pkg/controllers/deprovisioning/expiration.go +++ b/pkg/controllers/deprovisioning/expiration.go @@ -21,6 +21,7 @@ import ( "sort" "time" + "github.com/samber/lo" "k8s.io/utils/clock" v1 "k8s.io/api/core/v1" @@ -72,13 +73,12 @@ func (e *Expiration) ComputeCommand(ctx context.Context, candidates ...Candidate if err != nil { return Command{}, fmt.Errorf("tracking PodDisruptionBudgets, %w", err) } - for _, candidate := range candidates { - // is this a node that we can terminate? This check is meant to be fast so we can save the expense of simulated - // scheduling unless its really needed - if _, canBeTerminated := canBeTerminated(candidate, pdbs); !canBeTerminated { - continue - } + candidates = lo.Filter(candidates, func(n CandidateNode, _ int) bool { + _, canTerminate := canBeTerminated(n, pdbs) + return canTerminate + }) + for _, candidate := range candidates { // Check if we need to create any nodes. newNodes, allPodsScheduled, err := simulateScheduling(ctx, e.kubeClient, e.cluster, e.provisioner, candidate) if err != nil { @@ -93,7 +93,7 @@ func (e *Expiration) ComputeCommand(ctx context.Context, candidates ...Candidate logging.FromContext(ctx).With("node", candidate.Name).Debugf("continuing to expire node after scheduling simulation failed to schedule all pods") } - logging.FromContext(ctx).With("expirationTTL", time.Duration(ptr.Int64Value(candidates[0].provisioner.Spec.TTLSecondsUntilExpired))*time.Second). + logging.FromContext(ctx).With("ttl", time.Duration(ptr.Int64Value(candidates[0].provisioner.Spec.TTLSecondsUntilExpired))*time.Second). With("delay", time.Since(getExpirationTime(candidates[0].Node, candidates[0].provisioner))).Infof("triggering termination for expired node after TTL") // were we able to schedule all the pods on the inflight nodes? diff --git a/pkg/controllers/deprovisioning/helpers.go b/pkg/controllers/deprovisioning/helpers.go index d776fd1fca..0d86943794 100644 --- a/pkg/controllers/deprovisioning/helpers.go +++ b/pkg/controllers/deprovisioning/helpers.go @@ -329,27 +329,18 @@ func canBeTerminated(node CandidateNode, pdbs *PDBLimits) (string, bool) { if !node.DeletionTimestamp.IsZero() { return "in the process of deletion", false } - if pdb, ok := pdbs.CanEvictPods(node.pods); !ok { - return fmt.Sprintf("pdb %s prevents pod evictions", pdb), false - } - - if reason, ok := PodsPreventEviction(node.pods); ok { - return reason, false + if pdbs != nil { + if pdb, ok := pdbs.CanEvictPods(node.pods); !ok { + return fmt.Sprintf("pdb %s prevents pod evictions", pdb), false + } } - return "", true -} - -// PodsPreventEviction returns true if there are pods that would prevent eviction -func PodsPreventEviction(pods []*v1.Pod) (string, bool) { - for _, p := range pods { - // don't care about pods that are finishing, finished or owned by the node + if p, ok := lo.Find(node.pods, func(p *v1.Pod) bool { if pod.IsTerminating(p) || pod.IsTerminal(p) || pod.IsOwnedByNode(p) { - continue - } - - if pod.HasDoNotEvict(p) { - return fmt.Sprintf("pod %s/%s has do not evict annotation", p.Namespace, p.Name), true + return false } + return pod.HasDoNotEvict(p) + }); ok { + return fmt.Sprintf("pod %s/%s has do not evict annotation", p.Namespace, p.Name), false } - return "", false + return "", true } diff --git a/pkg/controllers/deprovisioning/suite_test.go b/pkg/controllers/deprovisioning/suite_test.go index ab7aba3b32..5b2cad9106 100644 --- a/pkg/controllers/deprovisioning/suite_test.go +++ b/pkg/controllers/deprovisioning/suite_test.go @@ -932,7 +932,7 @@ var _ = Describe("Delete Node", func() { ExpectNotFound(ctx, env.Client, node1) }) It("can delete nodes, considers do-not-evict", func() { - // create our RS so we can link a pod to it + // create our RS, so we can link a pod to it rs := test.ReplicaSet() ExpectApplied(ctx, env.Client, rs) Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(rs), rs)).To(Succeed()) @@ -1000,6 +1000,7 @@ var _ = Describe("Delete Node", func() { ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(node2)) fakeClock.Step(10 * time.Minute) + var wg sync.WaitGroup ExpectTriggerVerifyAction(&wg) _, err := deprovisioningController.Reconcile(ctx, reconcile.Request{}) diff --git a/pkg/controllers/inflightchecks/termination.go b/pkg/controllers/inflightchecks/termination.go index bd2c98c6b9..30e5840541 100644 --- a/pkg/controllers/inflightchecks/termination.go +++ b/pkg/controllers/inflightchecks/termination.go @@ -42,7 +42,6 @@ func (t *Termination) Check(ctx context.Context, node *v1.Node, provisioner *v1a if node.DeletionTimestamp.IsZero() { return nil, nil } - pods, err := nodeutils.GetNodePods(ctx, t.kubeClient, node) if err != nil { return nil, err @@ -54,13 +53,5 @@ func (t *Termination) Check(ctx context.Context, node *v1.Node, provisioner *v1a message: fmt.Sprintf("Can't drain node, PDB %s is blocking evictions", pdb), }) } - - if reason, ok := deprovisioning.PodsPreventEviction(pods); ok { - issues = append(issues, Issue{ - node: node, - message: fmt.Sprintf("Can't drain node, %s", reason), - }) - } - return issues, nil } diff --git a/pkg/controllers/machine/terminator/terminator.go b/pkg/controllers/machine/terminator/terminator.go index 02ecd4b4cb..b8ebc88bdd 100644 --- a/pkg/controllers/machine/terminator/terminator.go +++ b/pkg/controllers/machine/terminator/terminator.go @@ -76,9 +76,6 @@ func (t *Terminator) Drain(ctx context.Context, node *v1.Node) error { var podsToEvict []*v1.Pod // Skip node due to pods that are not able to be evicted for _, p := range pods { - if podutil.HasDoNotEvict(p) { - return NewNodeDrainError(fmt.Errorf("pod %s/%s has do-not-evict annotation", p.Namespace, p.Name)) - } // Ignore if unschedulable is tolerated, since they will reschedule if podutil.ToleratesUnschedulableTaint(p) { continue diff --git a/pkg/controllers/termination/suite_test.go b/pkg/controllers/termination/suite_test.go index 750b312785..3affde4703 100644 --- a/pkg/controllers/termination/suite_test.go +++ b/pkg/controllers/termination/suite_test.go @@ -43,7 +43,6 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" . "knative.dev/pkg/logging/testing" - "knative.dev/pkg/ptr" . "github.com/aws/karpenter-core/pkg/test/expectations" ) @@ -166,143 +165,6 @@ var _ = Describe("Termination", func() { ExpectReconcileSucceeded(ctx, terminationController, client.ObjectKeyFromObject(node)) ExpectNotFound(ctx, env.Client, node) }) - It("should not delete nodes that have a do-not-evict pod", func() { - podEvict := test.Pod(test.PodOptions{ - NodeName: node.Name, - ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}, - }) - podNoEvict := test.Pod(test.PodOptions{ - NodeName: node.Name, - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{v1alpha5.DoNotEvictPodAnnotationKey: "true"}, - OwnerReferences: defaultOwnerRefs, - }, - }) - - ExpectApplied(ctx, env.Client, node, podEvict, podNoEvict) - - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectReconcileSucceeded(ctx, terminationController, client.ObjectKeyFromObject(node)) - - // Expect no pod to be enqueued for eviction - ExpectNotEnqueuedForEviction(evictionQueue, podEvict, podNoEvict) - - // Expect node to exist and be draining - ExpectNodeDraining(env.Client, node.Name) - - // Delete do-not-evict pod - ExpectDeleted(ctx, env.Client, podNoEvict) - - // Reconcile node to evict pod - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectReconcileSucceeded(ctx, terminationController, client.ObjectKeyFromObject(node)) - - // Expect podEvict to be enqueued for eviction then be successful - ExpectEvicted(env.Client, podEvict) - - // Delete pod to simulate successful eviction - ExpectDeleted(ctx, env.Client, podEvict) - - // Reconcile to delete node - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectReconcileSucceeded(ctx, terminationController, client.ObjectKeyFromObject(node)) - ExpectNotFound(ctx, env.Client, node) - }) - It("should not delete nodes that have a do-not-evict pod that tolerates an unschedulable taint", func() { - podEvict := test.Pod(test.PodOptions{ - NodeName: node.Name, - ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}, - }) - podNoEvict := test.Pod(test.PodOptions{ - NodeName: node.Name, - Tolerations: []v1.Toleration{{Key: v1.TaintNodeUnschedulable, Operator: v1.TolerationOpExists, Effect: v1.TaintEffectNoSchedule}}, - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{v1alpha5.DoNotEvictPodAnnotationKey: "true"}, - OwnerReferences: defaultOwnerRefs, - }, - }) - - ExpectApplied(ctx, env.Client, node, podEvict, podNoEvict) - - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectReconcileSucceeded(ctx, terminationController, client.ObjectKeyFromObject(node)) - - // Expect no pod to be enqueued for eviction - ExpectNotEnqueuedForEviction(evictionQueue, podEvict, podNoEvict) - - // Expect node to exist and be draining - ExpectNodeDraining(env.Client, node.Name) - - // Delete do-not-evict pod - ExpectDeleted(ctx, env.Client, podNoEvict) - - // Reconcile node to evict pod - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectReconcileSucceeded(ctx, terminationController, client.ObjectKeyFromObject(node)) - - // Expect podEvict to be enqueued for eviction then be successful - ExpectEvicted(env.Client, podEvict) - - // Delete pod to simulate successful eviction - ExpectDeleted(ctx, env.Client, podEvict) - - // Reconcile to delete node - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectReconcileSucceeded(ctx, terminationController, client.ObjectKeyFromObject(node)) - ExpectNotFound(ctx, env.Client, node) - }) - It("should not delete nodes that have a do-not-evict static pod", func() { - ExpectApplied(ctx, env.Client, node) - node = ExpectNodeExists(ctx, env.Client, node.Name) - podEvict := test.Pod(test.PodOptions{ - NodeName: node.Name, - ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}, - }) - podNoEvict := test.Pod(test.PodOptions{ - NodeName: node.Name, - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{v1alpha5.DoNotEvictPodAnnotationKey: "true"}, - OwnerReferences: []metav1.OwnerReference{{ - APIVersion: "v1", - Kind: "Node", - Name: node.Name, - UID: node.UID, - }}, - }, - }) - - ExpectApplied(ctx, env.Client, podEvict, podNoEvict) - - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectReconcileSucceeded(ctx, terminationController, client.ObjectKeyFromObject(node)) - - // Expect no pod to be enqueued for eviction - ExpectNotEnqueuedForEviction(evictionQueue, podEvict, podNoEvict) - - // Expect node to exist and be draining - ExpectNodeDraining(env.Client, node.Name) - - // Delete do-not-evict pod - ExpectDeleted(ctx, env.Client, podNoEvict) - - // Reconcile node to evict pod - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectReconcileSucceeded(ctx, terminationController, client.ObjectKeyFromObject(node)) - - // Expect podEvict to be enqueued for eviction then be successful - ExpectEvicted(env.Client, podEvict) - - // Delete pod to simulate successful eviction - ExpectDeleted(ctx, env.Client, podEvict) - - // Reconcile to delete node - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectReconcileSucceeded(ctx, terminationController, client.ObjectKeyFromObject(node)) - ExpectNotFound(ctx, env.Client, node) - }) It("should delete nodes that have pods without an ownerRef", func() { pod := test.Pod(test.PodOptions{ NodeName: node.Name, @@ -332,50 +194,6 @@ var _ = Describe("Termination", func() { // Reconcile to delete node ExpectNotFound(ctx, env.Client, node) }) - It("should not delete nodes that have a do-not-delete pod without an ownerRef", func() { - podEvict := test.Pod(test.PodOptions{ - NodeName: node.Name, - ObjectMeta: metav1.ObjectMeta{ - OwnerReferences: nil, - }, - }) - podNoEvict := test.Pod(test.PodOptions{ - NodeName: node.Name, - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{v1alpha5.DoNotEvictPodAnnotationKey: "true"}, - OwnerReferences: nil, - }, - }) - - ExpectApplied(ctx, env.Client, node, podEvict, podNoEvict) - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectReconcileSucceeded(ctx, terminationController, client.ObjectKeyFromObject(node)) - - // Expect no pod to be enqueued for eviction - ExpectNotEnqueuedForEviction(evictionQueue, podEvict, podNoEvict) - - // Expect node to exist and be draining - ExpectNodeDraining(env.Client, node.Name) - - // Delete do-not-evict pod - ExpectDeleted(ctx, env.Client, podNoEvict) - - // Reconcile node to evict pod - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectReconcileSucceeded(ctx, terminationController, client.ObjectKeyFromObject(node)) - - // Expect podEvict to be enqueued for eviction then be successful - ExpectEvicted(env.Client, podEvict) - - // Delete pod to simulate successful eviction - ExpectDeleted(ctx, env.Client, podEvict) - - // Reconcile to delete node - node = ExpectNodeExists(ctx, env.Client, node.Name) - ExpectReconcileSucceeded(ctx, terminationController, client.ObjectKeyFromObject(node)) - ExpectNotFound(ctx, env.Client, node) - }) It("should delete nodes with terminal pods", func() { podEvictPhaseSucceeded := test.Pod(test.PodOptions{ NodeName: node.Name, @@ -393,41 +211,6 @@ var _ = Describe("Termination", func() { ExpectReconcileSucceeded(ctx, terminationController, client.ObjectKeyFromObject(node)) ExpectNotFound(ctx, env.Client, node) }) - It("should delete nodes that have do-not-evict on pods for which it does not apply", func() { - pods := []*v1.Pod{ - test.Pod(test.PodOptions{ - NodeName: node.Name, - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{v1alpha5.DoNotEvictPodAnnotationKey: "true"}, - OwnerReferences: defaultOwnerRefs, - }, - }), - test.Pod(test.PodOptions{ - NodeName: node.Name, - Tolerations: []v1.Toleration{{Operator: v1.TolerationOpExists}}, - ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}, - }), - test.Pod(test.PodOptions{ - NodeName: node.Name, - ObjectMeta: metav1.ObjectMeta{OwnerReferences: defaultOwnerRefs}, - }), - } - ExpectApplied(ctx, env.Client, node) - for _, pod := range pods { - ExpectApplied(ctx, env.Client, pod) - } - // Trigger eviction - Expect(env.Client.Delete(ctx, node)).To(Succeed()) - for _, pod := range pods { - Expect(env.Client.Delete(ctx, pod, &client.DeleteOptions{GracePeriodSeconds: ptr.Int64(30)})).To(Succeed()) - } - ExpectReconcileSucceeded(ctx, terminationController, client.ObjectKeyFromObject(node)) - node = ExpectNodeExists(ctx, env.Client, node.Name) - // Simulate stuck terminating - fakeClock.Step(2 * time.Minute) - ExpectReconcileSucceeded(ctx, terminationController, client.ObjectKeyFromObject(node)) - ExpectNotFound(ctx, env.Client, node) - }) It("should fail to evict pods that violate a PDB", func() { minAvailable := intstr.FromInt(1) labelSelector := map[string]string{test.RandomName(): test.RandomName()}