Skip to content

Commit

Permalink
feat: require and then remove karpenter startup taint (#1336)
Browse files Browse the repository at this point in the history
Co-authored-by: Nick Tran <10810510+njtran@users.noreply.github.com>
  • Loading branch information
rschalo and njtran authored Jul 3, 2024
1 parent 715c446 commit e40f4b0
Show file tree
Hide file tree
Showing 11 changed files with 74 additions and 22 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ delete: ## Delete the controller from your ~/.kube/config cluster
test: ## Run tests
go test ./pkg/... \
-race \
-timeout 15m \
-timeout 20m \
--ginkgo.focus="${FOCUS}" \
--ginkgo.randomize-all \
--ginkgo.v \
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ require (
github.com/patrickmn/go-cache v2.1.0+incompatible
github.com/prometheus/client_golang v1.19.1
github.com/prometheus/client_model v0.6.1
github.com/samber/lo v1.43.0
github.com/samber/lo v1.44.0
go.uber.org/multierr v1.11.0
go.uber.org/zap v1.27.0
golang.org/x/sync v0.7.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -287,8 +287,8 @@ github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFR
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/samber/lo v1.43.0 h1:ts0VhPi8+ZQZFVLv/2Vkgt2Cds05FM2v3Enmv+YMBtg=
github.com/samber/lo v1.43.0/go.mod h1:w7R6fO7h2lrnx/s0bWcZ55vXJI89p5UPM6+kyDL373E=
github.com/samber/lo v1.44.0 h1:5il56KxRE+GHsm1IR+sZ/6J42NODigFiqCWpSc2dybA=
github.com/samber/lo v1.44.0/go.mod h1:RmDH9Ct32Qy3gduHQuKJ3gW1fMHAnE/fAzQuf6He5cU=
github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88=
Expand Down
5 changes: 5 additions & 0 deletions pkg/apis/v1beta1/taints.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
const (
DisruptionTaintKey = apis.Group + "/disruption"
DisruptingNoScheduleTaintValue = "disrupting"
UnregisteredTaintKey = apis.Group + "/unregistered"
)

var (
Expand All @@ -36,6 +37,10 @@ var (
Effect: v1.TaintEffectNoSchedule,
Value: DisruptingNoScheduleTaintValue,
}
UnregisteredNoExecuteTaint = v1.Taint{
Key: UnregisteredTaintKey,
Effect: v1.TaintEffectNoExecute,
}
)

func IsDisruptingTaint(taint v1.Taint) bool {
Expand Down
2 changes: 1 addition & 1 deletion pkg/cloudprovider/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ func (its InstanceTypes) SatisfiesMinValues(requirements scheduling.Requirements
// Truncate truncates the InstanceTypes based on the passed-in requirements
// It returns an error if it isn't possible to truncate the instance types on maxItems without violating minValues
func (its InstanceTypes) Truncate(requirements scheduling.Requirements, maxItems int) (InstanceTypes, error) {
truncatedInstanceTypes := InstanceTypes(lo.Slice(its.OrderByPrice(requirements), 0, maxItems))
truncatedInstanceTypes := lo.Slice(its.OrderByPrice(requirements), 0, maxItems)
// Only check for a validity of NodeClaim if its requirement has minValues in it.
if requirements.HasMinValues() {
if _, err := truncatedInstanceTypes.SatisfiesMinValues(requirements); err != nil {
Expand Down
3 changes: 2 additions & 1 deletion pkg/controllers/nodeclaim/disruption/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ var _ = Describe("Disruption", func() {
Expect(nodeClaim.StatusConditions().Get(v1beta1.ConditionTypeEmpty).IsTrue()).To(BeTrue())
})
It("should remove multiple disruption conditions simultaneously", func() {
cp.Drifted = ""
nodePool.Spec.Disruption.ExpireAfter.Duration = nil
nodePool.Spec.Disruption.ConsolidateAfter = &v1beta1.NillableDuration{Duration: nil}

Expand All @@ -124,7 +125,7 @@ var _ = Describe("Disruption", func() {
ExpectObjectReconciled(ctx, env.Client, nodeClaimDisruptionController, nodeClaim)

nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(nodeClaim.StatusConditions().Get(v1beta1.ConditionTypeDrifted).IsTrue()).To(BeTrue())
Expect(nodeClaim.StatusConditions().Get(v1beta1.ConditionTypeDrifted)).To(BeNil())
Expect(nodeClaim.StatusConditions().Get(v1beta1.ConditionTypeEmpty)).To(BeNil())
})
})
19 changes: 16 additions & 3 deletions pkg/controllers/nodeclaim/lifecycle/initialization_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,12 @@ var _ = Describe("Initialization", func() {

node := test.Node(test.NodeOptions{
ProviderID: nodeClaim.Status.ProviderID,
Taints: []v1.Taint{v1beta1.UnregisteredNoExecuteTaint},
})
ExpectApplied(ctx, env.Client, node)
ExpectMakeNodesReady(ctx, env.Client, node) // Remove the not-ready taint

ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
ExpectMakeNodesReady(ctx, env.Client, node) // Remove the not-ready taint

nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expect(ExpectStatusConditionExists(nodeClaim, v1beta1.ConditionTypeRegistered).Status).To(Equal(metav1.ConditionTrue))
Expand Down Expand Up @@ -122,10 +123,11 @@ var _ = Describe("Initialization", func() {
v1.ResourceMemory: resource.MustParse("80Mi"),
v1.ResourcePods: resource.MustParse("110"),
},
Taints: []v1.Taint{v1beta1.UnregisteredNoExecuteTaint},
})
ExpectApplied(ctx, env.Client, node)
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
ExpectMakeNodesReady(ctx, env.Client, node) // Remove the not-ready taint

ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)

node = ExpectExists(ctx, env.Client, node)
Expand Down Expand Up @@ -165,7 +167,9 @@ var _ = Describe("Initialization", func() {
v1.ResourcePods: resource.MustParse("110"),
},
ReadyStatus: v1.ConditionFalse,
Taints: []v1.Taint{v1beta1.UnregisteredNoExecuteTaint},
})
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
ExpectApplied(ctx, env.Client, node)
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)

Expand Down Expand Up @@ -213,10 +217,11 @@ var _ = Describe("Initialization", func() {
v1.ResourceMemory: resource.MustParse("80Mi"),
v1.ResourcePods: resource.MustParse("110"),
},
Taints: []v1.Taint{v1beta1.UnregisteredNoExecuteTaint},
})
ExpectApplied(ctx, env.Client, node)
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
ExpectMakeNodesReady(ctx, env.Client, node) // Remove the not-ready taint

ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)

nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
Expand Down Expand Up @@ -263,8 +268,10 @@ var _ = Describe("Initialization", func() {
v1.ResourceMemory: resource.MustParse("80Mi"),
v1.ResourcePods: resource.MustParse("110"),
},
Taints: []v1.Taint{v1beta1.UnregisteredNoExecuteTaint},
})
ExpectApplied(ctx, env.Client, node)
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
ExpectMakeNodesReady(ctx, env.Client, node) // Remove the not-ready taint

ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
Expand Down Expand Up @@ -330,8 +337,10 @@ var _ = Describe("Initialization", func() {
v1.ResourceMemory: resource.MustParse("80Mi"),
v1.ResourcePods: resource.MustParse("110"),
},
Taints: []v1.Taint{v1beta1.UnregisteredNoExecuteTaint},
})
ExpectApplied(ctx, env.Client, node)
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
ExpectMakeNodesReady(ctx, env.Client, node) // Remove the not-ready taint

// Should add the startup taints to the node
Expand Down Expand Up @@ -401,8 +410,10 @@ var _ = Describe("Initialization", func() {
v1.ResourceMemory: resource.MustParse("80Mi"),
v1.ResourcePods: resource.MustParse("110"),
},
Taints: []v1.Taint{v1beta1.UnregisteredNoExecuteTaint},
})
ExpectApplied(ctx, env.Client, node)
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
ExpectMakeNodesReady(ctx, env.Client, node) // Remove the not-ready taint

// Shouldn't consider the node ready since the startup taints still exist
Expand Down Expand Up @@ -480,6 +491,7 @@ var _ = Describe("Initialization", func() {
Effect: v1.TaintEffectNoSchedule,
Value: "true",
},
v1beta1.UnregisteredNoExecuteTaint,
},
})
ExpectApplied(ctx, env.Client, node)
Expand Down Expand Up @@ -549,6 +561,7 @@ var _ = Describe("Initialization", func() {
Effect: v1.TaintEffectNoSchedule,
Value: "true",
},
v1beta1.UnregisteredNoExecuteTaint,
},
})
ExpectApplied(ctx, env.Client, node)
Expand Down
18 changes: 15 additions & 3 deletions pkg/controllers/nodeclaim/lifecycle/registration.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,17 @@ func (r *Registration) Reconcile(ctx context.Context, nodeClaim *v1beta1.NodeCla
}
return reconcile.Result{}, fmt.Errorf("getting node for nodeclaim, %w", err)
}
_, hasStartupTaint := lo.Find(node.Spec.Taints, func(t v1.Taint) bool {
return t.MatchTaint(&v1beta1.UnregisteredNoExecuteTaint)
})
// check if sync succeeded but setting the registered status condition failed
// if sync succeeded, then the label will be present and the taint will be gone
if _, ok := node.Labels[v1beta1.NodeRegisteredLabelKey]; !ok && !hasStartupTaint {
return reconcile.Result{}, fmt.Errorf("missing required startup taint, %s", v1beta1.UnregisteredTaintKey)
}
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef("", node.Name)))
if err = r.syncNode(ctx, nodeClaim, node); err != nil {
return reconcile.Result{}, fmt.Errorf("syncing node, %w", err)
return reconcile.Result{}, err
}
log.FromContext(ctx).Info("registered nodeclaim")
nodeClaim.StatusConditions().SetTrue(v1beta1.ConditionTypeRegistered)
Expand All @@ -88,12 +96,16 @@ func (r *Registration) syncNode(ctx context.Context, nodeClaim *v1beta1.NodeClai
// Sync all taints inside NodeClaim into the Node taints
node.Spec.Taints = scheduling.Taints(node.Spec.Taints).Merge(nodeClaim.Spec.Taints)
node.Spec.Taints = scheduling.Taints(node.Spec.Taints).Merge(nodeClaim.Spec.StartupTaints)
// Remove karpenter.sh/unregistered taint
node.Spec.Taints = lo.Reject(node.Spec.Taints, func(t v1.Taint, _ int) bool {
return t.MatchTaint(&v1beta1.UnregisteredNoExecuteTaint)
})
node.Labels = lo.Assign(node.Labels, nodeClaim.Labels, map[string]string{
v1beta1.NodeRegisteredLabelKey: "true",
})
if !equality.Semantic.DeepEqual(stored, node) {
if err := r.kubeClient.Patch(ctx, node, client.StrategicMergeFrom(stored)); err != nil {
return fmt.Errorf("syncing node labels, %w", err)
if err := r.kubeClient.Update(ctx, node); err != nil {
return fmt.Errorf("syncing node, %w", err)
}
}
return nil
Expand Down
35 changes: 26 additions & 9 deletions pkg/controllers/nodeclaim/lifecycle/registration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ var _ = Describe("Registration", func() {
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)

node := test.Node(test.NodeOptions{ProviderID: nodeClaim.Status.ProviderID})
node := test.Node(test.NodeOptions{ProviderID: nodeClaim.Status.ProviderID, Taints: []v1.Taint{v1beta1.UnregisteredNoExecuteTaint}})
ExpectApplied(ctx, env.Client, node)
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)

Expand All @@ -66,14 +66,14 @@ var _ = Describe("Registration", func() {
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)

node := test.Node(test.NodeOptions{ProviderID: nodeClaim.Status.ProviderID})
node := test.Node(test.NodeOptions{ProviderID: nodeClaim.Status.ProviderID, Taints: []v1.Taint{v1beta1.UnregisteredNoExecuteTaint}})
ExpectApplied(ctx, env.Client, node)
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)

node = ExpectExists(ctx, env.Client, node)
ExpectOwnerReferenceExists(node, nodeClaim)
})
It("should sync the karpenter.sh/registered label to the Node when the Node comes online", func() {
It("should sync the karpenter.sh/registered label to the Node and remove the karpenter.sh/unregistered taint when the Node comes online", func() {
nodeClaim := test.NodeClaim(v1beta1.NodeClaim{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
Expand All @@ -85,11 +85,28 @@ var _ = Describe("Registration", func() {
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)

node := test.Node(test.NodeOptions{ProviderID: nodeClaim.Status.ProviderID})
node := test.Node(test.NodeOptions{ProviderID: nodeClaim.Status.ProviderID, Taints: []v1.Taint{v1beta1.UnregisteredNoExecuteTaint}})
ExpectApplied(ctx, env.Client, node)
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
node = ExpectExists(ctx, env.Client, node)
Expect(node.Labels).To(HaveKeyWithValue(v1beta1.NodeRegisteredLabelKey, "true"))
Expect(node.Spec.Taints).To(Not(ContainElement(v1beta1.UnregisteredNoExecuteTaint)))
})
It("should fail registration if the karpenter.sh/unregistered taint is not present on the node and the node isn't labeled as registered", func() {
nodeClaim := test.NodeClaim(v1beta1.NodeClaim{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
v1beta1.NodePoolLabelKey: nodePool.Name,
},
},
})
ExpectApplied(ctx, env.Client, nodePool, nodeClaim)
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)

node := test.Node(test.NodeOptions{ProviderID: nodeClaim.Status.ProviderID})
ExpectApplied(ctx, env.Client, node)
_ = ExpectObjectReconcileFailed(ctx, env.Client, nodeClaimController, nodeClaim)
})
It("should sync the labels to the Node when the Node comes online", func() {
nodeClaim := test.NodeClaim(v1beta1.NodeClaim{
Expand All @@ -107,7 +124,7 @@ var _ = Describe("Registration", func() {
Expect(nodeClaim.Labels).To(HaveKeyWithValue("custom-label", "custom-value"))
Expect(nodeClaim.Labels).To(HaveKeyWithValue("other-custom-label", "other-custom-value"))

node := test.Node(test.NodeOptions{ProviderID: nodeClaim.Status.ProviderID})
node := test.Node(test.NodeOptions{ProviderID: nodeClaim.Status.ProviderID, Taints: []v1.Taint{v1beta1.UnregisteredNoExecuteTaint}})
ExpectApplied(ctx, env.Client, node)
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
node = ExpectExists(ctx, env.Client, node)
Expand Down Expand Up @@ -135,7 +152,7 @@ var _ = Describe("Registration", func() {
Expect(nodeClaim.Annotations).To(HaveKeyWithValue(v1beta1.DoNotDisruptAnnotationKey, "true"))
Expect(nodeClaim.Annotations).To(HaveKeyWithValue("my-custom-annotation", "my-custom-value"))

node := test.Node(test.NodeOptions{ProviderID: nodeClaim.Status.ProviderID})
node := test.Node(test.NodeOptions{ProviderID: nodeClaim.Status.ProviderID, Taints: []v1.Taint{v1beta1.UnregisteredNoExecuteTaint}})
ExpectApplied(ctx, env.Client, node)
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
node = ExpectExists(ctx, env.Client, node)
Expand Down Expand Up @@ -183,7 +200,7 @@ var _ = Describe("Registration", func() {
},
))

node := test.Node(test.NodeOptions{ProviderID: nodeClaim.Status.ProviderID})
node := test.Node(test.NodeOptions{ProviderID: nodeClaim.Status.ProviderID, Taints: []v1.Taint{v1beta1.UnregisteredNoExecuteTaint}})
ExpectApplied(ctx, env.Client, node)
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
node = ExpectExists(ctx, env.Client, node)
Expand Down Expand Up @@ -251,7 +268,7 @@ var _ = Describe("Registration", func() {
},
))

node := test.Node(test.NodeOptions{ProviderID: nodeClaim.Status.ProviderID})
node := test.Node(test.NodeOptions{ProviderID: nodeClaim.Status.ProviderID, Taints: []v1.Taint{v1beta1.UnregisteredNoExecuteTaint}})
ExpectApplied(ctx, env.Client, node)
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
node = ExpectExists(ctx, env.Client, node)
Expand Down Expand Up @@ -305,7 +322,7 @@ var _ = Describe("Registration", func() {
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)

node := test.Node(test.NodeOptions{ProviderID: nodeClaim.Status.ProviderID})
node := test.Node(test.NodeOptions{ProviderID: nodeClaim.Status.ProviderID, Taints: []v1.Taint{v1beta1.UnregisteredNoExecuteTaint}})
ExpectApplied(ctx, env.Client, node)
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim)
node = ExpectExists(ctx, env.Client, node)
Expand Down
2 changes: 2 additions & 0 deletions pkg/test/expectations/expectations.go
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,7 @@ func ExpectNodeClaimDeployed(ctx context.Context, c client.Client, cloudProvider

// Mock the nodeclaim launch and node joining at the apiserver
node := test.NodeClaimLinkedNode(nc)
node.Spec.Taints = lo.Reject(node.Spec.Taints, func(t v1.Taint, _ int) bool { return t.MatchTaint(&v1beta1.UnregisteredNoExecuteTaint) })
node.Labels = lo.Assign(node.Labels, map[string]string{v1beta1.NodeRegisteredLabelKey: "true"})
ExpectApplied(ctx, c, nc, node)
return nc, node, nil
Expand Down Expand Up @@ -408,6 +409,7 @@ func ExpectMakeNodesInitialized(ctx context.Context, c client.Client, nodes ...*
ExpectMakeNodesReady(ctx, c, nodes...)

for i := range nodes {
nodes[i].Spec.Taints = lo.Reject(nodes[i].Spec.Taints, func(t v1.Taint, _ int) bool { return t.MatchTaint(&v1beta1.UnregisteredNoExecuteTaint) })
nodes[i].Labels[v1beta1.NodeRegisteredLabelKey] = "true"
nodes[i].Labels[v1beta1.NodeInitializedLabelKey] = "true"
ExpectApplied(ctx, c, nodes[i])
Expand Down
4 changes: 3 additions & 1 deletion pkg/test/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ func Node(overrides ...NodeOptions) *v1.Node {
}

func NodeClaimLinkedNode(nodeClaim *v1beta1.NodeClaim) *v1.Node {
return Node(
n := Node(
NodeOptions{
ObjectMeta: metav1.ObjectMeta{
Labels: nodeClaim.Labels,
Expand All @@ -81,4 +81,6 @@ func NodeClaimLinkedNode(nodeClaim *v1beta1.NodeClaim) *v1.Node {
ProviderID: nodeClaim.Status.ProviderID,
},
)
n.Spec.Taints = append(n.Spec.Taints, v1beta1.UnregisteredNoExecuteTaint)
return n
}

0 comments on commit e40f4b0

Please sign in to comment.