Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: Remove ttlAfterNotRegistered value from settings #250

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions charts/karpenter-core/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -138,11 +138,5 @@ settings:
# Setting driftEnabled to true enables the drift deprovisioner to watch for drift between currently deployed nodes
# and the desired state of nodes set in provisioners and node templates
driftEnabled: false
# -- ttlAfterNotRegistered is in ALPHA and is planned to be removed in the future, pending design for handling nodes failing launch.
# The maximum length of time to wait for the machine to register to the cluster before terminating the machine.
# Generally, the default of 15m should be sufficient but raising or lowering this may be necessary depending
# on how slowly or quickly nodes join the cluster.
# Setting this value to an empty string disables the ttlAfterNotRegistered deprovisioning
ttlAfterNotRegistered: 15m


16 changes: 5 additions & 11 deletions pkg/apis/settings/settings.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,15 @@ type settingsKeyType struct{}
var ContextKey = settingsKeyType{}

var defaultSettings = &Settings{
BatchMaxDuration: &metav1.Duration{Duration: time.Second * 10},
BatchIdleDuration: &metav1.Duration{Duration: time.Second * 1},
TTLAfterNotRegistered: &metav1.Duration{Duration: time.Minute * 15},
DriftEnabled: false,
BatchMaxDuration: &metav1.Duration{Duration: time.Second * 10},
BatchIdleDuration: &metav1.Duration{Duration: time.Second * 1},
DriftEnabled: false,
}

// +k8s:deepcopy-gen=true
type Settings struct {
BatchMaxDuration *metav1.Duration
BatchIdleDuration *metav1.Duration
TTLAfterNotRegistered *metav1.Duration
BatchMaxDuration *metav1.Duration
BatchIdleDuration *metav1.Duration
// This feature flag is temporary and will be removed in the near future.
DriftEnabled bool
}
Expand All @@ -56,7 +54,6 @@ func (*Settings) Inject(ctx context.Context, cm *v1.ConfigMap) (context.Context,
if err := configmap.Parse(cm.Data,
AsMetaDuration("batchMaxDuration", &s.BatchMaxDuration),
AsMetaDuration("batchIdleDuration", &s.BatchIdleDuration),
AsMetaDuration("ttlAfterNotRegistered", &s.TTLAfterNotRegistered),
configmap.AsBool("featureGates.driftEnabled", &s.DriftEnabled),
); err != nil {
return ctx, fmt.Errorf("parsing settings, %w", err)
Expand All @@ -78,9 +75,6 @@ func (in *Settings) Validate() (err error) {
} else if in.BatchIdleDuration.Duration <= 0 {
err = multierr.Append(err, fmt.Errorf("batchIdleDuration cannot be negative"))
}
if in.TTLAfterNotRegistered != nil && in.TTLAfterNotRegistered.Duration <= 0 {
err = multierr.Append(err, fmt.Errorf("ttlAfterNotRegistered cannot be negative"))
}
return err
}

Expand Down
29 changes: 0 additions & 29 deletions pkg/apis/settings/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,13 @@ var _ = Describe("Validation", func() {
Expect(s.BatchMaxDuration.Duration).To(Equal(time.Second * 10))
Expect(s.BatchIdleDuration.Duration).To(Equal(time.Second))
Expect(s.DriftEnabled).To(BeFalse())
Expect(s.TTLAfterNotRegistered.Duration).To(Equal(time.Minute * 15))
})
It("should succeed to set custom values", func() {
cm := &v1.ConfigMap{
Data: map[string]string{
"batchMaxDuration": "30s",
"batchIdleDuration": "5s",
"featureGates.driftEnabled": "true",
"ttlAfterNotRegistered": "30m",
},
}
ctx, err := (&settings.Settings{}).Inject(ctx, cm)
Expand All @@ -63,24 +61,6 @@ var _ = Describe("Validation", func() {
Expect(s.BatchMaxDuration.Duration).To(Equal(time.Second * 30))
Expect(s.BatchIdleDuration.Duration).To(Equal(time.Second * 5))
Expect(s.DriftEnabled).To(BeTrue())
Expect(s.TTLAfterNotRegistered.Duration).To(Equal(time.Minute * 30))
})
It("should succeed to disable ttlAfterNotRegistered", func() {
cm := &v1.ConfigMap{
Data: map[string]string{
"batchMaxDuration": "30s",
"batchIdleDuration": "5s",
"featureGates.driftEnabled": "true",
"ttlAfterNotRegistered": "",
},
}
ctx, err := (&settings.Settings{}).Inject(ctx, cm)
Expect(err).ToNot(HaveOccurred())
s := settings.FromContext(ctx)
Expect(s.BatchMaxDuration.Duration).To(Equal(time.Second * 30))
Expect(s.BatchIdleDuration.Duration).To(Equal(time.Second * 5))
Expect(s.DriftEnabled).To(BeTrue())
Expect(s.TTLAfterNotRegistered).To(BeNil())
})
It("should fail validation when batchMaxDuration is negative", func() {
cm := &v1.ConfigMap{
Expand Down Expand Up @@ -127,13 +107,4 @@ var _ = Describe("Validation", func() {
_, err := (&settings.Settings{}).Inject(ctx, cm)
Expect(err).To(HaveOccurred())
})
It("should fail validation when ttlAfterNotRegistered is negative", func() {
cm := &v1.ConfigMap{
Data: map[string]string{
"ttlAfterNotRegistered": "-10s",
},
}
_, err := (&settings.Settings{}).Inject(ctx, cm)
Expect(err).To(HaveOccurred())
})
})
5 changes: 0 additions & 5 deletions pkg/apis/settings/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 8 additions & 8 deletions pkg/controllers/machine/liveness.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

"github.com/aws/karpenter-core/pkg/apis/settings"
"github.com/aws/karpenter-core/pkg/apis/v1alpha5"
"github.com/aws/karpenter-core/pkg/metrics"
"github.com/aws/karpenter-core/pkg/utils/result"
Expand All @@ -40,6 +39,10 @@ type Liveness struct {
// If we don't succeed within this time, then we should delete and try again through some other mechanism
const launchTTL = time.Minute * 2

// registrationTTL is a heuristic time that we expect the node to register within
// If we don't see the node within this time, then we should delete the machine and try again
const registrationTTL = time.Minute * 15

func (l *Liveness) Reconcile(ctx context.Context, machine *v1alpha5.Machine) (reconcile.Result, error) {
creationRes, creationErr := l.launchTTL(ctx, machine)
registrationRes, registrationErr := l.registrationTTL(ctx, machine)
Expand All @@ -57,7 +60,7 @@ func (l *Liveness) launchTTL(ctx context.Context, machine *v1alpha5.Machine) (re
if err := l.kubeClient.Delete(ctx, machine); err != nil {
return reconcile.Result{}, client.IgnoreNotFound(err)
}
logging.FromContext(ctx).With("ttl", time.Minute).Debugf("terminating machine since node hasn't created within creation ttl")
logging.FromContext(ctx).With("ttl", launchTTL).Debugf("terminating machine since node hasn't created within creation ttl")
metrics.MachinesTerminatedCounter.With(prometheus.Labels{
metrics.ReasonLabel: "machine_creation_timeout",
metrics.ProvisionerLabel: machine.Labels[v1alpha5.ProvisionerNameLabelKey],
Expand All @@ -66,20 +69,17 @@ func (l *Liveness) launchTTL(ctx context.Context, machine *v1alpha5.Machine) (re
}

func (l *Liveness) registrationTTL(ctx context.Context, machine *v1alpha5.Machine) (reconcile.Result, error) {
if settings.FromContext(ctx).TTLAfterNotRegistered == nil {
return reconcile.Result{}, nil
}
if machine.StatusConditions().GetCondition(v1alpha5.MachineRegistered).IsTrue() {
return reconcile.Result{}, nil
}
if machine.CreationTimestamp.IsZero() || l.clock.Since(machine.CreationTimestamp.Time) < settings.FromContext(ctx).TTLAfterNotRegistered.Duration {
return reconcile.Result{RequeueAfter: settings.FromContext(ctx).TTLAfterNotRegistered.Duration - l.clock.Since(machine.CreationTimestamp.Time)}, nil
if machine.CreationTimestamp.IsZero() || l.clock.Since(machine.CreationTimestamp.Time) < registrationTTL {
return reconcile.Result{RequeueAfter: registrationTTL - l.clock.Since(machine.CreationTimestamp.Time)}, nil
}
// Delete the machine if we believe the machine won't register since we haven't seen the node
if err := l.kubeClient.Delete(ctx, machine); err != nil {
return reconcile.Result{}, client.IgnoreNotFound(err)
}
logging.FromContext(ctx).With("ttl", settings.FromContext(ctx).TTLAfterNotRegistered.Duration).Debugf("terminating machine since node hasn't registered within registration ttl")
logging.FromContext(ctx).With("ttl", registrationTTL).Debugf("terminating machine since node hasn't registered within registration ttl")
metrics.MachinesTerminatedCounter.With(prometheus.Labels{
metrics.ReasonLabel: "machine_registration_timeout",
metrics.ProvisionerLabel: machine.Labels[v1alpha5.ProvisionerNameLabelKey],
Expand Down
32 changes: 0 additions & 32 deletions pkg/controllers/machine/liveness_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/aws/karpenter-core/pkg/apis/settings"
"github.com/aws/karpenter-core/pkg/apis/v1alpha5"
"github.com/aws/karpenter-core/pkg/cloudprovider/fake"
"github.com/aws/karpenter-core/pkg/test"
Expand Down Expand Up @@ -97,35 +96,4 @@ var _ = Describe("Liveness", func() {
ExpectReconcileSucceeded(ctx, machineController, client.ObjectKeyFromObject(machine)) // Reconcile again to handle termination flow
ExpectNotFound(ctx, env.Client, machine)
})
It("should not delete the Machine when the Node hasn't registered to the Machine past the registration TTL if ttlAfterNotRegistered is disabled", func() {
s := test.Settings()
s.TTLAfterNotRegistered = nil
ctx = settings.ToContext(ctx, s)
machine := test.Machine(v1alpha5.Machine{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
v1alpha5.ProvisionerNameLabelKey: provisioner.Name,
},
},
Spec: v1alpha5.MachineSpec{
Resources: v1alpha5.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceCPU: resource.MustParse("2"),
v1.ResourceMemory: resource.MustParse("50Mi"),
v1.ResourcePods: resource.MustParse("5"),
fake.ResourceGPUVendorA: resource.MustParse("1"),
},
},
},
})
ExpectApplied(ctx, env.Client, provisioner, machine)
ExpectReconcileSucceeded(ctx, machineController, client.ObjectKeyFromObject(machine))
machine = ExpectExists(ctx, env.Client, machine)

// If the node hasn't registered in the liveness timeframe, then we deprovision the Machine
fakeClock.Step(time.Minute * 20)
ExpectReconcileSucceeded(ctx, machineController, client.ObjectKeyFromObject(machine))
ExpectReconcileSucceeded(ctx, machineController, client.ObjectKeyFromObject(machine)) // Reconcile again to handle termination flow
ExpectExists(ctx, env.Client, machine)
})
})
11 changes: 3 additions & 8 deletions pkg/test/settings.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ package test

import (
"fmt"
"time"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

Expand All @@ -38,13 +37,9 @@ func Settings(overrides ...settings.Settings) *settings.Settings {
if options.BatchIdleDuration == nil {
options.BatchIdleDuration = &metav1.Duration{}
}
if options.TTLAfterNotRegistered == nil {
options.TTLAfterNotRegistered = &metav1.Duration{Duration: time.Minute * 15}
}
return &settings.Settings{
BatchMaxDuration: options.BatchMaxDuration,
BatchIdleDuration: options.BatchIdleDuration,
TTLAfterNotRegistered: options.TTLAfterNotRegistered,
DriftEnabled: options.DriftEnabled,
BatchMaxDuration: options.BatchMaxDuration,
BatchIdleDuration: options.BatchIdleDuration,
DriftEnabled: options.DriftEnabled,
}
}