diff --git a/charts/karpenter-core/values.yaml b/charts/karpenter-core/values.yaml index c912f5aa19..c4a24dce36 100644 --- a/charts/karpenter-core/values.yaml +++ b/charts/karpenter-core/values.yaml @@ -138,11 +138,5 @@ settings: # Setting driftEnabled to true enables the drift deprovisioner to watch for drift between currently deployed nodes # and the desired state of nodes set in provisioners and node templates driftEnabled: false - # -- ttlAfterNotRegistered is in ALPHA and is planned to be removed in the future, pending design for handling nodes failing launch. - # The maximum length of time to wait for the machine to register to the cluster before terminating the machine. - # Generally, the default of 15m should be sufficient but raising or lowering this may be necessary depending - # on how slowly or quickly nodes join the cluster. - # Setting this value to an empty string disables the ttlAfterNotRegistered deprovisioning - ttlAfterNotRegistered: 15m diff --git a/pkg/apis/settings/settings.go b/pkg/apis/settings/settings.go index 806f696994..ecc2da8823 100644 --- a/pkg/apis/settings/settings.go +++ b/pkg/apis/settings/settings.go @@ -30,17 +30,15 @@ type settingsKeyType struct{} var ContextKey = settingsKeyType{} var defaultSettings = &Settings{ - BatchMaxDuration: &metav1.Duration{Duration: time.Second * 10}, - BatchIdleDuration: &metav1.Duration{Duration: time.Second * 1}, - TTLAfterNotRegistered: &metav1.Duration{Duration: time.Minute * 15}, - DriftEnabled: false, + BatchMaxDuration: &metav1.Duration{Duration: time.Second * 10}, + BatchIdleDuration: &metav1.Duration{Duration: time.Second * 1}, + DriftEnabled: false, } // +k8s:deepcopy-gen=true type Settings struct { - BatchMaxDuration *metav1.Duration - BatchIdleDuration *metav1.Duration - TTLAfterNotRegistered *metav1.Duration + BatchMaxDuration *metav1.Duration + BatchIdleDuration *metav1.Duration // This feature flag is temporary and will be removed in the near future. DriftEnabled bool } @@ -56,7 +54,6 @@ func (*Settings) Inject(ctx context.Context, cm *v1.ConfigMap) (context.Context, if err := configmap.Parse(cm.Data, AsMetaDuration("batchMaxDuration", &s.BatchMaxDuration), AsMetaDuration("batchIdleDuration", &s.BatchIdleDuration), - AsMetaDuration("ttlAfterNotRegistered", &s.TTLAfterNotRegistered), configmap.AsBool("featureGates.driftEnabled", &s.DriftEnabled), ); err != nil { return ctx, fmt.Errorf("parsing settings, %w", err) @@ -78,9 +75,6 @@ func (in *Settings) Validate() (err error) { } else if in.BatchIdleDuration.Duration <= 0 { err = multierr.Append(err, fmt.Errorf("batchIdleDuration cannot be negative")) } - if in.TTLAfterNotRegistered != nil && in.TTLAfterNotRegistered.Duration <= 0 { - err = multierr.Append(err, fmt.Errorf("ttlAfterNotRegistered cannot be negative")) - } return err } diff --git a/pkg/apis/settings/suite_test.go b/pkg/apis/settings/suite_test.go index 90db24bbd0..4ac4659bae 100644 --- a/pkg/apis/settings/suite_test.go +++ b/pkg/apis/settings/suite_test.go @@ -46,7 +46,6 @@ var _ = Describe("Validation", func() { Expect(s.BatchMaxDuration.Duration).To(Equal(time.Second * 10)) Expect(s.BatchIdleDuration.Duration).To(Equal(time.Second)) Expect(s.DriftEnabled).To(BeFalse()) - Expect(s.TTLAfterNotRegistered.Duration).To(Equal(time.Minute * 15)) }) It("should succeed to set custom values", func() { cm := &v1.ConfigMap{ @@ -54,7 +53,6 @@ var _ = Describe("Validation", func() { "batchMaxDuration": "30s", "batchIdleDuration": "5s", "featureGates.driftEnabled": "true", - "ttlAfterNotRegistered": "30m", }, } ctx, err := (&settings.Settings{}).Inject(ctx, cm) @@ -63,24 +61,6 @@ var _ = Describe("Validation", func() { Expect(s.BatchMaxDuration.Duration).To(Equal(time.Second * 30)) Expect(s.BatchIdleDuration.Duration).To(Equal(time.Second * 5)) Expect(s.DriftEnabled).To(BeTrue()) - Expect(s.TTLAfterNotRegistered.Duration).To(Equal(time.Minute * 30)) - }) - It("should succeed to disable ttlAfterNotRegistered", func() { - cm := &v1.ConfigMap{ - Data: map[string]string{ - "batchMaxDuration": "30s", - "batchIdleDuration": "5s", - "featureGates.driftEnabled": "true", - "ttlAfterNotRegistered": "", - }, - } - ctx, err := (&settings.Settings{}).Inject(ctx, cm) - Expect(err).ToNot(HaveOccurred()) - s := settings.FromContext(ctx) - Expect(s.BatchMaxDuration.Duration).To(Equal(time.Second * 30)) - Expect(s.BatchIdleDuration.Duration).To(Equal(time.Second * 5)) - Expect(s.DriftEnabled).To(BeTrue()) - Expect(s.TTLAfterNotRegistered).To(BeNil()) }) It("should fail validation when batchMaxDuration is negative", func() { cm := &v1.ConfigMap{ @@ -127,13 +107,4 @@ var _ = Describe("Validation", func() { _, err := (&settings.Settings{}).Inject(ctx, cm) Expect(err).To(HaveOccurred()) }) - It("should fail validation when ttlAfterNotRegistered is negative", func() { - cm := &v1.ConfigMap{ - Data: map[string]string{ - "ttlAfterNotRegistered": "-10s", - }, - } - _, err := (&settings.Settings{}).Inject(ctx, cm) - Expect(err).To(HaveOccurred()) - }) }) diff --git a/pkg/apis/settings/zz_generated.deepcopy.go b/pkg/apis/settings/zz_generated.deepcopy.go index f14fd5cad1..d73c49f129 100644 --- a/pkg/apis/settings/zz_generated.deepcopy.go +++ b/pkg/apis/settings/zz_generated.deepcopy.go @@ -36,11 +36,6 @@ func (in *Settings) DeepCopyInto(out *Settings) { *out = new(v1.Duration) **out = **in } - if in.TTLAfterNotRegistered != nil { - in, out := &in.TTLAfterNotRegistered, &out.TTLAfterNotRegistered - *out = new(v1.Duration) - **out = **in - } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Settings. diff --git a/pkg/controllers/machine/liveness.go b/pkg/controllers/machine/liveness.go index 59a387d3a9..fbf9c9999c 100644 --- a/pkg/controllers/machine/liveness.go +++ b/pkg/controllers/machine/liveness.go @@ -25,7 +25,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/reconcile" - "github.com/aws/karpenter-core/pkg/apis/settings" "github.com/aws/karpenter-core/pkg/apis/v1alpha5" "github.com/aws/karpenter-core/pkg/metrics" "github.com/aws/karpenter-core/pkg/utils/result" @@ -40,6 +39,10 @@ type Liveness struct { // If we don't succeed within this time, then we should delete and try again through some other mechanism const launchTTL = time.Minute * 2 +// registrationTTL is a heuristic time that we expect the node to register within +// If we don't see the node within this time, then we should delete the machine and try again +const registrationTTL = time.Minute * 15 + func (l *Liveness) Reconcile(ctx context.Context, machine *v1alpha5.Machine) (reconcile.Result, error) { creationRes, creationErr := l.launchTTL(ctx, machine) registrationRes, registrationErr := l.registrationTTL(ctx, machine) @@ -57,7 +60,7 @@ func (l *Liveness) launchTTL(ctx context.Context, machine *v1alpha5.Machine) (re if err := l.kubeClient.Delete(ctx, machine); err != nil { return reconcile.Result{}, client.IgnoreNotFound(err) } - logging.FromContext(ctx).With("ttl", time.Minute).Debugf("terminating machine since node hasn't created within creation ttl") + logging.FromContext(ctx).With("ttl", launchTTL).Debugf("terminating machine since node hasn't created within creation ttl") metrics.MachinesTerminatedCounter.With(prometheus.Labels{ metrics.ReasonLabel: "machine_creation_timeout", metrics.ProvisionerLabel: machine.Labels[v1alpha5.ProvisionerNameLabelKey], @@ -66,20 +69,17 @@ func (l *Liveness) launchTTL(ctx context.Context, machine *v1alpha5.Machine) (re } func (l *Liveness) registrationTTL(ctx context.Context, machine *v1alpha5.Machine) (reconcile.Result, error) { - if settings.FromContext(ctx).TTLAfterNotRegistered == nil { - return reconcile.Result{}, nil - } if machine.StatusConditions().GetCondition(v1alpha5.MachineRegistered).IsTrue() { return reconcile.Result{}, nil } - if machine.CreationTimestamp.IsZero() || l.clock.Since(machine.CreationTimestamp.Time) < settings.FromContext(ctx).TTLAfterNotRegistered.Duration { - return reconcile.Result{RequeueAfter: settings.FromContext(ctx).TTLAfterNotRegistered.Duration - l.clock.Since(machine.CreationTimestamp.Time)}, nil + if machine.CreationTimestamp.IsZero() || l.clock.Since(machine.CreationTimestamp.Time) < registrationTTL { + return reconcile.Result{RequeueAfter: registrationTTL - l.clock.Since(machine.CreationTimestamp.Time)}, nil } // Delete the machine if we believe the machine won't register since we haven't seen the node if err := l.kubeClient.Delete(ctx, machine); err != nil { return reconcile.Result{}, client.IgnoreNotFound(err) } - logging.FromContext(ctx).With("ttl", settings.FromContext(ctx).TTLAfterNotRegistered.Duration).Debugf("terminating machine since node hasn't registered within registration ttl") + logging.FromContext(ctx).With("ttl", registrationTTL).Debugf("terminating machine since node hasn't registered within registration ttl") metrics.MachinesTerminatedCounter.With(prometheus.Labels{ metrics.ReasonLabel: "machine_registration_timeout", metrics.ProvisionerLabel: machine.Labels[v1alpha5.ProvisionerNameLabelKey], diff --git a/pkg/controllers/machine/liveness_test.go b/pkg/controllers/machine/liveness_test.go index 6a1c41a81b..d367e8ba2f 100644 --- a/pkg/controllers/machine/liveness_test.go +++ b/pkg/controllers/machine/liveness_test.go @@ -22,7 +22,6 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" - "github.com/aws/karpenter-core/pkg/apis/settings" "github.com/aws/karpenter-core/pkg/apis/v1alpha5" "github.com/aws/karpenter-core/pkg/cloudprovider/fake" "github.com/aws/karpenter-core/pkg/test" @@ -97,35 +96,4 @@ var _ = Describe("Liveness", func() { ExpectReconcileSucceeded(ctx, machineController, client.ObjectKeyFromObject(machine)) // Reconcile again to handle termination flow ExpectNotFound(ctx, env.Client, machine) }) - It("should not delete the Machine when the Node hasn't registered to the Machine past the registration TTL if ttlAfterNotRegistered is disabled", func() { - s := test.Settings() - s.TTLAfterNotRegistered = nil - ctx = settings.ToContext(ctx, s) - machine := test.Machine(v1alpha5.Machine{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - v1alpha5.ProvisionerNameLabelKey: provisioner.Name, - }, - }, - Spec: v1alpha5.MachineSpec{ - Resources: v1alpha5.ResourceRequirements{ - Requests: v1.ResourceList{ - v1.ResourceCPU: resource.MustParse("2"), - v1.ResourceMemory: resource.MustParse("50Mi"), - v1.ResourcePods: resource.MustParse("5"), - fake.ResourceGPUVendorA: resource.MustParse("1"), - }, - }, - }, - }) - ExpectApplied(ctx, env.Client, provisioner, machine) - ExpectReconcileSucceeded(ctx, machineController, client.ObjectKeyFromObject(machine)) - machine = ExpectExists(ctx, env.Client, machine) - - // If the node hasn't registered in the liveness timeframe, then we deprovision the Machine - fakeClock.Step(time.Minute * 20) - ExpectReconcileSucceeded(ctx, machineController, client.ObjectKeyFromObject(machine)) - ExpectReconcileSucceeded(ctx, machineController, client.ObjectKeyFromObject(machine)) // Reconcile again to handle termination flow - ExpectExists(ctx, env.Client, machine) - }) }) diff --git a/pkg/test/settings.go b/pkg/test/settings.go index 94101645cc..21f7696e79 100644 --- a/pkg/test/settings.go +++ b/pkg/test/settings.go @@ -16,7 +16,6 @@ package test import ( "fmt" - "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -38,13 +37,9 @@ func Settings(overrides ...settings.Settings) *settings.Settings { if options.BatchIdleDuration == nil { options.BatchIdleDuration = &metav1.Duration{} } - if options.TTLAfterNotRegistered == nil { - options.TTLAfterNotRegistered = &metav1.Duration{Duration: time.Minute * 15} - } return &settings.Settings{ - BatchMaxDuration: options.BatchMaxDuration, - BatchIdleDuration: options.BatchIdleDuration, - TTLAfterNotRegistered: options.TTLAfterNotRegistered, - DriftEnabled: options.DriftEnabled, + BatchMaxDuration: options.BatchMaxDuration, + BatchIdleDuration: options.BatchIdleDuration, + DriftEnabled: options.DriftEnabled, } }