Skip to content

Commit

Permalink
chore: add logging if deprovisioners take longer than expected (#288)
Browse files Browse the repository at this point in the history
  • Loading branch information
tzneal committed Apr 11, 2023
1 parent 913d0a2 commit 0a15776
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 0 deletions.
27 changes: 27 additions & 0 deletions pkg/controllers/deprovisioning/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ type Controller struct {
clock clock.Clock
cloudProvider cloudprovider.CloudProvider
deprovisioners []Deprovisioner
mu sync.Mutex
lastRun map[string]time.Time
}

// pollingPeriod that we inspect cluster to look for opportunities to deprovision
Expand Down Expand Up @@ -81,6 +83,7 @@ func NewController(clk clock.Clock, kubeClient client.Client, provisioner *provi
provisioner: provisioner,
recorder: recorder,
cloudProvider: cp,
lastRun: map[string]time.Time{},
deprovisioners: []Deprovisioner{
// Expire any machines that must be deleted, allowing their pods to potentially land on currently
NewExpiration(clk, kubeClient, cluster, provisioner, recorder),
Expand All @@ -107,15 +110,22 @@ func (c *Controller) Builder(_ context.Context, m manager.Manager) controller.Bu
}

func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
// this won't catch if the reconcile loop hangs forever but it will catch other issues
c.logAbnormalRuns(ctx)
defer c.logAbnormalRuns(ctx)
c.recordRun("deprovisioning-loop")

// We need to ensure that our internal cluster state mechanism is synced before we proceed
// with making any scheduling decision off of our state nodes. Otherwise, we have the potential to make
// a scheduling decision based on a smaller subset of nodes in our cluster state than actually exist.
if !c.cluster.Synced(ctx) {
logging.FromContext(ctx).Debugf("waiting on cluster sync")
return reconcile.Result{RequeueAfter: time.Second}, nil
}
// Attempt different deprovisioning methods. We'll only let one method perform an action
isConsolidated := c.cluster.Consolidated()
for _, d := range c.deprovisioners {
c.recordRun(fmt.Sprintf("%T", d))
success, err := c.deprovision(ctx, d)
if err != nil {
return reconcile.Result{}, fmt.Errorf("deprovisioning via %q, %w", d, err)
Expand Down Expand Up @@ -313,3 +323,20 @@ func (c *Controller) setNodesUnschedulable(ctx context.Context, isUnschedulable
}
return multiErr
}

func (c *Controller) recordRun(s string) {
c.mu.Lock()
defer c.mu.Unlock()
c.lastRun[s] = c.clock.Now()
}

func (c *Controller) logAbnormalRuns(ctx context.Context) {
const AbnormalTimeLimit = 15 * time.Minute
c.mu.Lock()
defer c.mu.Unlock()
for name, runTime := range c.lastRun {
if timeSince := c.clock.Since(runTime); timeSince > AbnormalTimeLimit {
logging.FromContext(ctx).Debugf("abnormal time between runs of %s = %s", name, timeSince)
}
}
}
1 change: 1 addition & 0 deletions pkg/controllers/provisioning/provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ func (p *Provisioner) Reconcile(ctx context.Context, _ reconcile.Request) (resul
// with making any scheduling decision off of our state nodes. Otherwise, we have the potential to make
// a scheduling decision based on a smaller subset of nodes in our cluster state than actually exist.
if !p.cluster.Synced(ctx) {
logging.FromContext(ctx).Debugf("waiting on cluster sync")
return reconcile.Result{RequeueAfter: time.Second}, nil
}

Expand Down

0 comments on commit 0a15776

Please sign in to comment.