Skip to content

Commit

Permalink
Power off nodes upon deletion
Browse files Browse the repository at this point in the history
We introduce a new step in the state machine where the node goes through
a power off stage before it's deleted.  We attempt to power it off 3
times before giving up, and proceeding to the delete.
  • Loading branch information
honza committed Jun 9, 2023
1 parent aa8db79 commit 6f65d8e
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 15 deletions.
4 changes: 4 additions & 0 deletions apis/metal3.io/v1alpha1/baremetalhost_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,10 @@ const (
// learn about the hardware components available there
StateInspecting ProvisioningState = "inspecting"

// StatePoweringOffBeforeDelete means we are in the process of
// powering off the node before it's deleted.
StatePoweringOffBeforeDelete ProvisioningState = "powering off before delete"

// StateDeleting means we are in the process of cleaning up the host
// ready for deletion
StateDeleting ProvisioningState = "deleting"
Expand Down
25 changes: 25 additions & 0 deletions controllers/metal3.io/baremetalhost_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,31 @@ func setErrorMessage(host *metal3api.BareMetalHost, errType metal3api.ErrorType,
host.Status.ErrorCount++
}

func (r *BareMetalHostReconciler) actionPowerOffBeforeDeleting(prov provisioner.Provisioner, info *reconcileInfo) actionResult {
info.log.Info("host ready to be powered off")
provResult, err := prov.PowerOff(
metal3api.RebootModeHard,
info.host.Status.ErrorType == metal3api.PowerManagementError)

if err != nil {
return actionError{errors.Wrap(err, "failed to power off before deleting node")}
}

if provResult.ErrorMessage != "" {
return recordActionFailure(info, metal3api.PowerManagementError, provResult.ErrorMessage)
}

if provResult.Dirty {
result := actionContinue{provResult.RequeueAfter}
if clearError(info.host) {
return actionUpdate{result}
}
return result
}

return actionComplete{}
}

// Manage deletion of the host
func (r *BareMetalHostReconciler) actionDeleting(prov provisioner.Provisioner, info *reconcileInfo) actionResult {
info.log.Info(
Expand Down
66 changes: 51 additions & 15 deletions controllers/metal3.io/host_state_machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,19 +41,20 @@ type stateHandler func(*reconcileInfo) actionResult

func (hsm *hostStateMachine) handlers() map[metal3api.ProvisioningState]stateHandler {
return map[metal3api.ProvisioningState]stateHandler{
metal3api.StateNone: hsm.handleNone,
metal3api.StateUnmanaged: hsm.handleUnmanaged,
metal3api.StateRegistering: hsm.handleRegistering,
metal3api.StateInspecting: hsm.handleInspecting,
metal3api.StateExternallyProvisioned: hsm.handleExternallyProvisioned,
metal3api.StateMatchProfile: hsm.handleMatchProfile, // Backward compatibility, remove eventually
metal3api.StatePreparing: hsm.handlePreparing,
metal3api.StateAvailable: hsm.handleAvailable,
metal3api.StateReady: hsm.handleAvailable,
metal3api.StateProvisioning: hsm.handleProvisioning,
metal3api.StateProvisioned: hsm.handleProvisioned,
metal3api.StateDeprovisioning: hsm.handleDeprovisioning,
metal3api.StateDeleting: hsm.handleDeleting,
metal3api.StateNone: hsm.handleNone,
metal3api.StateUnmanaged: hsm.handleUnmanaged,
metal3api.StateRegistering: hsm.handleRegistering,
metal3api.StateInspecting: hsm.handleInspecting,
metal3api.StateExternallyProvisioned: hsm.handleExternallyProvisioned,
metal3api.StateMatchProfile: hsm.handleMatchProfile, // Backward compatibility, remove eventually
metal3api.StatePreparing: hsm.handlePreparing,
metal3api.StateAvailable: hsm.handleAvailable,
metal3api.StateReady: hsm.handleAvailable,
metal3api.StateProvisioning: hsm.handleProvisioning,
metal3api.StateProvisioned: hsm.handleProvisioned,
metal3api.StateDeprovisioning: hsm.handleDeprovisioning,
metal3api.StatePoweringOffBeforeDelete: hsm.handlePoweringOffBeforeDelete,
metal3api.StateDeleting: hsm.handleDeleting,
}
}

Expand Down Expand Up @@ -223,14 +224,15 @@ func (hsm *hostStateMachine) checkInitiateDelete(log logr.Logger) bool {

switch hsm.NextState {
default:
hsm.NextState = metal3api.StateDeleting
hsm.NextState = metal3api.StatePoweringOffBeforeDelete
case metal3api.StateProvisioning, metal3api.StateProvisioned:
if hsm.Host.OperationalStatus() == metal3api.OperationalStatusDetached {
if delayDeleteForDetachedHost(hsm.Host) {
log.Info("Delaying detached host deletion")
deleteDelayedForDetached.Inc()
return false
}
// We cannot power off a detached host. Skip to delete.
hsm.NextState = metal3api.StateDeleting
} else {
hsm.NextState = metal3api.StateDeprovisioning
Expand All @@ -241,6 +243,9 @@ func (hsm *hostStateMachine) checkInitiateDelete(log logr.Logger) bool {
case metal3api.StateDeleting:
// Already in deleting state. Allow state machine to run.
return false
case metal3api.StatePoweringOffBeforeDelete:
// Already in powering off state. Allow state machine to run.
return false
}
return true
}
Expand Down Expand Up @@ -322,7 +327,7 @@ func (hsm *hostStateMachine) ensureRegistered(info *reconcileInfo) (result actio
case metal3api.StateMatchProfile:
// Backward compatibility, remove eventually
return
case metal3api.StateDeleting:
case metal3api.StateDeleting, metal3api.StatePoweringOffBeforeDelete:
// In the deleting state the whole idea is to de-register the host
return
case metal3api.StateRegistering:
Expand Down Expand Up @@ -561,6 +566,37 @@ func (hsm *hostStateMachine) handleDeprovisioning(info *reconcileInfo) actionRes
return actResult
}

func (hsm *hostStateMachine) handlePoweringOffBeforeDelete(info *reconcileInfo) actionResult {
actResult := hsm.Reconciler.actionPowerOffBeforeDeleting(hsm.Provisioner, info)
skipToDelete := func() actionResult {
hsm.NextState = metal3api.StateDeleting
info.postSaveCallbacks = append(info.postSaveCallbacks, deleteWithoutPowerOff.Inc)
return actionComplete{}
}

switch r := actResult.(type) {
case actionComplete:
hsm.NextState = metal3api.StateDeleting
hsm.Host.Status.ErrorCount = 0
hsm.Host.Status.PoweredOn = false
case actionFailed:
// If the provisioner gives up deprovisioning and
// deletion has been requested, continue to delete.
if hsm.Host.Status.ErrorCount > 3 {
info.log.Info("Giving up on host power off after 3 attempts.")
return skipToDelete()
}
case actionError:
if r.NeedsRegistration() && !hsm.haveCreds {
// If the host is not registered as a node in Ironic and we
// lack the credentials to power it off, just continue to
// delete.
return skipToDelete()
}
}
return actResult
}

func (hsm *hostStateMachine) handleDeleting(info *reconcileInfo) actionResult {
return hsm.Reconciler.actionDeleting(hsm.Provisioner, info)
}
5 changes: 5 additions & 0 deletions controllers/metal3.io/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,11 @@ var deleteWithoutDeprov = prometheus.NewCounter(prometheus.CounterOpts{
Help: "Number of times a host is deleted despite deprovisioning failing",
})

var deleteWithoutPowerOff = prometheus.NewCounter(prometheus.CounterOpts{
Name: "metal3_delete_without_powering_off_total",
Help: "Number of times a host is deleted despite powering off failing",
})

var provisionerNotReady = prometheus.NewCounter(prometheus.CounterOpts{
Name: "metal3_provisioner_not_ready_total",
Help: "Number of times a host is not provision ready",
Expand Down

0 comments on commit 6f65d8e

Please sign in to comment.