Skip to content

Commit

Permalink
Power off nodes upon deletion
Browse files Browse the repository at this point in the history
We introduce a new step in the state machine where the node goes through
a power off stage before it's deleted.  We attempt to power it off 3
times before giving up, and proceeding to the delete.
  • Loading branch information
honza committed Mar 29, 2023
1 parent 3cddef3 commit 1630c13
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 19 deletions.
4 changes: 4 additions & 0 deletions apis/metal3.io/v1alpha1/baremetalhost_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,10 @@ const (
// learn about the hardware components available there
StateInspecting ProvisioningState = "inspecting"

// StateDeleting means we are in the process of cleaning up the host
// ready for deletion
StatePoweringOffBeforeDelete ProvisioningState = "powering off before delete"

// StateDeleting means we are in the process of cleaning up the host
// ready for deletion
StateDeleting ProvisioningState = "deleting"
Expand Down
25 changes: 25 additions & 0 deletions controllers/metal3.io/baremetalhost_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,31 @@ func setErrorMessage(host *metal3v1alpha1.BareMetalHost, errType metal3v1alpha1.
host.Status.ErrorCount++
}

func (r *BareMetalHostReconciler) actionPowerOffBeforeDeleting(prov provisioner.Provisioner, info *reconcileInfo) actionResult {
info.log.Info("host ready to be powered off")
provResult, err := prov.PowerOff(
metal3v1alpha1.RebootModeHard,
info.host.Status.ErrorType == metal3v1alpha1.PowerManagementError)

if err != nil {
return actionError{errors.Wrap(err, "failed to power off before deleting node")}
}

if provResult.ErrorMessage != "" {
return recordActionFailure(info, metal3v1alpha1.PowerManagementError, provResult.ErrorMessage)
}

if provResult.Dirty {
result := actionContinue{provResult.RequeueAfter}
if clearError(info.host) {
return actionUpdate{result}
}
return result
}

return actionComplete{}
}

// Manage deletion of the host
func (r *BareMetalHostReconciler) actionDeleting(prov provisioner.Provisioner, info *reconcileInfo) actionResult {
info.log.Info(
Expand Down
67 changes: 51 additions & 16 deletions controllers/metal3.io/host_state_machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,19 +41,20 @@ type stateHandler func(*reconcileInfo) actionResult

func (hsm *hostStateMachine) handlers() map[metal3v1alpha1.ProvisioningState]stateHandler {
return map[metal3v1alpha1.ProvisioningState]stateHandler{
metal3v1alpha1.StateNone: hsm.handleNone,
metal3v1alpha1.StateUnmanaged: hsm.handleUnmanaged,
metal3v1alpha1.StateRegistering: hsm.handleRegistering,
metal3v1alpha1.StateInspecting: hsm.handleInspecting,
metal3v1alpha1.StateExternallyProvisioned: hsm.handleExternallyProvisioned,
metal3v1alpha1.StateMatchProfile: hsm.handleMatchProfile, // Backward compatibility, remove eventually
metal3v1alpha1.StatePreparing: hsm.handlePreparing,
metal3v1alpha1.StateAvailable: hsm.handleAvailable,
metal3v1alpha1.StateReady: hsm.handleAvailable,
metal3v1alpha1.StateProvisioning: hsm.handleProvisioning,
metal3v1alpha1.StateProvisioned: hsm.handleProvisioned,
metal3v1alpha1.StateDeprovisioning: hsm.handleDeprovisioning,
metal3v1alpha1.StateDeleting: hsm.handleDeleting,
metal3v1alpha1.StateNone: hsm.handleNone,
metal3v1alpha1.StateUnmanaged: hsm.handleUnmanaged,
metal3v1alpha1.StateRegistering: hsm.handleRegistering,
metal3v1alpha1.StateInspecting: hsm.handleInspecting,
metal3v1alpha1.StateExternallyProvisioned: hsm.handleExternallyProvisioned,
metal3v1alpha1.StateMatchProfile: hsm.handleMatchProfile, // Backward compatibility, remove eventually
metal3v1alpha1.StatePreparing: hsm.handlePreparing,
metal3v1alpha1.StateAvailable: hsm.handleAvailable,
metal3v1alpha1.StateReady: hsm.handleAvailable,
metal3v1alpha1.StateProvisioning: hsm.handleProvisioning,
metal3v1alpha1.StateProvisioned: hsm.handleProvisioned,
metal3v1alpha1.StateDeprovisioning: hsm.handleDeprovisioning,
metal3v1alpha1.StatePoweringOffBeforeDelete: hsm.handlePoweringOffBeforeDelete,
metal3v1alpha1.StateDeleting: hsm.handleDeleting,
}
}

Expand Down Expand Up @@ -223,15 +224,15 @@ func (hsm *hostStateMachine) checkInitiateDelete(log logr.Logger) bool {

switch hsm.NextState {
default:
hsm.NextState = metal3v1alpha1.StateDeleting
hsm.NextState = metal3v1alpha1.StatePoweringOffBeforeDelete
case metal3v1alpha1.StateProvisioning, metal3v1alpha1.StateProvisioned:
if hsm.Host.OperationalStatus() == metal3v1alpha1.OperationalStatusDetached {
if delayDeleteForDetachedHost(hsm.Host) {
log.Info("Delaying detached host deletion")
deleteDelayedForDetached.Inc()
return false
}
hsm.NextState = metal3v1alpha1.StateDeleting
hsm.NextState = metal3v1alpha1.StatePoweringOffBeforeDelete
} else {
hsm.NextState = metal3v1alpha1.StateDeprovisioning
}
Expand All @@ -241,6 +242,9 @@ func (hsm *hostStateMachine) checkInitiateDelete(log logr.Logger) bool {
case metal3v1alpha1.StateDeleting:
// Already in deleting state. Allow state machine to run.
return false
case metal3v1alpha1.StatePoweringOffBeforeDelete:
// Already in powering off state. Allow state machine to run.
return false
}
return true
}
Expand Down Expand Up @@ -322,7 +326,7 @@ func (hsm *hostStateMachine) ensureRegistered(info *reconcileInfo) (result actio
case metal3v1alpha1.StateMatchProfile:
// Backward compatibility, remove eventually
return
case metal3v1alpha1.StateDeleting:
case metal3v1alpha1.StateDeleting, metal3v1alpha1.StatePoweringOffBeforeDelete:
// In the deleting state the whole idea is to de-register the host
return
case metal3v1alpha1.StateRegistering:
Expand Down Expand Up @@ -561,6 +565,37 @@ func (hsm *hostStateMachine) handleDeprovisioning(info *reconcileInfo) actionRes
return actResult
}

func (hsm *hostStateMachine) handlePoweringOffBeforeDelete(info *reconcileInfo) actionResult {
actResult := hsm.Reconciler.actionPowerOffBeforeDeleting(hsm.Provisioner, info)
skipToDelete := func() actionResult {
hsm.NextState = metal3v1alpha1.StateDeleting
info.postSaveCallbacks = append(info.postSaveCallbacks, deleteWithoutPowerOff.Inc)
return actionComplete{}
}

switch r := actResult.(type) {
case actionComplete:
hsm.NextState = metal3v1alpha1.StateDeleting
hsm.Host.Status.ErrorCount = 0
hsm.Host.Status.PoweredOn = false
case actionFailed:
// If the provisioner gives up deprovisioning and
// deletion has been requested, continue to delete.
if hsm.Host.Status.ErrorCount > 3 {
info.log.Info("Giving up on host power off after 3 attempts.")
return skipToDelete()
}
case actionError:
if r.NeedsRegistration() && !hsm.haveCreds {
// If the host is not registered as a node in Ironic and we
// lack the credentials to power it off, just continue to
// delete.
return skipToDelete()
}
}
return actResult
}

func (hsm *hostStateMachine) handleDeleting(info *reconcileInfo) actionResult {
return hsm.Reconciler.actionDeleting(hsm.Provisioner, info)
}
6 changes: 3 additions & 3 deletions controllers/metal3.io/host_state_machine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ func TestDetach(t *testing.T) {
ExpectedDirty: true,
ExpectedOperationalStatus: metal3v1alpha1.OperationalStatusDetached,
// Should move to Deleting without any Deprovisioning
ExpectedState: metal3v1alpha1.StateDeleting,
ExpectedState: metal3v1alpha1.StatePoweringOffBeforeDelete,
},
{
Scenario: "ExternallyProvisionedHost",
Expand Down Expand Up @@ -1099,7 +1099,7 @@ func TestDeleteWaitsForDetach(t *testing.T) {
setDeletion().
setDetached("{\"deleteAction\": \"delete\"}").
build(),
ExpectedState: metal3v1alpha1.StateDeleting,
ExpectedState: metal3v1alpha1.StatePoweringOffBeforeDelete,
ExpectedOperationalStatus: metal3v1alpha1.OperationalStatusDetached,
},
{
Expand All @@ -1109,7 +1109,7 @@ func TestDeleteWaitsForDetach(t *testing.T) {
setDeletion().
setDetached("true").
build(),
ExpectedState: metal3v1alpha1.StateDeleting,
ExpectedState: metal3v1alpha1.StatePoweringOffBeforeDelete,
ExpectedOperationalStatus: metal3v1alpha1.OperationalStatusDetached,
},
{
Expand Down
5 changes: 5 additions & 0 deletions controllers/metal3.io/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,11 @@ var deleteWithoutDeprov = prometheus.NewCounter(prometheus.CounterOpts{
Help: "Number of times a host is deleted despite deprovisioning failing",
})

var deleteWithoutPowerOff = prometheus.NewCounter(prometheus.CounterOpts{
Name: "metal3_delete_without_powering_off_total",
Help: "Number of times a host is deleted despite powering off failing",
})

var provisionerNotReady = prometheus.NewCounter(prometheus.CounterOpts{
Name: "metal3_provisioner_not_ready_total",
Help: "Number of times a host is not provision ready",
Expand Down

0 comments on commit 1630c13

Please sign in to comment.