From 1630c13247e049eb4354a31192472a59c743b9f6 Mon Sep 17 00:00:00 2001 From: Honza Pokorny Date: Fri, 17 Mar 2023 14:55:54 -0300 Subject: [PATCH] Power off nodes upon deletion We introduce a new step in the state machine where the node goes through a power off stage before it's deleted. We attempt to power it off 3 times before giving up, and proceeding to the delete. --- .../metal3.io/v1alpha1/baremetalhost_types.go | 4 ++ .../metal3.io/baremetalhost_controller.go | 25 +++++++ controllers/metal3.io/host_state_machine.go | 67 ++++++++++++++----- .../metal3.io/host_state_machine_test.go | 6 +- controllers/metal3.io/metrics.go | 5 ++ 5 files changed, 88 insertions(+), 19 deletions(-) diff --git a/apis/metal3.io/v1alpha1/baremetalhost_types.go b/apis/metal3.io/v1alpha1/baremetalhost_types.go index e398755464..ad55f0d1ac 100644 --- a/apis/metal3.io/v1alpha1/baremetalhost_types.go +++ b/apis/metal3.io/v1alpha1/baremetalhost_types.go @@ -207,6 +207,10 @@ const ( // learn about the hardware components available there StateInspecting ProvisioningState = "inspecting" + // StateDeleting means we are in the process of cleaning up the host + // ready for deletion + StatePoweringOffBeforeDelete ProvisioningState = "powering off before delete" + // StateDeleting means we are in the process of cleaning up the host // ready for deletion StateDeleting ProvisioningState = "deleting" diff --git a/controllers/metal3.io/baremetalhost_controller.go b/controllers/metal3.io/baremetalhost_controller.go index 0bad06780f..b8664b7c3d 100644 --- a/controllers/metal3.io/baremetalhost_controller.go +++ b/controllers/metal3.io/baremetalhost_controller.go @@ -481,6 +481,31 @@ func setErrorMessage(host *metal3v1alpha1.BareMetalHost, errType metal3v1alpha1. host.Status.ErrorCount++ } +func (r *BareMetalHostReconciler) actionPowerOffBeforeDeleting(prov provisioner.Provisioner, info *reconcileInfo) actionResult { + info.log.Info("host ready to be powered off") + provResult, err := prov.PowerOff( + metal3v1alpha1.RebootModeHard, + info.host.Status.ErrorType == metal3v1alpha1.PowerManagementError) + + if err != nil { + return actionError{errors.Wrap(err, "failed to power off before deleting node")} + } + + if provResult.ErrorMessage != "" { + return recordActionFailure(info, metal3v1alpha1.PowerManagementError, provResult.ErrorMessage) + } + + if provResult.Dirty { + result := actionContinue{provResult.RequeueAfter} + if clearError(info.host) { + return actionUpdate{result} + } + return result + } + + return actionComplete{} +} + // Manage deletion of the host func (r *BareMetalHostReconciler) actionDeleting(prov provisioner.Provisioner, info *reconcileInfo) actionResult { info.log.Info( diff --git a/controllers/metal3.io/host_state_machine.go b/controllers/metal3.io/host_state_machine.go index 518c017e0a..9176c407e5 100644 --- a/controllers/metal3.io/host_state_machine.go +++ b/controllers/metal3.io/host_state_machine.go @@ -41,19 +41,20 @@ type stateHandler func(*reconcileInfo) actionResult func (hsm *hostStateMachine) handlers() map[metal3v1alpha1.ProvisioningState]stateHandler { return map[metal3v1alpha1.ProvisioningState]stateHandler{ - metal3v1alpha1.StateNone: hsm.handleNone, - metal3v1alpha1.StateUnmanaged: hsm.handleUnmanaged, - metal3v1alpha1.StateRegistering: hsm.handleRegistering, - metal3v1alpha1.StateInspecting: hsm.handleInspecting, - metal3v1alpha1.StateExternallyProvisioned: hsm.handleExternallyProvisioned, - metal3v1alpha1.StateMatchProfile: hsm.handleMatchProfile, // Backward compatibility, remove eventually - metal3v1alpha1.StatePreparing: hsm.handlePreparing, - metal3v1alpha1.StateAvailable: hsm.handleAvailable, - metal3v1alpha1.StateReady: hsm.handleAvailable, - metal3v1alpha1.StateProvisioning: hsm.handleProvisioning, - metal3v1alpha1.StateProvisioned: hsm.handleProvisioned, - metal3v1alpha1.StateDeprovisioning: hsm.handleDeprovisioning, - metal3v1alpha1.StateDeleting: hsm.handleDeleting, + metal3v1alpha1.StateNone: hsm.handleNone, + metal3v1alpha1.StateUnmanaged: hsm.handleUnmanaged, + metal3v1alpha1.StateRegistering: hsm.handleRegistering, + metal3v1alpha1.StateInspecting: hsm.handleInspecting, + metal3v1alpha1.StateExternallyProvisioned: hsm.handleExternallyProvisioned, + metal3v1alpha1.StateMatchProfile: hsm.handleMatchProfile, // Backward compatibility, remove eventually + metal3v1alpha1.StatePreparing: hsm.handlePreparing, + metal3v1alpha1.StateAvailable: hsm.handleAvailable, + metal3v1alpha1.StateReady: hsm.handleAvailable, + metal3v1alpha1.StateProvisioning: hsm.handleProvisioning, + metal3v1alpha1.StateProvisioned: hsm.handleProvisioned, + metal3v1alpha1.StateDeprovisioning: hsm.handleDeprovisioning, + metal3v1alpha1.StatePoweringOffBeforeDelete: hsm.handlePoweringOffBeforeDelete, + metal3v1alpha1.StateDeleting: hsm.handleDeleting, } } @@ -223,7 +224,7 @@ func (hsm *hostStateMachine) checkInitiateDelete(log logr.Logger) bool { switch hsm.NextState { default: - hsm.NextState = metal3v1alpha1.StateDeleting + hsm.NextState = metal3v1alpha1.StatePoweringOffBeforeDelete case metal3v1alpha1.StateProvisioning, metal3v1alpha1.StateProvisioned: if hsm.Host.OperationalStatus() == metal3v1alpha1.OperationalStatusDetached { if delayDeleteForDetachedHost(hsm.Host) { @@ -231,7 +232,7 @@ func (hsm *hostStateMachine) checkInitiateDelete(log logr.Logger) bool { deleteDelayedForDetached.Inc() return false } - hsm.NextState = metal3v1alpha1.StateDeleting + hsm.NextState = metal3v1alpha1.StatePoweringOffBeforeDelete } else { hsm.NextState = metal3v1alpha1.StateDeprovisioning } @@ -241,6 +242,9 @@ func (hsm *hostStateMachine) checkInitiateDelete(log logr.Logger) bool { case metal3v1alpha1.StateDeleting: // Already in deleting state. Allow state machine to run. return false + case metal3v1alpha1.StatePoweringOffBeforeDelete: + // Already in powering off state. Allow state machine to run. + return false } return true } @@ -322,7 +326,7 @@ func (hsm *hostStateMachine) ensureRegistered(info *reconcileInfo) (result actio case metal3v1alpha1.StateMatchProfile: // Backward compatibility, remove eventually return - case metal3v1alpha1.StateDeleting: + case metal3v1alpha1.StateDeleting, metal3v1alpha1.StatePoweringOffBeforeDelete: // In the deleting state the whole idea is to de-register the host return case metal3v1alpha1.StateRegistering: @@ -561,6 +565,37 @@ func (hsm *hostStateMachine) handleDeprovisioning(info *reconcileInfo) actionRes return actResult } +func (hsm *hostStateMachine) handlePoweringOffBeforeDelete(info *reconcileInfo) actionResult { + actResult := hsm.Reconciler.actionPowerOffBeforeDeleting(hsm.Provisioner, info) + skipToDelete := func() actionResult { + hsm.NextState = metal3v1alpha1.StateDeleting + info.postSaveCallbacks = append(info.postSaveCallbacks, deleteWithoutPowerOff.Inc) + return actionComplete{} + } + + switch r := actResult.(type) { + case actionComplete: + hsm.NextState = metal3v1alpha1.StateDeleting + hsm.Host.Status.ErrorCount = 0 + hsm.Host.Status.PoweredOn = false + case actionFailed: + // If the provisioner gives up deprovisioning and + // deletion has been requested, continue to delete. + if hsm.Host.Status.ErrorCount > 3 { + info.log.Info("Giving up on host power off after 3 attempts.") + return skipToDelete() + } + case actionError: + if r.NeedsRegistration() && !hsm.haveCreds { + // If the host is not registered as a node in Ironic and we + // lack the credentials to power it off, just continue to + // delete. + return skipToDelete() + } + } + return actResult +} + func (hsm *hostStateMachine) handleDeleting(info *reconcileInfo) actionResult { return hsm.Reconciler.actionDeleting(hsm.Provisioner, info) } diff --git a/controllers/metal3.io/host_state_machine_test.go b/controllers/metal3.io/host_state_machine_test.go index 9a700a2166..9c788d35a7 100644 --- a/controllers/metal3.io/host_state_machine_test.go +++ b/controllers/metal3.io/host_state_machine_test.go @@ -259,7 +259,7 @@ func TestDetach(t *testing.T) { ExpectedDirty: true, ExpectedOperationalStatus: metal3v1alpha1.OperationalStatusDetached, // Should move to Deleting without any Deprovisioning - ExpectedState: metal3v1alpha1.StateDeleting, + ExpectedState: metal3v1alpha1.StatePoweringOffBeforeDelete, }, { Scenario: "ExternallyProvisionedHost", @@ -1099,7 +1099,7 @@ func TestDeleteWaitsForDetach(t *testing.T) { setDeletion(). setDetached("{\"deleteAction\": \"delete\"}"). build(), - ExpectedState: metal3v1alpha1.StateDeleting, + ExpectedState: metal3v1alpha1.StatePoweringOffBeforeDelete, ExpectedOperationalStatus: metal3v1alpha1.OperationalStatusDetached, }, { @@ -1109,7 +1109,7 @@ func TestDeleteWaitsForDetach(t *testing.T) { setDeletion(). setDetached("true"). build(), - ExpectedState: metal3v1alpha1.StateDeleting, + ExpectedState: metal3v1alpha1.StatePoweringOffBeforeDelete, ExpectedOperationalStatus: metal3v1alpha1.OperationalStatusDetached, }, { diff --git a/controllers/metal3.io/metrics.go b/controllers/metal3.io/metrics.go index 66611d7f04..fdff8bb723 100644 --- a/controllers/metal3.io/metrics.go +++ b/controllers/metal3.io/metrics.go @@ -118,6 +118,11 @@ var deleteWithoutDeprov = prometheus.NewCounter(prometheus.CounterOpts{ Help: "Number of times a host is deleted despite deprovisioning failing", }) +var deleteWithoutPowerOff = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "metal3_delete_without_powering_off_total", + Help: "Number of times a host is deleted despite powering off failing", +}) + var provisionerNotReady = prometheus.NewCounter(prometheus.CounterOpts{ Name: "metal3_provisioner_not_ready_total", Help: "Number of times a host is not provision ready",