From acc3ffd6db6fce78b98bac387580b8dc0048485a Mon Sep 17 00:00:00 2001 From: janiskemper Date: Fri, 20 Oct 2023 12:39:38 +0200 Subject: [PATCH] :sparkles: autodetect permanent-error Adding state to verify that a device is shut down. Also deal with a state where the device is reloading forever. Set a label on that device that it is unhealthy and should not be used in that case. --- api/v1alpha1/conditions_const.go | 25 ++ api/v1alpha1/hivelocitymachine_types.go | 8 +- ...e.cluster.x-k8s.io_hivelocitymachines.yaml | 3 + controllers/hivelocitycluster_controller.go | 4 +- controllers/hivelocitymachine_controller.go | 1 + .../hivelocitymachine_controller_test.go | 1 + go.mod | 2 +- hack/output-for-watch.sh | 16 +- hack/tail-controller-logs.sh | 2 +- pkg/services/hivelocity/client/hvclient.go | 41 +- .../hivelocity/client/mock/mock_client.go | 35 ++ pkg/services/hivelocity/device/device.go | 374 +++++++++++++++--- pkg/services/hivelocity/device/device_test.go | 2 +- .../hivelocity/device/state_machine.go | 27 +- pkg/services/hivelocity/hvtag/hvtag.go | 44 ++- pkg/services/hivelocity/hvtag/hvtag_test.go | 36 ++ .../claim-devices-or-fail.go | 40 +- 17 files changed, 534 insertions(+), 127 deletions(-) diff --git a/api/v1alpha1/conditions_const.go b/api/v1alpha1/conditions_const.go index 70742174e..6caa558f7 100644 --- a/api/v1alpha1/conditions_const.go +++ b/api/v1alpha1/conditions_const.go @@ -23,11 +23,36 @@ import ( const ( // DeviceReadyCondition reports on current status of the device. Ready indicates the device is in a Running state. DeviceReadyCondition clusterv1.ConditionType = "DeviceReady" + // DeviceNotFoundReason (Severity=Error) documents a HivelocityMachine controller detecting // the underlying device cannot be found anymore. DeviceNotFoundReason = "DeviceNotFound" + // DeviceTagsInvalidReason documents a HivelocityMachine controller detecting invalid device tags. DeviceTagsInvalidReason = "DeviceTagsInvalid" + + // DeviceReloadingTooLongReason indicates that the device is reloading too long. + // The controller sets a corresponding tag, so that the machine can get reset by an operator. + DeviceReloadingTooLongReason = "DeviceReloadingTooLongReason" +) + +const ( + // DeviceProvisioningSucceededCondition reports on whether the device has been successfully provisioned. + DeviceProvisioningSucceededCondition clusterv1.ConditionType = "DeviceProvisioningSucceeded" + + // DeviceReloadingReason documents that the device is reloading. + DeviceReloadingReason = "DeviceReloading" + + // DeviceShutdownCalledReason documents that the device has been tried to shut down. + DeviceShutdownCalledReason = "DeviceShutdownCalled" + + // DeviceShutDownReason documents that the device is shut down. + DeviceShutDownReason = "DeviceShutDown" +) + +const ( + // DeviceDeProvisioningSucceededCondition reports on whether the device has been successfully deprovisioned. + DeviceDeProvisioningSucceededCondition clusterv1.ConditionType = "DeviceDeProvisioningSucceeded" ) const ( diff --git a/api/v1alpha1/hivelocitymachine_types.go b/api/v1alpha1/hivelocitymachine_types.go index 4a8f27004..bb55d6ede 100644 --- a/api/v1alpha1/hivelocitymachine_types.go +++ b/api/v1alpha1/hivelocitymachine_types.go @@ -38,6 +38,7 @@ const ( const ( // FailureMessageDeviceNotFound indicates that the associated device could not be found. FailureMessageDeviceNotFound = "device not found" + // FailureMessageDeviceTagsInvalid indicates that the associated device has invalid tags. // This is probably due to a user changing device tags on his own. FailureMessageDeviceTagsInvalid = "device tags invalid" @@ -65,6 +66,9 @@ const ( // StateVerifyAssociate . StateVerifyAssociate ProvisioningState = "verify-associate" + // StateVerifyShutdown . + StateVerifyShutdown ProvisioningState = "verify-shutdown" + // StateProvisionDevice . StateProvisionDevice ProvisioningState = "provision-device" @@ -112,7 +116,8 @@ type ControllerGeneratedStatus struct { // HivelocityDeviceType defines the Hivelocity device type. // +kubebuilder:validation:Enum=pool;hvCustom;hvControlPlane;hvWorker;e2eControlPlane;e2eWorker -type HivelocityDeviceType string +// hvlabel:foo=bar +type HivelocityDeviceType string // TODO: this should not be an enum. Rename to HVLabel, and make a label selector. // HivelocityMachineStatus defines the observed state of HivelocityMachine. type HivelocityMachineStatus struct { @@ -158,6 +163,7 @@ type HivelocityMachineStatus struct { // +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.ready",description="Machine ready status" // +kubebuilder:printcolumn:name="ProviderID",type="string",JSONPath=".spec.providerID",description="ProviderID of machine object" // +kubebuilder:printcolumn:name="Machine",type="string",JSONPath=".metadata.ownerReferences[?(@.kind==\"Machine\")].name",description="Machine object which owns with this HivelocityMachine" +// +kubebuilder:printcolumn:name="Prov.State",type="string",JSONPath=".spec.status.provisioningState" // +kubebuilder:printcolumn:name="Reason",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].reason" // +kubebuilder:printcolumn:name="Message",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].message" // +k8s:defaulter-gen=true diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_hivelocitymachines.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_hivelocitymachines.yaml index a8a36fa8c..cbe634acc 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_hivelocitymachines.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_hivelocitymachines.yaml @@ -47,6 +47,9 @@ spec: jsonPath: .metadata.ownerReferences[?(@.kind=="Machine")].name name: Machine type: string + - jsonPath: .spec.status.provisioningState + name: Prov.State + type: string - jsonPath: .status.conditions[?(@.type=='Ready')].reason name: Reason type: string diff --git a/controllers/hivelocitycluster_controller.go b/controllers/hivelocitycluster_controller.go index 9be3a929d..d263564c9 100644 --- a/controllers/hivelocitycluster_controller.go +++ b/controllers/hivelocitycluster_controller.go @@ -196,9 +196,9 @@ func (r *HivelocityClusterReconciler) reconcileNormal(ctx context.Context, clust if machineType == "" { return ctrl.Result{}, fmt.Errorf("Spec.Template.Spec.Type of HivelocityMachineTemplate %q is empty", name) } - hvDevice, err := device.GetFirstDevice(ctx, clusterScope.HVClient, machineType, hvCluster, "") + hvDevice, err := device.GetFirstFreeDevice(ctx, clusterScope.HVClient, machineType, hvCluster, "") if err != nil { - return ctrl.Result{}, fmt.Errorf("device.GetFirstDevice() failed: %w", err) + return ctrl.Result{}, fmt.Errorf("device.GetFirstFreeDevice() failed: %w", err) } log.Info(fmt.Sprintf("Setting hvCluster.Spec.ControlPlaneEndpoint.Host to %q (machineType=%s).", hvDevice.PrimaryIp, machineType)) diff --git a/controllers/hivelocitymachine_controller.go b/controllers/hivelocitymachine_controller.go index b4b2f720e..485a9304e 100644 --- a/controllers/hivelocitymachine_controller.go +++ b/controllers/hivelocitymachine_controller.go @@ -80,6 +80,7 @@ func (r *HivelocityMachineReconciler) Reconcile(ctx context.Context, req ctrl.Re // Fetch the Machine. machine, err := util.GetOwnerMachine(ctx, r.Client, hivelocityMachine.ObjectMeta) if err != nil { + log.Error(err, "GetOwnerMachine failed") return ctrl.Result{}, err } if machine == nil { diff --git a/controllers/hivelocitymachine_controller_test.go b/controllers/hivelocitymachine_controller_test.go index 148fc6f3a..1d51a65bd 100644 --- a/controllers/hivelocitymachine_controller_test.go +++ b/controllers/hivelocitymachine_controller_test.go @@ -201,6 +201,7 @@ var _ = Describe("HivelocityMachineReconciler", func() { Eventually(func() bool { if err := testEnv.Get(ctx, machineKey, hvMachine); err != nil { + testEnv.GetLogger().Info("machine resource does not exist yet") return false } if hvMachine.Spec.ProviderID == nil { diff --git a/go.mod b/go.mod index bd5bdf2f0..e9fae00b2 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,6 @@ go 1.20 //replace sigs.k8s.io/cluster-api/test/framework => sigs.k8s.io/cluster-api/test v1.4.2 require ( - github.com/antihax/optional v1.0.0 github.com/blang/semver/v4 v4.0.0 github.com/go-logr/logr v1.2.4 github.com/go-logr/zapr v1.2.3 @@ -44,6 +43,7 @@ require ( github.com/Masterminds/sprig/v3 v3.2.3 // indirect github.com/Microsoft/go-winio v0.5.0 // indirect github.com/alessio/shellescape v1.4.1 // indirect + github.com/antihax/optional v1.0.0 // indirect github.com/antlr/antlr4/runtime/Go/antlr v1.4.10 // indirect github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a // indirect github.com/beorn7/perks v1.0.1 // indirect diff --git a/hack/output-for-watch.sh b/hack/output-for-watch.sh index 3e451be48..1812611e1 100755 --- a/hack/output-for-watch.sh +++ b/hack/output-for-watch.sh @@ -34,11 +34,10 @@ kubectl get machines -A print_heading hivelocitymachine -kubectl get hivelocitymachine -A "-o=custom-columns=NAMESPACE:.metadata.namespace,NAME:.metadata.name,Cluster:.metadata.labels.cluster\.x-k8s\.io/cluster-name,Type:.spec.type,State:.status.powerState,Ready:.status.ready,ProviderID:.spec.providerID,Machine:.metadata.ownerReferences[?(@.kind==\"Machine\")].name,IP:.status.addresses[?(@.type==\"InternalIP\")].address" - +kubectl get hivelocitymachine -A print_heading events -kubectl get events -A -o=wide --sort-by=.lastTimestamp | grep -vP 'LeaderElection' | tail -8 +kubectl get events -A -o=wide --sort-by=.lastTimestamp | grep -vP 'LeaderElection|CSRApproved' | tail -8 print_heading conditions @@ -51,6 +50,15 @@ print_heading logs echo +capi_error="$(kubectl logs -n capi-system --since=5m deployments/capi-controller-manager | \ + grep -iP 'error|\be\d\d\d\d\b' | \ + grep -vP 'ignoring DaemonSet-managed Pods|TLS handshake error from' | \ + tail -7)" +if [ -n "$capi_error" ]; then + print_heading capi controller errors + echo "$capi_error" +fi + ip=$(kubectl get cluster -A -o=jsonpath='{.items[*].spec.controlPlaneEndpoint.host}' | head -1) if [ -z "$ip" ]; then echo "❌ Could not get IP of control-plane" @@ -93,7 +101,7 @@ else echo "👌 number of nodes in wl-cluster is equal to number of machines in mgt-cluster" fi -not_approved=$(KUBECONFIG=$kubeconfig_wl kubectl get csr --no-headers | grep -v Approved) +not_approved=$(KUBECONFIG=$kubeconfig_wl kubectl get csr --no-headers --sort-by='.metadata.creationTimestamp' | grep -v Approved | tail -8 ) if [ -n "$not_approved" ]; then echo "❌ (CSRs)certificate signing requests which are not approved" echo "$not_approved" diff --git a/hack/tail-controller-logs.sh b/hack/tail-controller-logs.sh index fc584b80a..d5cc5ea41 100755 --- a/hack/tail-controller-logs.sh +++ b/hack/tail-controller-logs.sh @@ -17,7 +17,7 @@ pod=$(kubectl -n capi-hivelocity-system get pods | grep caphv-controller-manager | cut -d' ' -f1) if [ -z "$pod" ]; then - echo "failed to find caphv-controller-manager pod" + echo "❌ failed to find caphv-controller-manager pod" exit 1 fi diff --git a/pkg/services/hivelocity/client/hvclient.go b/pkg/services/hivelocity/client/hvclient.go index e2074fb42..c8ebb2664 100644 --- a/pkg/services/hivelocity/client/hvclient.go +++ b/pkg/services/hivelocity/client/hvclient.go @@ -27,9 +27,7 @@ import ( "regexp" "runtime/debug" "strings" - "time" - "github.com/antihax/optional" "github.com/go-logr/logr" "github.com/hivelocity/cluster-api-provider-hivelocity/pkg/utils" caphvversion "github.com/hivelocity/cluster-api-provider-hivelocity/pkg/version" @@ -47,9 +45,9 @@ const PowerStatusOn = "ON" // Client collects all methods used by the controller in the Hivelocity API. type Client interface { PowerOnDevice(ctx context.Context, deviceID int32) error + ShutdownDevice(ctx context.Context, deviceID int32) error ProvisionDevice(ctx context.Context, deviceID int32, opts hv.BareMetalDeviceUpdate) (hv.BareMetalDevice, error) ListDevices(context.Context) ([]hv.BareMetalDevice, error) - ShutdownDevice(ctx context.Context, deviceID int32) error ListImages(ctx context.Context, productID int32) ([]string, error) ListSSHKeys(context.Context) ([]hv.SshKeyResponse, error) @@ -58,6 +56,8 @@ type Client interface { // SetDeviceTags sets the tags to the given list. SetDeviceTags(ctx context.Context, deviceID int32, tags []string) error + + GetDeviceDump(ctx context.Context, deviceID int32) (hv.DeviceDump, error) } // Factory is the interface for creating new Client objects. @@ -155,42 +155,20 @@ func (c *realClient) SetDeviceTags(ctx context.Context, deviceID int32, tags []s return checkRateLimit(err) } -func (c *realClient) PowerOnDevice(_ context.Context, _ int32) error { - return nil // todo +func (c *realClient) PowerOnDevice(ctx context.Context, deviceID int32) error { + _, _, err := c.client.DeviceApi.PostPowerResource(ctx, deviceID, "boot", nil) //nolint:bodyclose // Close() gets done in client + return err } func (c *realClient) ProvisionDevice(ctx context.Context, deviceID int32, opts hv.BareMetalDeviceUpdate) (hv.BareMetalDevice, error) { log := log.FromContext(ctx) var swaggerErr hv.GenericSwaggerError - power, _, err := c.client.DeviceApi.GetPowerResource(ctx, deviceID, nil) //nolint:bodyclose // Close() gets done in client - if errors.As(err, &swaggerErr) { - body := string(swaggerErr.Body()) - log.Info("ProvisionDevice() failed (GetPowerResource)", "DeviceID", deviceID, "body", body) - } - - if power.PowerStatus == PowerStatusOn { - // First we need to send "shutdown". - // https://developers.hivelocity.net/reference/post_power_resource - _, _, err := c.client.DeviceApi.PostPowerResource(ctx, deviceID, "shutdown", nil) //nolint:bodyclose // Close() gets done in client - if errors.As(err, &swaggerErr) { - body := string(swaggerErr.Body()) - log.Info("ProvisionDevice() failed (PostPowerResource)", "DeviceID", deviceID, "body", body) - } - log.Info("ProvisionDevice() called PostPowerResource shutdown", "DeviceID", deviceID) - time.Sleep(30 * time.Second) - } - log.Info("calling ProvisionDevice()", "DeviceID", deviceID, "hostname", opts.Hostname, "OsName", opts.OsName, "script", utils.FirstN(opts.Script, 50), "ForceReload", opts.ForceReload) - // https://developers.hivelocity.net/reference/put_bare_metal_device_id_resource - localVars := hv.BareMetalDevicesApiPutBareMetalDeviceIdResourceOpts{ - SkipPowerCheck: optional.NewBool(true), - } - - device, _, err := c.client.BareMetalDevicesApi.PutBareMetalDeviceIdResource(ctx, deviceID, opts, &localVars) //nolint:bodyclose // Close() gets done in client + device, _, err := c.client.BareMetalDevicesApi.PutBareMetalDeviceIdResource(ctx, deviceID, opts, nil) //nolint:bodyclose // Close() gets done in client if errors.As(err, &swaggerErr) { body := string(swaggerErr.Body()) log.Info("ProvisionDevice() failed (PutBareMetalDeviceIdResource)", "DeviceID", deviceID, "body", body) @@ -259,3 +237,8 @@ func checkRateLimit(err error) error { } return err } + +func (c *realClient) GetDeviceDump(ctx context.Context, deviceID int32) (hv.DeviceDump, error) { + dump, _, err := c.client.DeviceApi.GetDeviceIdResource(ctx, deviceID, nil) //nolint:bodyclose // Close() gets done in client + return dump, err +} diff --git a/pkg/services/hivelocity/client/mock/mock_client.go b/pkg/services/hivelocity/client/mock/mock_client.go index 91dea0d60..20a294503 100644 --- a/pkg/services/hivelocity/client/mock/mock_client.go +++ b/pkg/services/hivelocity/client/mock/mock_client.go @@ -176,6 +176,7 @@ func (c *mockedHVClient) ProvisionDevice(_ context.Context, deviceID int32, opts return hv.BareMetalDevice{}, fmt.Errorf("[ProvisionDevice] deviceID %d unknown", deviceID) } device.Tags = opts.Tags + device.PowerStatus = hvclient.PowerStatusOn c.store.idMap[deviceID] = device return device, nil } @@ -230,3 +231,37 @@ func (c *mockedHVClient) GetDevice(_ context.Context, deviceID int32) (hv.BareMe } return device, nil } + +func (c *mockedHVClient) GetDeviceDump(ctx context.Context, deviceID int32) (hv.DeviceDump, error) { + device, _ := c.GetDevice(ctx, deviceID) + return hv.DeviceDump{ + DeviceId: deviceID, + Name: "", + Status: "", + DeviceType: "", + DeviceTypeGroup: "", + PowerStatus: device.PowerStatus, + HasCancellation: false, + IsManaged: false, + IsReload: false, + MonitorsUp: 0, + MonitorsTotal: 0, + ManagedAlertsTotal: 0, + Ports: []interface{}{}, + Hostname: "", + IpmiEnabled: false, + DisplayedTags: []interface{}{}, + Tags: []string{}, + Location: nil, + NetworkAutomation: nil, + PrimaryIp: "", + IpmiAddress: nil, + ServiceMonitors: []string{}, + BillingInfo: nil, + ServicePlan: 0, + LastInvoiceId: 0, + SelfProvisioning: false, + Metadata: nil, + SpsStatus: "", + }, nil +} diff --git a/pkg/services/hivelocity/device/device.go b/pkg/services/hivelocity/device/device.go index 8ba74d24e..6c8d1c974 100644 --- a/pkg/services/hivelocity/device/device.go +++ b/pkg/services/hivelocity/device/device.go @@ -19,9 +19,11 @@ package device import ( "context" + _ "embed" "errors" "fmt" "sort" + "strings" "time" infrav1 "github.com/hivelocity/cluster-api-provider-hivelocity/api/v1alpha1" @@ -105,13 +107,27 @@ func (s *Service) actionAssociateDevice(ctx context.Context) actionResult { log := s.scope.Logger.WithValues("function", "actionAssociateDevice") log.V(1).Info("Started function") - device, err := GetFirstDevice(ctx, s.scope.HVClient, s.scope.HivelocityMachine.Spec.Type, + device, err := GetFirstFreeDevice(ctx, s.scope.HVClient, s.scope.HivelocityMachine.Spec.Type, s.scope.HivelocityCluster, s.scope.Name()) if err != nil { s.handleRateLimitExceeded(err, "ListDevices") return actionError{err: fmt.Errorf("failed to find available device: %w", err)} } + if device == nil { + conditions.MarkFalse( + s.scope.HivelocityMachine, + infrav1.DeviceAssociateSucceededCondition, + infrav1.NoAvailableDeviceReason, + clusterv1.ConditionSeverityWarning, + "no available device found", + ) + record.Warnf(s.scope.HivelocityMachine, "NoFreeDeviceFound", "Failed to find free device") + return actionContinue{delay: time.Minute} + } + + conditions.Delete(s.scope.HivelocityMachine, infrav1.DeviceAssociateSucceededCondition) + // associate this device with the machine object by setting tags device.Tags = append(device.Tags, s.scope.HivelocityCluster.DeviceTagOwned().ToString(), @@ -132,12 +148,12 @@ func (s *Service) actionAssociateDevice(ctx context.Context) actionResult { return actionComplete{} } -// GetFirstDevice finds the first free matching device. The parameter machineName is optional. -func GetFirstDevice(ctx context.Context, hvclient hvclient.Client, machineType infrav1.HivelocityDeviceType, hvCluster *infrav1.HivelocityCluster, - machineName string) (*hv.BareMetalDevice, error) { +// GetFirstFreeDevice finds the first free matching device. The parameter machineName is optional. +func GetFirstFreeDevice(ctx context.Context, hvclient hvclient.Client, machineType infrav1.HivelocityDeviceType, + hvCluster *infrav1.HivelocityCluster, machineName string, +) (*hv.BareMetalDevice, error) { // list all devices devices, err := hvclient.ListDevices(ctx) - if err != nil { return nil, err } @@ -152,23 +168,11 @@ func GetFirstDevice(ctx context.Context, hvclient hvclient.Client, machineType i return devices[i].DeviceId < devices[j].DeviceId }) - hvDevices := findAvailableDeviceFromList(devices, machineType, hvCluster.Name, machineName) - if hvDevices == nil { - conditions.MarkFalse( - hvCluster, - infrav1.DeviceAssociateSucceededCondition, - infrav1.NoAvailableDeviceReason, - clusterv1.ConditionSeverityWarning, - "no associated device found", - ) - return nil, nil - } - - conditions.MarkTrue(hvCluster, infrav1.DeviceAssociateSucceededCondition) - return hvDevices, nil + device := findAvailableDeviceFromList(devices, machineType, hvCluster.Name) + return device, nil } -func findAvailableDeviceFromList(devices []hv.BareMetalDevice, deviceType infrav1.HivelocityDeviceType, clusterName, machineName string) *hv.BareMetalDevice { +func findAvailableDeviceFromList(devices []hv.BareMetalDevice, deviceType infrav1.HivelocityDeviceType, clusterName string) *hv.BareMetalDevice { for _, device := range devices { // Skip if caphv-permanent-error exists @@ -184,10 +188,6 @@ func findAvailableDeviceFromList(devices []hv.BareMetalDevice, deviceType infrav // unexpected error - continue continue } - if machineName != "" && machineTag.Value == machineName { - // associated to this machine - return - return &device - } if machineTag.Value != "" { // associated to other machine - continue continue @@ -195,7 +195,7 @@ func findAvailableDeviceFromList(devices []hv.BareMetalDevice, deviceType infrav deviceTypeTag, err := hvtag.DeviceTypeTagFromList(device.Tags) if err != nil { - // TODO: What if no device type has been set? Is the device available or not? + // No device type has been set, don't use it. continue } @@ -259,6 +259,9 @@ func (s *Service) actionVerifyAssociate(ctx context.Context) actionResult { if clusterTagErr == nil && machineTagErr == nil { log.V(1).Info("Completed function") + record.Eventf(s.scope.HivelocityMachine, "SuccessfulAssociateDevice", "Device %d was associated with cluster %q", deviceID, + s.scope.HivelocityCluster.Name) + conditions.MarkTrue(s.scope.HivelocityMachine, infrav1.DeviceAssociateSucceededCondition) return actionComplete{} } @@ -286,6 +289,167 @@ func hasTimedOut(lastUpdated *metav1.Time, timeout time.Duration) bool { return lastUpdated.Add(timeout).Before(now.Time) } +// actionVerifyShutdown makes sure that the device is shut down. +func (s *Service) actionVerifyShutdown(ctx context.Context) actionResult { + + deviceID, err := s.scope.HivelocityMachine.DeviceIDFromProviderID() + if err != nil { + return actionError{err: fmt.Errorf("[actionVerifyShutdown] ProviderIDToDeviceID failed: %w", err)} + } + + isReloading, isPoweredOn, err := s.getPowerAndReloadingState(ctx, deviceID) + if err != nil { + return actionError{err: fmt.Errorf("[actionVerifyShutdown] getPowerAndReloadingState failed: %w", err)} + } + + // if device is powered off and not reloading, then we are done and can start provisioning + if !isPoweredOn && !isReloading { + conditions.MarkFalse( + s.scope.HivelocityMachine, + infrav1.DeviceProvisioningSucceededCondition, + infrav1.DeviceShutDownReason, + clusterv1.ConditionSeverityInfo, + "device is shut down and will be provisioned", + ) + return actionComplete{} + } + + provisionCondition := conditions.Get(s.scope.HivelocityMachine, infrav1.DeviceProvisioningSucceededCondition) + + // handle reloading state + + if isReloading { + if s.isReloadingTooLong(provisionCondition, isPoweredOn) { + return s.setReloadingTooLongTag(ctx, deviceID, provisionCondition.LastTransitionTime) + } + + conditions.MarkFalse( + s.scope.HivelocityMachine, + infrav1.DeviceProvisioningSucceededCondition, + infrav1.DeviceReloadingReason, + clusterv1.ConditionSeverityWarning, + fmt.Sprintf("device %d is reloading", deviceID), + ) + return actionContinue{delay: 1 * time.Minute} + } + + // handle powered on state + + // if shutdown has been called in the past two minutes already, do not call it again and wait + if provisionCondition != nil && provisionCondition.Reason == infrav1.DeviceShutdownCalledReason && !hasTimedOut(&provisionCondition.LastTransitionTime, 2*time.Minute) { + return actionContinue{delay: 30 * time.Second} + } + + // remove condition to reset the timer - we set the condition anyway again + conditions.Delete(s.scope.HivelocityMachine, infrav1.DeviceProvisioningSucceededCondition) + + err = s.scope.HVClient.ShutdownDevice(ctx, deviceID) + if err != nil { + return actionError{err: fmt.Errorf("[actionVerifyShutdown] ShutdownDevice failed: %w", err)} + } + + record.Eventf(s.scope.HivelocityMachine, "SuccessfulShutdownDevice", "Called ShutdownDevice API for %d", deviceID) + + conditions.MarkFalse( + s.scope.HivelocityMachine, + infrav1.DeviceProvisioningSucceededCondition, + infrav1.DeviceShutdownCalledReason, + clusterv1.ConditionSeverityInfo, + "device shut down has been triggered", + ) + return actionContinue{delay: 30 * time.Second} +} + +func (s *Service) isReloadingTooLong(condition *clusterv1.Condition, isPowerOn bool) bool { + if condition == nil { + return false + } + if condition.Reason != infrav1.DeviceReloadingReason { + return false + } + timeout := 5 * time.Minute + if isPowerOn { + // the device is "reloading" during provisioning, which can take longer. + timeout = 25 * time.Minute + } + if !hasTimedOut(&condition.LastTransitionTime, timeout) { + return false + } + return true +} + +// Set permanent error with an appropriate label, +// and go back to the state associate to associate with another device via actionGoBack. +// This method is used for provisioning and deprovisioning. +func (s *Service) setReloadingTooLongTag(ctx context.Context, deviceID int32, lastTransitionTime metav1.Time) actionResult { + device, err := s.scope.HVClient.GetDevice(ctx, deviceID) + if err != nil { + s.handleRateLimitExceeded(err, "GetDevice") + if errors.Is(err, hvclient.ErrDeviceNotFound) { + msg := fmt.Sprintf("Hivelocity device %d not found", deviceID) + conditions.MarkFalse( + s.scope.HivelocityMachine, + infrav1.DeviceReadyCondition, + infrav1.DeviceNotFoundReason, + clusterv1.ConditionSeverityError, + msg, + ) + record.Warnf(s.scope.HivelocityMachine, "DeviceNotFound", msg) + s.scope.HivelocityMachine.SetFailure(capierrors.UpdateMachineError, infrav1.FailureMessageDeviceNotFound) + return actionComplete{} + } + return actionError{err: fmt.Errorf("failed to get associated device: %w", err)} + } + tags := hvtag.RemoveEphemeralTags(device.Tags) + _, err = hvtag.PermanentErrorTagFromList(tags) + if errors.Is(err, hvtag.ErrDeviceTagNotFound) { + tags = append(tags, fmt.Sprintf("%s=reloading-since-%s", + hvtag.DeviceTagKeyPermanentError, + lastTransitionTime.Format(time.RFC3339))) + } else if err != nil { + return actionError{err: fmt.Errorf("[setReloadingTooLongTag] PermanentErrorTagFromList failed: %w", err)} + } + + err = s.scope.HVClient.SetDeviceTags(ctx, device.DeviceId, tags) + if err != nil { + return actionError{err: fmt.Errorf("[setReloadingTooLongTag] SetDeviceTags failed: %w", err)} + } + + msg := fmt.Sprintf("device %d reloading too long. Tag %q was set. Trying next device.", device.DeviceId, + hvtag.DeviceTagKeyPermanentError) + + conditions.MarkFalse( + s.scope.HivelocityMachine, + infrav1.DeviceReadyCondition, + infrav1.DeviceReloadingTooLongReason, + clusterv1.ConditionSeverityError, + msg, + ) + record.Warnf(s.scope.HivelocityMachine, "DeviceReloadingTooLong", msg) + return actionGoBack{nextState: infrav1.StateAssociateDevice} +} + +func (s *Service) getPowerAndReloadingState(ctx context.Context, deviceID int32) ( + isReloading bool, isPoweredOn bool, err error) { + dump, err := s.scope.HVClient.GetDeviceDump(ctx, deviceID) + if err != nil { + return false, false, fmt.Errorf("[getPowerAndReloadingState] GetDeviceDump failed: %d %w", deviceID, err) + } + power, ok := dump.PowerStatus.(string) + if !ok { + return false, false, fmt.Errorf("[getPowerAndReloadingState] dump.PowerStatus failed: %d %+v %w", deviceID, dump.PowerStatus, err) + } + switch power { + case "ON": + isPoweredOn = true + case "OFF": + isPoweredOn = false + default: + return false, false, fmt.Errorf("[getPowerAndReloadingState] dump.PowerStatus unknown: %d %s %w", deviceID, power, err) + } + return dump.IsReload, isPoweredOn, nil +} + // actionProvisionDevice provisions the device. func (s *Service) actionProvisionDevice(ctx context.Context) actionResult { log := s.scope.Logger.WithValues("function", "actionProvisionDevice") @@ -308,6 +472,21 @@ func (s *Service) actionProvisionDevice(ctx context.Context) actionResult { return actionError{err: fmt.Errorf("failed to get device: %w", err)} } + isReloading, isPoweredOn, err := s.getPowerAndReloadingState(ctx, deviceID) + if err != nil { + return actionError{err: fmt.Errorf("[actionProvisionDevice] getPowerAndReloadingState failed: %s", err)} + } + if isReloading { + msg := fmt.Sprintf("unexpected: device %d is reloading.", deviceID) + record.Warnf(s.scope.HivelocityMachine, "ProvisionDeviceIsReloading", msg) + return actionError{err: errors.New(msg)} + } + if isPoweredOn { + msg := fmt.Sprintf("unexpected: device %d has power on.", deviceID) + record.Warnf(s.scope.HivelocityMachine, "ProvisionDeviceIsPoweredOn", msg) + return actionError{err: errors.New(msg)} + } + userData, err := s.scope.GetRawBootstrapData(ctx) if err != nil { record.Warnf(s.scope.HivelocityMachine, "FailedGetBootstrapData", err.Error()) @@ -354,14 +533,16 @@ func (s *Service) actionProvisionDevice(ctx context.Context) actionResult { // Provision the device if _, err := s.scope.HVClient.ProvisionDevice(ctx, deviceID, opts); err != nil { - // TODO: Handle error that machine is not shut down s.handleRateLimitExceeded(err, "ProvisionDevice") record.Warnf(s.scope.HivelocityMachine, "FailedProvisionDevice", "Failed to provision device %d: %s", deviceID, err) return actionContinue{delay: 30 * time.Second} - // actionError{err: fmt.Errorf("failed to provision device %d: %s", deviceID, err)} } - record.Eventf(s.scope.HivelocityMachine, "SuccessfulProvisionDevice", "Successfully provisioned device: %d", deviceID) + record.Eventf(s.scope.HivelocityMachine, "SuccessfulStartedProvisionDevice", "Successfully started ProvisionDevice: %d", deviceID) + + conditions.MarkTrue( + s.scope.HivelocityMachine, + infrav1.DeviceProvisioningSucceededCondition) log.V(1).Info("Completed function") return actionComplete{} @@ -427,25 +608,41 @@ func (s *Service) actionDeviceProvisioned(ctx context.Context) actionResult { return actionComplete{} } + isReloading, _, err := s.getPowerAndReloadingState(ctx, deviceID) + if err != nil { + return actionError{err: fmt.Errorf("[actionDeviceProvisioned] getPowerAndReloadingState failed: %w", err)} + } + if isReloading { + conditions.MarkFalse( + s.scope.HivelocityMachine, + infrav1.DeviceProvisioningSucceededCondition, + infrav1.DeviceReloadingReason, + clusterv1.ConditionSeverityWarning, + fmt.Sprintf("Provisioned device %d is reloading", deviceID), + ) + return actionContinue{delay: 15 * time.Second} + } + conditions.MarkTrue( + s.scope.HivelocityMachine, + infrav1.DeviceProvisioningSucceededCondition) + // update machine object with infos from device conditions.MarkTrue(s.scope.HivelocityMachine, infrav1.DeviceReadyCondition) s.scope.HivelocityMachine.SetMachineStatus(device) if device.PowerStatus == hvclient.PowerStatusOff { conditions.MarkFalse(s.scope.HivelocityMachine, infrav1.HivelocityMachineReadyCondition, infrav1.DevicePowerOffReason, clusterv1.ConditionSeverityError, "the device is in power off state") s.scope.HivelocityMachine.Status.Ready = false - log.Info("Power is off?", - "DeviceId", device.DeviceId, - "PowerStatus", device.PowerStatus) - } else { - conditions.MarkTrue(s.scope.HivelocityMachine, infrav1.HivelocityMachineReadyCondition) - s.scope.HivelocityMachine.Status.Ready = true + return actionContinue{delay: 20 * time.Second} } + conditions.MarkTrue(s.scope.HivelocityMachine, infrav1.HivelocityMachineReadyCondition) + s.scope.HivelocityMachine.Status.Ready = true log.V(1).Info("Completed function", "DeviceId", device.DeviceId, "PowerStatus", device.PowerStatus, "script", utils.FirstN(device.Script, 50)) + // This is the final state, if the machine is provisioned and everything is fine. return actionComplete{} } @@ -469,7 +666,7 @@ func (s *Service) verifyAssociatedDevice(device *hv.BareMetalDevice) error { } // actionDeleteDeviceDeProvision re-provisions a device to remove it from cluster. -func (s *Service) actionDeleteDeviceDeProvision(ctx context.Context) actionResult { +func (s *Service) actionDeleteDeviceDeProvision(ctx context.Context) (ar actionResult) { log := s.scope.Logger.WithValues("function", "actionDeleteDeviceDeProvision") log.V(1).Info("Started function") @@ -490,28 +687,92 @@ func (s *Service) actionDeleteDeviceDeProvision(ctx context.Context) actionResul return actionError{err: fmt.Errorf("failed to get device: %w", err)} } - newTags, _ := s.scope.HivelocityCluster.DeviceTag().RemoveFromList(device.Tags) - newTags, _ = s.scope.HivelocityMachine.DeviceTag().RemoveFromList(newTags) - newTags, _ = s.scope.DeviceTagMachineType().RemoveFromList(newTags) + isReloading, isPoweredOn, err := s.getPowerAndReloadingState(ctx, deviceID) + if err != nil { + return actionError{err: fmt.Errorf("actionDeleteDeviceDeProvision] getPowerAndReloadingState failed: %w", err)} + } + + if !isReloading && !isPoweredOn { + return s.actionDeleteDeviceDeProvisionPowerIsOff(ctx, device) + } + + deprovisionCondition := conditions.Get(s.scope.HivelocityMachine, infrav1.DeviceDeProvisioningSucceededCondition) + + // handle reloading state + if isReloading { + if s.isReloadingTooLong(deprovisionCondition, isPoweredOn) { + return s.setReloadingTooLongTag(ctx, deviceID, deprovisionCondition.LastTransitionTime) + } + + conditions.MarkFalse( + s.scope.HivelocityMachine, + infrav1.DeviceDeProvisioningSucceededCondition, + infrav1.DeviceReloadingReason, + clusterv1.ConditionSeverityWarning, + fmt.Sprintf("device %d is reloading", deviceID), + ) + return actionContinue{delay: 1 * time.Minute} + } + + if !strings.Contains(device.Script, "cloud-init") { + // This is the dummy OS + return actionComplete{} + } + + // handle powered on state + + // if shutdown has been called in the past two minutes already, do not call it again and wait + if deprovisionCondition != nil && deprovisionCondition.Reason == infrav1.DeviceShutdownCalledReason && !hasTimedOut(&deprovisionCondition.LastTransitionTime, 2*time.Minute) { + return actionContinue{delay: 30 * time.Second} + } + + // remove condition to reset the timer - we set the condition anyway again + conditions.Delete(s.scope.HivelocityMachine, infrav1.DeviceDeProvisioningSucceededCondition) + + err = s.scope.HVClient.ShutdownDevice(ctx, deviceID) + if err != nil { + return actionError{err: fmt.Errorf("[actionDeleteDeviceDeProvision] ShutdownDevice failed: %w", err)} + } + + record.Eventf(s.scope.HivelocityMachine, "SuccessfulShutdownDevice", "Called ShutdownDevice API for %d", deviceID) + + conditions.MarkFalse( + s.scope.HivelocityMachine, + infrav1.DeviceDeProvisioningSucceededCondition, + infrav1.DeviceShutdownCalledReason, + clusterv1.ConditionSeverityInfo, + "device shut down has been triggered", + ) + return actionContinue{delay: 30 * time.Second} +} + +func (s *Service) actionDeleteDeviceDeProvisionPowerIsOff(ctx context.Context, device hv.BareMetalDevice) actionResult { + log := s.scope.Logger.WithValues("function", "actionDeleteDeviceDeProvisionPowerIsOff") + deviceID := device.DeviceId opts := hv.BareMetalDeviceUpdate{ Hostname: s.scope.Name() + "-deleted.example.com", - Tags: newTags, OsName: defaultImageName, ForceReload: true, + Script: "", + Tags: device.Tags, } // Deprovision the device with default image. if _, err := s.scope.HVClient.ProvisionDevice(ctx, deviceID, opts); err != nil { // TODO: Handle error that machine is not shut down s.handleRateLimitExceeded(err, "ProvisionDevice") - record.Warnf(s.scope.HivelocityMachine, "FailedDeProvisionDevice", "Failed to de-provision device %d: %s", deviceID, err) + record.Warnf(s.scope.HivelocityMachine, "FailedCallProvisionToDeprovision", "Failed to call provision to deprovision device %d: %s", deviceID, err) return actionError{err: fmt.Errorf("failed to de-provision device %d: %s", deviceID, err)} } - record.Eventf(s.scope.HivelocityMachine, "SuccessfulDeProvisionDevice", "Successfully de-provision device %d", deviceID) + msg := fmt.Sprintf("Successfully called provision to deprovision %d with %s", + deviceID, opts.OsName) + record.Eventf(s.scope.HivelocityMachine, "SuccessfulCallProvisionToDeprovision", msg) log.V(1).Info("Completed function") - return actionComplete{} + return actionContinue{ + delay: time.Minute, + } } // actionDeleteDeviceDissociate ensures that the device has no tags of machine. @@ -519,6 +780,10 @@ func (s *Service) actionDeleteDeviceDissociate(ctx context.Context) actionResult log := s.scope.Logger.WithValues("function", "actionDeleteDeviceDissociate") log.V(1).Info("Started function") + if s.scope.HivelocityMachine.Spec.ProviderID == nil || *(s.scope.HivelocityMachine.Spec.ProviderID) == "" { + log.V(1).Info("No ProviderID, no need to dissociate device: actionComplete") + return actionComplete{} + } deviceID, err := s.scope.HivelocityMachine.DeviceIDFromProviderID() if err != nil { return actionError{err: fmt.Errorf("failed to get deviceID from providerID: %w", err)} @@ -529,13 +794,34 @@ func (s *Service) actionDeleteDeviceDissociate(ctx context.Context) actionResult s.handleRateLimitExceeded(err, "GetDevice") if errors.Is(err, hvclient.ErrDeviceNotFound) { // Nothing to do if device is not found - s.scope.Info("Unable to locate Hivelocity device by ID or tags") - record.Warnf(s.scope.HivelocityMachine, "NoDeviceFound", "Unable to find matching Hivelocity device for %s", s.scope.Name()) + msg := fmt.Sprintf("[actionDeleteDeviceDissociate] Unable to find matching Hivelocity device %d", deviceID) + s.scope.Info(msg) + record.Warnf(s.scope.HivelocityMachine, "NoDeviceFound", msg) return actionComplete{} } return actionError{err: fmt.Errorf("failed to get device: %w", err)} } + if device.PowerStatus != hvclient.PowerStatusOff { + err = s.scope.HVClient.ShutdownDevice(ctx, deviceID) + if err != nil { + s.handleRateLimitExceeded(err, "ShutdownDevice") + return actionError{err: fmt.Errorf("[actionDeleteDeviceDissociate] failed to shutdown device: %w", err)} + } + conditions.MarkFalse( + s.scope.HivelocityMachine, + infrav1.DeviceDeProvisioningSucceededCondition, + infrav1.DeviceShutdownCalledReason, + clusterv1.ConditionSeverityInfo, + fmt.Sprintf("shut down for device %d was called", deviceID), + ) + return actionContinue{delay: 30 * time.Second} + } + + conditions.MarkTrue( + s.scope.HivelocityMachine, + infrav1.DeviceDeProvisioningSucceededCondition) + newTags, updated1 := s.scope.HivelocityCluster.DeviceTag().RemoveFromList(device.Tags) newTags, updated2 := s.scope.HivelocityMachine.DeviceTag().RemoveFromList(newTags) newTags, updated3 := s.scope.DeviceTagMachineType().RemoveFromList(newTags) diff --git a/pkg/services/hivelocity/device/device_test.go b/pkg/services/hivelocity/device/device_test.go index 138f597cd..7292a0f96 100644 --- a/pkg/services/hivelocity/device/device_test.go +++ b/pkg/services/hivelocity/device/device_test.go @@ -34,7 +34,7 @@ func Test_findAvailableDeviceFromList(t *testing.T) { mockclient.NoTagsDevice, mockclient.FreeDevice, } - device := findAvailableDeviceFromList(devices, "fooDeviceType", "my-cluster", "my-machine") + device := findAvailableDeviceFromList(devices, "fooDeviceType", "my-cluster") require.Nil(t, device) } diff --git a/pkg/services/hivelocity/device/state_machine.go b/pkg/services/hivelocity/device/state_machine.go index 416b7722b..9e27f41fa 100644 --- a/pkg/services/hivelocity/device/state_machine.go +++ b/pkg/services/hivelocity/device/state_machine.go @@ -53,6 +53,7 @@ func (sm *stateMachine) handlers() map[infrav1.ProvisioningState]stateHandler { return map[infrav1.ProvisioningState]stateHandler{ infrav1.StateAssociateDevice: sm.handleAssociateDevice, infrav1.StateVerifyAssociate: sm.handleVerifyAssociate, + infrav1.StateVerifyShutdown: sm.handleVerifyShutdown, infrav1.StateProvisionDevice: sm.handleProvisionDevice, infrav1.StateDeviceProvisioned: sm.handleDeviceProvisioned, infrav1.StateDeleteDeviceDeProvision: sm.handleDeleteDeviceDeProvision, @@ -87,7 +88,7 @@ func (sm *stateMachine) ReconcileState(ctx context.Context) (actionRes actionRes } record.Warnf(sm.hvMachine, "NoHandlerFoundForState", "no handler found for state %q", initialState.ProvisioningState) - return actionError{fmt.Errorf("no handler found for state %q", initialState.ProvisioningState)} + return actionError{err: fmt.Errorf("no handler found for state %q", initialState.ProvisioningState)} } func (sm *stateMachine) handleAssociateDevice(ctx context.Context) actionResult { @@ -100,6 +101,20 @@ func (sm *stateMachine) handleAssociateDevice(ctx context.Context) actionResult func (sm *stateMachine) handleVerifyAssociate(ctx context.Context) actionResult { actResult := sm.reconciler.actionVerifyAssociate(ctx) + if _, ok := actResult.(actionComplete); ok { + sm.nextState = infrav1.StateVerifyShutdown + } + + // check whether we need to associate the machine to another device + actionGoBack, ok := actResult.(actionGoBack) + if ok { + sm.nextState = actionGoBack.nextState + } + return actResult +} + +func (sm *stateMachine) handleVerifyShutdown(ctx context.Context) actionResult { + actResult := sm.reconciler.actionVerifyShutdown(ctx) if _, ok := actResult.(actionComplete); ok { sm.nextState = infrav1.StateProvisionDevice } @@ -127,18 +142,16 @@ func (sm *stateMachine) handleProvisionDevice(ctx context.Context) actionResult } func (sm *stateMachine) handleDeviceProvisioned(ctx context.Context) actionResult { - actResult := sm.reconciler.actionDeviceProvisioned(ctx) - if _, ok := actResult.(actionComplete); ok { - sm.nextState = infrav1.StateDeviceProvisioned - } - return actResult + // This is the last state. + return sm.reconciler.actionDeviceProvisioned(ctx) } func (sm *stateMachine) handleDeleteDeviceDeProvision(ctx context.Context) actionResult { actResult := sm.reconciler.actionDeleteDeviceDeProvision(ctx) if _, ok := actResult.(actionComplete); ok { - sm.nextState = infrav1.StateDeleteDevice + sm.nextState = infrav1.StateDeleteDeviceDissociate } + return actResult } diff --git a/pkg/services/hivelocity/hvtag/hvtag.go b/pkg/services/hivelocity/hvtag/hvtag.go index d8f05d3dd..c62747934 100644 --- a/pkg/services/hivelocity/hvtag/hvtag.go +++ b/pkg/services/hivelocity/hvtag/hvtag.go @@ -42,6 +42,8 @@ const ( // DeviceTagKeyPermanentError is the key for machines which need a manual reset by a Hivelocity admin. DeviceTagKeyPermanentError DeviceTagKey = "caphv-permanent-error" + + // Attention: If you add a new DeviceTagKey, then extend the method IsValid()! ) // Prefix returns the prefix based on this DeviceTagKey used in Hivelocity tag strings. @@ -69,7 +71,8 @@ func (key DeviceTagKey) IsValid() bool { return key == DeviceTagKeyMachine || key == DeviceTagKeyCluster || key == DeviceTagKeyDeviceType || - key == DeviceTagKeyMachineType + key == DeviceTagKeyMachineType || + key == DeviceTagKeyPermanentError } // DeviceTag defines the object that represents a key-value pair that is stored as tag of Hivelocity devices. @@ -79,6 +82,8 @@ type DeviceTag struct { } // DeviceTagFromList takes the tag of a HV device and returns a DeviceTag or an error if it is invalid. +// returns ErrDeviceTagNotFound if no tag with the given key exist. +// returns ErrMultipleDeviceTagsFound if the key exists twice. func DeviceTagFromList(key DeviceTagKey, tagList []string) (DeviceTag, error) { var found bool var deviceTag DeviceTag @@ -182,3 +187,40 @@ func (deviceTag DeviceTag) RemoveFromList(tagList []string) (newTagList []string } return newTagList, updated } + +// isEphemeralTag returns False if a tag should not get removed +// if a machine leaves a cluster. +// tag: Something like "caphv-cluster-name=mycluster" +func isEphemeralTag(tag string) bool { + // ignore tags that are not set by this controller + if !strings.HasPrefix(tag, "caphv-") { + return false + } + + // ignore tags that are only allowed to be changed or removed by the user + for _, keepPrefix := range []string{ + string(DeviceTagKeyPermanentError), + string(DeviceTagKeyDeviceType), + } { + if strings.HasPrefix(tag, keepPrefix+"=") { + return false + } + } + + return true +} + +// RemoveEphemeralTags removes ephemeral tags. Tags which are not +// from caphv will remain in the list. +// Creates a new slice of tags. +// Usually called like this: newTags := RemoveEphemeralTags(device.Tags). +func RemoveEphemeralTags(tags []string) []string { + newTags := make([]string, 0, len(tags)) + for _, tag := range tags { + if isEphemeralTag(tag) { + continue + } + newTags = append(newTags, tag) + } + return newTags +} diff --git a/pkg/services/hivelocity/hvtag/hvtag_test.go b/pkg/services/hivelocity/hvtag/hvtag_test.go index 564571f26..07386cb39 100644 --- a/pkg/services/hivelocity/hvtag/hvtag_test.go +++ b/pkg/services/hivelocity/hvtag/hvtag_test.go @@ -340,3 +340,39 @@ var _ = Describe("Test DeviceTag.RemoveFromList", func() { }), ) }) + +var _ = Describe("RemoveEphemeralTags", func() { + It("removes ephemeral tags, but keeps permanent tags", func() { + newTags := RemoveEphemeralTags([]string{ + // non-ephemeral (keep) + DeviceTagKeyDeviceType.Prefix() + "my-device-type", + DeviceTagKeyPermanentError.Prefix() + "my-permantent-error", + + // remove these: + DeviceTagKeyCluster.Prefix() + "my-cluster", + DeviceTagKeyMachine.Prefix() + "my-machine", + "some-other-tag", + }) + Expect(newTags).To(Equal([]string{ + "caphv-device-type=my-device-type", + "caphv-permanent-error=my-permantent-error", + "some-other-tag"})) + }) +}) + +var _ = Describe("PermanentErrorTagFromList", func() { + It("return permantent error from list", func() { + tag, err := PermanentErrorTagFromList([]string{ + DeviceTagKeyDeviceType.Prefix() + "my-device-type", + DeviceTagKeyPermanentError.Prefix() + "my-permantent-error", + DeviceTagKeyCluster.Prefix() + "my-cluster", + DeviceTagKeyMachine.Prefix() + "my-machine", + "some-other-tag", + }) + Expect(err).To(BeNil()) + Expect(tag).To(Equal(DeviceTag{ + Key: "caphv-permanent-error", + Value: "my-permantent-error", + })) + }) +}) diff --git a/test/claim-devices-or-fail/claim-devices-or-fail.go b/test/claim-devices-or-fail/claim-devices-or-fail.go index 0bcfc4417..e875b9218 100644 --- a/test/claim-devices-or-fail/claim-devices-or-fail.go +++ b/test/claim-devices-or-fail/claim-devices-or-fail.go @@ -83,7 +83,10 @@ func releaseOldMachines(ctx context.Context, apiClient *hv.APIClient, deviceType fmt.Printf("resetting labels of all devices which have %s=%s. Found %d devices\n", hvtag.DeviceTagKeyDeviceType, deviceType, len(devicesWithTag)) for _, device := range devicesWithTag { - err := resetTags(ctx, device, apiClient) + fmt.Printf(" resetting labels of device %d\n", device.DeviceId) + newTags := hvtag.RemoveEphemeralTags(device.Tags) + _, _, err := apiClient.DeviceApi.PutDeviceTagIdResource(ctx, device.DeviceId, hv.DeviceTag{ + Tags: newTags}, nil) if err != nil { return err } @@ -91,38 +94,3 @@ func releaseOldMachines(ctx context.Context, apiClient *hv.APIClient, deviceType return nil } - -// resetTags: Remove our labels, but keep DeviceTagKeyPermanentError and DeviceTagKeyDeviceType. -// And keep other labels. -func resetTags(ctx context.Context, device hv.BareMetalDevice, apiClient *hv.APIClient) error { - fmt.Printf(" resetting labels of device %d\n", device.DeviceId) - var newTags []string - for _, tag := range device.Tags { - if removeTag(tag) { - continue - } - newTags = append(newTags, tag) - } - _, _, err := apiClient.DeviceApi.PutDeviceTagIdResource(ctx, device.DeviceId, hv.DeviceTag{ - Tags: newTags}, nil) - if err != nil { - return err - } - return nil -} - -// tag: Something like caphv-cluster-name=hv-guettli -func removeTag(tag string) bool { - if !strings.HasPrefix(tag, "caphv-") { - return false - } - for _, keepPrefix := range []string{ - string(hvtag.DeviceTagKeyPermanentError), - string(hvtag.DeviceTagKeyDeviceType), - } { - if strings.HasPrefix(tag, keepPrefix) { - return false - } - } - return true -}