Skip to content

Commit

Permalink
Merge pull request #112 from hivelocity/verify-shutdown
Browse files Browse the repository at this point in the history
✨ Auto-detect permanent-error
  • Loading branch information
guettli committed Nov 9, 2023
2 parents baf39f9 + acc3ffd commit 46b07b9
Show file tree
Hide file tree
Showing 17 changed files with 534 additions and 127 deletions.
25 changes: 25 additions & 0 deletions api/v1alpha1/conditions_const.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,36 @@ import (
const (
// DeviceReadyCondition reports on current status of the device. Ready indicates the device is in a Running state.
DeviceReadyCondition clusterv1.ConditionType = "DeviceReady"

// DeviceNotFoundReason (Severity=Error) documents a HivelocityMachine controller detecting
// the underlying device cannot be found anymore.
DeviceNotFoundReason = "DeviceNotFound"

// DeviceTagsInvalidReason documents a HivelocityMachine controller detecting invalid device tags.
DeviceTagsInvalidReason = "DeviceTagsInvalid"

// DeviceReloadingTooLongReason indicates that the device is reloading too long.
// The controller sets a corresponding tag, so that the machine can get reset by an operator.
DeviceReloadingTooLongReason = "DeviceReloadingTooLongReason"
)

const (
// DeviceProvisioningSucceededCondition reports on whether the device has been successfully provisioned.
DeviceProvisioningSucceededCondition clusterv1.ConditionType = "DeviceProvisioningSucceeded"

// DeviceReloadingReason documents that the device is reloading.
DeviceReloadingReason = "DeviceReloading"

// DeviceShutdownCalledReason documents that the device has been tried to shut down.
DeviceShutdownCalledReason = "DeviceShutdownCalled"

// DeviceShutDownReason documents that the device is shut down.
DeviceShutDownReason = "DeviceShutDown"
)

const (
// DeviceDeProvisioningSucceededCondition reports on whether the device has been successfully deprovisioned.
DeviceDeProvisioningSucceededCondition clusterv1.ConditionType = "DeviceDeProvisioningSucceeded"
)

const (
Expand Down
8 changes: 7 additions & 1 deletion api/v1alpha1/hivelocitymachine_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ const (
const (
// FailureMessageDeviceNotFound indicates that the associated device could not be found.
FailureMessageDeviceNotFound = "device not found"

// FailureMessageDeviceTagsInvalid indicates that the associated device has invalid tags.
// This is probably due to a user changing device tags on his own.
FailureMessageDeviceTagsInvalid = "device tags invalid"
Expand Down Expand Up @@ -65,6 +66,9 @@ const (
// StateVerifyAssociate .
StateVerifyAssociate ProvisioningState = "verify-associate"

// StateVerifyShutdown .
StateVerifyShutdown ProvisioningState = "verify-shutdown"

// StateProvisionDevice .
StateProvisionDevice ProvisioningState = "provision-device"

Expand Down Expand Up @@ -112,7 +116,8 @@ type ControllerGeneratedStatus struct {

// HivelocityDeviceType defines the Hivelocity device type.
// +kubebuilder:validation:Enum=pool;hvCustom;hvControlPlane;hvWorker;e2eControlPlane;e2eWorker
type HivelocityDeviceType string
// hvlabel:foo=bar
type HivelocityDeviceType string // TODO: this should not be an enum. Rename to HVLabel, and make a label selector.

// HivelocityMachineStatus defines the observed state of HivelocityMachine.
type HivelocityMachineStatus struct {
Expand Down Expand Up @@ -158,6 +163,7 @@ type HivelocityMachineStatus struct {
// +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.ready",description="Machine ready status"
// +kubebuilder:printcolumn:name="ProviderID",type="string",JSONPath=".spec.providerID",description="ProviderID of machine object"
// +kubebuilder:printcolumn:name="Machine",type="string",JSONPath=".metadata.ownerReferences[?(@.kind==\"Machine\")].name",description="Machine object which owns with this HivelocityMachine"
// +kubebuilder:printcolumn:name="Prov.State",type="string",JSONPath=".spec.status.provisioningState"
// +kubebuilder:printcolumn:name="Reason",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].reason"
// +kubebuilder:printcolumn:name="Message",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].message"
// +k8s:defaulter-gen=true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ spec:
jsonPath: .metadata.ownerReferences[?(@.kind=="Machine")].name
name: Machine
type: string
- jsonPath: .spec.status.provisioningState
name: Prov.State
type: string
- jsonPath: .status.conditions[?(@.type=='Ready')].reason
name: Reason
type: string
Expand Down
4 changes: 2 additions & 2 deletions controllers/hivelocitycluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,9 +196,9 @@ func (r *HivelocityClusterReconciler) reconcileNormal(ctx context.Context, clust
if machineType == "" {
return ctrl.Result{}, fmt.Errorf("Spec.Template.Spec.Type of HivelocityMachineTemplate %q is empty", name)
}
hvDevice, err := device.GetFirstDevice(ctx, clusterScope.HVClient, machineType, hvCluster, "")
hvDevice, err := device.GetFirstFreeDevice(ctx, clusterScope.HVClient, machineType, hvCluster, "")
if err != nil {
return ctrl.Result{}, fmt.Errorf("device.GetFirstDevice() failed: %w", err)
return ctrl.Result{}, fmt.Errorf("device.GetFirstFreeDevice() failed: %w", err)
}
log.Info(fmt.Sprintf("Setting hvCluster.Spec.ControlPlaneEndpoint.Host to %q (machineType=%s).",
hvDevice.PrimaryIp, machineType))
Expand Down
1 change: 1 addition & 0 deletions controllers/hivelocitymachine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ func (r *HivelocityMachineReconciler) Reconcile(ctx context.Context, req ctrl.Re
// Fetch the Machine.
machine, err := util.GetOwnerMachine(ctx, r.Client, hivelocityMachine.ObjectMeta)
if err != nil {
log.Error(err, "GetOwnerMachine failed")
return ctrl.Result{}, err
}
if machine == nil {
Expand Down
1 change: 1 addition & 0 deletions controllers/hivelocitymachine_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ var _ = Describe("HivelocityMachineReconciler", func() {

Eventually(func() bool {
if err := testEnv.Get(ctx, machineKey, hvMachine); err != nil {
testEnv.GetLogger().Info("machine resource does not exist yet")
return false
}
if hvMachine.Spec.ProviderID == nil {
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ go 1.20
//replace sigs.k8s.io/cluster-api/test/framework => sigs.k8s.io/cluster-api/test v1.4.2

require (
github.com/antihax/optional v1.0.0
github.com/blang/semver/v4 v4.0.0
github.com/go-logr/logr v1.2.4
github.com/go-logr/zapr v1.2.3
Expand Down Expand Up @@ -44,6 +43,7 @@ require (
github.com/Masterminds/sprig/v3 v3.2.3 // indirect
github.com/Microsoft/go-winio v0.5.0 // indirect
github.com/alessio/shellescape v1.4.1 // indirect
github.com/antihax/optional v1.0.0 // indirect
github.com/antlr/antlr4/runtime/Go/antlr v1.4.10 // indirect
github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a // indirect
github.com/beorn7/perks v1.0.1 // indirect
Expand Down
16 changes: 12 additions & 4 deletions hack/output-for-watch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,10 @@ kubectl get machines -A

print_heading hivelocitymachine

kubectl get hivelocitymachine -A "-o=custom-columns=NAMESPACE:.metadata.namespace,NAME:.metadata.name,Cluster:.metadata.labels.cluster\.x-k8s\.io/cluster-name,Type:.spec.type,State:.status.powerState,Ready:.status.ready,ProviderID:.spec.providerID,Machine:.metadata.ownerReferences[?(@.kind==\"Machine\")].name,IP:.status.addresses[?(@.type==\"InternalIP\")].address"

kubectl get hivelocitymachine -A
print_heading events

kubectl get events -A -o=wide --sort-by=.lastTimestamp | grep -vP 'LeaderElection' | tail -8
kubectl get events -A -o=wide --sort-by=.lastTimestamp | grep -vP 'LeaderElection|CSRApproved' | tail -8

print_heading conditions

Expand All @@ -51,6 +50,15 @@ print_heading logs

echo

capi_error="$(kubectl logs -n capi-system --since=5m deployments/capi-controller-manager | \
grep -iP 'error|\be\d\d\d\d\b' | \
grep -vP 'ignoring DaemonSet-managed Pods|TLS handshake error from' | \
tail -7)"
if [ -n "$capi_error" ]; then
print_heading capi controller errors
echo "$capi_error"
fi

ip=$(kubectl get cluster -A -o=jsonpath='{.items[*].spec.controlPlaneEndpoint.host}' | head -1)
if [ -z "$ip" ]; then
echo "❌ Could not get IP of control-plane"
Expand Down Expand Up @@ -93,7 +101,7 @@ else
echo "👌 number of nodes in wl-cluster is equal to number of machines in mgt-cluster"
fi

not_approved=$(KUBECONFIG=$kubeconfig_wl kubectl get csr --no-headers | grep -v Approved)
not_approved=$(KUBECONFIG=$kubeconfig_wl kubectl get csr --no-headers --sort-by='.metadata.creationTimestamp' | grep -v Approved | tail -8 )
if [ -n "$not_approved" ]; then
echo "❌ (CSRs)certificate signing requests which are not approved"
echo "$not_approved"
Expand Down
2 changes: 1 addition & 1 deletion hack/tail-controller-logs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
pod=$(kubectl -n capi-hivelocity-system get pods | grep caphv-controller-manager | cut -d' ' -f1)

if [ -z "$pod" ]; then
echo "failed to find caphv-controller-manager pod"
echo "failed to find caphv-controller-manager pod"
exit 1
fi

Expand Down
41 changes: 12 additions & 29 deletions pkg/services/hivelocity/client/hvclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,7 @@ import (
"regexp"
"runtime/debug"
"strings"
"time"

"github.com/antihax/optional"
"github.com/go-logr/logr"
"github.com/hivelocity/cluster-api-provider-hivelocity/pkg/utils"
caphvversion "github.com/hivelocity/cluster-api-provider-hivelocity/pkg/version"
Expand All @@ -47,9 +45,9 @@ const PowerStatusOn = "ON"
// Client collects all methods used by the controller in the Hivelocity API.
type Client interface {
PowerOnDevice(ctx context.Context, deviceID int32) error
ShutdownDevice(ctx context.Context, deviceID int32) error
ProvisionDevice(ctx context.Context, deviceID int32, opts hv.BareMetalDeviceUpdate) (hv.BareMetalDevice, error)
ListDevices(context.Context) ([]hv.BareMetalDevice, error)
ShutdownDevice(ctx context.Context, deviceID int32) error
ListImages(ctx context.Context, productID int32) ([]string, error)
ListSSHKeys(context.Context) ([]hv.SshKeyResponse, error)

Expand All @@ -58,6 +56,8 @@ type Client interface {

// SetDeviceTags sets the tags to the given list.
SetDeviceTags(ctx context.Context, deviceID int32, tags []string) error

GetDeviceDump(ctx context.Context, deviceID int32) (hv.DeviceDump, error)
}

// Factory is the interface for creating new Client objects.
Expand Down Expand Up @@ -155,42 +155,20 @@ func (c *realClient) SetDeviceTags(ctx context.Context, deviceID int32, tags []s
return checkRateLimit(err)
}

func (c *realClient) PowerOnDevice(_ context.Context, _ int32) error {
return nil // todo
func (c *realClient) PowerOnDevice(ctx context.Context, deviceID int32) error {
_, _, err := c.client.DeviceApi.PostPowerResource(ctx, deviceID, "boot", nil) //nolint:bodyclose // Close() gets done in client
return err
}

func (c *realClient) ProvisionDevice(ctx context.Context, deviceID int32, opts hv.BareMetalDeviceUpdate) (hv.BareMetalDevice, error) {
log := log.FromContext(ctx)
var swaggerErr hv.GenericSwaggerError

power, _, err := c.client.DeviceApi.GetPowerResource(ctx, deviceID, nil) //nolint:bodyclose // Close() gets done in client
if errors.As(err, &swaggerErr) {
body := string(swaggerErr.Body())
log.Info("ProvisionDevice() failed (GetPowerResource)", "DeviceID", deviceID, "body", body)
}

if power.PowerStatus == PowerStatusOn {
// First we need to send "shutdown".
// https://developers.hivelocity.net/reference/post_power_resource
_, _, err := c.client.DeviceApi.PostPowerResource(ctx, deviceID, "shutdown", nil) //nolint:bodyclose // Close() gets done in client
if errors.As(err, &swaggerErr) {
body := string(swaggerErr.Body())
log.Info("ProvisionDevice() failed (PostPowerResource)", "DeviceID", deviceID, "body", body)
}
log.Info("ProvisionDevice() called PostPowerResource shutdown", "DeviceID", deviceID)
time.Sleep(30 * time.Second)
}

log.Info("calling ProvisionDevice()", "DeviceID", deviceID, "hostname", opts.Hostname, "OsName", opts.OsName,
"script", utils.FirstN(opts.Script, 50),
"ForceReload", opts.ForceReload)

// https://developers.hivelocity.net/reference/put_bare_metal_device_id_resource
localVars := hv.BareMetalDevicesApiPutBareMetalDeviceIdResourceOpts{
SkipPowerCheck: optional.NewBool(true),
}

device, _, err := c.client.BareMetalDevicesApi.PutBareMetalDeviceIdResource(ctx, deviceID, opts, &localVars) //nolint:bodyclose // Close() gets done in client
device, _, err := c.client.BareMetalDevicesApi.PutBareMetalDeviceIdResource(ctx, deviceID, opts, nil) //nolint:bodyclose // Close() gets done in client
if errors.As(err, &swaggerErr) {
body := string(swaggerErr.Body())
log.Info("ProvisionDevice() failed (PutBareMetalDeviceIdResource)", "DeviceID", deviceID, "body", body)
Expand Down Expand Up @@ -259,3 +237,8 @@ func checkRateLimit(err error) error {
}
return err
}

func (c *realClient) GetDeviceDump(ctx context.Context, deviceID int32) (hv.DeviceDump, error) {
dump, _, err := c.client.DeviceApi.GetDeviceIdResource(ctx, deviceID, nil) //nolint:bodyclose // Close() gets done in client
return dump, err
}
35 changes: 35 additions & 0 deletions pkg/services/hivelocity/client/mock/mock_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ func (c *mockedHVClient) ProvisionDevice(_ context.Context, deviceID int32, opts
return hv.BareMetalDevice{}, fmt.Errorf("[ProvisionDevice] deviceID %d unknown", deviceID)
}
device.Tags = opts.Tags
device.PowerStatus = hvclient.PowerStatusOn
c.store.idMap[deviceID] = device
return device, nil
}
Expand Down Expand Up @@ -230,3 +231,37 @@ func (c *mockedHVClient) GetDevice(_ context.Context, deviceID int32) (hv.BareMe
}
return device, nil
}

func (c *mockedHVClient) GetDeviceDump(ctx context.Context, deviceID int32) (hv.DeviceDump, error) {
device, _ := c.GetDevice(ctx, deviceID)
return hv.DeviceDump{
DeviceId: deviceID,
Name: "",
Status: "",
DeviceType: "",
DeviceTypeGroup: "",
PowerStatus: device.PowerStatus,
HasCancellation: false,
IsManaged: false,
IsReload: false,
MonitorsUp: 0,
MonitorsTotal: 0,
ManagedAlertsTotal: 0,
Ports: []interface{}{},
Hostname: "",
IpmiEnabled: false,
DisplayedTags: []interface{}{},
Tags: []string{},
Location: nil,
NetworkAutomation: nil,
PrimaryIp: "",
IpmiAddress: nil,
ServiceMonitors: []string{},
BillingInfo: nil,
ServicePlan: 0,
LastInvoiceId: 0,
SelfProvisioning: false,
Metadata: nil,
SpsStatus: "",
}, nil
}
Loading

0 comments on commit 46b07b9

Please sign in to comment.