Skip to content

Commit

Permalink
feat: force delete member cluster (#905)
Browse files Browse the repository at this point in the history
  • Loading branch information
Arvindthiru authored Aug 23, 2024
1 parent 988f973 commit caf4233
Show file tree
Hide file tree
Showing 9 changed files with 365 additions and 395 deletions.
1 change: 1 addition & 0 deletions charts/hub-agent/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ spec:
- --max-fleet-size={{ .Values.MaxFleetSizeSupported }}
- --hub-api-qps={{ .Values.hubAPIQPS }}
- --hub-api-burst={{ .Values.hubAPIBurst }}
- --force-delete-wait-time={{ .Values.forceDeleteWaitTime }}
ports:
- name: metrics
containerPort: 8080
Expand Down
1 change: 1 addition & 0 deletions charts/hub-agent/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ enableWebhook: true
webhookServiceName: fleetwebhook
enableGuardRail: true
webhookClientConnectionType: service
forceDeleteWaitTime: 15m0s

namespace:
fleet-system
Expand Down
1 change: 1 addition & 0 deletions cmd/hubagent/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ func main() {
Client: mgr.GetClient(),
NetworkingAgentsEnabled: opts.NetworkingAgentsEnabled,
MaxConcurrentReconciles: int(math.Ceil(float64(opts.MaxFleetSizeSupported) / 100)), //one member cluster reconciler routine per 100 member clusters
ForceDeleteWaitTime: opts.ForceDeleteWaitTime.Duration,
}).SetupWithManager(mgr); err != nil {
klog.ErrorS(err, "unable to create v1beta1 controller", "controller", "MemberCluster")
exitWithErrorFunc()
Expand Down
3 changes: 3 additions & 0 deletions cmd/hubagent/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ type Options struct {
EnableV1Alpha1APIs bool
// EnableV1Beta1APIs enables the agents to watch the v1beta1 CRs.
EnableV1Beta1APIs bool
// ForceDeleteWaitTime is the duration the hub agent waits before force deleting a member cluster.
ForceDeleteWaitTime metav1.Duration
}

// NewOptions builds an empty options.
Expand Down Expand Up @@ -133,6 +135,7 @@ func (o *Options) AddFlags(flags *flag.FlagSet) {
flags.IntVar(&o.MaxFleetSizeSupported, "max-fleet-size", 100, "The max number of member clusters supported in this fleet")
flags.BoolVar(&o.EnableV1Alpha1APIs, "enable-v1alpha1-apis", false, "If set, the agents will watch for the v1alpha1 APIs.")
flags.BoolVar(&o.EnableV1Beta1APIs, "enable-v1beta1-apis", true, "If set, the agents will watch for the v1beta1 APIs.")
flags.DurationVar(&o.ForceDeleteWaitTime.Duration, "force-delete-wait-time", 15*time.Minute, "The duration the hub agent waits before force deleting a member cluster.")

o.RateLimiterOpts.AddFlags(flags)
}
14 changes: 10 additions & 4 deletions pkg/controllers/membercluster/v1beta1/membercluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ type Reconciler struct {
NetworkingAgentsEnabled bool
// the max number of concurrent reconciles per controller.
MaxConcurrentReconciles int
// the wait time in minutes before we force delete a member cluster.
ForceDeleteWaitTime time.Duration
// agents are used as hashset to query the expected agent type, so the value will be ignored.
agents map[clusterv1beta1.AgentType]bool
}
Expand Down Expand Up @@ -158,8 +160,6 @@ func (r *Reconciler) handleDelete(ctx context.Context, mc *clusterv1beta1.Member
}
// calculate the current status of the member cluster from imc status
r.syncInternalMemberClusterStatus(currentImc, mc)
// TODO: check the last heartbeat time from all agents and assume the member cluster is left if they haven't sent heartbeat
// beyond a pre-agreed threshold.
// check if the cluster is already left
mcJoinedCondition := meta.FindStatusCondition(mc.Status.Conditions, string(clusterv1beta1.ConditionTypeMemberClusterJoined))
if condition.IsConditionStatusFalse(mcJoinedCondition, mc.GetGeneration()) {
Expand All @@ -169,15 +169,21 @@ func (r *Reconciler) handleDelete(ctx context.Context, mc *clusterv1beta1.Member
}
return runtime.Result{Requeue: true}, controller.NewUpdateIgnoreConflictError(r.updateMemberClusterStatus(ctx, mc))
}
// check to see if we can force delete member cluster.
if currentImc.Spec.State == clusterv1beta1.ClusterStateLeave && time.Since(mc.DeletionTimestamp.Time) >= r.ForceDeleteWaitTime {
klog.V(2).InfoS("Force delete the member cluster, by garbage collecting owned resources", "memberCluster", mcObjRef)
return runtime.Result{Requeue: true}, r.garbageCollect(ctx, mc)
}
klog.V(2).InfoS("Need to wait for the agent to leave", "memberCluster", mcObjRef, "joinedCondition", mcJoinedCondition)
// mark the imc as left to make sure the agent is leaving the fleet
if err := r.leave(ctx, mc, currentImc); err != nil {
klog.ErrorS(err, "Failed to mark the imc as leave", "memberCluster", mcObjRef)
return runtime.Result{}, err
}
// update the mc status to track the leaving status while we wait for all the agents to leave.
// once the imc is updated, the mc controller will reconcile again.
return runtime.Result{}, controller.NewUpdateIgnoreConflictError(r.updateMemberClusterStatus(ctx, mc))
// once the imc is updated, the mc controller will reconcile again ,or we reconcile to force delete
// the member cluster after force delete wait time.
return runtime.Result{RequeueAfter: r.ForceDeleteWaitTime}, controller.NewUpdateIgnoreConflictError(r.updateMemberClusterStatus(ctx, mc))
}

func (r *Reconciler) getInternalMemberCluster(ctx context.Context, name string) (*clusterv1beta1.InternalMemberCluster, error) {
Expand Down
Loading

0 comments on commit caf4233

Please sign in to comment.