Skip to content

Commit

Permalink
feat: update scale down status after every scale up
Browse files Browse the repository at this point in the history
- move scaledown delay status to cluster state/registry
- enable scale down if  `ScaleDownDelayTypeLocal` is enabled
- add new funcs on cluster state to get and update scale down delay status
- use timestamp instead of booleans to track scale down delay status
Signed-off-by: vadasambar <surajrbanakar@gmail.com>
  • Loading branch information
vadasambar committed May 8, 2023
1 parent 3b4206f commit e90529b
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 12 deletions.
21 changes: 21 additions & 0 deletions cluster-autoscaler/clusterstate/clusterstate.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/clusterstate/api"
"k8s.io/autoscaler/cluster-autoscaler/clusterstate/utils"
"k8s.io/autoscaler/cluster-autoscaler/metrics"
node_processors "k8s.io/autoscaler/cluster-autoscaler/processors/nodes"
"k8s.io/autoscaler/cluster-autoscaler/utils/backoff"
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
"k8s.io/autoscaler/cluster-autoscaler/utils/taints"
Expand Down Expand Up @@ -136,6 +137,9 @@ type ClusterStateRegistry struct {
cloudProviderNodeInstancesCache *utils.CloudProviderNodeInstancesCache
interrupt chan struct{}
maxNodeProvisionTimeProvider maxNodeProvisionTimeProvider
// scaleDownDelayStatus is for tracking nodegroups which should be
// in scaledown cooldown
scaleDownDelayStatus *node_processors.ScaleDownDelayStatus

// scaleUpFailures contains information about scale-up failures for each node group. It should be
// cleared periodically to avoid unnecessary accumulation.
Expand Down Expand Up @@ -168,6 +172,11 @@ func NewClusterStateRegistry(cloudProvider cloudprovider.CloudProvider, config C
interrupt: make(chan struct{}),
scaleUpFailures: make(map[string][]ScaleUpFailure),
maxNodeProvisionTimeProvider: maxNodeProvisionTimeProvider,
scaleDownDelayStatus: &node_processors.ScaleDownDelayStatus{
ScaleDownDelayAfterAdd: node_processors.ScaleDownDelay{
PerNG: make(map[string]time.Time),
},
},
}
}

Expand All @@ -193,6 +202,13 @@ func (csr *ClusterStateRegistry) RegisterOrUpdateScaleUp(nodeGroup cloudprovider
csr.registerOrUpdateScaleUpNoLock(nodeGroup, delta, currentTime)
}

// UpdateScaleDownDelayStatus updates ScaleDownDelayStatus to keep track of nodegroups which were recently scaled up
func (csr *ClusterStateRegistry) UpdateScaleDownDelayStatus(nodeGroup cloudprovider.NodeGroup, currentTime time.Time) {
csr.Lock()
defer csr.Unlock()
csr.scaleDownDelayStatus.ScaleDownDelayAfterAdd.PerNG[nodeGroup.Id()] = currentTime
}

// MaxNodeProvisionTime returns MaxNodeProvisionTime value that should be used for the given NodeGroup.
func (csr *ClusterStateRegistry) MaxNodeProvisionTime(nodeGroup cloudprovider.NodeGroup) (time.Duration, error) {
return csr.maxNodeProvisionTimeProvider.GetMaxNodeProvisionTime(nodeGroup)
Expand Down Expand Up @@ -1211,3 +1227,8 @@ func (csr *ClusterStateRegistry) GetScaleUpFailures() map[string][]ScaleUpFailur
}
return result
}

// GetScaleDownDelayStatus returns scale-down delay status.
func (csr *ClusterStateRegistry) GetScaleDownDelayStatus() *node_processors.ScaleDownDelayStatus {
return csr.scaleDownDelayStatus
}
Original file line number Diff line number Diff line change
Expand Up @@ -591,6 +591,7 @@ func (o *ScaleUpOrchestrator) executeScaleUp(
o.clusterStateRegistry.RegisterFailedScaleUp(info.Group, metrics.FailedScaleUpReason(string(aerr.Type())), now)
return aerr
}
o.clusterStateRegistry.UpdateScaleDownDelayStatus(info.Group, time.Now())
o.clusterStateRegistry.RegisterOrUpdateScaleUp(
info.Group,
increase,
Expand Down
11 changes: 5 additions & 6 deletions cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/expander"
"k8s.io/autoscaler/cluster-autoscaler/metrics"
ca_processors "k8s.io/autoscaler/cluster-autoscaler/processors"
node_processors "k8s.io/autoscaler/cluster-autoscaler/processors/nodes"
"k8s.io/autoscaler/cluster-autoscaler/processors/status"
"k8s.io/autoscaler/cluster-autoscaler/simulator"
"k8s.io/autoscaler/cluster-autoscaler/simulator/clustersnapshot"
Expand Down Expand Up @@ -92,7 +91,6 @@ type StaticAutoscaler struct {
processorCallbacks *staticAutoscalerProcessorCallbacks
initialized bool
taintConfig taints.TaintConfig
scaleDownDelayStatus *node_processors.ScaleDownDelayStatus
}

type staticAutoscalerProcessorCallbacks struct {
Expand Down Expand Up @@ -208,9 +206,6 @@ func NewStaticAutoscaler(
processorCallbacks: processorCallbacks,
clusterStateRegistry: clusterStateRegistry,
taintConfig: taintConfig,
scaleDownDelayStatus: &node_processors.ScaleDownDelayStatus{
ScaleDownDelayAfterAdd: node_processors.ScaleDownDelay{},
},
}
}

Expand Down Expand Up @@ -594,7 +589,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
} else {
var err caerrors.AutoscalerError
scaleDownCandidates, err = a.processors.ScaleDownNodeProcessor.GetScaleDownCandidates(
autoscalingContext, allNodes, a.scaleDownDelayStatus)
autoscalingContext, allNodes, a.clusterStateRegistry.GetScaleDownDelayStatus())
if err != nil {
klog.Error(err)
return err
Expand Down Expand Up @@ -625,6 +620,10 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
a.lastScaleDownFailTime.Add(a.ScaleDownDelayAfterFailure).After(currentTime) ||
a.lastScaleDownDeleteTime.Add(a.ScaleDownDelayAfterDelete).After(currentTime)

if !a.ScaleDownDelayTypeLocal {
scaleDownInCooldown = scaleDownInCooldown || a.lastScaleUpTime.Add(a.ScaleDownDelayAfterAdd).After(currentTime)
}

klog.V(4).Infof("Scale down status: lastScaleUpTime=%s lastScaleDownDeleteTime=%v "+
"lastScaleDownFailTime=%s scaleDownForbidden=%v scaleDownInCooldown=%v",
a.lastScaleUpTime, a.lastScaleDownDeleteTime, a.lastScaleDownFailTime,
Expand Down
5 changes: 3 additions & 2 deletions cluster-autoscaler/processors/nodes/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ limitations under the License.
package nodes

import (
"time"

apiv1 "k8s.io/api/core/v1"

"k8s.io/autoscaler/cluster-autoscaler/context"
Expand Down Expand Up @@ -48,6 +50,5 @@ type ScaleDownDelayStatus struct {
}

type ScaleDownDelay struct {
PerNG map[string]bool
Aggregated bool
PerNG map[string]time.Time
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package scaledowncandidates

import (
"reflect"
"time"

apiv1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
Expand Down Expand Up @@ -54,10 +55,13 @@ func (p *ScaleDownCandidatesDelayProcessor) GetScaleDownCandidates(ctx *context.
continue
}

if _, ok := scaleDownDelayStatus.ScaleDownDelayAfterAdd.PerNG[nodeGroup.Id()]; ok {
klog.V(4).Infof("Skipping scale down on node group %s because it was scaled up recently", nodeGroup.Id())
lastScaledUpTime, ok := scaleDownDelayStatus.ScaleDownDelayAfterAdd.PerNG[nodeGroup.Id()]

if ok && lastScaledUpTime.Add(ctx.ScaleDownDelayAfterAdd).Before(time.Now()) {
klog.V(4).Infof("Skipping scale down on node group %s because it was scaled up recently at %v", nodeGroup.Id(), lastScaledUpTime.String())
continue
}

result = append(result, node)
}
return result, nil
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ func (p *ScaleDownCandidatesSortingProcessor) GetPodDestinationCandidates(ctx *c

// GetScaleDownCandidates returns filter nodes and move previous scale down candidates to the beginning of the list.
func (p *ScaleDownCandidatesSortingProcessor) GetScaleDownCandidates(ctx *context.AutoscalingContext,
nodes []*apiv1.Node, _ *nodes.ScaleDownDelayStatus) ([]*apiv1.Node, errors.AutoscalerError) {
candidates, err := p.preFilter.GetScaleDownCandidates(ctx, nodes)
nodes []*apiv1.Node, scaleDownDelayStatus *nodes.ScaleDownDelayStatus) ([]*apiv1.Node, errors.AutoscalerError) {
candidates, err := p.preFilter.GetScaleDownCandidates(ctx, nodes, scaleDownDelayStatus)
if err != nil {
return candidates, err
}
Expand Down

0 comments on commit e90529b

Please sign in to comment.