-
Notifications
You must be signed in to change notification settings - Fork 4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: set IgnoreDaemonSetsUtilization
per nodegroup for AWS
#5672
Changes from 4 commits
7941bab
7eb0910
eff7888
e1a22da
8a73d8e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -48,6 +48,8 @@ type NodeGroupAutoscalingOptions struct { | |
MaxNodeProvisionTime time.Duration | ||
// ZeroOrMaxNodeScaling means that a node group should be scaled up to maximum size or down to zero nodes all at once instead of one-by-one. | ||
ZeroOrMaxNodeScaling bool | ||
// IgnoreDaemonSetsUtilization sets if daemonsets utilization should be considered during node scale-down | ||
IgnoreDaemonSetsUtilization bool | ||
} | ||
|
||
// GCEOptions contain autoscaling options specific to GCE cloud provider. | ||
|
@@ -117,8 +119,6 @@ type AutoscalingOptions struct { | |
GRPCExpanderCert string | ||
// GRPCExpanderURL is the url of the gRPC server when using the gRPC expander | ||
GRPCExpanderURL string | ||
// IgnoreDaemonSetsUtilization is whether CA will ignore DaemonSet pods when calculating resource utilization for scaling down | ||
IgnoreDaemonSetsUtilization bool | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Moved this to |
||
// IgnoreMirrorPodsUtilization is whether CA will ignore Mirror pods when calculating resource utilization for scaling down | ||
IgnoreMirrorPodsUtilization bool | ||
// MaxGracefulTerminationSec is maximum number of seconds scale down waits for pods to terminate before | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,8 @@ limitations under the License. | |
|
||
package config | ||
|
||
import "time" | ||
|
||
const ( | ||
// DefaultMaxClusterCores is the default maximum number of cores in the cluster. | ||
DefaultMaxClusterCores = 5000 * 64 | ||
|
@@ -32,4 +34,14 @@ const ( | |
DefaultScaleDownUnreadyTimeKey = "scaledownunreadytime" | ||
// DefaultMaxNodeProvisionTimeKey identifies MaxNodeProvisionTime autoscaling option | ||
DefaultMaxNodeProvisionTimeKey = "maxnodeprovisiontime" | ||
// DefaultIgnoreDaemonSetsUtilizationKey identifies IgnoreDaemonSetsUtilization autoscaling option | ||
DefaultIgnoreDaemonSetsUtilizationKey = "ignoredaemonsetsutilization" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is only used in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @x13n as of now, yes but this is a generic option like I am fine with moving it to AWS provider since no other cloud provider is using it now but say azure provider wants to support this key tomorrow, they'd have to define their own constant with the same name (or maybe worse, a different name like So far it seems like we have kept such keys common across providers. If we want to keep them separate, I think that would be going against what we have been doing so far. It can be argued that every cloud provider should have freedom to define this key as they like but that means it opens up possibility for different cloudproviders naming this key differently. I am not sure if that's a good idea. Any feedback on this is welcome. If both of you think it's better to move this to AWS provider, I am happy to move the key to AWS provider. |
||
// DefaultScaleDownUnneededTime identifies ScaleDownUnneededTime autoscaling option | ||
DefaultScaleDownUnneededTime = 10 * time.Minute | ||
// DefaultScaleDownUnreadyTime identifies ScaleDownUnreadyTime autoscaling option | ||
DefaultScaleDownUnreadyTime = 20 * time.Minute | ||
// DefaultScaleDownUtilizationThreshold identifies ScaleDownUtilizationThreshold autoscaling option | ||
DefaultScaleDownUtilizationThreshold = 0.5 | ||
// DefaultScaleDownGpuUtilizationThreshold identifies ScaleDownGpuUtilizationThreshold autoscaling option | ||
DefaultScaleDownGpuUtilizationThreshold = 0.5 | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,10 +52,18 @@ type Actuator struct { | |
// This is a larger change to the code structure which impacts some existing actuator unit tests | ||
// as well as Cluster Autoscaler implementations that may override ScaleDownSetProcessor | ||
budgetProcessor *budgets.ScaleDownBudgetProcessor | ||
configGetter actuatorNodeGroupConfigGetter | ||
} | ||
|
||
// actuatorNodeGroupConfigGetter is an interface to limit the functions that can be used | ||
// from NodeGroupConfigProcessor interface | ||
type actuatorNodeGroupConfigGetter interface { | ||
// GetIgnoreDaemonSetsUtilization returns IgnoreDaemonSetsUtilization value that should be used for a given NodeGroup. | ||
GetIgnoreDaemonSetsUtilization(context *context.AutoscalingContext, nodeGroup cloudprovider.NodeGroup) (bool, error) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This interface is to limit the functions that can be used from |
||
} | ||
|
||
// NewActuator returns a new instance of Actuator. | ||
func NewActuator(ctx *context.AutoscalingContext, csr *clusterstate.ClusterStateRegistry, ndt *deletiontracker.NodeDeletionTracker, deleteOptions simulator.NodeDeleteOptions) *Actuator { | ||
func NewActuator(ctx *context.AutoscalingContext, csr *clusterstate.ClusterStateRegistry, ndt *deletiontracker.NodeDeletionTracker, deleteOptions simulator.NodeDeleteOptions, configGetter actuatorNodeGroupConfigGetter) *Actuator { | ||
ndb := NewNodeDeletionBatcher(ctx, csr, ndt, ctx.NodeDeletionBatcherInterval) | ||
return &Actuator{ | ||
ctx: ctx, | ||
|
@@ -64,6 +72,7 @@ func NewActuator(ctx *context.AutoscalingContext, csr *clusterstate.ClusterState | |
nodeDeletionScheduler: NewGroupDeletionScheduler(ctx, ndt, ndb, NewDefaultEvictor(deleteOptions, ndt)), | ||
budgetProcessor: budgets.NewScaleDownBudgetProcessor(ctx), | ||
deleteOptions: deleteOptions, | ||
configGetter: configGetter, | ||
} | ||
} | ||
|
||
|
@@ -263,8 +272,14 @@ func (a *Actuator) scaleDownNodeToReport(node *apiv1.Node, drain bool) (*status. | |
if err != nil { | ||
return nil, err | ||
} | ||
|
||
ignoreDaemonSetsUtilization, err := a.configGetter.GetIgnoreDaemonSetsUtilization(a.ctx, nodeGroup) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
gpuConfig := a.ctx.CloudProvider.GetNodeGpuConfig(node) | ||
utilInfo, err := utilization.Calculate(nodeInfo, a.ctx.IgnoreDaemonSetsUtilization, a.ctx.IgnoreMirrorPodsUtilization, gpuConfig, time.Now()) | ||
utilInfo, err := utilization.Calculate(nodeInfo, ignoreDaemonSetsUtilization, a.ctx.IgnoreMirrorPodsUtilization, gpuConfig, time.Now()) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ScaleDownUnreadyTime
here is a wrong key (for testing).