diff --git a/pkg/autoscaling/estimator/percentile.go b/pkg/autoscaling/estimator/percentile.go index dd6e98378..c31914002 100644 --- a/pkg/autoscaling/estimator/percentile.go +++ b/pkg/autoscaling/estimator/percentile.go @@ -92,7 +92,7 @@ func (e *PercentileResourceEstimator) GetResourceEstimation(evpa *autoscalingapi predictErrs = append(predictErrs, err) } - if len(tsList) > 1 && len(tsList[0].Samples) > 1 { + if len(tsList) > 0 && len(tsList[0].Samples) > 0 { cpuValue := int64(tsList[0].Samples[0].Value * 1000) recommendResource[corev1.ResourceCPU] = *resource.NewMilliQuantity(cpuValue, resource.DecimalSI) } else { @@ -104,7 +104,7 @@ func (e *PercentileResourceEstimator) GetResourceEstimation(evpa *autoscalingapi predictErrs = append(predictErrs, err) } - if len(tsList) > 1 && len(tsList[0].Samples) > 1 { + if len(tsList) > 0 && len(tsList[0].Samples) > 0 { memValue := int64(tsList[0].Samples[0].Value) recommendResource[corev1.ResourceMemory] = *resource.NewQuantity(memValue, resource.BinarySI) } else { diff --git a/pkg/controller/evpa/container_policy.go b/pkg/controller/evpa/container_policy.go index 61ad2bc1a..638073e99 100644 --- a/pkg/controller/evpa/container_policy.go +++ b/pkg/controller/evpa/container_policy.go @@ -28,16 +28,19 @@ func (c *EffectiveVPAController) ReconcileContainerPolicies(evpa *autoscalingapi recommendation = evpa.Status.Recommendation rankedEstimators := RankEstimators(resourceEstimators) - for _, containerPolicy := range evpa.Spec.ResourcePolicy.ContainerPolicies { - // container scaling is disabled - if (containerPolicy.ScaleUpPolicy.ScaleMode != nil && *containerPolicy.ScaleUpPolicy.ScaleMode == vpatypes.ContainerScalingModeOff) || - (containerPolicy.ScaleDownPolicy.ScaleMode != nil && *containerPolicy.ScaleDownPolicy.ScaleMode == vpatypes.ContainerScalingModeOff) { - continue + needReconciledContainers := make(map[string]autoscalingapi.ContainerResourcePolicy) + for _, container := range podTemplate.Spec.Containers { + for _, containerPolicy := range evpa.Spec.ResourcePolicy.ContainerPolicies { + if containerPolicy.ContainerName == "*" || containerPolicy.ContainerName == container.Name { + out := containerPolicy.DeepCopy() + out.ContainerName = container.Name + needReconciledContainers[container.Name] = *out + } } - + } + for container, containerPolicy := range needReconciledContainers { // get current resource by pod template - // todo: support "*" - resourceRequirement, found := utils.GetResourceByPodTemplate(podTemplate, containerPolicy.ContainerName) + resourceRequirement, found := utils.GetResourceByPodTemplate(podTemplate, container) if !found { klog.Warningf("ContainerName %s not found", containerPolicy.ContainerName) continue @@ -45,6 +48,8 @@ func (c *EffectiveVPAController) ReconcileContainerPolicies(evpa *autoscalingapi // loop estimator and get final estimated resource for container recommendResourceContainer, currentStatus := GetEstimatedResourceForContainer(evpa, containerPolicy, resourceRequirement, rankedEstimators, currentEstimatorStatus) + // record the recommended resource each time to do estimating. so we can get more observability + recordResourceRecommendation(evpa, containerPolicy, recommendResourceContainer) currentEstimatorStatus = currentStatus if IsResourceListEmpty(recommendResourceContainer) { klog.V(4).Infof("Container %s recommend resource is empty, skip scaling. ", containerPolicy.ContainerName) diff --git a/pkg/controller/evpa/effective_vpa_controller.go b/pkg/controller/evpa/effective_vpa_controller.go index d8e59c802..786fc4b5d 100644 --- a/pkg/controller/evpa/effective_vpa_controller.go +++ b/pkg/controller/evpa/effective_vpa_controller.go @@ -92,7 +92,7 @@ func (c *EffectiveVPAController) Reconcile(ctx context.Context, req ctrl.Request c.UpdateStatus(ctx, evpa, newStatus) return ctrl.Result{}, err } - c.Recorder.Event(evpa, v1.EventTypeNormal, "RemoveFinalizers", err.Error()) + c.Recorder.Event(evpa, v1.EventTypeNormal, "RemoveFinalizers", "") } else if !utils.ContainsString(evpa.Finalizers, known.AutoscalingFinalizer) { evpa.Finalizers = append(evpa.Finalizers, known.AutoscalingFinalizer) err = c.Client.Update(ctx, evpa) @@ -155,15 +155,38 @@ func (c *EffectiveVPAController) SetupWithManager(mgr ctrl.Manager) error { Complete(c) } -func recordMetric(evpa *autoscalingapi.EffectiveVerticalPodAutoscaler, status *autoscalingapi.EffectiveVerticalPodAutoscalerStatus, podTemplate *v1.PodTemplateSpec) { - labels := map[string]string{ - "resourceName": fmt.Sprintf("%s/%s", evpa.Namespace, evpa.Spec.TargetRef.Name), +func recordResourceRecommendation(evpa *autoscalingapi.EffectiveVerticalPodAutoscaler, containerPolicy autoscalingapi.ContainerResourcePolicy, resourceList v1.ResourceList) { + for resourceName, resource := range resourceList { + labels := map[string]string{ + "apiversion": evpa.Spec.TargetRef.APIVersion, + "owner_kind": evpa.Spec.TargetRef.Kind, + "namespace": evpa.Namespace, + "owner_name": evpa.Spec.TargetRef.Name, + "container": containerPolicy.ContainerName, + "resource": resourceName.String(), + } + switch resourceName { + case v1.ResourceCPU: + metrics.EVPAResourceRecommendation.With(labels).Set(float64(resource.MilliValue()) / 1000.) + case v1.ResourceMemory: + metrics.EVPAResourceRecommendation.With(labels).Set(float64(resource.Value())) + } } +} + +func recordMetric(evpa *autoscalingapi.EffectiveVerticalPodAutoscaler, status *autoscalingapi.EffectiveVerticalPodAutoscalerStatus, podTemplate *v1.PodTemplateSpec) { if status.Recommendation == nil { return } for _, container := range status.Recommendation.ContainerRecommendations { + labels := map[string]string{ + "apiversion": evpa.Spec.TargetRef.APIVersion, + "owner_kind": evpa.Spec.TargetRef.Kind, + "namespace": evpa.Namespace, + "owner_name": evpa.Spec.TargetRef.Name, + "container": container.ContainerName, + } resourceRequirement, found := utils.GetResourceByPodTemplate(podTemplate, container.ContainerName) if !found { klog.Warningf("ContainerName %s not found", container.ContainerName) @@ -172,30 +195,32 @@ func recordMetric(evpa *autoscalingapi.EffectiveVerticalPodAutoscaler, status *a recommendCpu := container.Target[v1.ResourceCPU] currentCpu := resourceRequirement.Requests[v1.ResourceCPU] + labels["resource"] = v1.ResourceCPU.String() if currentCpu.Cmp(recommendCpu) > 0 { // scale down currCopy := currentCpu.DeepCopy() currCopy.Sub(recommendCpu) - metrics.EVPACpuScaleDownMilliCores.With(labels).Set(float64(currCopy.MilliValue())) + metrics.EVPACpuScaleDown.With(labels).Set(float64(currCopy.MilliValue()) / 1000.) } else if currentCpu.Cmp(recommendCpu) < 0 { // scale up recommendCopy := recommendCpu.DeepCopy() recommendCopy.Sub(currentCpu) - metrics.EVPACpuScaleUpMilliCores.With(labels).Set(float64(recommendCopy.MilliValue())) + metrics.EVPACpuScaleUp.With(labels).Set(float64(recommendCopy.MilliValue()) / 1000.) } recommendMem := container.Target[v1.ResourceMemory] currentMem := resourceRequirement.Requests[v1.ResourceMemory] + labels["resource"] = v1.ResourceMemory.String() if currentMem.Cmp(recommendMem) > 0 { // scale down currCopy := currentMem.DeepCopy() currCopy.Sub(recommendMem) - metrics.EVPAMemoryScaleDownMB.With(labels).Set(float64(currCopy.Value() / 1024 / 1024)) + metrics.EVPAMemoryScaleDown.With(labels).Set(float64(currCopy.Value())) } else if currentMem.Cmp(recommendMem) < 0 { // scale up recommendCopy := recommendMem.DeepCopy() recommendCopy.Sub(currentMem) - metrics.EVPAMemoryScaleUpMB.With(labels).Set(float64(recommendCopy.Value() / 1024 / 1024)) + metrics.EVPAMemoryScaleUp.With(labels).Set(float64(recommendCopy.Value())) } } } diff --git a/pkg/metrics/autoscaling.go b/pkg/metrics/autoscaling.go index c9e5e5d5b..dd9362d1b 100644 --- a/pkg/metrics/autoscaling.go +++ b/pkg/metrics/autoscaling.go @@ -45,55 +45,56 @@ var ( "container", }, ) - EVPACpuScaleUpMilliCores = prometheus.NewGaugeVec( + EVPACpuScaleUp = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: "crane", Subsystem: "autoscaling", - Name: "effective_vpa_cpu_scale_up_millicores", + Name: "effective_vpa_cpu_scale_up", Help: "The cpu scale up for Effective VPA", }, - []string{ - "resourceName", - }, + []string{"apiversion", "owner_kind", "namespace", "owner_name", "container", "resource"}, ) - EVPACpuScaleDownMilliCores = prometheus.NewGaugeVec( + EVPACpuScaleDown = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: "crane", Subsystem: "autoscaling", - Name: "effective_vpa_cpu_scale_down_millicores", + Name: "effective_vpa_cpu_scale_down", Help: "The cpu scale down for Effective VPA", }, - []string{ - "resourceName", - }, + []string{"apiversion", "owner_kind", "namespace", "owner_name", "container", "resource"}, ) - EVPAMemoryScaleUpMB = prometheus.NewGaugeVec( + EVPAMemoryScaleUp = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: "crane", Subsystem: "autoscaling", - Name: "effective_vpa_memory_scale_up_mb", + Name: "effective_vpa_memory_scale_up", Help: "The memory scale up for Effective VPA", }, - []string{ - "resourceName", - }, + []string{"apiversion", "owner_kind", "namespace", "owner_name", "container", "resource"}, ) - EVPAMemoryScaleDownMB = prometheus.NewGaugeVec( + EVPAMemoryScaleDown = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: "crane", Subsystem: "autoscaling", - Name: "effective_vpa_memory_scale_down_mb", + Name: "effective_vpa_memory_scale_down", Help: "The memory scale down for Effective VPA", }, - []string{ - "resourceName", + []string{"apiversion", "owner_kind", "namespace", "owner_name", "container", "resource"}, + ) + EVPAResourceRecommendation = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "crane", + Subsystem: "autoscaling", + Name: "effective_vpa_resource_recommendation", + Help: "The resource recommendation for Effective VPA", }, + []string{"apiversion", "owner_kind", "namespace", "owner_name", "container", "resource"}, ) ) func init() { // Register custom metrics with the global prometheus registry - metrics.Registry.MustRegister(HPAReplicas, EHPAReplicas, OOMCount, HPAScaleCount, EVPACpuScaleUpMilliCores, EVPACpuScaleDownMilliCores, EVPAMemoryScaleDownMB, EVPAMemoryScaleUpMB) + metrics.Registry.MustRegister(HPAReplicas, EHPAReplicas, OOMCount, HPAScaleCount, EVPACpuScaleUp, EVPACpuScaleDown, EVPAMemoryScaleDown, EVPAMemoryScaleUp, EVPAResourceRecommendation) } diff --git a/pkg/prediction/percentile/prediction.go b/pkg/prediction/percentile/prediction.go index b665d7bb5..91b3cb225 100644 --- a/pkg/prediction/percentile/prediction.go +++ b/pkg/prediction/percentile/prediction.go @@ -282,10 +282,12 @@ func (p *percentilePrediction) Run(stopCh <-chan struct{}) { QueryExpr := qc.MetricNamer.BuildUniqueKey() if _, ok := p.queryRoutines.Load(QueryExpr); ok { + klog.V(6).InfoS("Prediction percentile routine %v already registered.", "queryExpr", QueryExpr, "caller", qc.Caller) continue } if _, ok := p.stopChMap.Load(QueryExpr); ok { + klog.V(6).InfoS("Prediction percentile routine %v already stopped.", "queryExpr", QueryExpr, "caller", qc.Caller) continue } @@ -466,6 +468,11 @@ func (p *percentilePrediction) addSamples(namer metricnaming.MetricNamer) { if signal == nil { return } + // maybe we can use other aggregated way to deal with the container instances of the same container in different pods of the same workload, + // aggregated by reducing all the samples to just one p99 or avg value and so on. + // saw that when the workload is daemonset, container in different node has very different resource usage. this is unexpected in production, maybe the daemonset in different nodes has different loads. + // NOTE: now it there are N instance of the workload, there are N samples in latest, then the aggregationWindowLength is N times growth to accumulate data fastly. + // it is not a time dimension, but we use N samples of different container instances of the workload to represent the N intervals samples for _, ts := range latestTimeSeriesList { if len(ts.Samples) < 1 { klog.V(4).InfoS("Sample not found.", "key", key)