-
Notifications
You must be signed in to change notification settings - Fork 386
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
support init mode for metric model #278
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,7 +19,7 @@ import ( | |
"github.com/gocrane/crane/pkg/utils" | ||
) | ||
|
||
const callerFormat = "EVPACaller-%s-%s-%s" | ||
const callerFormat = "EVPACaller-%s-%s" | ||
|
||
type PercentileResourceEstimator struct { | ||
Predictor prediction.Interface | ||
|
@@ -29,7 +29,9 @@ type PercentileResourceEstimator struct { | |
func (e *PercentileResourceEstimator) GetResourceEstimation(evpa *autoscalingapi.EffectiveVerticalPodAutoscaler, config map[string]string, containerName string, currRes *corev1.ResourceRequirements) (corev1.ResourceList, error) { | ||
recommendResource := corev1.ResourceList{} | ||
|
||
caller := fmt.Sprintf(callerFormat, klog.KObj(evpa), string(evpa.UID)) | ||
cpuMetricNamer := &metricnaming.GeneralMetricNamer{ | ||
CallerName: caller, | ||
Metric: &metricquery.Metric{ | ||
Type: metricquery.ContainerMetricType, | ||
MetricName: corev1.ResourceCPU.String(), | ||
|
@@ -43,7 +45,7 @@ func (e *PercentileResourceEstimator) GetResourceEstimation(evpa *autoscalingapi | |
} | ||
|
||
cpuConfig := getCpuConfig(config) | ||
tsList, err := utils.QueryPredictedValues(e.Predictor, fmt.Sprintf(callerFormat, string(evpa.UID), containerName, corev1.ResourceCPU), cpuConfig, cpuMetricNamer) | ||
tsList, err := utils.QueryPredictedValues(e.Predictor, caller, cpuConfig, cpuMetricNamer) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
@@ -56,6 +58,7 @@ func (e *PercentileResourceEstimator) GetResourceEstimation(evpa *autoscalingapi | |
recommendResource[corev1.ResourceCPU] = *resource.NewMilliQuantity(cpuValue, resource.DecimalSI) | ||
|
||
memoryMetricNamer := &metricnaming.GeneralMetricNamer{ | ||
CallerName: caller, | ||
Metric: &metricquery.Metric{ | ||
Type: metricquery.ContainerMetricType, | ||
MetricName: corev1.ResourceMemory.String(), | ||
|
@@ -69,7 +72,7 @@ func (e *PercentileResourceEstimator) GetResourceEstimation(evpa *autoscalingapi | |
} | ||
|
||
memConfig := getMemConfig(config) | ||
tsList, err = utils.QueryPredictedValues(e.Predictor, fmt.Sprintf(callerFormat, string(evpa.UID), containerName, corev1.ResourceMemory), memConfig, memoryMetricNamer) | ||
tsList, err = utils.QueryPredictedValues(e.Predictor, caller, memConfig, memoryMetricNamer) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
@@ -86,7 +89,9 @@ func (e *PercentileResourceEstimator) GetResourceEstimation(evpa *autoscalingapi | |
|
||
func (e *PercentileResourceEstimator) DeleteEstimation(evpa *autoscalingapi.EffectiveVerticalPodAutoscaler) { | ||
for _, containerPolicy := range evpa.Spec.ResourcePolicy.ContainerPolicies { | ||
caller := fmt.Sprintf(callerFormat, klog.KObj(evpa), string(evpa.UID)) | ||
cpuMetricNamer := &metricnaming.GeneralMetricNamer{ | ||
CallerName: caller, | ||
Metric: &metricquery.Metric{ | ||
Type: metricquery.ContainerMetricType, | ||
MetricName: corev1.ResourceCPU.String(), | ||
|
@@ -98,12 +103,12 @@ func (e *PercentileResourceEstimator) DeleteEstimation(evpa *autoscalingapi.Effe | |
}, | ||
}, | ||
} | ||
err := e.Predictor.DeleteQuery(cpuMetricNamer, fmt.Sprintf(callerFormat, string(evpa.UID), containerPolicy.ContainerName, corev1.ResourceCPU)) | ||
err := e.Predictor.DeleteQuery(cpuMetricNamer, caller) | ||
if err != nil { | ||
klog.ErrorS(err, "Failed to delete query.", "queryExpr", cpuMetricNamer.BuildUniqueKey()) | ||
} | ||
|
||
memoryMetricNamer := &metricnaming.GeneralMetricNamer{ | ||
CallerName: caller, | ||
Metric: &metricquery.Metric{ | ||
Type: metricquery.ContainerMetricType, | ||
MetricName: corev1.ResourceMemory.String(), | ||
|
@@ -115,7 +120,7 @@ func (e *PercentileResourceEstimator) DeleteEstimation(evpa *autoscalingapi.Effe | |
}, | ||
}, | ||
} | ||
err = e.Predictor.DeleteQuery(memoryMetricNamer, fmt.Sprintf(callerFormat, string(evpa.UID), containerPolicy.ContainerName, corev1.ResourceMemory)) | ||
err = e.Predictor.DeleteQuery(memoryMetricNamer, caller) | ||
if err != nil { | ||
klog.ErrorS(err, "Failed to delete query.", "queryExpr", memoryMetricNamer.BuildUniqueKey()) | ||
} | ||
|
@@ -137,9 +142,22 @@ func getCpuConfig(config map[string]string) *predictionconfig.Config { | |
marginFraction = "0.15" | ||
} | ||
|
||
initModeStr, exists := config["cpu-model-init-mode"] | ||
initMode := predictionconfig.ModelInitModeLazyTraining | ||
if !exists { | ||
initMode = predictionconfig.ModelInitMode(initModeStr) | ||
} | ||
|
||
historyLength, exists := config["cpu-model-history-length"] | ||
if !exists { | ||
historyLength = "24h" | ||
} | ||
|
||
return &predictionconfig.Config{ | ||
InitMode: &initMode, | ||
Percentile: &predictionapi.Percentile{ | ||
Aggregated: true, | ||
HistoryLength: historyLength, | ||
SampleInterval: sampleInterval, | ||
MarginFraction: marginFraction, | ||
Percentile: percentile, | ||
|
@@ -166,9 +184,22 @@ func getMemConfig(props map[string]string) *predictionconfig.Config { | |
marginFraction = "0.15" | ||
} | ||
|
||
initModeStr, exists := props["mem-model-init-mode"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This part of memory is almost same like cpu, how about a function ? |
||
initMode := predictionconfig.ModelInitModeLazyTraining | ||
if !exists { | ||
initMode = predictionconfig.ModelInitMode(initModeStr) | ||
} | ||
|
||
historyLength, exists := props["mem-model-history-length"] | ||
if !exists { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why history length of cpu and memory is not same, cpu is 24h but memory is 48h? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This is just an empirical value borrowed from vpa. memory is incompressible resource, so use longer history data is more safe and robust. cpu is compressible resource, and generally it is daily cycle because of people traffic There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. got it. |
||
historyLength = "48h" | ||
} | ||
|
||
return &predictionconfig.Config{ | ||
InitMode: &initMode, | ||
Percentile: &predictionapi.Percentile{ | ||
Aggregated: true, | ||
HistoryLength: historyLength, | ||
SampleInterval: sampleInterval, | ||
MarginFraction: marginFraction, | ||
Percentile: percentile, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,7 +10,20 @@ type AlgorithmModelConfig struct { | |
UpdateInterval time.Duration | ||
} | ||
|
||
type ModelInitMode string | ||
|
||
const ( | ||
// means recover or init the algorithm model directly from history datasource, this process may block because it is time consuming for data fetching & model gen | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we make sure which mode is default? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
this param is specified by caller or user, if it is not specified, default is history mode, original logic |
||
ModelInitModeHistory ModelInitMode = "history" | ||
// means recover or init the algorithm model from real time datasource async, predictor can not do predicting before the data is accumulating to window length | ||
// this is more safe to do some data accumulating and make the prediction data is robust. | ||
ModelInitModeLazyTraining ModelInitMode = "lazytraining" | ||
// means recover or init the model from a checkpoint, it can be restored directly and immediately to do predict. | ||
ModelInitModeCheckpoint ModelInitMode = "checkpoint" | ||
) | ||
|
||
type Config struct { | ||
InitMode *ModelInitMode | ||
DSP *v1alpha1.DSP | ||
Percentile *v1alpha1.Percentile | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about moving the constant to a common file, like
EVPACaller-%s-%s
、cpu-model-history-length
、24h
etc.?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, we can do it later