gocrane · kitianFresh · Apr 25, 2022 · Apr 21, 2022 · xieydd · Apr 24, 2022
diff --git a/pkg/autoscaling/estimator/percentile.go b/pkg/autoscaling/estimator/percentile.go
@@ -19,7 +19,7 @@ import (
 	"github.com/gocrane/crane/pkg/utils"
 )
 
-const callerFormat = "EVPACaller-%s-%s-%s"
+const callerFormat = "EVPACaller-%s-%s"
 
 type PercentileResourceEstimator struct {
 	Predictor prediction.Interface
@@ -29,7 +29,9 @@ type PercentileResourceEstimator struct {
 func (e *PercentileResourceEstimator) GetResourceEstimation(evpa *autoscalingapi.EffectiveVerticalPodAutoscaler, config map[string]string, containerName string, currRes *corev1.ResourceRequirements) (corev1.ResourceList, error) {
 	recommendResource := corev1.ResourceList{}
 
+	caller := fmt.Sprintf(callerFormat, klog.KObj(evpa), string(evpa.UID))
 	cpuMetricNamer := &metricnaming.GeneralMetricNamer{
+		CallerName: caller,
 		Metric: &metricquery.Metric{
 			Type:       metricquery.ContainerMetricType,
 			MetricName: corev1.ResourceCPU.String(),
@@ -43,7 +45,7 @@ func (e *PercentileResourceEstimator) GetResourceEstimation(evpa *autoscalingapi
 	}
 
 	cpuConfig := getCpuConfig(config)
-	tsList, err := utils.QueryPredictedValues(e.Predictor, fmt.Sprintf(callerFormat, string(evpa.UID), containerName, corev1.ResourceCPU), cpuConfig, cpuMetricNamer)
+	tsList, err := utils.QueryPredictedValues(e.Predictor, caller, cpuConfig, cpuMetricNamer)
 	if err != nil {
 		return nil, err
 	}
@@ -56,6 +58,7 @@ func (e *PercentileResourceEstimator) GetResourceEstimation(evpa *autoscalingapi
 	recommendResource[corev1.ResourceCPU] = *resource.NewMilliQuantity(cpuValue, resource.DecimalSI)
 
 	memoryMetricNamer := &metricnaming.GeneralMetricNamer{
+		CallerName: caller,
 		Metric: &metricquery.Metric{
 			Type:       metricquery.ContainerMetricType,
 			MetricName: corev1.ResourceMemory.String(),
@@ -69,7 +72,7 @@ func (e *PercentileResourceEstimator) GetResourceEstimation(evpa *autoscalingapi
 	}
 
 	memConfig := getMemConfig(config)
-	tsList, err = utils.QueryPredictedValues(e.Predictor, fmt.Sprintf(callerFormat, string(evpa.UID), containerName, corev1.ResourceMemory), memConfig, memoryMetricNamer)
+	tsList, err = utils.QueryPredictedValues(e.Predictor, caller, memConfig, memoryMetricNamer)
 	if err != nil {
 		return nil, err
 	}
@@ -86,7 +89,9 @@ func (e *PercentileResourceEstimator) GetResourceEstimation(evpa *autoscalingapi
 
 func (e *PercentileResourceEstimator) DeleteEstimation(evpa *autoscalingapi.EffectiveVerticalPodAutoscaler) {
 	for _, containerPolicy := range evpa.Spec.ResourcePolicy.ContainerPolicies {
+		caller := fmt.Sprintf(callerFormat, klog.KObj(evpa), string(evpa.UID))
 		cpuMetricNamer := &metricnaming.GeneralMetricNamer{
+			CallerName: caller,
 			Metric: &metricquery.Metric{
 				Type:       metricquery.ContainerMetricType,
 				MetricName: corev1.ResourceCPU.String(),
@@ -98,12 +103,12 @@ func (e *PercentileResourceEstimator) DeleteEstimation(evpa *autoscalingapi.Effe
 				},
 			},
 		}
-		err := e.Predictor.DeleteQuery(cpuMetricNamer, fmt.Sprintf(callerFormat, string(evpa.UID), containerPolicy.ContainerName, corev1.ResourceCPU))
+		err := e.Predictor.DeleteQuery(cpuMetricNamer, caller)
 		if err != nil {
 			klog.ErrorS(err, "Failed to delete query.", "queryExpr", cpuMetricNamer.BuildUniqueKey())
 		}
-
 		memoryMetricNamer := &metricnaming.GeneralMetricNamer{
+			CallerName: caller,
 			Metric: &metricquery.Metric{
 				Type:       metricquery.ContainerMetricType,
 				MetricName: corev1.ResourceMemory.String(),
@@ -115,7 +120,7 @@ func (e *PercentileResourceEstimator) DeleteEstimation(evpa *autoscalingapi.Effe
 				},
 			},
 		}
-		err = e.Predictor.DeleteQuery(memoryMetricNamer, fmt.Sprintf(callerFormat, string(evpa.UID), containerPolicy.ContainerName, corev1.ResourceMemory))
+		err = e.Predictor.DeleteQuery(memoryMetricNamer, caller)
 		if err != nil {
 			klog.ErrorS(err, "Failed to delete query.", "queryExpr", memoryMetricNamer.BuildUniqueKey())
 		}
@@ -137,9 +142,22 @@ func getCpuConfig(config map[string]string) *predictionconfig.Config {
 		marginFraction = "0.15"
 	}
 
+	initModeStr, exists := config["cpu-model-init-mode"]
+	initMode := predictionconfig.ModelInitModeLazyTraining
+	if !exists {
+		initMode = predictionconfig.ModelInitMode(initModeStr)
+	}
+
+	historyLength, exists := config["cpu-model-history-length"]
+	if !exists {
+		historyLength = "24h"
+	}
+
 	return &predictionconfig.Config{
+		InitMode: &initMode,
 		Percentile: &predictionapi.Percentile{
 			Aggregated:     true,
+			HistoryLength:  historyLength,
 			SampleInterval: sampleInterval,
 			MarginFraction: marginFraction,
 			Percentile:     percentile,
@@ -166,9 +184,22 @@ func getMemConfig(props map[string]string) *predictionconfig.Config {
 		marginFraction = "0.15"
 	}
 
+	initModeStr, exists := props["mem-model-init-mode"]
+	initMode := predictionconfig.ModelInitModeLazyTraining
+	if !exists {
+		initMode = predictionconfig.ModelInitMode(initModeStr)
+	}
+
+	historyLength, exists := props["mem-model-history-length"]
+	if !exists {
+		historyLength = "48h"
+	}
+
 	return &predictionconfig.Config{
+		InitMode: &initMode,
 		Percentile: &predictionapi.Percentile{
 			Aggregated:     true,
+			HistoryLength:  historyLength,
 			SampleInterval: sampleInterval,
 			MarginFraction: marginFraction,
 			Percentile:     percentile,

diff --git a/pkg/controller/evpa/effective_vpa_controller.go b/pkg/controller/evpa/effective_vpa_controller.go
@@ -158,6 +158,9 @@ func recordMetric(evpa *autoscalingapi.EffectiveVerticalPodAutoscaler, status *a
 		"resourceName": fmt.Sprintf("%s/%s", evpa.Namespace, evpa.Spec.TargetRef.Name),
 	}
 
+	if status.Recommendation == nil {
+		return
+	}
 	for _, container := range status.Recommendation.ContainerRecommendations {
 		resourceRequirement, found := utils.GetResourceByPodTemplate(podTemplate, container.ContainerName)
 		if !found {

diff --git a/pkg/controller/timeseriesprediction/config.go b/pkg/controller/timeseriesprediction/config.go
@@ -63,18 +63,21 @@ func (c *MetricContext) GetMetricNamer(conf *predictionapi.PredictionMetric) met
 		return nil
 	}
 	if conf.ExpressionQuery != nil {
-		namer.Metric = &metricquery.Metric{
-			Type:       metricquery.PromQLMetricType,
-			MetricName: conf.ResourceIdentifier,
-			Prom: &metricquery.PromNamerInfo{
-				QueryExpr: conf.ExpressionQuery.Expression,
-				Selector:  labels.Nothing(),
+		namer = metricnaming.GeneralMetricNamer{
+			CallerName: c.GetCaller(),
+			Metric: &metricquery.Metric{
+				Type:       metricquery.PromQLMetricType,
+				MetricName: conf.ResourceIdentifier,
+				Prom: &metricquery.PromNamerInfo{
+					QueryExpr: conf.ExpressionQuery.Expression,
+					Selector:  labels.Nothing(),
+				},
 			},
 		}
 		klog.InfoS("GetQueryStr", "tsp", klog.KObj(c.SeriesPrediction), "queryExpr", conf.ExpressionQuery.Expression)
 	}
 	if conf.ResourceQuery != nil {
-		namer = c.ResourceToMetricNamer(conf.ResourceQuery)
+		namer = c.ResourceToMetricNamer(conf.ResourceQuery, c.GetCaller())
 		klog.InfoS("GetQueryStr", "tsp", klog.KObj(c.SeriesPrediction), "resourceQuery", conf.ResourceQuery)
 	}
 	return &namer
@@ -139,30 +142,36 @@ func metricSelectorToQueryExpr(m *predictionapi.MetricQuery) string {
 	return fmt.Sprintf("%s{%s}", m.MetricName, strings.Join(conditions, ","))
 }
 
-func (c *MetricContext) ResourceToMetricNamer(resourceName *corev1.ResourceName) metricnaming.GeneralMetricNamer {
+func (c *MetricContext) ResourceToMetricNamer(resourceName *corev1.ResourceName, caller string) metricnaming.GeneralMetricNamer {
 	var namer metricnaming.GeneralMetricNamer
 
 	// Node
 	if strings.ToLower(c.TargetKind) == strings.ToLower(predconf.TargetKindNode) {
-		namer.Metric = &metricquery.Metric{
-			Type:       metricquery.NodeMetricType,
-			MetricName: resourceName.String(),
-			Node: &metricquery.NodeNamerInfo{
-				Name:     c.Name,
-				Selector: labels.Everything(),
+		namer = metricnaming.GeneralMetricNamer{
+			CallerName: caller,
+			Metric: &metricquery.Metric{
+				Type:       metricquery.NodeMetricType,
+				MetricName: resourceName.String(),
+				Node: &metricquery.NodeNamerInfo{
+					Name:     c.Name,
+					Selector: labels.Everything(),
+				},
 			},
 		}
 	} else {
 		// workload
-		namer.Metric = &metricquery.Metric{
-			Type:       metricquery.WorkloadMetricType,
-			MetricName: resourceName.String(),
-			Workload: &metricquery.WorkloadNamerInfo{
-				Namespace:  c.Namespace,
-				Kind:       c.TargetKind,
-				APIVersion: c.APIVersion,
-				Name:       c.Name,
-				Selector:   c.Selector,
+		namer = metricnaming.GeneralMetricNamer{
+			CallerName: caller,
+			Metric: &metricquery.Metric{
+				Type:       metricquery.WorkloadMetricType,
+				MetricName: resourceName.String(),
+				Workload: &metricquery.WorkloadNamerInfo{
+					Namespace:  c.Namespace,
+					Kind:       c.TargetKind,
+					APIVersion: c.APIVersion,
+					Name:       c.Name,
+					Selector:   c.Selector,
+				},
 			},
 		}
 	}

diff --git a/pkg/metricnaming/naming.go b/pkg/metricnaming/naming.go
@@ -5,26 +5,36 @@ import (
 	"github.com/gocrane/crane/pkg/querybuilder"
 )
 
-// MetricNamer is an interface. it is the bridge between predictor and different data sources and other component.
+// MetricNamer is an interface. it is the bridge between predictor and different data sources and other component such as caller.
 type MetricNamer interface {
 	// Used for datasource provider, data source provider call QueryBuilder
 	QueryBuilder() querybuilder.QueryBuilder
 	// Used for predictor now
 	BuildUniqueKey() string
 
 	Validate() error
+
+	// Means the caller of this MetricNamer, different caller maybe use the same metric
+	Caller() string
 }
 
+var _ MetricNamer = &GeneralMetricNamer{}
+
 type GeneralMetricNamer struct {
-	Metric *metricquery.Metric
+	Metric     *metricquery.Metric
+	CallerName string
+}
+
+func (gmn *GeneralMetricNamer) Caller() string {
+	return gmn.CallerName
 }
 
 func (gmn *GeneralMetricNamer) QueryBuilder() querybuilder.QueryBuilder {
 	return NewQueryBuilder(gmn.Metric)
 }
 
 func (gmn *GeneralMetricNamer) BuildUniqueKey() string {
-	return gmn.Metric.BuildUniqueKey()
+	return gmn.CallerName + "/" + gmn.Metric.BuildUniqueKey()
 }
 
 func (gmn *GeneralMetricNamer) Validate() error {

diff --git a/pkg/metricquery/type.go b/pkg/metricquery/type.go
@@ -27,10 +27,10 @@ const (
 
 var (
 	NotMatchWorkloadError  = fmt.Errorf("metric type %v, but no WorkloadNamerInfo provided", WorkloadMetricType)
-	NotMatchContainerError = fmt.Errorf("metric type %v, but no WorkloadNamerInfo provided", ContainerMetricType)
-	NotMatchPodError       = fmt.Errorf("metric type %v, but no WorkloadNamerInfo provided", PodMetricType)
-	NotMatchNodeError      = fmt.Errorf("metric type %v, but no WorkloadNamerInfo provided", NodeMetricType)
-	NotMatchPromError      = fmt.Errorf("metric type %v, but no WorkloadNamerInfo provided", PromQLMetricType)
+	NotMatchContainerError = fmt.Errorf("metric type %v, but no ContainerNamerInfo provided", ContainerMetricType)
+	NotMatchPodError       = fmt.Errorf("metric type %v, but no PodNamerInfo provided", PodMetricType)
+	NotMatchNodeError      = fmt.Errorf("metric type %v, but no NodeNamerInfo provided", NodeMetricType)
+	NotMatchPromError      = fmt.Errorf("metric type %v, but no PromNamerInfo provided", PromQLMetricType)
 )
 
 type Metric struct {
@@ -153,7 +153,7 @@ func (m *Metric) keyByWorkload() string {
 		m.Workload.APIVersion,
 		m.Workload.Namespace,
 		m.Workload.Name,
-		selectorStr}, "-")
+		selectorStr}, "_")
 }
 
 func (m *Metric) keyByContainer() string {
@@ -168,7 +168,7 @@ func (m *Metric) keyByContainer() string {
 		m.Container.WorkloadName,
 		m.Container.PodName,
 		m.Container.ContainerName,
-		selectorStr}, "-")
+		selectorStr}, "_")
 }
 
 func (m *Metric) keyByPod() string {
@@ -181,7 +181,7 @@ func (m *Metric) keyByPod() string {
 		strings.ToLower(m.MetricName),
 		m.Pod.Namespace,
 		m.Pod.Name,
-		selectorStr}, "-")
+		selectorStr}, "_")
 }
 func (m *Metric) keyByNode() string {
 	selectorStr := ""
@@ -192,7 +192,7 @@ func (m *Metric) keyByNode() string {
 		string(m.Type),
 		strings.ToLower(m.MetricName),
 		m.Node.Name,
-		selectorStr}, "-")
+		selectorStr}, "_")
 }
 
 func (m *Metric) keyByPromQL() string {
@@ -205,7 +205,7 @@ func (m *Metric) keyByPromQL() string {
 		m.Prom.Namespace,
 		strings.ToLower(m.MetricName),
 		m.Prom.QueryExpr,
-		selectorStr}, "-")
+		selectorStr}, "_")
 }
 
 // Query is used to do query for different data source. you can extends it with your data source query

diff --git a/pkg/prediction/config/types.go b/pkg/prediction/config/types.go
@@ -10,7 +10,20 @@ type AlgorithmModelConfig struct {
 	UpdateInterval time.Duration
 }
 
+type ModelInitMode string
+
+const (
+	// means recover or init the algorithm model directly from history datasource, this process may block because it is time consuming for data fetching & model gen
+	ModelInitModeHistory ModelInitMode = "history"
+	// means recover or init the algorithm model from real time datasource async, predictor can not do predicting before the data is accumulating to window length
+	// this is more safe to do some data accumulating and make the prediction data is robust.
+	ModelInitModeLazyTraining ModelInitMode = "lazytraining"
+	// means recover or init the model from a checkpoint, it can be restored directly and immediately to do predict.
+	ModelInitModeCheckpoint ModelInitMode = "checkpoint"
+)
+
 type Config struct {
+	InitMode   *ModelInitMode
 	DSP        *v1alpha1.DSP
 	Percentile *v1alpha1.Percentile
 }
diff --git a/pkg/prediction/dsp/prediction.go b/pkg/prediction/dsp/prediction.go
@@ -35,6 +35,10 @@ type periodicSignalPrediction struct {
 	modelConfig config.AlgorithmModelConfig
 }
 
+func (p *periodicSignalPrediction) QueryPredictionStatus(ctx context.Context, metricNamer metricnaming.MetricNamer) (prediction.Status, error) {
+	panic("implement me")
+}
+
 func NewPrediction(realtimeProvider providers.RealTime, historyProvider providers.History, mc config.AlgorithmModelConfig) prediction.Interface {
 	withCh, delCh := make(chan prediction.QueryExprWithCaller), make(chan prediction.QueryExprWithCaller)
 	return &periodicSignalPrediction{

diff --git a/pkg/prediction/generic.go b/pkg/prediction/generic.go
@@ -26,6 +26,10 @@ const (
 	StatusNotStarted Status = "NotStarted"
 	StatusUnknown    Status = "Unknown"
 	StatusDeleted    Status = "Deleted"
+	// StatusInitializing means the prediction model is accumulating data until it satisfy the user specified time window such as 12h or 3d or 1w when use some real time data provider
+	// if support recover from checkpoint, then it maybe faster
+	StatusInitializing Status = "Initializing"
+	StatusExpired      Status = "Expired"
 )
 
 type WithMetricEvent struct {

diff --git a/pkg/prediction/interface.go b/pkg/prediction/interface.go
@@ -19,6 +19,9 @@ type Interface interface {
 
 	DeleteQuery(metricNamer metricnaming.MetricNamer, caller string) error
 
+	// QueryPredictionStatus return the metricNamer prediction status. it is predictable only when it is ready
+	QueryPredictionStatus(ctx context.Context, metricNamer metricnaming.MetricNamer) (Status, error)
+
 	// QueryRealtimePredictedValues returns predicted values based on the specified query expression
 	QueryRealtimePredictedValues(ctx context.Context, metricNamer metricnaming.MetricNamer) ([]*common.TimeSeries, error)
 

diff --git a/pkg/prediction/percentile/aggregate_signal.go b/pkg/prediction/percentile/aggregate_signal.go
@@ -15,6 +15,7 @@ type aggregateSignal struct {
 	lastSampleTime    time.Time
 	minSampleWeight   float64
 	totalSamplesCount int
+	sampleInterval    time.Duration
 	creationTime      time.Time
 	labels            []common.Label
 }
@@ -30,10 +31,16 @@ func (a *aggregateSignal) addSample(sampleTime time.Time, sampleValue float64) {
 	a.totalSamplesCount++
 }
 
+// largest is 290 years, so it can not be overflow
+func (a *aggregateSignal) GetAggregationWindowLength() time.Duration {
+	return time.Duration(a.totalSamplesCount) * a.sampleInterval
+}
+
 func newAggregateSignal(c *internalConfig) *aggregateSignal {
 	return &aggregateSignal{
 		histogram:       vpa.NewHistogram(c.histogramOptions),
 		minSampleWeight: c.minSampleWeight,
 		creationTime:    time.Now(),
+		sampleInterval:  c.sampleInterval,
 	}
 }