From fd0fbf8b30e9621b3a969c88b90bcbffae23610a Mon Sep 17 00:00:00 2001 From: Song Gao Date: Mon, 13 Jul 2020 19:18:37 +0800 Subject: [PATCH] cherry pick #2884 to release-1.1 Signed-off-by: ti-srebot --- docs/api-references/docs.md | 183 ++++++++++++++++-- manifests/crd.yaml | 26 ++- .../pingcap/v1alpha1/openapi_generated.go | 111 +++++++++-- .../v1alpha1/tidbclusterautoscaler_types.go | 61 +++++- .../pingcap/v1alpha1/zz_generated.deepcopy.go | 85 +++++++- .../autoscaler/autoscaler_manager.go | 36 +--- .../autoscaler/calculate/calculate.go | 4 +- pkg/autoscaler/autoscaler/calculate/cpu.go | 12 +- .../autoscaler/calculate/storage.go | 132 +++++++++++++ pkg/autoscaler/autoscaler/calculate/util.go | 20 +- pkg/autoscaler/autoscaler/tidb_autoscaler.go | 3 +- pkg/autoscaler/autoscaler/tikv_autoscaler.go | 125 +++++++++--- .../autoscaler/tikv_autoscaler_test.go | 149 ++++++++++++++ pkg/autoscaler/autoscaler/util.go | 44 +++-- pkg/autoscaler/autoscaler/util_test.go | 7 +- pkg/label/label.go | 4 - tests/e2e/tidbcluster/serial.go | 130 +++++++++++-- tests/pkg/mock/monitor.go | 39 ++-- tests/pkg/mock/util.go | 40 +++- 19 files changed, 1026 insertions(+), 185 deletions(-) create mode 100644 pkg/autoscaler/autoscaler/calculate/storage.go create mode 100644 pkg/autoscaler/autoscaler/tikv_autoscaler_test.go diff --git a/docs/api-references/docs.md b/docs/api-references/docs.md index 0c70c6e5bd..97ac5a48b0 100644 --- a/docs/api-references/docs.md +++ b/docs/api-references/docs.md @@ -2550,21 +2550,13 @@ If not set, the default ScaleOutIntervalSeconds will be set to 300

metrics
- -[]Kubernetes autoscaling/v2beta2.MetricSpec + +[]CustomMetric (Optional) -

metrics contains the specifications for which to use to calculate the -desired replica count (the maximum replica count across all metrics will -be used). The desired replica count is calculated multiplying the -ratio between the target value and the current value by the current -number of pods. Ergo, metrics used must decrease as the pod count is -increased, and vice-versa. See the individual metric source types for -more information about how each type of metric must respond. -If not set, the default metric will be set to 80% average CPU utilization.

@@ -2576,7 +2568,7 @@ string (Optional) -

MetricsTimeDuration describe the Time duration to be queried in the Prometheus

+

MetricsTimeDuration describes the Time duration to be queried in the Prometheus

@@ -3399,6 +3391,77 @@ CrdKind +

CustomMetric

+

+(Appears on: +BasicAutoScalerSpec) +

+

+

+ + + + + + + + + + + + + + + + + + + + + +
FieldDescription
+MetricSpec
+ + +Kubernetes autoscaling/v2beta2.MetricSpec + + +
+

+(Members of MetricSpec are embedded into this type.) +

+(Optional) +

metrics contains the specifications for which to use to calculate the +desired replica count (the maximum replica count across all metrics will +be used). The desired replica count is calculated multiplying the +ratio between the target value and the current value by the current +number of pods. Ergo, metrics used must decrease as the pod count is +increased, and vice-versa. See the individual metric source types for +more information about how each type of metric must respond. +If not set, the auto-scaling won’t happen.

+
+leastStoragePressurePeriodSeconds
+ +int64 + +
+(Optional) +

LeastStoragePressurePeriodSeconds is only for the storage auto-scaling case when the resource name in the metricSpec +is Storage. When the Storage metrics meet the pressure, Operator would wait +LeastStoragePressurePeriodSeconds duration then able to scale out. +If not set, the default value is 300

+
+leastRemainAvailableStoragePercent
+ +int64 + +
+(Optional) +

LeastRemainAvailableStoragePercent indicates the least remaining available storage percent compare to +the capacity storage. If the available storage is lower than the capacity storage * LeastRemainAvailableStoragePercent, +the storage status will become storage pressure and ready to be scaled out. +LeastRemainAvailableStoragePercent should between 5 and 90. If not set, the default value would be 10

+

DashboardConfig

(Appears on: @@ -4725,6 +4788,7 @@ string +(Optional)

CurrentValue indicates the value calculated in the last auto-scaling reconciliation

@@ -4736,9 +4800,26 @@ string +(Optional)

TargetValue indicates the threshold value for this metrics in auto-scaling

+ + +StorageMetricsStatus
+ + +StorageMetricsStatus + + + + +

+(Members of StorageMetricsStatus are embedded into this type.) +

+(Optional) + +

MonitorComponentAccessor

@@ -8567,6 +8648,86 @@ More info: StorageMetricsStatus +

+(Appears on: +MetricsStatus) +

+

+

StorageMetricsStatus describe the storage metrics status in the last auto-scaling reconciliation

+

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FieldDescription
+storagePressure
+ +bool + +
+(Optional) +

StoragePressure indicates whether storage under pressure

+
+storagePressureStartTime
+ + +Kubernetes meta/v1.Time + + +
+(Optional) +

StoragePressureStartTime indicates the timestamp of the StoragePressure fist become true from false or nil

+
+availableStorage
+ +string + +
+(Optional) +
+capacityStorage
+ +string + +
+(Optional) +
+baselineAvailableStorage
+ +string + +
+

BaselineAvailableStorage indicates the baseline for available storage size. +This is calculated by the capacity storage size * storage auto-scaling baseline percent value +If the AvailableStorage is less than the BaselineAvailableStorage, the database is under StoragePressure +optional

+

StorageProvider

(Appears on: diff --git a/manifests/crd.yaml b/manifests/crd.yaml index f0adf02210..b309456952 100644 --- a/manifests/crd.yaml +++ b/manifests/crd.yaml @@ -13984,16 +13984,25 @@ spec: metrics: items: properties: + availableStorage: + type: string + baselineAvailableStorage: + type: string + capacityStorage: + type: string currentValue: type: string name: type: string + storagePressure: + type: boolean + storagePressureStartTime: + format: date-time + type: string thresholdValue: type: string required: - name - - currentValue - - thresholdValue type: object type: array recommendedReplicas: @@ -14013,16 +14022,25 @@ spec: metrics: items: properties: + availableStorage: + type: string + baselineAvailableStorage: + type: string + capacityStorage: + type: string currentValue: type: string name: type: string + storagePressure: + type: boolean + storagePressureStartTime: + format: date-time + type: string thresholdValue: type: string required: - name - - currentValue - - thresholdValue type: object type: array recommendedReplicas: diff --git a/pkg/apis/pingcap/v1alpha1/openapi_generated.go b/pkg/apis/pingcap/v1alpha1/openapi_generated.go index b3740f36c6..5c599e7bfb 100644 --- a/pkg/apis/pingcap/v1alpha1/openapi_generated.go +++ b/pkg/apis/pingcap/v1alpha1/openapi_generated.go @@ -91,6 +91,7 @@ func GetOpenAPIDefinitions(ref common.ReferenceCallback) map[string]common.OpenA "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.Status": schema_pkg_apis_pingcap_v1alpha1_Status(ref), "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.StmtSummary": schema_pkg_apis_pingcap_v1alpha1_StmtSummary(ref), "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.StorageClaim": schema_pkg_apis_pingcap_v1alpha1_StorageClaim(ref), + "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.StorageMetricsStatus": schema_pkg_apis_pingcap_v1alpha1_StorageMetricsStatus(ref), "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.StorageProvider": schema_pkg_apis_pingcap_v1alpha1_StorageProvider(ref), "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.TiCDCConfig": schema_pkg_apis_pingcap_v1alpha1_TiCDCConfig(ref), "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.TiCDCSpec": schema_pkg_apis_pingcap_v1alpha1_TiCDCSpec(ref), @@ -893,12 +894,11 @@ func schema_pkg_apis_pingcap_v1alpha1_BasicAutoScalerSpec(ref common.ReferenceCa }, "metrics": { SchemaProps: spec.SchemaProps{ - Description: "metrics contains the specifications for which to use to calculate the desired replica count (the maximum replica count across all metrics will be used). The desired replica count is calculated multiplying the ratio between the target value and the current value by the current number of pods. Ergo, metrics used must decrease as the pod count is increased, and vice-versa. See the individual metric source types for more information about how each type of metric must respond. If not set, the default metric will be set to 80% average CPU utilization.", - Type: []string{"array"}, + Type: []string{"array"}, Items: &spec.SchemaOrArray{ Schema: &spec.Schema{ SchemaProps: spec.SchemaProps{ - Ref: ref("k8s.io/api/autoscaling/v2beta2.MetricSpec"), + Ref: ref("github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.CustomMetric"), }, }, }, @@ -906,7 +906,7 @@ func schema_pkg_apis_pingcap_v1alpha1_BasicAutoScalerSpec(ref common.ReferenceCa }, "metricsTimeDuration": { SchemaProps: spec.SchemaProps{ - Description: "MetricsTimeDuration describe the Time duration to be queried in the Prometheus", + Description: "MetricsTimeDuration describes the Time duration to be queried in the Prometheus", Type: []string{"string"}, Format: "", }, @@ -922,7 +922,7 @@ func schema_pkg_apis_pingcap_v1alpha1_BasicAutoScalerSpec(ref common.ReferenceCa }, }, Dependencies: []string{ - "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.ExternalEndpoint", "k8s.io/api/autoscaling/v2beta2.MetricSpec"}, + "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.CustomMetric", "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.ExternalEndpoint"}, } } @@ -2056,10 +2056,44 @@ func schema_pkg_apis_pingcap_v1alpha1_MetricsStatus(ref common.ReferenceCallback Format: "", }, }, + "storagePressure": { + SchemaProps: spec.SchemaProps{ + Description: "StoragePressure indicates whether storage under pressure", + Type: []string{"boolean"}, + Format: "", + }, + }, + "storagePressureStartTime": { + SchemaProps: spec.SchemaProps{ + Description: "StoragePressureStartTime indicates the timestamp of the StoragePressure fist become true from false or nil", + Ref: ref("k8s.io/apimachinery/pkg/apis/meta/v1.Time"), + }, + }, + "availableStorage": { + SchemaProps: spec.SchemaProps{ + Type: []string{"string"}, + Format: "", + }, + }, + "capacityStorage": { + SchemaProps: spec.SchemaProps{ + Type: []string{"string"}, + Format: "", + }, + }, + "baselineAvailableStorage": { + SchemaProps: spec.SchemaProps{ + Description: "BaselineAvailableStorage indicates the baseline for available storage size. This is calculated by the capacity storage size * storage auto-scaling baseline percent value If the AvailableStorage is less than the BaselineAvailableStorage, the database is under StoragePressure optional", + Type: []string{"string"}, + Format: "", + }, + }, }, - Required: []string{"name", "currentValue", "thresholdValue"}, + Required: []string{"name"}, }, }, + Dependencies: []string{ + "k8s.io/apimachinery/pkg/apis/meta/v1.Time"}, } } @@ -4336,6 +4370,53 @@ func schema_pkg_apis_pingcap_v1alpha1_StorageClaim(ref common.ReferenceCallback) } } +func schema_pkg_apis_pingcap_v1alpha1_StorageMetricsStatus(ref common.ReferenceCallback) common.OpenAPIDefinition { + return common.OpenAPIDefinition{ + Schema: spec.Schema{ + SchemaProps: spec.SchemaProps{ + Description: "StorageMetricsStatus describe the storage metrics status in the last auto-scaling reconciliation", + Type: []string{"object"}, + Properties: map[string]spec.Schema{ + "storagePressure": { + SchemaProps: spec.SchemaProps{ + Description: "StoragePressure indicates whether storage under pressure", + Type: []string{"boolean"}, + Format: "", + }, + }, + "storagePressureStartTime": { + SchemaProps: spec.SchemaProps{ + Description: "StoragePressureStartTime indicates the timestamp of the StoragePressure fist become true from false or nil", + Ref: ref("k8s.io/apimachinery/pkg/apis/meta/v1.Time"), + }, + }, + "availableStorage": { + SchemaProps: spec.SchemaProps{ + Type: []string{"string"}, + Format: "", + }, + }, + "capacityStorage": { + SchemaProps: spec.SchemaProps{ + Type: []string{"string"}, + Format: "", + }, + }, + "baselineAvailableStorage": { + SchemaProps: spec.SchemaProps{ + Description: "BaselineAvailableStorage indicates the baseline for available storage size. This is calculated by the capacity storage size * storage auto-scaling baseline percent value If the AvailableStorage is less than the BaselineAvailableStorage, the database is under StoragePressure optional", + Type: []string{"string"}, + Format: "", + }, + }, + }, + }, + }, + Dependencies: []string{ + "k8s.io/apimachinery/pkg/apis/meta/v1.Time"}, + } +} + func schema_pkg_apis_pingcap_v1alpha1_StorageProvider(ref common.ReferenceCallback) common.OpenAPIDefinition { return common.OpenAPIDefinition{ Schema: spec.Schema{ @@ -8095,12 +8176,11 @@ func schema_pkg_apis_pingcap_v1alpha1_TidbAutoScalerSpec(ref common.ReferenceCal }, "metrics": { SchemaProps: spec.SchemaProps{ - Description: "metrics contains the specifications for which to use to calculate the desired replica count (the maximum replica count across all metrics will be used). The desired replica count is calculated multiplying the ratio between the target value and the current value by the current number of pods. Ergo, metrics used must decrease as the pod count is increased, and vice-versa. See the individual metric source types for more information about how each type of metric must respond. If not set, the default metric will be set to 80% average CPU utilization.", - Type: []string{"array"}, + Type: []string{"array"}, Items: &spec.SchemaOrArray{ Schema: &spec.Schema{ SchemaProps: spec.SchemaProps{ - Ref: ref("k8s.io/api/autoscaling/v2beta2.MetricSpec"), + Ref: ref("github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.CustomMetric"), }, }, }, @@ -8108,7 +8188,7 @@ func schema_pkg_apis_pingcap_v1alpha1_TidbAutoScalerSpec(ref common.ReferenceCal }, "metricsTimeDuration": { SchemaProps: spec.SchemaProps{ - Description: "MetricsTimeDuration describe the Time duration to be queried in the Prometheus", + Description: "MetricsTimeDuration describes the Time duration to be queried in the Prometheus", Type: []string{"string"}, Format: "", }, @@ -8124,7 +8204,7 @@ func schema_pkg_apis_pingcap_v1alpha1_TidbAutoScalerSpec(ref common.ReferenceCal }, }, Dependencies: []string{ - "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.ExternalEndpoint", "k8s.io/api/autoscaling/v2beta2.MetricSpec"}, + "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.CustomMetric", "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.ExternalEndpoint"}, } } @@ -9233,12 +9313,11 @@ func schema_pkg_apis_pingcap_v1alpha1_TikvAutoScalerSpec(ref common.ReferenceCal }, "metrics": { SchemaProps: spec.SchemaProps{ - Description: "metrics contains the specifications for which to use to calculate the desired replica count (the maximum replica count across all metrics will be used). The desired replica count is calculated multiplying the ratio between the target value and the current value by the current number of pods. Ergo, metrics used must decrease as the pod count is increased, and vice-versa. See the individual metric source types for more information about how each type of metric must respond. If not set, the default metric will be set to 80% average CPU utilization.", - Type: []string{"array"}, + Type: []string{"array"}, Items: &spec.SchemaOrArray{ Schema: &spec.Schema{ SchemaProps: spec.SchemaProps{ - Ref: ref("k8s.io/api/autoscaling/v2beta2.MetricSpec"), + Ref: ref("github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.CustomMetric"), }, }, }, @@ -9246,7 +9325,7 @@ func schema_pkg_apis_pingcap_v1alpha1_TikvAutoScalerSpec(ref common.ReferenceCal }, "metricsTimeDuration": { SchemaProps: spec.SchemaProps{ - Description: "MetricsTimeDuration describe the Time duration to be queried in the Prometheus", + Description: "MetricsTimeDuration describes the Time duration to be queried in the Prometheus", Type: []string{"string"}, Format: "", }, @@ -9262,7 +9341,7 @@ func schema_pkg_apis_pingcap_v1alpha1_TikvAutoScalerSpec(ref common.ReferenceCal }, }, Dependencies: []string{ - "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.ExternalEndpoint", "k8s.io/api/autoscaling/v2beta2.MetricSpec"}, + "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.CustomMetric", "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1.ExternalEndpoint"}, } } diff --git a/pkg/apis/pingcap/v1alpha1/tidbclusterautoscaler_types.go b/pkg/apis/pingcap/v1alpha1/tidbclusterautoscaler_types.go index 825967de4a..85d23774f6 100644 --- a/pkg/apis/pingcap/v1alpha1/tidbclusterautoscaler_types.go +++ b/pkg/apis/pingcap/v1alpha1/tidbclusterautoscaler_types.go @@ -115,6 +115,19 @@ type BasicAutoScalerSpec struct { // +optional ScaleOutIntervalSeconds *int32 `json:"scaleOutIntervalSeconds,omitempty"` + // +optional + Metrics []CustomMetric `json:"metrics,omitempty"` + + // MetricsTimeDuration describes the Time duration to be queried in the Prometheus + // +optional + MetricsTimeDuration *string `json:"metricsTimeDuration,omitempty"` + // ExternalEndpoint makes the auto-scaler controller able to query the external service + // to fetch the recommended replicas for TiKV/TiDB + // +optional + ExternalEndpoint *ExternalEndpoint `json:"externalEndpoint,omitempty"` +} + +type CustomMetric struct { // metrics contains the specifications for which to use to calculate the // desired replica count (the maximum replica count across all metrics will // be used). The desired replica count is calculated multiplying the @@ -122,17 +135,21 @@ type BasicAutoScalerSpec struct { // number of pods. Ergo, metrics used must decrease as the pod count is // increased, and vice-versa. See the individual metric source types for // more information about how each type of metric must respond. - // If not set, the default metric will be set to 80% average CPU utilization. + // If not set, the auto-scaling won't happen. // +optional - Metrics []v2beta2.MetricSpec `json:"metrics,omitempty"` - - // MetricsTimeDuration describe the Time duration to be queried in the Prometheus + v2beta2.MetricSpec `json:",inline"` + // LeastStoragePressurePeriodSeconds is only for the storage auto-scaling case when the resource name in the metricSpec + // is `Storage`. When the Storage metrics meet the pressure, Operator would wait + // LeastStoragePressurePeriodSeconds duration then able to scale out. + // If not set, the default value is `300` // +optional - MetricsTimeDuration *string `json:"metricsTimeDuration,omitempty"` - // ExternalEndpoint makes the auto-scaler controller able to query the external service - // to fetch the recommended replicas for TiKV/TiDB + LeastStoragePressurePeriodSeconds *int64 `json:"leastStoragePressurePeriodSeconds,omitempty"` + // LeastRemainAvailableStoragePercent indicates the least remaining available storage percent compare to + // the capacity storage. If the available storage is lower than the capacity storage * LeastRemainAvailableStoragePercent, + // the storage status will become storage pressure and ready to be scaled out. + // LeastRemainAvailableStoragePercent should between 5 and 90. If not set, the default value would be 10 // +optional - ExternalEndpoint *ExternalEndpoint `json:"externalEndpoint,omitempty"` + LeastRemainAvailableStoragePercent *int64 `json:"leastRemainAvailableStoragePercent,omitempty"` } // +k8s:openapi-gen=true @@ -196,9 +213,33 @@ type MetricsStatus struct { // Name indicates the metrics name Name string `json:"name"` // CurrentValue indicates the value calculated in the last auto-scaling reconciliation - CurrentValue string `json:"currentValue"` + // +optional + CurrentValue *string `json:"currentValue,omitempty"` // TargetValue indicates the threshold value for this metrics in auto-scaling - ThresholdValue string `json:"thresholdValue"` + // +optional + ThresholdValue *string `json:"thresholdValue,omitempty"` + // +optional + StorageMetricsStatus `json:",inline"` +} + +// +k8s:openapi-gen=true +// StorageMetricsStatus describe the storage metrics status in the last auto-scaling reconciliation +type StorageMetricsStatus struct { + // StoragePressure indicates whether storage under pressure + // +optional + StoragePressure *bool `json:"storagePressure,omitempty"` + // StoragePressureStartTime indicates the timestamp of the StoragePressure fist become true from false or nil + // +optional + StoragePressureStartTime *metav1.Time `json:"storagePressureStartTime,omitempty"` + // +optional + AvailableStorage *string `json:"availableStorage,omitempty"` + // +optional + CapacityStorage *string `json:"capacityStorage,omitempty"` + // BaselineAvailableStorage indicates the baseline for available storage size. + // This is calculated by the capacity storage size * storage auto-scaling baseline percent value + // If the AvailableStorage is less than the BaselineAvailableStorage, the database is under StoragePressure + // optional + BaselineAvailableStorage *string `json:"baselineAvailableStorage,omitempty"` } // +k8s:openapi-gen=true diff --git a/pkg/apis/pingcap/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/pingcap/v1alpha1/zz_generated.deepcopy.go index 227c6de239..bdbf8b72c9 100644 --- a/pkg/apis/pingcap/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/apis/pingcap/v1alpha1/zz_generated.deepcopy.go @@ -21,7 +21,6 @@ import ( time "time" appsv1 "k8s.io/api/apps/v1" - v2beta2 "k8s.io/api/autoscaling/v2beta2" v1 "k8s.io/api/core/v1" extensionsv1beta1 "k8s.io/api/extensions/v1beta1" v1beta1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1beta1" @@ -370,7 +369,7 @@ func (in *BasicAutoScalerSpec) DeepCopyInto(out *BasicAutoScalerSpec) { } if in.Metrics != nil { in, out := &in.Metrics, &out.Metrics - *out = make([]v2beta2.MetricSpec, len(*in)) + *out = make([]CustomMetric, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } @@ -404,7 +403,9 @@ func (in *BasicAutoScalerStatus) DeepCopyInto(out *BasicAutoScalerStatus) { if in.MetricsStatusList != nil { in, out := &in.MetricsStatusList, &out.MetricsStatusList *out = make([]MetricsStatus, len(*in)) - copy(*out, *in) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } } if in.LastAutoScalingTimestamp != nil { in, out := &in.LastAutoScalingTimestamp, &out.LastAutoScalingTimestamp @@ -794,6 +795,33 @@ func (in *CrdKinds) DeepCopy() *CrdKinds { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CustomMetric) DeepCopyInto(out *CustomMetric) { + *out = *in + in.MetricSpec.DeepCopyInto(&out.MetricSpec) + if in.LeastStoragePressurePeriodSeconds != nil { + in, out := &in.LeastStoragePressurePeriodSeconds, &out.LeastStoragePressurePeriodSeconds + *out = new(int64) + **out = **in + } + if in.LeastRemainAvailableStoragePercent != nil { + in, out := &in.LeastRemainAvailableStoragePercent, &out.LeastRemainAvailableStoragePercent + *out = new(int64) + **out = **in + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CustomMetric. +func (in *CustomMetric) DeepCopy() *CustomMetric { + if in == nil { + return nil + } + out := new(CustomMetric) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DashboardConfig) DeepCopyInto(out *DashboardConfig) { *out = *in @@ -1738,6 +1766,17 @@ func (in *MasterKeyKMSConfig) DeepCopy() *MasterKeyKMSConfig { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *MetricsStatus) DeepCopyInto(out *MetricsStatus) { *out = *in + if in.CurrentValue != nil { + in, out := &in.CurrentValue, &out.CurrentValue + *out = new(string) + **out = **in + } + if in.ThresholdValue != nil { + in, out := &in.ThresholdValue, &out.ThresholdValue + *out = new(string) + **out = **in + } + in.StorageMetricsStatus.DeepCopyInto(&out.StorageMetricsStatus) return } @@ -3625,6 +3664,46 @@ func (in *StorageClaim) DeepCopy() *StorageClaim { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *StorageMetricsStatus) DeepCopyInto(out *StorageMetricsStatus) { + *out = *in + if in.StoragePressure != nil { + in, out := &in.StoragePressure, &out.StoragePressure + *out = new(bool) + **out = **in + } + if in.StoragePressureStartTime != nil { + in, out := &in.StoragePressureStartTime, &out.StoragePressureStartTime + *out = (*in).DeepCopy() + } + if in.AvailableStorage != nil { + in, out := &in.AvailableStorage, &out.AvailableStorage + *out = new(string) + **out = **in + } + if in.CapacityStorage != nil { + in, out := &in.CapacityStorage, &out.CapacityStorage + *out = new(string) + **out = **in + } + if in.BaselineAvailableStorage != nil { + in, out := &in.BaselineAvailableStorage, &out.BaselineAvailableStorage + *out = new(string) + **out = **in + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new StorageMetricsStatus. +func (in *StorageMetricsStatus) DeepCopy() *StorageMetricsStatus { + if in == nil { + return nil + } + out := new(StorageMetricsStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *StorageProvider) DeepCopyInto(out *StorageProvider) { *out = *in diff --git a/pkg/autoscaler/autoscaler/autoscaler_manager.go b/pkg/autoscaler/autoscaler/autoscaler_manager.go index 1ce1d19fe9..951242087b 100644 --- a/pkg/autoscaler/autoscaler/autoscaler_manager.go +++ b/pkg/autoscaler/autoscaler/autoscaler_manager.go @@ -16,7 +16,6 @@ package autoscaler import ( "encoding/json" "fmt" - "strconv" "strings" "time" @@ -147,47 +146,22 @@ func (am *autoScalerManager) updateAutoScaling(oldTc *v1alpha1.TidbCluster, if tac.Annotations == nil { tac.Annotations = map[string]string{} } - f := func(key string) (*time.Time, error) { - v, ok := tac.Annotations[key] - if ok { - ts, err := strconv.ParseInt(v, 10, 64) - if err != nil { - klog.Errorf("failed to convert label[%s] key to int64, err:%v", key, err) - return nil, err - } - t := time.Unix(ts, 0) - return &t, nil - } - return nil, nil - } - - tac.Annotations[label.AnnLastSyncingTimestamp] = fmt.Sprintf("%d", time.Now().Unix()) - + now := time.Now() + tac.Annotations[label.AnnLastSyncingTimestamp] = fmt.Sprintf("%d", now.Unix()) if tac.Spec.TiKV != nil { if oldTc.Status.TiKV.StatefulSet != nil { tac.Status.TiKV.CurrentReplicas = oldTc.Status.TiKV.StatefulSet.CurrentReplicas } - lastTimestamp, err := f(label.AnnTiKVLastAutoScalingTimestamp) - if err != nil { - return err - } - if lastTimestamp != nil { - tac.Status.TiKV.LastAutoScalingTimestamp = &metav1.Time{Time: *lastTimestamp} - } + tac.Status.TiKV.LastAutoScalingTimestamp = &metav1.Time{Time: now} } else { tac.Status.TiKV = nil } + if tac.Spec.TiDB != nil { if oldTc.Status.TiDB.StatefulSet != nil { tac.Status.TiDB.CurrentReplicas = oldTc.Status.TiDB.StatefulSet.CurrentReplicas } - lastTimestamp, err := f(label.AnnTiDBLastAutoScalingTimestamp) - if err != nil { - return err - } - if lastTimestamp != nil { - tac.Status.TiDB.LastAutoScalingTimestamp = &metav1.Time{Time: *lastTimestamp} - } + tac.Status.TiDB.LastAutoScalingTimestamp = &metav1.Time{Time: now} } else { tac.Status.TiDB = nil } diff --git a/pkg/autoscaler/autoscaler/calculate/calculate.go b/pkg/autoscaler/autoscaler/calculate/calculate.go index 6509712e2b..9194c492c1 100644 --- a/pkg/autoscaler/autoscaler/calculate/calculate.go +++ b/pkg/autoscaler/autoscaler/calculate/calculate.go @@ -24,12 +24,12 @@ import ( "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1" promClient "github.com/prometheus/client_golang/api" - autoscalingv2beta2 "k8s.io/api/autoscaling/v2beta2" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/klog" ) const ( + TikvSumStorageMetricsPattern = `sum(tikv_store_size_bytes{cluster="%s", type="%s"}) by (cluster)` TikvSumCpuMetricsPattern = `sum(increase(tikv_thread_cpu_seconds_total{cluster="%s"}[%s])) by (instance)` TidbSumCpuMetricsPattern = `sum(increase(process_cpu_seconds_total{cluster="%s",job="tidb"}[%s])) by (instance)` InvalidTacMetricConfigureMsg = "tac[%s/%s] metric configuration invalid" @@ -44,7 +44,6 @@ type SingleQuery struct { Timestamp int64 Quary string Instances []string - Metric autoscalingv2beta2.MetricSpec } func queryMetricsFromPrometheus(tac *v1alpha1.TidbClusterAutoScaler, client promClient.Client, sq *SingleQuery, resp *Response) error { @@ -92,6 +91,7 @@ func sumForEachInstance(instances []string, resp *Response) (float64, error) { if len(resp.Data.Result) < 1 { return 0, fmt.Errorf("metrics Response return zero info") } + for _, r := range resp.Data.Result { if s.Has(r.Metric.Instance) { v, err := strconv.ParseFloat(r.Value[1].(string), 64) diff --git a/pkg/autoscaler/autoscaler/calculate/cpu.go b/pkg/autoscaler/autoscaler/calculate/cpu.go index 2d803fe205..36e763f925 100644 --- a/pkg/autoscaler/autoscaler/calculate/cpu.go +++ b/pkg/autoscaler/autoscaler/calculate/cpu.go @@ -20,7 +20,9 @@ import ( "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1" promClient "github.com/prometheus/client_golang/api" appsv1 "k8s.io/api/apps/v1" + autoscalingv2beta2 "k8s.io/api/autoscaling/v2beta2" corev1 "k8s.io/api/core/v1" + "k8s.io/utils/pointer" ) const ( @@ -29,10 +31,8 @@ const ( //TODO: create issue to explain how auto-scaling algorithm based on cpu metrics work func CalculateRecomendedReplicasByCpuCosts(tac *v1alpha1.TidbClusterAutoScaler, sq *SingleQuery, sts *appsv1.StatefulSet, - client promClient.Client, memberType v1alpha1.MemberType, duration time.Duration) (int32, error) { - metric := sq.Metric + client promClient.Client, memberType v1alpha1.MemberType, duration time.Duration, metric autoscalingv2beta2.MetricSpec) (int32, error) { instances := sq.Instances - if metric.Resource == nil || metric.Resource.Target.AverageUtilization == nil { return -1, fmt.Errorf(InvalidTacMetricConfigureMsg, tac.Namespace, tac.Name) } @@ -66,9 +66,9 @@ func CalculateRecomendedReplicasByCpuCosts(tac *v1alpha1.TidbClusterAutoScaler, return -1, err } metrics := v1alpha1.MetricsStatus{ - Name: string(MetricTypeCPU), - CurrentValue: fmt.Sprintf("%v", cpuSecsTotal), - ThresholdValue: fmt.Sprintf("%v", expectedCpuSecsTotal), + Name: string(corev1.ResourceCPU), + CurrentValue: pointer.StringPtr(fmt.Sprintf("%v", cpuSecsTotal)), + ThresholdValue: pointer.StringPtr(fmt.Sprintf("%v", expectedCpuSecsTotal)), } if memberType == v1alpha1.TiKVMemberType { addMetricsStatusIntoMetricsStatusList(metrics, &tac.Status.TiKV.BasicAutoScalerStatus) diff --git a/pkg/autoscaler/autoscaler/calculate/storage.go b/pkg/autoscaler/autoscaler/calculate/storage.go new file mode 100644 index 0000000000..5e1497a8f9 --- /dev/null +++ b/pkg/autoscaler/autoscaler/calculate/storage.go @@ -0,0 +1,132 @@ +// Copyright 2020 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package calculate + +import ( + "fmt" + "strconv" + "time" + + "github.com/dustin/go-humanize" + "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1" + promClient "github.com/prometheus/client_golang/api" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/pointer" +) + +const ( + parseError = "tac[%s/%s] parse size failed,err:%v" +) + +func CalculateWhetherStoragePressure(tac *v1alpha1.TidbClusterAutoScaler, capacitySq, availableSq *SingleQuery, + client promClient.Client, metric v1alpha1.CustomMetric) (bool, error) { + if metric.Resource == nil || + metric.Resource.Name != corev1.ResourceStorage || + metric.LeastRemainAvailableStoragePercent == nil { + return false, fmt.Errorf("tac[%s/%s] didn't set storage metric correctly", tac.Namespace, tac.Name) + } + + // query total available storage size + resp := &Response{} + err := queryMetricsFromPrometheus(tac, client, availableSq, resp) + if err != nil { + return false, err + } + var availableSize uint64 + for _, r := range resp.Data.Result { + if r.Metric.Cluster == tac.Spec.Cluster.Name { + availableSize, err = strconv.ParseUint(r.Value[1].(string), 10, 64) + if err != nil { + return false, fmt.Errorf(parseError, tac.Namespace, tac.Name, err) + } + } + } + + // query total capacity storage size + resp = &Response{} + err = queryMetricsFromPrometheus(tac, client, capacitySq, resp) + if err != nil { + return false, err + } + var capacitySize uint64 + for _, r := range resp.Data.Result { + if r.Metric.Cluster == tac.Spec.Cluster.Name { + capacitySize, err = strconv.ParseUint(r.Value[1].(string), 10, 64) + if err != nil { + return false, fmt.Errorf(parseError, tac.Namespace, tac.Name, err) + } + } + } + v := *metric.LeastRemainAvailableStoragePercent + baselineAvailableSize := (capacitySize / 100) * uint64(v) + storagePressure := false + if availableSize < baselineAvailableSize { + storagePressure = true + } + + var newStatus, oldStatus *v1alpha1.MetricsStatus + for _, m := range tac.Status.TiKV.MetricsStatusList { + if m.Name == string(corev1.ResourceStorage) { + oldStatus = &m + break + } + } + storageMetrics := v1alpha1.StorageMetricsStatus{ + StoragePressure: pointer.BoolPtr(storagePressure), + AvailableStorage: pointer.StringPtr(humanize.Bytes(availableSize)), + CapacityStorage: pointer.StringPtr(humanize.Bytes(capacitySize)), + BaselineAvailableStorage: pointer.StringPtr(humanize.Bytes(baselineAvailableSize)), + } + if oldStatus != nil { + oldStatus.StoragePressure = storageMetrics.StoragePressure + oldStatus.AvailableStorage = storageMetrics.AvailableStorage + oldStatus.CapacityStorage = storageMetrics.CapacityStorage + oldStatus.BaselineAvailableStorage = storageMetrics.BaselineAvailableStorage + newStatus = oldStatus + } else { + newStatus = &v1alpha1.MetricsStatus{ + Name: string(corev1.ResourceStorage), + StorageMetricsStatus: storageMetrics, + } + } + + if storagePressure { + if !isStoragePressureStartTimeRecordAlready(tac.Status) { + newStatus.StorageMetricsStatus.StoragePressureStartTime = &metav1.Time{Time: time.Now()} + } + } else { + newStatus.StoragePressureStartTime = nil + } + addMetricsStatusIntoMetricsStatusList(*newStatus, &tac.Status.TiKV.BasicAutoScalerStatus) + return storagePressure, nil +} + +// TODO: add unit test +func isStoragePressureStartTimeRecordAlready(tacStatus v1alpha1.TidbClusterAutoSclaerStatus) bool { + if tacStatus.TiKV == nil { + return false + } + if len(tacStatus.TiKV.MetricsStatusList) < 1 { + return false + } + for _, metricsStatus := range tacStatus.TiKV.MetricsStatusList { + if metricsStatus.Name == "storage" { + if metricsStatus.StoragePressureStartTime != nil { + return true + } + } + } + return false +} diff --git a/pkg/autoscaler/autoscaler/calculate/util.go b/pkg/autoscaler/autoscaler/calculate/util.go index 1523ba74e3..40d586219e 100644 --- a/pkg/autoscaler/autoscaler/calculate/util.go +++ b/pkg/autoscaler/autoscaler/calculate/util.go @@ -22,18 +22,10 @@ import ( corev1 "k8s.io/api/core/v1" ) -// MetricType describe the current Supported Metric Type to calculate the recommended Replicas -type MetricType string - -const ( - MetricTypeCPU MetricType = "cpu" - //metricTypeQPS MetricType = "qps" -) - // currently, we only choose one metrics to be computed. // If there exists several metrics, we tend to choose ResourceMetricSourceType metric -func FilterMetrics(metrics []autoscalingv2beta2.MetricSpec, name corev1.ResourceName) []autoscalingv2beta2.MetricSpec { - var list []autoscalingv2beta2.MetricSpec +func FilterMetrics(metrics []v1alpha1.CustomMetric, name corev1.ResourceName) []v1alpha1.CustomMetric { + var list []v1alpha1.CustomMetric for _, m := range metrics { if m.Type == autoscalingv2beta2.ResourceMetricSourceType && m.Resource != nil && m.Resource.Name == name { list = append(list, m) @@ -43,14 +35,6 @@ func FilterMetrics(metrics []autoscalingv2beta2.MetricSpec, name corev1.Resource return list } -// genMetricType return the supported MetricType in Operator by kubernetes auto-scaling MetricType -func GenMetricType(tac *v1alpha1.TidbClusterAutoScaler, metric autoscalingv2beta2.MetricSpec) (MetricType, error) { - if metric.Type == autoscalingv2beta2.ResourceMetricSourceType && metric.Resource != nil && metric.Resource.Name == corev1.ResourceCPU { - return MetricTypeCPU, nil - } - return "", fmt.Errorf(InvalidTacMetricConfigureMsg, tac.Namespace, tac.Name) -} - // filterContainer is to filter the specific container from the given statefulset(tidb/tikv) func filterContainer(tac *v1alpha1.TidbClusterAutoScaler, sts *appsv1.StatefulSet, containerName string) (*corev1.Container, error) { for _, c := range sts.Spec.Template.Spec.Containers { diff --git a/pkg/autoscaler/autoscaler/tidb_autoscaler.go b/pkg/autoscaler/autoscaler/tidb_autoscaler.go index 6a40d4ae18..83c79e0bcb 100644 --- a/pkg/autoscaler/autoscaler/tidb_autoscaler.go +++ b/pkg/autoscaler/autoscaler/tidb_autoscaler.go @@ -108,10 +108,9 @@ func calculateTidbMetrics(tac *v1alpha1.TidbClusterAutoScaler, sts *appsv1.State Endpoint: ep, Timestamp: time.Now().Unix(), Instances: instances, - Metric: metrics[0], Quary: fmt.Sprintf(calculate.TidbSumCpuMetricsPattern, tac.Spec.Cluster.Name, *tac.Spec.TiDB.MetricsTimeDuration), } - return calculate.CalculateRecomendedReplicasByCpuCosts(tac, sq, sts, client, v1alpha1.TiDBMemberType, duration) + return calculate.CalculateRecomendedReplicasByCpuCosts(tac, sq, sts, client, v1alpha1.TiDBMemberType, duration, metrics[0].MetricSpec) } return -1, fmt.Errorf(calculate.InvalidTacMetricConfigureMsg, tac.Namespace, tac.Name) } diff --git a/pkg/autoscaler/autoscaler/tikv_autoscaler.go b/pkg/autoscaler/autoscaler/tikv_autoscaler.go index 9c48094eea..e663d9d7f9 100644 --- a/pkg/autoscaler/autoscaler/tikv_autoscaler.go +++ b/pkg/autoscaler/autoscaler/tikv_autoscaler.go @@ -20,11 +20,13 @@ import ( "github.com/pingcap/advanced-statefulset/client/apis/apps/v1/helper" "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1" "github.com/pingcap/tidb-operator/pkg/autoscaler/autoscaler/calculate" + "github.com/pingcap/tidb-operator/pkg/controller" "github.com/pingcap/tidb-operator/pkg/label" operatorUtils "github.com/pingcap/tidb-operator/pkg/util" promClient "github.com/prometheus/client_golang/api" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" + "k8s.io/klog" ) func (am *autoScalerManager) syncTiKV(tc *v1alpha1.TidbCluster, tac *v1alpha1.TidbClusterAutoScaler) error { @@ -69,6 +71,10 @@ func calculateTiKVMetrics(tac *v1alpha1.TidbClusterAutoScaler, tc *v1alpha1.Tidb if err != nil { return err } + if len(tac.Spec.TiKV.Metrics) < 1 { + klog.V(4).Infof("tac[%s/%s] have no setting, skip auto-scaling", tac.Namespace, tac.Name) + return nil + } // check CPU metrics := calculate.FilterMetrics(tac.Spec.TiKV.Metrics, corev1.ResourceCPU) @@ -77,17 +83,72 @@ func calculateTiKVMetrics(tac *v1alpha1.TidbClusterAutoScaler, tc *v1alpha1.Tidb Endpoint: ep, Timestamp: time.Now().Unix(), Instances: instances, - Metric: metrics[0], Quary: fmt.Sprintf(calculate.TikvSumCpuMetricsPattern, tac.Spec.Cluster.Name, *tac.Spec.TiKV.MetricsTimeDuration), } - return calculateTiKVCPUMetrics(tac, tc, sts, sq, client, duration) + return calculateTiKVCPUMetrics(tac, tc, sts, sq, client, duration, metrics[0]) } + + // check storage + metrics = calculate.FilterMetrics(tac.Spec.TiKV.Metrics, corev1.ResourceStorage) + if len(metrics) > 0 { + now := time.Now().Unix() + capacitySq := &calculate.SingleQuery{ + Endpoint: ep, + Timestamp: now, + Instances: instances, + Quary: fmt.Sprintf(calculate.TikvSumStorageMetricsPattern, tac.Spec.Cluster.Name, "capacity"), + } + availableSq := &calculate.SingleQuery{ + Endpoint: ep, + Timestamp: now, + Instances: instances, + Quary: fmt.Sprintf(calculate.TikvSumStorageMetricsPattern, tac.Spec.Cluster.Name, "available"), + } + return calculateTiKVStorageMetrics(tac, tc, capacitySq, availableSq, client, metrics[0]) + } + + // none metrics selected, end auto-scaling return nil } -func calculateTiKVCPUMetrics(tac *v1alpha1.TidbClusterAutoScaler, tc *v1alpha1.TidbCluster, sts *appsv1.StatefulSet, sq *calculate.SingleQuery, client promClient.Client, duration time.Duration) error { +func calculateTiKVStorageMetrics(tac *v1alpha1.TidbClusterAutoScaler, tc *v1alpha1.TidbCluster, + capSq, avaSq *calculate.SingleQuery, client promClient.Client, metric v1alpha1.CustomMetric) error { + if tc.Spec.TiKV.Replicas >= tac.Spec.TiKV.MaxReplicas { + klog.V(4).Infof("tac[%s/%s]'s tikv won't scale out by storage pressure due to maxReplicas", tac.Namespace, tac.Name) + return nil + } + intervalSeconds := tac.Spec.TiKV.ScaleOutIntervalSeconds + ableToScale, err := checkTiKVAutoScalingInterval(tac, *intervalSeconds) + if err != nil { + return err + } + if !ableToScale { + klog.Infof("tac[%s/%s]'s tikv won't scale out by storage pressure due to scale-out cool-down interval", tac.Namespace, tac.Name) + return nil + } + storagePressure, err := calculate.CalculateWhetherStoragePressure(tac, capSq, avaSq, client, metric) + if err != nil { + return err + } + if !storagePressure { + return nil + } + ableToScale, err = checkWhetherAbleToScaleDueToStorage(tac, metric, time.Now(), controller.ResyncDuration) + if err != nil { + return err + } + if !ableToScale { + return nil + } + currentReplicas := tc.Spec.TiKV.Replicas + targetReplicas := currentReplicas + 1 + return updateTacIfTiKVScale(tc, tac, targetReplicas) +} + +func calculateTiKVCPUMetrics(tac *v1alpha1.TidbClusterAutoScaler, tc *v1alpha1.TidbCluster, sts *appsv1.StatefulSet, + sq *calculate.SingleQuery, client promClient.Client, duration time.Duration, metric v1alpha1.CustomMetric) error { - targetReplicas, err := calculate.CalculateRecomendedReplicasByCpuCosts(tac, sq, sts, client, v1alpha1.TiKVMemberType, duration) + targetReplicas, err := calculate.CalculateRecomendedReplicasByCpuCosts(tac, sq, sts, client, v1alpha1.TiKVMemberType, duration, metric.MetricSpec) if err != nil { return err } @@ -96,14 +157,25 @@ func calculateTiKVCPUMetrics(tac *v1alpha1.TidbClusterAutoScaler, tc *v1alpha1.T return nil } currentReplicas := int32(len(sq.Instances)) - err = syncTiKVAfterCalculated(tc, tac, currentReplicas, targetReplicas) + intervalSeconds := tac.Spec.TiKV.ScaleInIntervalSeconds + ableToScale, err := checkTiKVAutoScalingInterval(tac, *intervalSeconds) + if err != nil { + return err + } + if !ableToScale { + return nil + } + err = updateTacIfTiKVScale(tc, tac, targetReplicas) if err != nil { return err } return addAnnotationMarkIfScaleOutDueToCPUMetrics(tc, currentReplicas, targetReplicas, sts) } -func checkTiKVAutoScaling(tac *v1alpha1.TidbClusterAutoScaler, intervalSeconds int32) (bool, error) { +// checkTiKVAutoScalingInterval check the each 2 auto-scaling interval depends on the scaling-in and scaling-out +// Note that for the storage scaling, we will check scale-out interval before we start to scraping metrics, +// and for the cpu scaling, we will check scale-in/scale-out interval after we finish calculating metrics. +func checkTiKVAutoScalingInterval(tac *v1alpha1.TidbClusterAutoScaler, intervalSeconds int32) (bool, error) { if tac.Annotations == nil { tac.Annotations = map[string]string{} } @@ -117,28 +189,35 @@ func checkTiKVAutoScaling(tac *v1alpha1.TidbClusterAutoScaler, intervalSeconds i return true, nil } -// syncTiKVAfterCalculated would check the Consecutive count to avoid jitter, and it would also check the interval -// duration between each auto-scaling. If either of them is not meet, the auto-scaling would be rejected. -// If the auto-scaling is permitted, the timestamp would be recorded and the Consecutive count would be zeroed. -// The currentReplicas of TiKV calculated in auto-scaling is the count of the StateUp TiKV instance, so we need to -// add the number of other state tikv instance replicas when we update the TidbCluster.Spec.TiKV.Replicas -func syncTiKVAfterCalculated(tc *v1alpha1.TidbCluster, tac *v1alpha1.TidbClusterAutoScaler, currentReplicas, recommendedReplicas int32) error { - intervalSeconds := tac.Spec.TiKV.ScaleInIntervalSeconds - if recommendedReplicas > currentReplicas { - intervalSeconds = tac.Spec.TiKV.ScaleOutIntervalSeconds +// checkWhetherAbleToScaleDueToStorage will check whether the storage pressure status have been existed for as least +// LeastStoragePressurePeriodSeconds duration. If not, the operator would wait next round to check again. +func checkWhetherAbleToScaleDueToStorage(tac *v1alpha1.TidbClusterAutoScaler, metric v1alpha1.CustomMetric, now time.Time, resyncDuration time.Duration) (bool, error) { + if metric.LeastStoragePressurePeriodSeconds == nil { + return false, fmt.Errorf("tac[%s/%s]'s leastStoragePressurePeriodSeconds must be setted before scale out in storage", tac.Namespace, tac.Name) } - ableToScale, err := checkTiKVAutoScaling(tac, *intervalSeconds) - if err != nil { - return err + if tac.Status.TiKV.LastAutoScalingTimestamp == nil { + return false, fmt.Errorf("tac[%s/%s]'s tikv status LastAutoScalingTimestamp haven't been set", tac.Namespace, tac.Name) } - if !ableToScale { - return nil + if now.Sub(tac.Status.TiKV.LastAutoScalingTimestamp.Time) > 3*resyncDuration { + klog.Infof("tac[%s/%s]'s tikv status LastAutoScalingTimestamp timeout", tac.Namespace, tac.Name) + return false, nil + } + for _, m := range tac.Status.TiKV.MetricsStatusList { + if m.Name == string(corev1.ResourceStorage) { + if m.StoragePressure == nil || m.StoragePressureStartTime == nil { + return false, nil + } + x := now.Sub(m.StoragePressureStartTime.Time).Seconds() + if x >= float64(*metric.LeastStoragePressurePeriodSeconds) { + return true, nil + } + } } - return updateTcTiKVIfScale(tc, tac, recommendedReplicas) + return false, nil } -// we record the auto-scaling out slot for tikv, in order to add special hot labels when they are created -func updateTcTiKVIfScale(tc *v1alpha1.TidbCluster, tac *v1alpha1.TidbClusterAutoScaler, recommendedReplicas int32) error { +// updateTacIfTiKVScale update the tac status and syncing annotations if tikv scale-in/out +func updateTacIfTiKVScale(tc *v1alpha1.TidbCluster, tac *v1alpha1.TidbClusterAutoScaler, recommendedReplicas int32) error { tac.Annotations[label.AnnTiKVLastAutoScalingTimestamp] = fmt.Sprintf("%d", time.Now().Unix()) tc.Spec.TiKV.Replicas = recommendedReplicas tac.Status.TiKV.RecommendedReplicas = recommendedReplicas diff --git a/pkg/autoscaler/autoscaler/tikv_autoscaler_test.go b/pkg/autoscaler/autoscaler/tikv_autoscaler_test.go new file mode 100644 index 0000000000..f904256202 --- /dev/null +++ b/pkg/autoscaler/autoscaler/tikv_autoscaler_test.go @@ -0,0 +1,149 @@ +// Copyright 2020 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package autoscaler + +import ( + "testing" + "time" + + . "github.com/onsi/gomega" + "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1" + autoscaling "k8s.io/api/autoscaling/v2beta2" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/pointer" +) + +func TestCheckWhetherAbleToScaleDueToStorage(t *testing.T) { + g := NewGomegaWithT(t) + now := time.Now() + tac1 := newStorageMetricTac() + tac1.Status = v1alpha1.TidbClusterAutoSclaerStatus{ + TiKV: &v1alpha1.TikvAutoScalerStatus{ + BasicAutoScalerStatus: v1alpha1.BasicAutoScalerStatus{ + MetricsStatusList: []v1alpha1.MetricsStatus{ + { + Name: string(corev1.ResourceStorage), + StorageMetricsStatus: v1alpha1.StorageMetricsStatus{ + StoragePressure: pointer.BoolPtr(true), + StoragePressureStartTime: &metav1.Time{ + Time: now.Add(-2 * time.Minute), + }, + }, + }, + }, + LastAutoScalingTimestamp: &metav1.Time{ + Time: now.Add(-10 * time.Second), + }, + }, + }, + } + tac2 := newStorageMetricTac() + tac2.Status = v1alpha1.TidbClusterAutoSclaerStatus{ + TiKV: &v1alpha1.TikvAutoScalerStatus{ + BasicAutoScalerStatus: v1alpha1.BasicAutoScalerStatus{ + MetricsStatusList: []v1alpha1.MetricsStatus{ + { + Name: string(corev1.ResourceStorage), + StorageMetricsStatus: v1alpha1.StorageMetricsStatus{ + StoragePressure: pointer.BoolPtr(true), + StoragePressureStartTime: &metav1.Time{ + Time: now.Add(-20 * time.Second), + }, + }, + }, + }, + LastAutoScalingTimestamp: &metav1.Time{ + Time: now.Add(-10 * time.Second), + }, + }, + }, + } + tac3 := newStorageMetricTac() + tac3.Status = v1alpha1.TidbClusterAutoSclaerStatus{ + TiKV: &v1alpha1.TikvAutoScalerStatus{ + BasicAutoScalerStatus: v1alpha1.BasicAutoScalerStatus{ + MetricsStatusList: []v1alpha1.MetricsStatus{ + { + Name: string(corev1.ResourceStorage), + StorageMetricsStatus: v1alpha1.StorageMetricsStatus{ + StoragePressure: pointer.BoolPtr(true), + StoragePressureStartTime: &metav1.Time{ + Time: now.Add(-2 * time.Minute), + }, + }, + }, + }, + LastAutoScalingTimestamp: &metav1.Time{ + Time: now.Add(-10 * time.Minute), + }, + }, + }, + } + testcases := []struct { + name string + tac *v1alpha1.TidbClusterAutoScaler + metric v1alpha1.CustomMetric + expectedResult bool + }{ + { + name: "experienced disk pressure for enough time", + tac: tac1, + metric: v1alpha1.CustomMetric{ + LeastStoragePressurePeriodSeconds: pointer.Int64Ptr(60), + }, + expectedResult: true, + }, + { + name: "haven't experienced disk pressure for enough time", + tac: tac2, + metric: v1alpha1.CustomMetric{ + LeastStoragePressurePeriodSeconds: pointer.Int64Ptr(60), + }, + expectedResult: false, + }, + { + name: "last syncing time is stale", + tac: tac3, + metric: v1alpha1.CustomMetric{ + LeastStoragePressurePeriodSeconds: pointer.Int64Ptr(60), + }, + expectedResult: false, + }, + } + + for _, testcase := range testcases { + t.Run(testcase.name, func(t *testing.T) { + result, err := checkWhetherAbleToScaleDueToStorage(testcase.tac, testcase.metric, now, 30*time.Second) + g.Expect(err).Should(BeNil()) + g.Expect(result).Should(Equal(testcase.expectedResult)) + }) + } +} + +func newStorageMetricTac() *v1alpha1.TidbClusterAutoScaler { + tac := newTidbClusterAutoScaler() + tac.Spec.TiKV.Metrics = []v1alpha1.CustomMetric{ + { + LeastStoragePressurePeriodSeconds: pointer.Int64Ptr(60), + MetricSpec: autoscaling.MetricSpec{ + Type: autoscaling.ResourceMetricSourceType, + Resource: &autoscaling.ResourceMetricSource{ + Name: corev1.ResourceStorage, + }, + }, + }, + } + return tac +} diff --git a/pkg/autoscaler/autoscaler/util.go b/pkg/autoscaler/autoscaler/util.go index 1f753b4adf..e221766247 100644 --- a/pkg/autoscaler/autoscaler/util.go +++ b/pkg/autoscaler/autoscaler/util.go @@ -22,21 +22,10 @@ import ( "github.com/pingcap/tidb-operator/pkg/label" operatorUtils "github.com/pingcap/tidb-operator/pkg/util" appsv1 "k8s.io/api/apps/v1" - autoscalingv2beta2 "k8s.io/api/autoscaling/v2beta2" corev1 "k8s.io/api/core/v1" "k8s.io/utils/pointer" ) -var defaultMetricSpec = autoscalingv2beta2.MetricSpec{ - Type: autoscalingv2beta2.ResourceMetricSourceType, - Resource: &autoscalingv2beta2.ResourceMetricSource{ - Name: corev1.ResourceCPU, - Target: autoscalingv2beta2.MetricTarget{ - AverageUtilization: pointer.Int32Ptr(80), - }, - }, -} - // checkStsAutoScalingPrerequisites would check the sts status to ensure wouldn't happen during // upgrading, scaling func checkStsAutoScalingPrerequisites(set *appsv1.StatefulSet) bool { @@ -60,7 +49,7 @@ func checkStsAutoScalingInterval(tac *v1alpha1.TidbClusterAutoScaler, intervalSe } t, err := strconv.ParseInt(lastAutoScalingTimestamp, 10, 64) if err != nil { - return false, err + return false, fmt.Errorf("tac[%s/%s] parse last auto-scaling timestamp failed,err:%v", tac.Namespace, tac.Name, err) } if intervalSeconds > int32(time.Now().Sub(time.Unix(t, 0)).Seconds()) { return false, nil @@ -132,13 +121,21 @@ func defaultTAC(tac *v1alpha1.TidbClusterAutoScaler) { } // If ExternalEndpoint is not provided, we would set default metrics if tac.Spec.TiKV.ExternalEndpoint == nil { - if len(tac.Spec.TiKV.Metrics) == 0 { - tac.Spec.TiKV.Metrics = append(tac.Spec.TiKV.Metrics, defaultMetricSpec) - } if tac.Spec.TiKV.MetricsTimeDuration == nil { tac.Spec.TiKV.MetricsTimeDuration = pointer.StringPtr("3m") } } + for id, m := range tac.Spec.TiKV.Metrics { + if m.Resource != nil && m.Resource.Name == corev1.ResourceStorage { + if m.LeastStoragePressurePeriodSeconds == nil { + m.LeastStoragePressurePeriodSeconds = pointer.Int64Ptr(300) + } + if m.LeastRemainAvailableStoragePercent == nil { + m.LeastRemainAvailableStoragePercent = pointer.Int64Ptr(10) + } + tac.Spec.TiKV.Metrics[id] = m + } + } } if tac.Spec.TiDB != nil { @@ -152,9 +149,6 @@ func defaultTAC(tac *v1alpha1.TidbClusterAutoScaler) { tac.Spec.TiDB.ScaleInIntervalSeconds = pointer.Int32Ptr(500) } if tac.Spec.TiDB.ExternalEndpoint == nil { - if len(tac.Spec.TiDB.Metrics) == 0 { - tac.Spec.TiDB.Metrics = append(tac.Spec.TiDB.Metrics, defaultMetricSpec) - } if tac.Spec.TiDB.MetricsTimeDuration == nil { tac.Spec.TiDB.MetricsTimeDuration = pointer.StringPtr("3m") } @@ -177,3 +171,17 @@ func genMetricsEndpoint(tac *v1alpha1.TidbClusterAutoScaler) (string, error) { } return fmt.Sprintf("http://%s-prometheus.%s.svc:9090", tac.Spec.Monitor.Name, tac.Spec.Monitor.Namespace), nil } + +func emptyStorageMetricsStatus(tac *v1alpha1.TidbClusterAutoScaler) { + for id, m := range tac.Status.TiKV.MetricsStatusList { + if m.Name == string(corev1.ResourceStorage) { + m.StoragePressure = nil + m.StoragePressureStartTime = nil + m.CapacityStorage = nil + m.AvailableStorage = nil + m.BaselineAvailableStorage = nil + tac.Status.TiKV.MetricsStatusList[id] = m + return + } + } +} diff --git a/pkg/autoscaler/autoscaler/util_test.go b/pkg/autoscaler/autoscaler/util_test.go index a0a9b8b3a6..980721d3d3 100644 --- a/pkg/autoscaler/autoscaler/util_test.go +++ b/pkg/autoscaler/autoscaler/util_test.go @@ -22,7 +22,6 @@ import ( "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1" "github.com/pingcap/tidb-operator/pkg/label" appsv1 "k8s.io/api/apps/v1" - autoscalingv2beta2 "k8s.io/api/autoscaling/v2beta2" "k8s.io/utils/pointer" ) @@ -259,13 +258,12 @@ func TestDefaultTac(t *testing.T) { tac := newTidbClusterAutoScaler() tac.Spec.TiDB = nil tac.Spec.TiKV.MinReplicas = nil - tac.Spec.TiKV.Metrics = []autoscalingv2beta2.MetricSpec{} + tac.Spec.TiKV.Metrics = []v1alpha1.CustomMetric{} tac.Spec.TiKV.MetricsTimeDuration = nil tac.Spec.TiKV.ScaleOutIntervalSeconds = nil tac.Spec.TiKV.ScaleInIntervalSeconds = nil defaultTAC(tac) g.Expect(*tac.Spec.TiKV.MinReplicas).Should(Equal(int32(1))) - g.Expect(len(tac.Spec.TiKV.Metrics)).Should(Equal(1)) g.Expect(*tac.Spec.TiKV.MetricsTimeDuration).Should(Equal("3m")) g.Expect(*tac.Spec.TiKV.ScaleOutIntervalSeconds).Should(Equal(int32(300))) g.Expect(*tac.Spec.TiKV.ScaleInIntervalSeconds).Should(Equal(int32(500))) @@ -273,13 +271,12 @@ func TestDefaultTac(t *testing.T) { tac = newTidbClusterAutoScaler() tac.Spec.TiKV = nil tac.Spec.TiDB.MinReplicas = nil - tac.Spec.TiDB.Metrics = []autoscalingv2beta2.MetricSpec{} + tac.Spec.TiDB.Metrics = []v1alpha1.CustomMetric{} tac.Spec.TiDB.MetricsTimeDuration = nil tac.Spec.TiDB.ScaleOutIntervalSeconds = nil tac.Spec.TiDB.ScaleInIntervalSeconds = nil defaultTAC(tac) g.Expect(*tac.Spec.TiDB.MinReplicas).Should(Equal(int32(1))) - g.Expect(len(tac.Spec.TiDB.Metrics)).Should(Equal(1)) g.Expect(*tac.Spec.TiDB.MetricsTimeDuration).Should(Equal("3m")) g.Expect(*tac.Spec.TiDB.ScaleOutIntervalSeconds).Should(Equal(int32(300))) g.Expect(*tac.Spec.TiDB.ScaleInIntervalSeconds).Should(Equal(int32(500))) diff --git a/pkg/label/label.go b/pkg/label/label.go index 248ba11b35..e6863702c8 100644 --- a/pkg/label/label.go +++ b/pkg/label/label.go @@ -110,10 +110,6 @@ const ( AnnTiDBLastAutoScalingTimestamp = "tidb.tidb.pingcap.com/last-autoscaling-timestamp" // AnnTiKVLastAutoScalingTimestamp is annotation key of tidbclusterto which ordinal is created by tikv auto-scaling AnnTiKVLastAutoScalingTimestamp = "tikv.tidb.pingcap.com/last-autoscaling-timestamp" - - // AnnTiKVReadyToScaleTimestamp records timestamp when tikv ready to scale - AnnTiKVReadyToScaleTimestamp = "tikv.tidb.pingcap.com/ready-to-scale-timestamp" - // AnnLastSyncingTimestamp records last sync timestamp AnnLastSyncingTimestamp = "tidb.pingcap.com/last-syncing-timestamp" diff --git a/tests/e2e/tidbcluster/serial.go b/tests/e2e/tidbcluster/serial.go index 038ddddf2b..ba4b4d1f78 100644 --- a/tests/e2e/tidbcluster/serial.go +++ b/tests/e2e/tidbcluster/serial.go @@ -478,17 +478,20 @@ var _ = ginkgo.Describe("[tidb-operator][Serial]", func() { // The cpu requests of tikv is 100m, so the threshold value would be 60*0.1*3*0.8 = 14.4 // so we would set Value as "5" for each instance so that the sum in each auto-scaling calculating would be 15 Value: "5.0", + QueryType: "cpu", InstancesPod: []string{"auto-scaling-tikv-0", "auto-scaling-tikv-1", "auto-scaling-tikv-2"}, } err = mock.SetPrometheusResponse(monitor.Name, monitor.Namespace, mp, fw) framework.ExpectNoError(err, "set tikv mock metrics error") - var defaultMetricSpec = autoscalingv2beta2.MetricSpec{ - Type: autoscalingv2beta2.ResourceMetricSourceType, - Resource: &autoscalingv2beta2.ResourceMetricSource{ - Name: corev1.ResourceCPU, - Target: autoscalingv2beta2.MetricTarget{ - AverageUtilization: pointer.Int32Ptr(80), + var defaultMetricSpec = v1alpha1.CustomMetric{ + MetricSpec: autoscalingv2beta2.MetricSpec{ + Type: autoscalingv2beta2.ResourceMetricSourceType, + Resource: &autoscalingv2beta2.ResourceMetricSource{ + Name: corev1.ResourceCPU, + Target: autoscalingv2beta2.MetricTarget{ + AverageUtilization: pointer.Int32Ptr(80), + }, }, }, } @@ -502,7 +505,7 @@ var _ = ginkgo.Describe("[tidb-operator][Serial]", func() { ScaleInIntervalSeconds: pointer.Int32Ptr(100), }, } - tac.Spec.TiKV.Metrics = []autoscalingv2beta2.MetricSpec{} + tac.Spec.TiKV.Metrics = []v1alpha1.CustomMetric{} tac.Spec.TiKV.Metrics = append(tac.Spec.TiKV.Metrics, defaultMetricSpec) _, err = cli.PingcapV1alpha1().TidbClusterAutoScalers(ns).Create(tac) @@ -538,7 +541,8 @@ var _ = ginkgo.Describe("[tidb-operator][Serial]", func() { } if stac.Status.TiKV != nil && len(stac.Status.TiKV.MetricsStatusList) > 0 { metrics := stac.Status.TiKV.MetricsStatusList[0] - framework.Logf("tikv threshold value: %v, currentValue: %v, recommended replicas: %v", metrics.ThresholdValue, metrics.CurrentValue, stac.Status.TiKV.RecommendedReplicas) + framework.Logf("tikv threshold value: %v, currentValue: %v, recommended replicas: %v", + *metrics.ThresholdValue, *metrics.CurrentValue, stac.Status.TiKV.RecommendedReplicas) } tc, err := cli.PingcapV1alpha1().TidbClusters(tc.Namespace).Get(tc.Name, metav1.GetOptions{}) @@ -614,6 +618,7 @@ var _ = ginkgo.Describe("[tidb-operator][Serial]", func() { // The cpu requests of tikv is 100m, so the threshold value would be 60*0.1*4*0.8 = 19.2 // so we would set Value as "1" for each instance so that the sum in each auto-scaling calculating would be 4 Value: "1.0", + QueryType: "cpu", InstancesPod: []string{"auto-scaling-tikv-0", "auto-scaling-tikv-1", "auto-scaling-tikv-2", "auto-scaling-tikv-3"}, } err = mock.SetPrometheusResponse(monitor.Name, monitor.Namespace, mp, fw) @@ -627,7 +632,8 @@ var _ = ginkgo.Describe("[tidb-operator][Serial]", func() { } if stac.Status.TiKV != nil && len(stac.Status.TiKV.MetricsStatusList) > 0 { metrics := stac.Status.TiKV.MetricsStatusList[0] - framework.Logf("tikv threshold value: %v, currentValue: %v, recommended replicas: %v", metrics.ThresholdValue, metrics.CurrentValue, stac.Status.TiKV.RecommendedReplicas) + framework.Logf("tikv threshold value: %v, currentValue: %v, recommended replicas: %v", + *metrics.ThresholdValue, *metrics.CurrentValue, stac.Status.TiKV.RecommendedReplicas) } tc, err = cli.PingcapV1alpha1().TidbClusters(tc.Namespace).Get(tc.Name, metav1.GetOptions{}) @@ -694,6 +700,7 @@ var _ = ginkgo.Describe("[tidb-operator][Serial]", func() { // The cpu requests of tidb is 100m, so the threshold value would be 60*0.1*2*0.8 = 9.6 // so we would set Value as "5" for each instance so that the sum in each auto-scaling calculating would be 10 Value: "5.0", + QueryType: "cpu", InstancesPod: []string{"auto-scaling-tidb-0", "auto-scaling-tidb-1"}, } err = mock.SetPrometheusResponse(monitor.Name, monitor.Namespace, mp, fw) @@ -711,7 +718,7 @@ var _ = ginkgo.Describe("[tidb-operator][Serial]", func() { ScaleInIntervalSeconds: pointer.Int32Ptr(100), }, } - tac.Spec.TiDB.Metrics = []autoscalingv2beta2.MetricSpec{} + tac.Spec.TiDB.Metrics = []v1alpha1.CustomMetric{} tac.Spec.TiDB.Metrics = append(tac.Spec.TiDB.Metrics, defaultMetricSpec) _, err = cli.PingcapV1alpha1().TidbClusterAutoScalers(ns).Update(tac) framework.ExpectNoError(err, "Update TidbMonitorClusterAutoScaler error") @@ -723,7 +730,8 @@ var _ = ginkgo.Describe("[tidb-operator][Serial]", func() { } if stac.Status.TiDB != nil && len(stac.Status.TiDB.MetricsStatusList) > 0 { metrics := stac.Status.TiDB.MetricsStatusList[0] - framework.Logf("tidb threshold value: %v, currentValue: %v, recommended replicas: %v", metrics.ThresholdValue, metrics.CurrentValue, stac.Status.TiDB.RecommendedReplicas) + framework.Logf("tidb threshold value: %v, currentValue: %v, recommended replicas: %v", + *metrics.ThresholdValue, *metrics.CurrentValue, stac.Status.TiDB.RecommendedReplicas) } tc, err = cli.PingcapV1alpha1().TidbClusters(tc.Namespace).Get(tc.Name, metav1.GetOptions{}) @@ -764,6 +772,7 @@ var _ = ginkgo.Describe("[tidb-operator][Serial]", func() { // The cpu requests of tidb is 100m, so the threshold value would be 60*0.1*2*0.8 = 9.6 // so we would set Value as "1" for each instance so that the sum in each auto-scaling calculating would be 3 Value: "1.0", + QueryType: "cpu", InstancesPod: []string{"auto-scaling-tidb-0", "auto-scaling-tidb-1", "auto-scaling-tidb-2"}, } err = mock.SetPrometheusResponse(monitor.Name, monitor.Namespace, mp, fw) @@ -777,7 +786,8 @@ var _ = ginkgo.Describe("[tidb-operator][Serial]", func() { } if stac.Status.TiDB != nil && len(stac.Status.TiDB.MetricsStatusList) > 0 { metrics := stac.Status.TiDB.MetricsStatusList[0] - framework.Logf("tidb threshold value: %v, currentValue: %v, recommended replicas: %v", metrics.ThresholdValue, metrics.CurrentValue, stac.Status.TiDB.RecommendedReplicas) + framework.Logf("tidb threshold value: %v, currentValue: %v, recommended replicas: %v", + *metrics.ThresholdValue, *metrics.CurrentValue, stac.Status.TiDB.RecommendedReplicas) } tc, err = cli.PingcapV1alpha1().TidbClusters(tc.Namespace).Get(tc.Name, metav1.GetOptions{}) @@ -817,6 +827,101 @@ var _ = ginkgo.Describe("[tidb-operator][Serial]", func() { framework.ExpectNoError(err, "check tidb auto-scale to 2 error") framework.Logf("success to check auto scale-in tidb to 2 replicas") + // Check Storage Auto-Scaling + framework.Logf("start to check tikv storage auto-scaling") + mp = &mock.MonitorParams{ + Name: tc.Name, + // The mock capacity size for the tikv storage + Value: fmt.Sprintf("%v", 1024*1024*1024), + QueryType: "storage", + StorageType: "capacity", + } + err = mock.SetPrometheusResponse(monitor.Name, monitor.Namespace, mp, fw) + framework.ExpectNoError(err, "set tikv capacity storage size error") + + mp = &mock.MonitorParams{ + Name: tc.Name, + // The mock capacity size for the tikv storage + Value: fmt.Sprintf("%v", 1024*1024), + QueryType: "storage", + StorageType: "available", + } + err = mock.SetPrometheusResponse(monitor.Name, monitor.Namespace, mp, fw) + framework.ExpectNoError(err, "set tikv available storage size error") + + tac, err = cli.PingcapV1alpha1().TidbClusterAutoScalers(ns).Get(tac.Name, metav1.GetOptions{}) + framework.ExpectNoError(err, "Get TidbCluster AutoScaler err") + tac.Spec.TiKV = &v1alpha1.TikvAutoScalerSpec{ + BasicAutoScalerSpec: v1alpha1.BasicAutoScalerSpec{ + MaxReplicas: int32(4), + Metrics: []v1alpha1.CustomMetric{ + { + MetricSpec: autoscalingv2beta2.MetricSpec{ + Type: autoscalingv2beta2.ResourceMetricSourceType, + Resource: &autoscalingv2beta2.ResourceMetricSource{ + Name: corev1.ResourceStorage, + }, + }, + LeastStoragePressurePeriodSeconds: pointer.Int64Ptr(30), + LeastRemainAvailableStoragePercent: pointer.Int64Ptr(90), + }, + }, + }, + } + tac.Spec.TiDB = nil + tac.Status.TiKV = nil + tac.Status.TiDB = nil + tacCopy := tac.DeepCopy() + err = controller.GuaranteedUpdate(genericCli, tac, func() error { + tac.Spec = tacCopy.Spec + tac.Status = tacCopy.Status + return nil + }) + framework.ExpectNoError(err, "Update TidbMonitorClusterAutoScaler error") + + // check tikv scale-out to 4 + err = wait.Poll(5*time.Second, 10*time.Minute, func() (done bool, err error) { + stac, err := cli.PingcapV1alpha1().TidbClusterAutoScalers(ns).Get(tac.Name, metav1.GetOptions{}) + if err != nil { + return false, nil + } + if stac.Status.TiKV != nil && len(stac.Status.TiKV.MetricsStatusList) > 0 { + metrics := stac.Status.TiKV.MetricsStatusList[0] + framework.Logf("tikv AvailableStorage:%v, BaselineAvailableStorage:%v, CapacityStorage:%v, StoragePressure:%v", + *metrics.AvailableStorage, *metrics.BaselineAvailableStorage, *metrics.CapacityStorage, *metrics.StoragePressure) + } + + tc, err = cli.PingcapV1alpha1().TidbClusters(tc.Namespace).Get(tc.Name, metav1.GetOptions{}) + if err != nil { + return false, nil + } + if tc.Spec.TiKV.Replicas != 4 { + framework.Logf("tikv haven't auto-scale to 4 replicas") + return false, nil + } + if len(tc.Status.TiKV.Stores) != 4 { + framework.Logf("tikv's store haven't auto-scale to 4") + return false, nil + } + tac, err = cli.PingcapV1alpha1().TidbClusterAutoScalers(ns).Get(tac.Name, metav1.GetOptions{}) + if err != nil { + return false, nil + } + if tac.Annotations == nil || len(tac.Annotations) < 1 { + framework.Logf("tac haven't marked any annotation") + return false, nil + } + _, ok := tac.Annotations[label.AnnTiKVLastAutoScalingTimestamp] + if !ok { + framework.Logf("tac has no tikv.tidb.pingcap.com/last-autoscaling-timestamp annotation") + return false, nil + } + return true, nil + }) + framework.ExpectNoError(err, "check tikv auto-scale to 4 error") + framework.Logf("success to check tikv auto scale-in to 4 replicas") + + // Clean scaler err = cli.PingcapV1alpha1().TidbClusterAutoScalers(tac.Namespace).Delete(tac.Name, &metav1.DeleteOptions{}) framework.ExpectNoError(err, "Expect to delete auto-scaler ref") err = wait.Poll(5*time.Second, 5*time.Minute, func() (done bool, err error) { @@ -830,6 +935,7 @@ var _ = ginkgo.Describe("[tidb-operator][Serial]", func() { return true, nil }) framework.ExpectNoError(err, "expect auto-scaler ref empty after delete auto-scaler") + framework.Logf("clean scaler mark success") }) }) diff --git a/tests/pkg/mock/monitor.go b/tests/pkg/mock/monitor.go index 1ba72e69d1..fc9c6b6f9b 100644 --- a/tests/pkg/mock/monitor.go +++ b/tests/pkg/mock/monitor.go @@ -39,7 +39,8 @@ func NewMockPrometheus() MonitorInterface { mp := &mockPrometheus{ responses: map[string]string{}, } - upResp := buildPrometheusResponse(nil, "") + params := &MonitorParams{} + upResp := buildPrometheusResponse(params) b, err := json.Marshal(upResp) if err != nil { klog.Fatal(err.Error()) @@ -83,13 +84,13 @@ func (m *mockPrometheus) SetResponse(w http.ResponseWriter, r *http.Request) { return } - b, err := json.Marshal(buildPrometheusResponse(mp.InstancesPod, mp.Value)) + b, err := json.Marshal(buildPrometheusResponse(mp)) if err != nil { writeResponse(w, err.Error()) return } - m.addIntoMaps(mp.Name, mp.MemberType, mp.Duration, string(b)) + m.addIntoMaps(mp, string(b)) writeResponse(w, "ok") return } @@ -117,16 +118,30 @@ func (m *mockPrometheus) ServeTargets(w http.ResponseWriter, r *http.Request) { writeResponse(w, string(b)) } -func (m *mockPrometheus) addIntoMaps(name, memberType, duration, value string) { - key := "" - klog.Infof("name= %s , memberType = %s , duration = %s , value = %s", name, memberType, duration, value) - if memberType == "tidb" { - key = fmt.Sprintf(calculate.TidbSumCpuMetricsPattern, name, duration) - } else if memberType == "tikv" { - key = fmt.Sprintf(calculate.TikvSumCpuMetricsPattern, name, duration) +func (m *mockPrometheus) addIntoMaps(mp *MonitorParams, response string) { + currentType := mp.QueryType + if currentType == "cpu" { + key := "" + name := mp.Name + memberType := mp.MemberType + duration := mp.Duration + klog.Infof("name=%s, memberType =%s, duration =%s, response =%s", name, memberType, duration, response) + if memberType == "tidb" { + key = fmt.Sprintf(calculate.TidbSumCpuMetricsPattern, name, duration) + } else if memberType == "tikv" { + key = fmt.Sprintf(calculate.TikvSumCpuMetricsPattern, name, duration) + } + m.responses[fmt.Sprintf("%s", key)] = response + klog.Infof("add key: %s with value: %s", key, response) + } else if currentType == "storage" { + key := "" + cluster := mp.Name + stype := mp.StorageType + klog.Infof("cluster=%s, storageType=%s, response =%s", cluster, stype, response) + key = fmt.Sprintf(calculate.TikvSumStorageMetricsPattern, cluster, stype) + m.responses[fmt.Sprintf("%s", key)] = response + klog.Infof("add key: %s with value: %s", key, response) } - m.responses[fmt.Sprintf("%s", key)] = value - klog.Infof("add key: %s with value: %s", key, value) } func writeResponse(w http.ResponseWriter, msg string) { diff --git a/tests/pkg/mock/util.go b/tests/pkg/mock/util.go index 7a79bf4a18..952d9916f7 100644 --- a/tests/pkg/mock/util.go +++ b/tests/pkg/mock/util.go @@ -27,10 +27,12 @@ import ( type MonitorParams struct { Name string `json:"name"` - MemberType string `json:"type"` + MemberType string `json:"memberType"` Duration string `json:"duration"` Value string `json:"value"` InstancesPod []string `json:"instances"` + QueryType string `json:"queryType"` + StorageType string `json:"storageType"` } type MonitorTargets struct { @@ -84,18 +86,39 @@ func SetPrometheusResponse(monitorName, monitorNamespace string, mp *MonitorPara return nil } -func buildPrometheusResponse(instances []string, value string) *calculate.Response { +func buildPrometheusResponse(mp *MonitorParams) *calculate.Response { resp := &calculate.Response{} resp.Status = "success" resp.Data = calculate.Data{} - if instances == nil { - return resp - } - for _, instance := range instances { + cluster := mp.Name + value := mp.Value + if mp.QueryType == "cpu" { + instances := mp.InstancesPod + if instances == nil { + return resp + } + for _, instance := range instances { + r := calculate.Result{ + Metric: calculate.Metric{ + Instance: instance, + Cluster: cluster, + Job: "foo", + KubernetesNamespace: "foo", + KubernetesNode: "foo", + KubernetesPodIp: "foo", + }, + Value: []interface{}{ + value, + value, + }, + } + resp.Data.Result = append(resp.Data.Result, r) + } + } else if mp.QueryType == "storage" { + value := mp.Value r := calculate.Result{ Metric: calculate.Metric{ - Instance: instance, - Cluster: "foo", + Cluster: cluster, Job: "foo", KubernetesNamespace: "foo", KubernetesNode: "foo", @@ -107,6 +130,7 @@ func buildPrometheusResponse(instances []string, value string) *calculate.Respon }, } resp.Data.Result = append(resp.Data.Result, r) + resp.Data.Result = append(resp.Data.Result, r) } resp.Data.ResultType = "foo" return resp