Skip to content

Commit

Permalink
watermark of memory usage percent
Browse files Browse the repository at this point in the history
  • Loading branch information
kaiyuechen committed Sep 15, 2022
1 parent 0ebcef7 commit 7ef2bdf
Show file tree
Hide file tree
Showing 10 changed files with 181 additions and 2 deletions.
13 changes: 13 additions & 0 deletions examples/ensurance/evict-on-mem-usage-percent/elastic-pod-qos.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
apiVersion: ensurance.crane.io/v1alpha1
kind: PodQOS
metadata:
name: all-elastic-pods
spec:
allowedActions:
- eviction
resourceQOS:
cpuQOS:
cpuPriority: 7
labelSelector:
matchLabels:
preemptible_job: "true"
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
apiVersion: ensurance.crane.io/v1alpha1
kind: AvoidanceAction
metadata:
name: eviction
spec:
coolDownSeconds: 300
description: evict low priority pods
eviction:
terminationGracePeriodSeconds: 30
56 changes: 56 additions & 0 deletions examples/ensurance/evict-on-mem-usage-percent/pod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
apiVersion: v1
kind: Pod
metadata:
name: low-pi6
labels:
k8s-app: low
preemptible_job: "true"
spec:
containers:
- image: ccr.ccs.tencentyun.com/tkeimages/stress-ng:v0.12.09
command:
- stress-ng
- --vm-hang
- "3600"
- --vm
- "2"
- --vm-bytes
- "2G"
name: stress
volumeMounts:
- mountPath: /data
name: data
volumes:
- hostPath:
path: /data/dd
type: DirectoryOrCreate
name: data
---
apiVersion: v1
kind: Pod
metadata:
name: low-pi2
labels:
k8s-app: low
preemptible_job: "true"
spec:
containers:
- image: ccr.ccs.tencentyun.com/tkeimages/stress-ng:v0.12.09
command:
- stress-ng
- --vm-hang
- "3600"
- --vm
- "2"
- --vm-bytes
- "3.5G"
name: stress
volumeMounts:
- mountPath: /data
name: data
volumes:
- hostPath:
path: /data/dd
type: DirectoryOrCreate
name: data

18 changes: 18 additions & 0 deletions examples/ensurance/evict-on-mem-usage-percent/watermark.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
apiVersion: ensurance.crane.io/v1alpha1
kind: NodeQOS
metadata:
name: eviction-on-high-mem-usage-percent
spec:
nodeQualityProbe:
nodeLocalGet:
localCacheTTLSeconds: 60
timeoutSeconds: 10
rules:
- actionName: eviction
avoidanceThreshold: 2
metricRule:
name: memory_total_utilization
value: 50
name: cpu-usage-percent
restoreThreshold: 2
strategy: None
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: ensurance.crane.io/v1alpha1
kind: NodeQOS
metadata:
name: eviction-on-high-cpu-usage-percent
name: eviction-on-high-mem-usage
spec:
nodeQualityProbe:
nodeLocalGet:
Expand Down
1 change: 1 addition & 0 deletions pkg/ensurance/collector/nodelocal/memory.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ func collectMemory(_ *nodeLocalContext) (map[string][]common.TimeSeries, error)
var data = make(map[string][]common.TimeSeries, 2)
data[string(types.MetricNameMemoryTotalUsage)] = []common.TimeSeries{{Samples: []common.Sample{{Value: float64(usage), Timestamp: now.Unix()}}}}
data[string(types.MetricNameMemoryTotalUtilization)] = []common.TimeSeries{{Samples: []common.Sample{{Value: usagePercent, Timestamp: now.Unix()}}}}
data[string(types.MetricNameMemoryTotal)] = []common.TimeSeries{{Samples: []common.Sample{{Value: float64(stat.Total), Timestamp: now.Unix()}}}}

return data, nil
}
1 change: 1 addition & 0 deletions pkg/ensurance/collector/types/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ const (

MetricNameMemoryTotalUsage MetricName = "memory_total_usage"
MetricNameMemoryTotalUtilization MetricName = "memory_total_utilization"
MetricNameMemoryTotal MetricName = "memory_total"

MetricDiskReadKiBPS MetricName = "disk_read_kibps"
MetricDiskWriteKiBPS MetricName = "disk_write_kibps"
Expand Down
70 changes: 70 additions & 0 deletions pkg/ensurance/executor/mem_usage_percent.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
package executor

import (
"sync"

"k8s.io/klog/v2"

"github.com/gocrane/crane/pkg/ensurance/executor/podinfo"
"github.com/gocrane/crane/pkg/ensurance/executor/sort"
"github.com/gocrane/crane/pkg/metrics"
"github.com/gocrane/crane/pkg/utils"
)

func init() {
registerMetricMap(memUsagePercent)
}

var memUsagePercent = metric{
Name: MemUsagePercent,
ActionPriority: 5,
Sortable: true,
SortFunc: sort.MemUsageSort,

Throttleable: false,
ThrottleQuantified: false,
ThrottleFunc: nil,
RestoreFunc: nil,

Evictable: true,
EvictQuantified: true,
EvictFunc: memUsagePercentEvictPod,
}

func memUsagePercentEvictPod(wg *sync.WaitGroup, ctx *ExecuteContext, index int, totalReleasedResource *ReleaseResource, EvictPods EvictPods) (errPodKeys []string, released ReleaseResource) {
wg.Add(1)

// Calculate release resources
released = releaseMemUsagePercent(EvictPods[index])
totalReleasedResource.Add(released)

go func(evictPod podinfo.PodContext) {
defer wg.Done()

pod, err := ctx.PodLister.Pods(evictPod.Key.Namespace).Get(evictPod.Key.Name)
if err != nil {
errPodKeys = append(errPodKeys, "not found ", evictPod.Key.String())
return
}
klog.Warningf("Evicting pod %v", evictPod.Key)
err = utils.EvictPodWithGracePeriod(ctx.Client, pod, evictPod.DeletionGracePeriodSeconds)
if err != nil {
errPodKeys = append(errPodKeys, "evict failed ", evictPod.Key.String())
klog.Warningf("Failed to evict pod %s: %v", evictPod.Key.String(), err)
return
}
metrics.ExecutorEvictCountsInc()

klog.Warningf("Pod %s is evicted", klog.KObj(pod))
}(EvictPods[index])
return
}

func releaseMemUsagePercent(pod podinfo.PodContext) ReleaseResource {
if pod.ActionType == podinfo.Evict {
return ReleaseResource{
MemUsagePercent: pod.PodMemUsage,
}
}
return ReleaseResource{}
}
13 changes: 12 additions & 1 deletion pkg/ensurance/executor/watermark.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ const (
CpuUsage = WatermarkMetric(types.MetricNameCpuTotalUsage)
CpuUsagePercent = WatermarkMetric(types.MetricNameCpuTotalUtilization)
MemUsage = WatermarkMetric(types.MetricNameMemoryTotalUsage)
MemUsagePercent = WatermarkMetric(types.MetricNameMemoryTotalUtilization)
)

const (
Expand Down Expand Up @@ -171,8 +172,18 @@ func calculateGaps(stateMap map[string][]common.TimeSeries,
} else {
cpuPercentToUsage := (1 + executeExcessPercent) * (maxUsed - float64(evictWatermark.PopSmallest().Value())) * cpuCoreNums[0].Samples[0].Value * 1000 / types.MaxPercentage
result[m.Name] = cpuPercentToUsage
klog.V(6).Infof("maxUsed is %f, watermark is %f, cpuPercentToUsageGap is %f", maxUsed, float64(evictWatermark.PopSmallest().Value()), cpuPercentToUsage)
klog.V(6).Infof("cpuPercent maxUsed is %f, watermark is %f, cpuPercentToUsageGap is %f", maxUsed, float64(evictWatermark.PopSmallest().Value()), cpuPercentToUsage)
}
} else if m.Name == MemUsagePercent {
totalMem, ok := stateMap[string(types.MetricNameMemoryTotal)]
if !ok {
klog.Warningf("Can't get MetricNameMemoryTotal")
} else {
memPercentToUsage := (1 + executeExcessPercent) * (maxUsed - float64(evictWatermark.PopSmallest().Value())) * totalMem[0].Samples[0].Value / types.MaxPercentage
result[m.Name] = memPercentToUsage
klog.V(6).Infof("memPercent maxUsed is %f, watermark is %f, memPercentToUsageGap is %f", maxUsed, float64(evictWatermark.PopSmallest().Value()), memPercentToUsage)
}

} else {
result[m.Name] = (1 + executeExcessPercent) * (maxUsed - float64(evictWatermark.PopSmallest().Value()))
}
Expand Down

0 comments on commit 7ef2bdf

Please sign in to comment.