Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Soft evict when hyperallocation of ext resource #200

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion cmd/crane-agent/app/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,13 @@ func Run(ctx context.Context, opts *options.Options) error {
craneInformerFactory := craneinformers.NewSharedInformerFactory(craneClient, informerSyncPeriod)
nepInformer := craneInformerFactory.Ensurance().V1alpha1().NodeQOSEnsurancePolicies()
actionInformer := craneInformerFactory.Ensurance().V1alpha1().AvoidanceActions()
tspInformer := craneInformerFactory.Prediction().V1alpha1().TimeSeriesPredictions()
nepInformer.Informer()
actionInformer.Informer()
tspInformer.Informer()

agent, err := agent.NewAgent(ctx, hostname, opts.RuntimeEndpoint, kubeClient, craneClient,
podInformer, nodeInformer, nepInformer, actionInformer, opts.Ifaces, healthCheck, opts.CollectInterval)
podInformer, nodeInformer, nepInformer, actionInformer, tspInformer, opts.NodeResourceOptions, opts.Ifaces, healthCheck, opts.CollectInterval, opts.UseBt)
nepInformer.Informer()
actionInformer.Informer()

Expand Down
22 changes: 21 additions & 1 deletion cmd/crane-agent/app/options/option.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,26 @@ type Options struct {
MaxInactivity time.Duration
// Ifaces is the network devices to collect metric
Ifaces []string
//NodeResourceOptions is the options of nodeResource
NodeResourceOptions NodeResourceOptions
//UseBt is the flag of if use bt_stat
UseBt bool
}

type NodeResourceOptions struct {
Enabled bool
CollectorNames []string
ReserveCpuPercentStr string
ReserveMemoryPercentStr string
}

// NewOptions builds an empty options.
func NewOptions() *Options {
return &Options{}
return &Options{
NodeResourceOptions: NodeResourceOptions{
CollectorNames: []string{},
},
}
}

// Complete completes all the required options.
Expand All @@ -48,4 +63,9 @@ func (o *Options) AddFlags(flags *pflag.FlagSet) {
flags.DurationVar(&o.CollectInterval, "collect-interval", 10*time.Second, "period for the state collector to collect metrics, default: 10s")
flags.StringArrayVar(&o.Ifaces, "ifaces", []string{"eth0"}, "The network devices to collect metric, use comma to separated, default: eth0")
flags.DurationVar(&o.MaxInactivity, "max-inactivity", 5*time.Minute, "Maximum time from last recorded activity before automatic restart, default: 5min")
flags.StringSliceVar(&o.NodeResourceOptions.CollectorNames, "noderesource-collector-names", []string{}, "The collectors of noderesource.")
flags.StringVar(&o.NodeResourceOptions.ReserveCpuPercentStr, "reserve-cpu-percent", "", "reserve cpu percentage of node.")
flags.StringVar(&o.NodeResourceOptions.ReserveMemoryPercentStr, "reserve-memory-percent", "", "reserve memory percentage of node.")
flags.BoolVar(&o.NodeResourceOptions.Enabled, "noderesource-enabled", false, "Enable NodeResourceManager.")
flags.BoolVar(&o.UseBt, "bt-enabled", false, "Enable bt scheduled.")
}
12 changes: 0 additions & 12 deletions cmd/craned/app/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ import (
"github.com/gocrane/crane/pkg/controller/analytics"
"github.com/gocrane/crane/pkg/controller/cnp"
"github.com/gocrane/crane/pkg/controller/ehpa"
"github.com/gocrane/crane/pkg/controller/noderesource"
"github.com/gocrane/crane/pkg/controller/recommendation"
"github.com/gocrane/crane/pkg/controller/timeseriesprediction"
"github.com/gocrane/crane/pkg/features"
Expand Down Expand Up @@ -174,7 +173,6 @@ func initializationWebhooks(mgr ctrl.Manager) {
// initializationControllers setup controllers with manager
func initializationControllers(ctx context.Context, mgr ctrl.Manager, opts *options.Options) {
autoscaling := utilfeature.DefaultFeatureGate.Enabled(features.CraneAutoscaling)
nodeResource := utilfeature.DefaultFeatureGate.Enabled(features.CraneNodeResource)
clusterNodePrediction := utilfeature.DefaultFeatureGate.Enabled(features.CraneClusterNodePrediction)
analysis := utilfeature.DefaultMutableFeatureGate.Enabled(features.CraneAnalysis)
timeseriespredict := utilfeature.DefaultFeatureGate.Enabled(features.CraneTimeSeriesPrediction)
Expand Down Expand Up @@ -304,16 +302,6 @@ func initializationControllers(ctx context.Context, mgr ctrl.Manager, opts *opti
}
}

// NodeResourceController
if nodeResource {
if err := (&noderesource.NodeResourceReconciler{
Client: mgr.GetClient(),
Recorder: mgr.GetEventRecorderFor("node-resource-controller"),
}).SetupWithManager(mgr); err != nil {
klog.Exit(err, "unable to create controller", "controller", "NodeResourceController")
}
}

// CnpController
if clusterNodePrediction {
if err := (&cnp.ClusterNodePredictionController{
Expand Down
9 changes: 9 additions & 0 deletions examples/eviction-extresource.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
apiVersion: ensurance.crane.io/v1alpha1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Put this file in crane/examples/ensurance

kind: AvoidanceAction
metadata:
name: eviction-extresource
labels:
app: system
spec:
coolDownSeconds: 300
description: "evict extresource"
20 changes: 20 additions & 0 deletions examples/extresource.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
apiVersion: ensurance.crane.io/v1alpha1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Put this file in crane/examples/ensurance

kind: NodeQOSEnsurancePolicy
metadata:
name: "extresource"
labels:
app: "system"
spec:
nodeQualityProbe:
timeoutSeconds: 10
nodeLocalGet:
localCacheTTLSeconds: 60
objectiveEnsurances:
- name: "ext_cpu_total_distribute"
avoidanceThreshold: 2
restoreThreshold: 2
actionName: "eviction-extresource"
strategy: "None"
metricRule:
name: "ext_cpu_total_distribute"
value: 99
25 changes: 25 additions & 0 deletions examples/noderesourcemanager-tsp-configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
apiVersion: v1
data:
spec: |
predictionMetrics:
- algorithm:
algorithmType: dsp
dsp:
estimators:
fft:
- highFrequencyThreshold: "0.05"
lowAmplitudeThreshold: "1.0"
marginFraction: "0.2"
maxNumOfSpectrumItems: 20
minNumOfSpectrumItems: 10
historyLength: 3d
sampleInterval: 60s
resourceIdentifier: cpu
type: ExpressionQuery
expressionQuery:
expression: 'node_cpu_can_be_reused_seconds{node=~"({{nodename}})(:\\d+)?"}'
predictionWindowSeconds: 180
kind: ConfigMap
metadata:
name: noderesource-tsp-template
namespace: default
7 changes: 5 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,21 @@ require (
github.com/stretchr/testify v1.7.0
golang.org/x/net v0.0.0-20211216030914-fe4d6282115f
google.golang.org/grpc v1.43.0
gopkg.in/yaml.v2 v2.4.0
k8s.io/api v0.22.3
k8s.io/apimachinery v0.22.3
k8s.io/apiserver v0.22.3
k8s.io/autoscaler/vertical-pod-autoscaler v0.10.0
k8s.io/client-go v0.22.3
k8s.io/component-base v0.22.3
k8s.io/cri-api v0.22.3
k8s.io/klog v0.3.0
k8s.io/klog/v2 v2.9.0
k8s.io/kubelet v0.0.0
k8s.io/kubernetes v1.22.3
k8s.io/metrics v0.22.3
sigs.k8s.io/controller-runtime v0.10.2
sigs.k8s.io/custom-metrics-apiserver v1.22.0
sigs.k8s.io/yaml v1.2.0
)

require (
Expand Down Expand Up @@ -147,12 +149,13 @@ require (
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/natefinch/lumberjack.v2 v2.0.0 // indirect
gopkg.in/warnings.v0 v0.1.1 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect
k8s.io/apiextensions-apiserver v0.22.2 // indirect
k8s.io/kube-controller-manager v0.22.3 // indirect
k8s.io/utils v0.0.0-20210819203725-bdf08cb9a70a // indirect
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.0.22 // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.1.2 // indirect
sigs.k8s.io/yaml v1.2.0 // indirect
)

require (
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -306,8 +306,6 @@ github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og=
github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
github.com/gobwas/ws v1.1.0-rc.5 h1:QOAag7FoBaBYYHRqzqkhhd8fq5RTubvI4v3Ft/gDVVQ=
github.com/gobwas/ws v1.1.0-rc.5/go.mod h1:nzvNcVha5eUziGrbxFCo6qFIojQHjJV5cLYIbezhfL0=
github.com/gocrane/api v0.2.1-0.20220307082411-6171d03c2dc5 h1:Q8ZYSeMCoz8VLor5nFZ8BIVuKVm+KgnfSMOr+XTLyOU=
github.com/gocrane/api v0.2.1-0.20220307082411-6171d03c2dc5/go.mod h1:GxI+t9AW8+NsHkz2JkPBIJN//9eLUjTZl1ScYAbXMbk=
github.com/gocrane/api v0.2.1-0.20220309033244-699efd59d009 h1:xd175jH+TT03ea52N4187vD5uoZmHQUAvMWTUPOIj2Y=
github.com/gocrane/api v0.2.1-0.20220309033244-699efd59d009/go.mod h1:GxI+t9AW8+NsHkz2JkPBIJN//9eLUjTZl1ScYAbXMbk=
github.com/godbus/dbus/v5 v5.0.3/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
Expand Down Expand Up @@ -1278,13 +1276,15 @@ k8s.io/klog/v2 v2.2.0/go.mod h1:Od+F08eJP+W3HUb4pSrPpgp9DGU4GzlpG/TmITuYh/Y=
k8s.io/klog/v2 v2.9.0 h1:D7HV+n1V57XeZ0m6tdRkfknthUaM06VFbWldOFh8kzM=
k8s.io/klog/v2 v2.9.0/go.mod h1:hy9LJ/NvuK+iVyP4Ehqva4HxZG/oXyIS3n3Jmire4Ec=
k8s.io/kube-aggregator v0.22.3/go.mod h1:TIpLq1HvR/S4y75i3y+4q9ik3ZvgyaDz72CBfDS0A6E=
k8s.io/kube-controller-manager v0.22.3 h1:DatYcgMKAn28e2A7MiMULoRoft3SaCV/qVk+FoGTUw0=
k8s.io/kube-controller-manager v0.22.3/go.mod h1:7biFk6Azf7xD+pzTScw7X9M5vGScqYp4J4wOT61QL1s=
k8s.io/kube-openapi v0.0.0-20210421082810-95288971da7e/go.mod h1:vHXdDvt9+2spS2Rx9ql3I8tycm3H9FDfdUoIuKCefvw=
k8s.io/kube-openapi v0.0.0-20210817084001-7fbd8d59e5b8 h1:Xxl9TLJ30BJ1pGWfGZnqbpww2rwOt3RAzbSz+omQGtg=
k8s.io/kube-openapi v0.0.0-20210817084001-7fbd8d59e5b8/go.mod h1:foAE7XkrXQ1Qo2eWsW/iWksptrVdbl6t+vscSdmmGjk=
k8s.io/kube-proxy v0.22.3/go.mod h1:9ta1U8GKKo6by981sN/L6MhFJzPWxMdfh7plVPH1I2s=
k8s.io/kube-scheduler v0.22.3/go.mod h1:jVLHSttd8cSejBLOeiWE+g8etA6XdOBGiR8tI577OhU=
k8s.io/kubectl v0.22.3/go.mod h1:gcpQHPOx+Jke9Og6Li7YxR/ZuaOtFUeJw7xHH617tHs=
k8s.io/kubelet v0.22.3 h1:C21Kg66Zzvc21uJITEPg4stGMcSZsR1JB+7+6Uwm8zs=
k8s.io/kubelet v0.22.3/go.mod h1:9nUZNGUigU2uAIm7kgf8BsvYDI9KjIE5nt9+yI1+p7w=
k8s.io/kubernetes v1.22.3 h1:/eFfR5S2Vxn0t9kcLVAZXQFloKMkklWQIf5e0hFbzlA=
k8s.io/kubernetes v1.22.3/go.mod h1:Snea7fgIObGgHmLbUJ3OgjGEr5bjj16iEdp5oHS6eS8=
Expand Down
96 changes: 94 additions & 2 deletions pkg/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,23 @@ package agent

import (
"context"
"fmt"
"net/http"
"strings"
"time"

"github.com/gocrane/crane/pkg/ensurance/cm"
"github.com/gocrane/crane/pkg/noderesource"
"github.com/gocrane/crane/pkg/utils"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client/apiutil"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"

"github.com/gocrane/crane/pkg/metrics"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/uuid"
"k8s.io/apimachinery/pkg/util/yaml"
"k8s.io/apiserver/pkg/server/mux"
"k8s.io/apiserver/pkg/server/routes"
coreinformers "k8s.io/client-go/informers/core/v1"
Expand All @@ -19,6 +30,8 @@ import (
ensuranceapi "github.com/gocrane/api/ensurance/v1alpha1"
craneclientset "github.com/gocrane/api/pkg/generated/clientset/versioned"
"github.com/gocrane/api/pkg/generated/informers/externalversions/ensurance/v1alpha1"
predictionv1alpha1 "github.com/gocrane/api/pkg/generated/informers/externalversions/prediction/v1alpha1"
v1alpha12 "github.com/gocrane/api/prediction/v1alpha1"
"github.com/gocrane/crane/cmd/crane-agent/app/options"
"github.com/gocrane/crane/pkg/ensurance/analyzer"
"github.com/gocrane/crane/pkg/ensurance/collector"
Expand All @@ -32,6 +45,7 @@ type Agent struct {
kubeClient kubernetes.Interface
craneClient craneclientset.Interface
managers []manager.Manager
host string
}

func NewAgent(ctx context.Context,
Expand All @@ -42,13 +56,26 @@ func NewAgent(ctx context.Context,
nodeInformer coreinformers.NodeInformer,
nepInformer v1alpha1.NodeQOSEnsurancePolicyInformer,
actionInformer v1alpha1.AvoidanceActionInformer,
timeSeriesPredictionInformer predictionv1alpha1.TimeSeriesPredictionInformer,
nodeResourceOptions options.NodeResourceOptions,
ifaces []string,
healthCheck *metrics.HealthCheck,
CollectInterval time.Duration,
useBt bool,
) (*Agent, error) {
var managers []manager.Manager
var noticeCh = make(chan executor.AvoidanceExecutor)

agent := &Agent{
ctx: ctx,
name: getAgentName(nodeName),
host: nodeName,
kubeClient: kubeClient,
craneClient: craneClient,
}
cadvisorManager, err := utils.NewCadvisorManager()
if err != nil {
return nil, err
}
utilruntime.Must(ensuranceapi.AddToScheme(scheme.Scheme))

stateCollector := collector.NewStateCollector(nodeName, nepInformer.Lister(), podInformer.Lister(), nodeInformer.Lister(), ifaces, healthCheck, CollectInterval)
Expand All @@ -57,6 +84,12 @@ func NewAgent(ctx context.Context,
managers = append(managers, analyzerManager)
avoidanceManager := executor.NewActionExecutor(kubeClient, nodeName, podInformer, nodeInformer, noticeCh, runtimeEndpoint)
managers = append(managers, avoidanceManager)
cpuManager := cm.NewAdvancedCpuManager(kubeClient, nodeName, podInformer, nodeInformer, runtimeEndpoint, cadvisorManager)
managers = append(managers, cpuManager)
if nodeResourceOptions.Enabled {
nodeResourceManager := noderesource.NewNodeResource(nodeName, kubeClient, craneClient, nodeInformer, timeSeriesPredictionInformer, nodeResourceOptions.ReserveCpuPercentStr, nodeResourceOptions.ReserveMemoryPercentStr, nodeResourceOptions.CollectorNames, utils.NewCpuStateProvider(cadvisorManager, podInformer.Lister(), useBt, cpuManager.GetExclusiveCpu), agent.CreateNodeResourceTsp())
managers = append(managers, nodeResourceManager)
}

return &Agent{
ctx: ctx,
Expand Down Expand Up @@ -95,5 +128,64 @@ func (a *Agent) Run(healthCheck *metrics.HealthCheck, opts *options.Options) {
}

func getAgentName(nodeName string) string {
return nodeName + "_" + string(uuid.NewUUID())
return nodeName + "." + string(uuid.NewUUID())
}

func (a *Agent) CreateNodeResourceTsp() string {
tsp, err := a.craneClient.PredictionV1alpha1().TimeSeriesPredictions("default").Get(context.TODO(), a.GenerateNodeResourceTspName(), metav1.GetOptions{})
if err == nil {
klog.V(4).Infof("Found old tsp %s in namespace default", a.GenerateNodeResourceTspName())
err := a.DeleteNodeResourceTsp()
if err != nil {
klog.Errorf("Delete old tsp %s with error: %v", a.GenerateNodeResourceTspName(), err)
return a.GenerateNodeResourceTspName()
}
}
config, err := a.kubeClient.CoreV1().ConfigMaps("default").Get(context.TODO(), "noderesource-tsp-template", metav1.GetOptions{})

if err != nil {
klog.Exitf("Get noderesource tsp configmap noderesource-tsp-template with error: %v", err)
}

if config == nil {
klog.Exitf("Can't get noderesource tsp configmap noderesource-tsp-template")
}

spec := v1alpha12.TimeSeriesPredictionSpec{}
err = yaml.Unmarshal([]byte(strings.Replace(config.Data["spec"], "{{nodename}}", a.host, -1)), &spec)
if err != nil {
klog.Exitf("Convert spec template error: %v", err)
}

n, _ := a.kubeClient.CoreV1().Nodes().Get(context.TODO(), a.host, metav1.GetOptions{})

tsp = &v1alpha12.TimeSeriesPrediction{}

tsp.Name = a.GenerateNodeResourceTspName()
tsp.Namespace = "default"
gvk, _ := apiutil.GVKForObject(n, scheme.Scheme)
spec.TargetRef = v1.ObjectReference{
Kind: gvk.Kind,
APIVersion: gvk.GroupVersion().String(),
Name: a.host,
}
tsp.Spec = spec
_ = controllerutil.SetControllerReference(n, tsp, scheme.Scheme)
_, err = a.craneClient.PredictionV1alpha1().TimeSeriesPredictions("default").Create(context.TODO(), tsp, metav1.CreateOptions{})
if err != nil {
klog.Exitf("Create noderesource tsp %s with error: %v", a.GenerateNodeResourceTspName(), err)
}
return a.GenerateNodeResourceTspName()
}

func (a *Agent) DeleteNodeResourceTsp() error {
err := a.craneClient.PredictionV1alpha1().TimeSeriesPredictions("default").Delete(context.TODO(), a.GenerateNodeResourceTspName(), metav1.DeleteOptions{})
if err != nil {
return err
}
return nil
}

func (a *Agent) GenerateNodeResourceTspName() string {
return fmt.Sprintf("noderesource-%s", a.name)
}
Loading