Specify CPU Request for the SDK Server Sidecar

This provides the mechanism (and defaults) for being able to set both the CPU request, and CPU limits for the SDK Server `GameServer` sidecar. I've only set the Request level, as it seems that the major issue is not CPU usage, but actually how the scheduler allots space for the sidecar (by default 100/0.1 vCPU is alloted to each container. After discussion, the CPU request has been set to 30m, but is also configurable via the helm chart. I've not set a CPU limit, as I found when setting a low (<= 20m) CPU limit on the sidecar it mostly stopped working. But if people want to experiment with this, it is also configurable via the Helm chart. Closes #344
googleforgames · Oct 19, 2018 · c6f714a · c6f714a
1 parent 1923a41
commit c6f714a
Show file tree

Hide file tree

Showing 12 changed files with 162 additions and 60 deletions.
diff --git a/README.md b/README.md
@@ -71,6 +71,7 @@ Documentation and usage guides on how to develop and host dedicated game servers
 
 ### Advanced
 - [Scheduling and Autoscaling](./docs/scheduling_autoscaling.md)
+- [Limiting CPU/Memory](./docs/limiting_resources.md)
 
 ## Get involved
 

diff --git a/cmd/controller/main.go b/cmd/controller/main.go
@@ -40,20 +40,23 @@ import (
 	"github.com/spf13/pflag"
 	"github.com/spf13/viper"
 	extclientset "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset"
+	"k8s.io/apimachinery/pkg/api/resource"
 	"k8s.io/client-go/informers"
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/rest"
 )
 
 const (
-	sidecarFlag     = "sidecar"
-	pullSidecarFlag = "always-pull-sidecar"
-	minPortFlag     = "min-port"
-	maxPortFlag     = "max-port"
-	certFileFlag    = "cert-file"
-	keyFileFlag     = "key-file"
-	workers         = 2
-	defaultResync   = 30 * time.Second
+	sidecarImageFlag      = "sidecar-image"
+	sidecarCPURequestFlag = "sidecar-cpu-request"
+	sidecarCPULimitFlag   = "sidecar-cpu-limit"
+	pullSidecarFlag       = "always-pull-sidecar"
+	minPortFlag           = "min-port"
+	maxPortFlag           = "max-port"
+	certFileFlag          = "cert-file"
+	keyFileFlag           = "key-file"
+	workers               = 2
+	defaultResync         = 30 * time.Second
 )
 
 var (
@@ -63,6 +66,9 @@ var (
 // main starts the operator for the gameserver CRD
 func main() {
 	ctlConf := parseEnvFlags()
+	logger.WithField("version", pkg.Version).
+		WithField("ctlConf", ctlConf).Info("starting gameServer operator...")
+
 	if err := ctlConf.validate(); err != nil {
 		logger.WithError(err).Fatal("Could not create controller from environment or flags")
 	}
@@ -88,14 +94,15 @@ func main() {
 	}
 
 	health := healthcheck.NewHandler()
-	wh := webhooks.NewWebHook(ctlConf.certFile, ctlConf.keyFile)
+	wh := webhooks.NewWebHook(ctlConf.CertFile, ctlConf.KeyFile)
 	agonesInformerFactory := externalversions.NewSharedInformerFactory(agonesClient, defaultResync)
 	kubeInformationFactory := informers.NewSharedInformerFactory(kubeClient, defaultResync)
 
 	allocationMutex := &sync.Mutex{}
 
 	gsController := gameservers.NewController(wh, health, allocationMutex,
-		ctlConf.minPort, ctlConf.maxPort, ctlConf.sidecarImage, ctlConf.alwaysPullSidecar,
+		ctlConf.MinPort, ctlConf.MaxPort, ctlConf.SidecarImage, ctlConf.AlwaysPullSidecar,
+		ctlConf.SidecarCPURequest, ctlConf.SidecarCPULimit,
 		kubeClient, kubeInformationFactory, extClient, agonesClient, agonesInformerFactory)
 	gsSetController := gameserversets.NewController(wh, health, allocationMutex,
 		kubeClient, extClient, agonesClient, agonesInformerFactory)
@@ -132,12 +139,16 @@ func parseEnvFlags() config {
 	}
 
 	base := filepath.Dir(exec)
-	viper.SetDefault(sidecarFlag, "gcr.io/agones-images/agones-sdk:"+pkg.Version)
+	viper.SetDefault(sidecarImageFlag, "gcr.io/agones-images/agones-sdk:"+pkg.Version)
+	viper.SetDefault(sidecarCPURequestFlag, "0")
+	viper.SetDefault(sidecarCPULimitFlag, "0")
 	viper.SetDefault(pullSidecarFlag, false)
 	viper.SetDefault(certFileFlag, filepath.Join(base, "certs/server.crt"))
 	viper.SetDefault(keyFileFlag, filepath.Join(base, "certs/server.key"))
 
-	pflag.String(sidecarFlag, viper.GetString(sidecarFlag), "Flag to overwrite the GameServer sidecar image that is used. Can also use SIDECAR env variable")
+	pflag.String(sidecarImageFlag, viper.GetString(sidecarImageFlag), "Flag to overwrite the GameServer sidecar image that is used. Can also use SIDECAR env variable")
+	pflag.String(sidecarCPULimitFlag, viper.GetString(sidecarCPULimitFlag), "Flag to overwrite the GameServer sidecar container's cpu limit. Can also use SIDECAR_CPU_LIMIT env variable")
+	pflag.String(sidecarCPURequestFlag, viper.GetString(sidecarCPURequestFlag), "Flag to overwrite the GameServer sidecar container's cpu request. Can also use SIDECAR_CPU_REQUEST env variable")
 	pflag.Bool(pullSidecarFlag, viper.GetBool(pullSidecarFlag), "For development purposes, set the sidecar image to have a ImagePullPolicy of Always. Can also use ALWAYS_PULL_SIDECAR env variable")
 	pflag.Int32(minPortFlag, 0, "Required. The minimum port that that a GameServer can be allocated to. Can also use MIN_PORT env variable.")
 	pflag.Int32(maxPortFlag, 0, "Required. The maximum port that that a GameServer can be allocated to. Can also use MAX_PORT env variable")
@@ -146,55 +157,56 @@ func parseEnvFlags() config {
 	pflag.Parse()
 
 	viper.SetEnvKeyReplacer(strings.NewReplacer("-", "_"))
-	runtime.Must(viper.BindEnv(sidecarFlag))
+	runtime.Must(viper.BindEnv(sidecarImageFlag))
+	runtime.Must(viper.BindEnv(sidecarCPULimitFlag))
+	runtime.Must(viper.BindEnv(sidecarCPURequestFlag))
 	runtime.Must(viper.BindEnv(pullSidecarFlag))
 	runtime.Must(viper.BindEnv(minPortFlag))
 	runtime.Must(viper.BindEnv(maxPortFlag))
 	runtime.Must(viper.BindEnv(keyFileFlag))
 	runtime.Must(viper.BindEnv(certFileFlag))
 	runtime.Must(viper.BindPFlags(pflag.CommandLine))
 
-	minPort := int32(viper.GetInt64(minPortFlag))
-	maxPort := int32(viper.GetInt64(maxPortFlag))
-	sidecarImage := viper.GetString(sidecarFlag)
-	alwaysPullSidecar := viper.GetBool(pullSidecarFlag)
-	keyFile := viper.GetString(keyFileFlag)
-	certFile := viper.GetString(certFileFlag)
-
-	logger.WithField(sidecarFlag, sidecarImage).
-		WithField("minPort", minPort).
-		WithField("maxPort", maxPort).
-		WithField(keyFileFlag, keyFile).
-		WithField(certFileFlag, certFile).
-		WithField("alwaysPullSidecarImage", alwaysPullSidecar).
-		WithField("Version", pkg.Version).Info("starting gameServer operator...")
+	request, err := resource.ParseQuantity(viper.GetString(sidecarCPURequestFlag))
+	if err != nil {
+		logger.WithError(err).Fatalf("could not parse %s", sidecarCPURequestFlag)
+	}
+
+	limit, err := resource.ParseQuantity(viper.GetString(sidecarCPULimitFlag))
+	if err != nil {
+		logger.WithError(err).Fatalf("could not parse %s", sidecarCPULimitFlag)
+	}
 
 	return config{
-		minPort:           minPort,
-		maxPort:           maxPort,
-		sidecarImage:      sidecarImage,
-		alwaysPullSidecar: alwaysPullSidecar,
-		keyFile:           keyFile,
-		certFile:          certFile,
+		MinPort:           int32(viper.GetInt64(minPortFlag)),
+		MaxPort:           int32(viper.GetInt64(maxPortFlag)),
+		SidecarImage:      viper.GetString(sidecarImageFlag),
+		SidecarCPURequest: request,
+		SidecarCPULimit:   limit,
+		AlwaysPullSidecar: viper.GetBool(pullSidecarFlag),
+		KeyFile:           viper.GetString(keyFileFlag),
+		CertFile:          viper.GetString(certFileFlag),
 	}
 }
 
 // config stores all required configuration to create a game server controller.
 type config struct {
-	minPort           int32
-	maxPort           int32
-	sidecarImage      string
-	alwaysPullSidecar bool
-	keyFile           string
-	certFile          string
+	MinPort           int32
+	MaxPort           int32
+	SidecarImage      string
+	SidecarCPURequest resource.Quantity
+	SidecarCPULimit   resource.Quantity
+	AlwaysPullSidecar bool
+	KeyFile           string
+	CertFile          string
 }
 
 // validate ensures the ctlConfig data is valid.
 func (c config) validate() error {
-	if c.minPort <= 0 || c.maxPort <= 0 {
+	if c.MinPort <= 0 || c.MaxPort <= 0 {
 		return errors.New("min Port and Max Port values are required")
 	}
-	if c.maxPort < c.minPort {
+	if c.MaxPort < c.MinPort {
 		return errors.New("max Port cannot be set less that the Min Port")
 	}
 	return nil

diff --git a/docs/limiting_resources.md b/docs/limiting_resources.md
@@ -0,0 +1,57 @@
+# Limiting CPU & Memory
+
+Kubernetes natively has inbuilt capabilities for requesting and limiting both CPU and Memory usage of running containers.
+
+As a short description:
+
+- CPU `Requests` are limits that are applied when there is CPU congestion, and as such can burst above their set limits.
+- CPU `Limits` are hard limits on how much CPU time the particular container gets access to.
+
+This is useful for game servers, not just as a mechanism to distribute compute resources evenly, but also as a way
+to advice the Kubernetes scheduler how many game server processes it is able to fit into a given node in the cluster.
+
+It's worth reading the [Managing Compute Resources for Containers](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/)
+Kubernetes documentation for more details on "requests" and "limits" to both CPU and Memory, and how to configure them.
+
+## GameServers
+
+Since the `GameServer` specification provides a full [`PodSpecTemplate`](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.10/#podtemplatespec-v1-core),
+we can take advantage of both resource limits and requests in our `GameServer` configurations. 
+
+For example, to set a CPU limit on our `GameServer` configuration of 250m/0.25 of a cpu, 
+we could do so as followed:
+
+```yaml
+apiVersion: "stable.agones.dev/v1alpha1"
+kind: GameServer
+metadata:
+  name: "simple-udp"
+spec:
+  ports:
+  - name: default
+    portPolicy: "dynamic"
+    containerPort: 7654
+  template:
+    spec:
+      containers:
+      - name: simple-udp
+        image: gcr.io/agones-images/udp-server:0.4
+        resources:
+          limit:
+            cpu: "250m" #this is our limit here
+```
+
+If you do not set a limit or request, the default is set my Kubernetes at a 100m CPU request. 
+
+## SDK GameServer sidecar
+
+⚠️⚠️⚠️ **This is currently a development feature and has not been released** ⚠️⚠️⚠️
+
+You may also want to tweak the CPU request or limits on the SDK `GameServer` sidecar process that spins up alongside
+each game server container.
+
+You can do this through the [Helm configuration](../install/helm/README.md#configuration) when installing Agones.
+
+By default, this is set to having a CPU request value of 30m, with no hard CPU limit. This ensures that the sidecar always has enough CPU
+to function, but it is configurable in case a lower, or higher value is required on your clusters, or if you desire 
+hard limit.
diff --git a/install/helm/README.md b/install/helm/README.md
@@ -92,6 +92,8 @@ The following tables lists the configurable parameters of the Agones chart and t
 | `agones.image.controller.pullPolicy`                | Image pull policy for the controller                                                            | `IfNotPresent`         |
 | `agones.image.controller.pullSecret`                | Image pull secret for the controller                                                            | ``                     |
 | `agones.image.sdk.name`                             | Image name for the sdk                                                                          | `agones-sdk`           |
+| `agones.image.sdk.cpuRequest`                       | (⚠️ Development feature ⚠️) the [cpu request][constraints] for sdk server container              | `30m`                  |
+| `agones.image.sdk.cpuLimit`                         | (⚠️ Development feature ⚠️) the [cpu limit][constraints] for the sdk server container            | `0` (none)             |
 | `agones.image.sdk.alwaysPull`                       | Tells if the sdk image should always be pulled                                                  | `false`                |
 | `agones.controller.healthCheck.http.port`           | Port to use for liveness probe service                                                          | `8080`                 |
 | `agones.controller.healthCheck.initialDelaySeconds` | Initial delay before performing the first probe (in seconds)                                    | `3`                    |
@@ -104,6 +106,8 @@ The following tables lists the configurable parameters of the Agones chart and t
 | `gameservers.minPort`                               | Minimum port to use for dynamic port allocation                                                 | `7000`                 |
 | `gameservers.maxPort`                               | Maximum port to use for dynamic port allocation                                                 | `8000`                 |
 
+[constraints]: https://kubernetes.io/docs/tasks/administer-cluster/manage-resources/cpu-constraint-namespace/
+
 Specify each parameter using the `--set key=value[,key=value]` argument to `helm install`. For example,
 
 ```bash

diff --git a/install/helm/agones/templates/controller.yaml b/install/helm/agones/templates/controller.yaml
@@ -51,16 +51,20 @@ spec:
         image: "{{ .Values.agones.image.registry }}/{{ .Values.agones.image.controller.name}}:{{ .Values.agones.image.tag }}"
         imagePullPolicy: {{ .Values.agones.image.controller.pullPolicy }}
         env:
-        - name: ALWAYS_PULL_SIDECAR # set the sidecar imagePullPolicy to Always
-          value: {{ .Values.agones.image.sdk.alwaysPull | quote }}
         # minimum port that can be exposed to GameServer traffic
         - name: MIN_PORT
-          value: {{ .Values.gameservers.minPort | quote }} 
+          value: {{ .Values.gameservers.minPort | quote }}
         # maximum port that can be exposed to GameServer traffic
         - name: MAX_PORT
           value: {{ .Values.gameservers.maxPort | quote }}
-        - name: SIDECAR # overwrite the GameServer sidecar image that is used
+        - name: SIDECAR_IMAGE # overwrite the GameServer sidecar image that is used
           value: "{{ .Values.agones.image.registry }}/{{ .Values.agones.image.sdk.name}}:{{ .Values.agones.image.tag }}"
+        - name: ALWAYS_PULL_SIDECAR # set the sidecar imagePullPolicy to Always
+          value: {{ .Values.agones.image.sdk.alwaysPull | quote }}
+        - name: SIDECAR_CPU_REQUEST
+          value: {{ .Values.agones.image.sdk.cpuRequest | quote }}
+        - name: SIDECAR_CPU_LIMIT
+          value: {{ .Values.agones.image.sdk.cpuLimit | quote }}
         livenessProbe:
           httpGet:
             path: /live

diff --git a/install/helm/agones/values.yaml b/install/helm/agones/values.yaml
@@ -37,6 +37,8 @@ agones:
       pullPolicy: IfNotPresent
     sdk:
       name: agones-sdk
+      cpuRequest: 30m
+      cpuLimit: 0
       alwaysPull: false
 
 gameservers:

diff --git a/install/yaml/install.yaml b/install/yaml/install.yaml
@@ -859,16 +859,20 @@ spec:
         image: "gcr.io/agones-images/agones-controller:0.6.0-rc"
         imagePullPolicy: IfNotPresent
         env:
-        - name: ALWAYS_PULL_SIDECAR # set the sidecar imagePullPolicy to Always
-          value: "false"
         # minimum port that can be exposed to GameServer traffic
         - name: MIN_PORT
-          value: "7000" 
+          value: "7000"
         # maximum port that can be exposed to GameServer traffic
         - name: MAX_PORT
           value: "8000"
-        - name: SIDECAR # overwrite the GameServer sidecar image that is used
+        - name: SIDECAR_IMAGE # overwrite the GameServer sidecar image that is used
           value: "gcr.io/agones-images/agones-sdk:0.6.0-rc"
+        - name: ALWAYS_PULL_SIDECAR # set the sidecar imagePullPolicy to Always
+          value: "false"
+        - name: SIDECAR_CPU_REQUEST
+          value: "30m"
+        - name: SIDECAR_CPU_LIMIT
+          value: "0"
         livenessProbe:
           httpGet:
             path: /live

diff --git a/pkg/gameservers/controller.go b/pkg/gameservers/controller.go
@@ -38,6 +38,7 @@ import (
 	extclientset "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset"
 	"k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset/typed/apiextensions/v1beta1"
 	k8serrors "k8s.io/apimachinery/pkg/api/errors"
+	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/labels"
 	"k8s.io/apimachinery/pkg/util/intstr"
@@ -59,6 +60,8 @@ type Controller struct {
 	logger                 *logrus.Entry
 	sidecarImage           string
 	alwaysPullSidecarImage bool
+	sidecarCPURequest      resource.Quantity
+	sidecarCPULimit        resource.Quantity
 	crdGetter              v1beta1.CustomResourceDefinitionInterface
 	podGetter              typedcorev1.PodsGetter
 	podLister              corelisterv1.PodLister
@@ -71,8 +74,9 @@ type Controller struct {
 	healthController       *HealthController
 	workerqueue            *workerqueue.WorkerQueue
 	allocationMutex        *sync.Mutex
-	stop                   <-chan struct{}
-	recorder               record.EventRecorder
+	stop                   <-chan struct {
+	}
+	recorder record.EventRecorder
 }
 
 // NewController returns a new gameserver crd controller
@@ -83,6 +87,8 @@ func NewController(
 	minPort, maxPort int32,
 	sidecarImage string,
 	alwaysPullSidecarImage bool,
+	sidecarCPURequest resource.Quantity,
+	sidecarCPULimit resource.Quantity,
 	kubeClient kubernetes.Interface,
 	kubeInformerFactory informers.SharedInformerFactory,
 	extClient extclientset.Interface,
@@ -95,6 +101,8 @@ func NewController(
 
 	c := &Controller{
 		sidecarImage:           sidecarImage,
+		sidecarCPULimit:        sidecarCPULimit,
+		sidecarCPURequest:      sidecarCPURequest,
 		alwaysPullSidecarImage: alwaysPullSidecarImage,
 		allocationMutex:        allocationMutex,
 		crdGetter:              extClient.ApiextensionsV1beta1().CustomResourceDefinitions(),
@@ -463,6 +471,7 @@ func (c *Controller) sidecar(gs *v1alpha1.GameServer) corev1.Container {
 				},
 			},
 		},
+		Resources: corev1.ResourceRequirements{},
 		LivenessProbe: &corev1.Probe{
 			Handler: corev1.Handler{
 				HTTPGet: &corev1.HTTPGetAction{
@@ -474,6 +483,15 @@ func (c *Controller) sidecar(gs *v1alpha1.GameServer) corev1.Container {
 			PeriodSeconds:       3,
 		},
 	}
+
+	if !c.sidecarCPURequest.IsZero() {
+		sidecar.Resources.Requests = corev1.ResourceList{corev1.ResourceCPU: c.sidecarCPURequest}
+	}
+
+	if !c.sidecarCPULimit.IsZero() {
+		sidecar.Resources.Limits = corev1.ResourceList{corev1.ResourceCPU: c.sidecarCPULimit}
+	}
+
 	if c.alwaysPullSidecarImage {
 		sidecar.ImagePullPolicy = corev1.PullAlways
 	}