Skip to content

Commit

Permalink
Add Stackdriver Metrics Exporter
Browse files Browse the repository at this point in the history
Add Opencensus Stackdriver Exporter functionality for Agones metrics.
New docs on how to set Stackdriver Dashboard and configure permissions.
Add helm config variable as well as change reporting period to 1 minute
 if StackdriveExporter is enabled.
  • Loading branch information
aLekSer committed Jan 25, 2019
1 parent 418bdb9 commit bb09feb
Show file tree
Hide file tree
Showing 5,323 changed files with 5,278,158 additions and 81,935 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
302 changes: 274 additions & 28 deletions Gopkg.lock

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions Gopkg.toml
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,7 @@
[[constraint]]
name = "github.com/evanphx/json-patch"
version = "4.1.0"

[[constraint]]
name = "contrib.go.opencensus.io/exporter/stackdriver"
version = "v0.8.0"
72 changes: 52 additions & 20 deletions cmd/controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,18 +49,20 @@ import (
)

const (
enableMetricsFlag = "metrics"
sidecarImageFlag = "sidecar-image"
sidecarCPURequestFlag = "sidecar-cpu-request"
sidecarCPULimitFlag = "sidecar-cpu-limit"
pullSidecarFlag = "always-pull-sidecar"
minPortFlag = "min-port"
maxPortFlag = "max-port"
certFileFlag = "cert-file"
keyFileFlag = "key-file"
kubeconfigFlag = "kubeconfig"
workers = 2
defaultResync = 30 * time.Second
enableStackdriverMetricsFlag = "stackdriver-exporter"
enablePrometheusMetricsFlag = "prometheus-exporter"
projectIDFlag = "gcp-project-id"
sidecarImageFlag = "sidecar-image"
sidecarCPURequestFlag = "sidecar-cpu-request"
sidecarCPULimitFlag = "sidecar-cpu-limit"
pullSidecarFlag = "always-pull-sidecar"
minPortFlag = "min-port"
maxPortFlag = "max-port"
certFileFlag = "cert-file"
keyFileFlag = "key-file"
kubeconfigFlag = "kubeconfig"
workers = 2
defaultResync = 30 * time.Second
)

var (
Expand Down Expand Up @@ -106,19 +108,39 @@ func main() {
var rs []runner
var health healthcheck.Handler

if ctlConf.Metrics {
// Stackdriver metrics
if ctlConf.Stackdriver {
sd, err := metrics.RegisterStackdriverExporter(ctlConf.GCPProjectID)
if err != nil {
logger.WithError(err).Fatal("Could not register stackdriver exporter")
}
// It is imperative to invoke flush before your main function exits
defer sd.Flush()
}

// Prometheus metrics
if ctlConf.PrometheusMetrics {
registry := prom.NewRegistry()
metricHandler, err := metrics.RegisterPrometheusExporter(registry)
if err != nil {
logger.WithError(err).Fatal("Could not create register prometheus exporter")
logger.WithError(err).Fatal("Could not register prometheus exporter")
}
server.Handle("/metrics", metricHandler)
health = healthcheck.NewMetricsHandler(registry, "agones")
rs = append(rs, metrics.NewController(kubeClient, agonesClient, agonesInformerFactory))
} else {
health = healthcheck.NewHandler()
}

// If we are using Prometheus only exporter we can make reporting more often,
// every 1 seconds, if we are using Stackdriver we would use 60 seconds reporting period,
// which is a requirements of Stackdriver, otherwise most of time series would be invalid for Stackdriver
metrics.SetReportingPeriod(ctlConf.PrometheusMetrics, ctlConf.Stackdriver)

// Add metrics controller only if we configure one of metrics exporters
if ctlConf.PrometheusMetrics || ctlConf.Stackdriver {
rs = append(rs, metrics.NewController(kubeClient, agonesClient, agonesInformerFactory))
}

server.Handle("/", health)

allocationMutex := &sync.Mutex{}
Expand Down Expand Up @@ -170,7 +192,9 @@ func parseEnvFlags() config {
viper.SetDefault(pullSidecarFlag, false)
viper.SetDefault(certFileFlag, filepath.Join(base, "certs/server.crt"))
viper.SetDefault(keyFileFlag, filepath.Join(base, "certs/server.key"))
viper.SetDefault(enableMetricsFlag, true)
viper.SetDefault(enablePrometheusMetricsFlag, true)
viper.SetDefault(enableStackdriverMetricsFlag, false)
viper.SetDefault(projectIDFlag, "")

pflag.String(sidecarImageFlag, viper.GetString(sidecarImageFlag), "Flag to overwrite the GameServer sidecar image that is used. Can also use SIDECAR env variable")
pflag.String(sidecarCPULimitFlag, viper.GetString(sidecarCPULimitFlag), "Flag to overwrite the GameServer sidecar container's cpu limit. Can also use SIDECAR_CPU_LIMIT env variable")
Expand All @@ -181,7 +205,9 @@ func parseEnvFlags() config {
pflag.String(keyFileFlag, viper.GetString(keyFileFlag), "Optional. Path to the key file")
pflag.String(certFileFlag, viper.GetString(certFileFlag), "Optional. Path to the crt file")
pflag.String(kubeconfigFlag, viper.GetString(kubeconfigFlag), "Optional. kubeconfig to run the controller out of the cluster. Only use it for debugging as webhook won't works.")
pflag.Bool(enableMetricsFlag, viper.GetBool(enableMetricsFlag), "Flag to activate metrics of Agones. Can also use METRICS env variable.")
pflag.Bool(enablePrometheusMetricsFlag, viper.GetBool(enablePrometheusMetricsFlag), "Flag to activate metrics of Agones. Can also use PROMETHEUS_EXPORTER env variable.")
pflag.Bool(enableStackdriverMetricsFlag, viper.GetBool(enableStackdriverMetricsFlag), "Flag to activate stackdriver monitoring metrics for Agones. Can also use STACKDRIVER_EXPORTER env variable.")
pflag.String(projectIDFlag, viper.GetString(projectIDFlag), "GCP ProjectID used for Stackdriver, if not specified ProjectID from Application Default Credentials would be used. Can also use GCP_PROJECT_ID env variable.")
pflag.Parse()

viper.SetEnvKeyReplacer(strings.NewReplacer("-", "_"))
Expand All @@ -194,7 +220,9 @@ func parseEnvFlags() config {
runtime.Must(viper.BindEnv(keyFileFlag))
runtime.Must(viper.BindEnv(certFileFlag))
runtime.Must(viper.BindEnv(kubeconfigFlag))
runtime.Must(viper.BindEnv(enableMetricsFlag))
runtime.Must(viper.BindEnv(enablePrometheusMetricsFlag))
runtime.Must(viper.BindEnv(enableStackdriverMetricsFlag))
runtime.Must(viper.BindEnv(projectIDFlag))
runtime.Must(viper.BindPFlags(pflag.CommandLine))

request, err := resource.ParseQuantity(viper.GetString(sidecarCPURequestFlag))
Expand All @@ -217,7 +245,9 @@ func parseEnvFlags() config {
KeyFile: viper.GetString(keyFileFlag),
CertFile: viper.GetString(certFileFlag),
KubeConfig: viper.GetString(kubeconfigFlag),
Metrics: viper.GetBool(enableMetricsFlag),
PrometheusMetrics: viper.GetBool(enablePrometheusMetricsFlag),
Stackdriver: viper.GetBool(enableStackdriverMetricsFlag),
GCPProjectID: viper.GetString(projectIDFlag),
}
}

Expand All @@ -229,10 +259,12 @@ type config struct {
SidecarCPURequest resource.Quantity
SidecarCPULimit resource.Quantity
AlwaysPullSidecar bool
Metrics bool
PrometheusMetrics bool
Stackdriver bool
KeyFile string
CertFile string
KubeConfig string
GCPProjectID string
}

// validate ensures the ctlConfig data is valid.
Expand Down
10 changes: 7 additions & 3 deletions install/helm/agones/templates/controller.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ spec:
{{- if .Values.agones.controller.generateTLS }}
revision/tls-cert: {{ .Release.Revision | quote }}
{{- end }}
{{- if and (.Values.agones.metrics.prometheusServiceDiscovery) (.Values.agones.metrics.enabled) }}
{{- if and (.Values.agones.metrics.prometheusServiceDiscovery) (.Values.agones.metrics.prometheusEnabled) }}
prometheus.io/scrape: "true"
prometheus.io/port: {{ .Values.agones.controller.http.port | quote }}
prometheus.io/path: "/metrics"
Expand Down Expand Up @@ -81,8 +81,12 @@ spec:
value: {{ .Values.agones.image.sdk.alwaysPull | quote }}
- name: SIDECAR_CPU_REQUEST
value: {{ .Values.agones.image.sdk.cpuRequest | quote }}
- name: METRICS
value: {{ .Values.agones.metrics.enabled | quote }}
- name: PROMETHEUS_EXPORTER
value: {{ .Values.agones.metrics.prometheusEnabled | quote }}
- name: STACKDRIVER_EXPORTER
value: {{ .Values.agones.metrics.stackdriverEnabled | quote }}
- name: GCP_PROJECT_ID
value: {{ .Values.agones.metrics.stackdriverProjectID | quote }}
- name: SIDECAR_CPU_LIMIT
value: {{ .Values.agones.image.sdk.cpuLimit | quote }}
livenessProbe:
Expand Down
4 changes: 3 additions & 1 deletion install/helm/agones/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@

agones:
metrics:
enabled: true
prometheusEnabled: true
prometheusServiceDiscovery: true
stackdriverEnabled: false
stackdriverProjectID: ""
rbacEnabled: true
crds:
install: true
Expand Down
6 changes: 5 additions & 1 deletion install/yaml/install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1031,8 +1031,12 @@ spec:
value: "false"
- name: SIDECAR_CPU_REQUEST
value: "30m"
- name: METRICS
- name: PROMETHEUS_EXPORTER
value: "true"
- name: STACKDRIVER_EXPORTER
value: "false"
- name: GCP_PROJECT_ID
value: ""
- name: SIDECAR_CPU_LIMIT
value: "0"
livenessProbe:
Expand Down
38 changes: 36 additions & 2 deletions pkg/metrics/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"net/http"
"time"

"contrib.go.opencensus.io/exporter/stackdriver"
prom "github.com/prometheus/client_golang/prometheus"
"go.opencensus.io/exporter/prometheus"
"go.opencensus.io/stats/view"
Expand All @@ -41,7 +42,40 @@ func RegisterPrometheusExporter(registry *prom.Registry) (http.Handler, error) {
return nil, err
}
view.RegisterExporter(pe)
// since we're using prometheus we can report faster as we're only exposing metrics in memory
view.SetReportingPeriod(1 * time.Second)

return pe, nil
}

// RegisterStackdriverExporter register a Stackdriver exporter to OpenCensus.
// It will add Agones metrics into Stackdriver on Google Cloud.
func RegisterStackdriverExporter(projectID string) (sd *stackdriver.Exporter, err error) {
// Default project will be used
sd, err = stackdriver.NewExporter(stackdriver.Options{
ProjectID: projectID,
// MetricPrefix helps uniquely identify your metrics.
MetricPrefix: "agones",
})
if err != nil {
return
}

// Register it as a metrics exporter
view.RegisterExporter(sd)
return
}

// SetReportingPeriod set appropriate reporting period which depends on exporters
// we are going to use
func SetReportingPeriod(prometheus, stackdriver bool) {
// if we're using only prometheus we can report faster as we're only exposing metrics in memory
reportingPeriod := 1 * time.Second
if stackdriver {
// There is a limitation on Stackdriver that reporting should
// be equal or more than 1 minute
reportingPeriod = 60 * time.Second
}

if stackdriver || prometheus {
view.SetReportingPeriod(reportingPeriod)
}
}
2 changes: 1 addition & 1 deletion site/content/en/docs/Contribute/_index.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ or to hide a section from 0.8.0 onward:

```markdown
{{\% feature expiryVersion="0.8.0" %}}
This is my special content that she be hidden <= 0.8.0
This is my special content that will be hidden >= 0.8.0
{{\% /feature %}}
```

Expand Down
43 changes: 42 additions & 1 deletion site/content/en/docs/Guides/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,14 @@ Table of Contents
- [Installation](#installation)
- [Prometheus installation](#prometheus-installation)
- [Grafana installation](#grafana-installation)
- [Stackdriver installation](#stackdriver-installation)
- [Adding more metrics](#adding-more-metrics)

## Backend integrations

### Prometheus

If you are running a [Prometheus](https://prometheus.io/) instance you just need to ensure that metrics and kubernetes service discovery are enabled. (helm chart values `agones.metrics.enabled` and `agones.metrics.prometheusServiceDiscovery`). This will automatically add annotations required by Prometheus to discover Agones metrics and start collecting them. (see [example](https://github.com/prometheus/prometheus/tree/master/documentation/examples/kubernetes-rabbitmq))
If you are running a [Prometheus](https://prometheus.io/) instance you just need to ensure that metrics and kubernetes service discovery are enabled. (helm chart values {{% feature expiryVersion="0.8.0" %}}`agones.metrics.enabled`{{% /feature %}}{{% feature publishVersion="0.8.0" %}}`agones.metrics.prometheusEnabled`{{% /feature %}} and `agones.metrics.prometheusServiceDiscovery`). This will automatically add annotations required by Prometheus to discover Agones metrics and start collecting them. (see [example](https://github.com/prometheus/prometheus/tree/master/documentation/examples/kubernetes-rabbitmq))

### Prometheus Operator

Expand All @@ -54,10 +55,17 @@ Finally include that `ServiceMonitor` in your [Prometheus instance CRD](https://

### Stackdriver

{{% feature expiryVersion="0.8.0" %}}
We don't yet support the [OpenCensus Stackdriver exporter](https://opencensus.io/exporters/supported-exporters/go/stackdriver/)
but you can still use the Prometheus Stackdriver integration by following these [instructions](https://cloud.google.com/monitoring/kubernetes-engine/prometheus).
Annotations required by this integration can be activated by setting the `agones.metrics.prometheusServiceDiscovery`
to true (default) via the [helm chart value]({{< relref "../Installation/helm.md" >}}).
{{% /feature %}}
{{% feature publishVersion="0.8.0" %}}
We support the [OpenCensus Stackdriver exporter](https://opencensus.io/exporters/supported-exporters/go/stackdriver/).
In order to use it you should enable [Stackdriver Monitoring API](https://cloud.google.com/monitoring/api/enable-api) in Google Cloud Console.
Follow the [Stackdriver Installation steps](#stackdriver-installation) to see your metrics on Stackdriver Monitoring website.
{{% /feature %}}

## Metrics available

Expand Down Expand Up @@ -174,6 +182,39 @@ Open a web browser to [http://127.0.0.1:3000](http://127.0.0.1:3000), you should

> Makefile targets `make grafana-portforward`,`make kind-grafana-portforward` and `make minikube-grafana-portforward`.

{{% feature publishVersion="0.8.0" %}}
### Stackdriver installation

In order to use [Stackdriver monitoring](https://app.google.stackdriver.com) you should [enable Stackdriver Monitoring API](https://cloud.google.com/monitoring/api/enable-api) on Google Cloud Console. You need to grant all the necessary permissions to the users (see [Access Control Guide](https://cloud.google.com/monitoring/access-control)). Stackdriver exporter uses a strategy called Application Default Credentials (ADC) to find your application's credentials. Details could be found here [Setting Up Authentication for Server to Server Production Applications](https://cloud.google.com/docs/authentication/production).

Note that Stackdriver monitoring is enabled by default on GKE clusters, however you can follow this [guide](https://cloud.google.com/kubernetes-engine/docs/how-to/monitoring#enabling_stackdriver_monitoring) if it was disabled on your GKE cluster.

Default metrics exporter is Prometheus. In order to change it to Stackdriver upgrade Agones release using helm with next three chart parameters changed:
```
helm upgrade --install --wait --set agones.metrics.stackdriverEnabled=true --set agones.metrics.prometheusEnabled=false --set agones.metrics.prometheusServiceDiscovery=false agones ../install/helm/agones/
```

With this configuration only Stackdriver exporter would be used instead of Prometheus exporter.

Create a Fleet or a Gameserver in order to check that connection with stackdriver API is configured properly and so that you will be able to see the metrics data.

Visit [Stackdriver monitoring](https://app.google.stackdriver.com) website, select your project, or choose `Create a new Workspace` and select GCP project where your cluster resides. In [Stackdriver metrics explorer](https://cloud.google.com/monitoring/charts/metrics-explorer) you should be able to find new metrics with prefix `agones/` (resource type is `Global`) after a couple of minutes. Choose the metrics you are interested in and add to a single or separate graphs. You can create multiple graphs, save them into your dashboard and use various aggregation parameters and reducers for each graph.

Example of the dashboard appearance is provided below:

![stackdriver monitoring dashboard](../../../images/stackdriver-metrics-dashboard.png)

Currently there exists only manual way of configuring Stackdriver Dashboard. So it is up to you to set an Alignment Period (minimal is 1 minute), GroupBy, Filter parameters and other graph settings.

#### Troubleshooting
If you can't see Agones metrics you should have a look at the controller logs for connection errors. Also ensure that your cluster has the necessary credentials to interact with Stackdriver Monitoring. You can configure `stackdriverProjectID` manually, if the automatic discovery is not working.

Permissions problem example from controller logs:
```
Failed to export to Stackdriver: rpc error: code = PermissionDenied desc = Permission monitoring.metricDescriptors.create denied (or the resource may not exist).
```
{{% /feature %}}

## Adding more metrics

If you want to contribute and add more metrics we recommend to use shared informers (cache) as it is currently implemented in the {{< ghlink href="pkg/metrics/controller.go" branch="master" >}}metrics controller{{< /ghlink >}}.
Expand Down
5 changes: 4 additions & 1 deletion site/content/en/docs/Installation/helm.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ The following tables lists the configurable parameters of the Agones chart and t
| --------------------------------------------------- | ----------------------------------------------------------------------------------------------- | ---------------------- |
| `agones.rbacEnabled` | Creates RBAC resources. Must be set for any cluster configured with RBAC | `true` |
| `agones.crds.install` | Install the CRDs with this chart. Useful to disable if you want to subchart (since crd-install hook is broken), so you can copy the CRDs into your own chart. | `true` |
| `agones.crds.cleanupOnDelete` | Run the pre-delete hook to delete all GameServers and their backing Pods when deleting the helm chart, so that all CRDs can be removed on chart deletion | `true` |
| `agones.crds.cleanupOnDelete` | Run the pre-delete hook to delete all GameServers and their backing Pods when deleting the helm chart, so that all CRDs can be removed on chart deletion | `true` |
| `agones.metrics.enabled` | Enables controller metrics on port `8080` and path `/metrics` | `true` |
| `agones.metrics.prometheusServiceDiscovery` | Adds annotations for Prometheus ServiceDiscovery (and also Strackdriver) | `true` |
| `agones.serviceaccount.controller` | Service account name for the controller | `agones-controller` |
Expand Down Expand Up @@ -136,6 +136,9 @@ The following tables lists the configurable parameters of the Agones chart and t

| Parameter | Description | Default |
| --------------------------------------------------- | ----------------------------------------------------------------------------------------------- | ---------------------- |
| `agones.metrics.prometheusEnabled` | Enables controller metrics on port `8080` and path `/metrics` | `true` |
| `agones.metrics.stackdriverEnabled` | Enables Stackdriver exporter of controller metrics | `false` |
| `agones.metrics.stackdriverProjectID` | This overrides the default gcp project id for use with stackdriver | `` |
| `agones.controller.nodeSelector` | Controller [node labels](nodeSelector) for pod assignment | `{}` |
| `agones.controller.tolerations` | Controller [toleration][toleration] labels for pod assignment | `[]` |
| `agones.controller.affinity` | Controller [affinity](affinity) settings for pod assignment | `{}` |
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit bb09feb

Please sign in to comment.