Skip to content

Commit

Permalink
Monitoring improvements:
Browse files Browse the repository at this point in the history
- excluded system nodes (tainted with stable.agones.dev/*) from GS/node census
- fixed distribution buckets to account for the fact that OpenCensus uses buckets with "<" while Prometheus uses "<=", so there's no way to properly count nodes with zero gameservers
- fixed grafana dashboard gameservers/node ("avg")
- reduced rate window for gameservers/node from 5m to 1m
  • Loading branch information
jkowalski committed Feb 4, 2019
1 parent a2adb0a commit 1d9fd20
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 35 deletions.
12 changes: 6 additions & 6 deletions build/grafana/dashboard-gameservers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -401,42 +401,42 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(1, sum(rate(agones_gameservers_node_count_bucket[5m])) by (le))",
"expr": "histogram_quantile(1, sum(rate(agones_gameservers_node_count_bucket[1m])) by (le))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "max",
"refId": "F"
},
{
"expr": "histogram_quantile(0.99, sum(rate(agones_gameservers_node_count_bucket[5m])) by (le))",
"expr": "histogram_quantile(0.99, sum(rate(agones_gameservers_node_count_bucket[1m])) by (le))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "99th",
"refId": "A"
},
{
"expr": "histogram_quantile(0.90, sum(rate(agones_gameservers_node_count_bucket[5m])) by (le))",
"expr": "histogram_quantile(0.90, sum(rate(agones_gameservers_node_count_bucket[1m])) by (le))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "90th",
"refId": "B"
},
{
"expr": "histogram_quantile(0.50, sum(rate(agones_gameservers_node_count_bucket[5m])) by (le))",
"expr": "histogram_quantile(0.50, sum(rate(agones_gameservers_node_count_bucket[1m])) by (le))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "50th",
"refId": "C"
},
{
"expr": "histogram_quantile(0, sum(rate(agones_gameservers_node_count_bucket[5m])) by (le))",
"expr": "histogram_quantile(0, sum(rate(agones_gameservers_node_count_bucket[1m])) by (le))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "min",
"refId": "E"
},
{
"expr": " agones_gameservers_node_count_sum /\n agones_gameservers_node_count_count",
"expr": "avg(delta(agones_gameservers_node_count_sum[1m]) / delta(agones_gameservers_node_count_count[1m]))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "avg",
Expand Down
27 changes: 27 additions & 0 deletions pkg/metrics/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@ package metrics
import (
"context"
"strconv"
"strings"
"sync"
"time"

corev1 "k8s.io/api/core/v1"
v1 "k8s.io/client-go/listers/core/v1"

stablev1alpha1 "agones.dev/agones/pkg/apis/stable/v1alpha1"
Expand Down Expand Up @@ -371,6 +373,8 @@ func (c *Controller) collectNodeCounts() {
c.logger.WithError(err).Warn("failed listing gameservers")
return
}

nodes = removeSystemNodes(nodes)
recordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyEmpty, "true")},
nodesCountStats.M(int64(len(nodes)-len(gsPerNodes))))
recordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyEmpty, "false")},
Expand All @@ -381,3 +385,26 @@ func (c *Controller) collectNodeCounts() {
}

}

func removeSystemNodes(nodes []*corev1.Node) []*corev1.Node {
var result []*corev1.Node

for _, n := range nodes {
if !isSystemNode(n) {
result = append(result, n)
}
}

return result
}

// isSystemNode determines if a node is a system node, by checking if it has any taints starting with "stable.agones.dev/"
func isSystemNode(n *corev1.Node) bool {
for _, t := range n.Spec.Taints {
if strings.HasPrefix(t.Key, "stable.agones.dev/") {
return true
}
}

return false
}
2 changes: 1 addition & 1 deletion pkg/metrics/controller_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ var (
Name: "gameservers_node_count",
Measure: gsPerNodesCountStats,
Description: "The count of gameservers per node in the cluster",
Aggregation: view.Distribution(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, 40, 50, 60, 70, 80, 90, 100, 110, 120),
Aggregation: view.Distribution(0.00001, 1.00001, 2.00001, 3.00001, 4.00001, 5.00001, 6.00001, 7.00001, 8.00001, 9.00001, 10.00001, 11.00001, 12.00001, 13.00001, 14.00001, 15.00001, 16.00001, 32.00001, 40.00001, 50.00001, 60.00001, 70.00001, 80.00001, 90.00001, 100.00001, 110.00001, 120.00001),
},
}
)
Expand Down
5 changes: 3 additions & 2 deletions pkg/metrics/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ func TestControllerGameServersNodeState(t *testing.T) {
c.collect()
report()

assert.Nil(t, testutil.GatherAndCompare(registry, strings.NewReader(nodeCountExpected), "agones_nodes_count", "agones_gameservers_node_count"))

if err := testutil.GatherAndCompare(registry, strings.NewReader(nodeCountExpected), "agones_nodes_count", "agones_gameservers_node_count"); err != nil {
t.Fatal(err)
}
}
53 changes: 27 additions & 26 deletions pkg/metrics/util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -290,32 +290,33 @@ agones_fleet_autoscalers_limited{fleet_name="deleted-fleet",name="deleted"} 0

var nodeCountExpected = `# HELP agones_gameservers_node_count The count of gameservers per node in the cluster
# TYPE agones_gameservers_node_count histogram
agones_gameservers_node_count_bucket{le="1"} 1
agones_gameservers_node_count_bucket{le="2"} 2
agones_gameservers_node_count_bucket{le="3"} 3
agones_gameservers_node_count_bucket{le="4"} 3
agones_gameservers_node_count_bucket{le="5"} 3
agones_gameservers_node_count_bucket{le="6"} 3
agones_gameservers_node_count_bucket{le="7"} 3
agones_gameservers_node_count_bucket{le="8"} 3
agones_gameservers_node_count_bucket{le="9"} 3
agones_gameservers_node_count_bucket{le="10"} 3
agones_gameservers_node_count_bucket{le="11"} 3
agones_gameservers_node_count_bucket{le="12"} 3
agones_gameservers_node_count_bucket{le="13"} 3
agones_gameservers_node_count_bucket{le="14"} 3
agones_gameservers_node_count_bucket{le="15"} 3
agones_gameservers_node_count_bucket{le="16"} 3
agones_gameservers_node_count_bucket{le="32"} 3
agones_gameservers_node_count_bucket{le="40"} 3
agones_gameservers_node_count_bucket{le="50"} 3
agones_gameservers_node_count_bucket{le="60"} 3
agones_gameservers_node_count_bucket{le="70"} 3
agones_gameservers_node_count_bucket{le="80"} 3
agones_gameservers_node_count_bucket{le="90"} 3
agones_gameservers_node_count_bucket{le="100"} 3
agones_gameservers_node_count_bucket{le="110"} 3
agones_gameservers_node_count_bucket{le="120"} 3
agones_gameservers_node_count_bucket{le="1e-05"} 1
agones_gameservers_node_count_bucket{le="1.00001"} 2
agones_gameservers_node_count_bucket{le="2.00001"} 3
agones_gameservers_node_count_bucket{le="3.00001"} 3
agones_gameservers_node_count_bucket{le="4.00001"} 3
agones_gameservers_node_count_bucket{le="5.00001"} 3
agones_gameservers_node_count_bucket{le="6.00001"} 3
agones_gameservers_node_count_bucket{le="7.00001"} 3
agones_gameservers_node_count_bucket{le="8.00001"} 3
agones_gameservers_node_count_bucket{le="9.00001"} 3
agones_gameservers_node_count_bucket{le="10.00001"} 3
agones_gameservers_node_count_bucket{le="11.00001"} 3
agones_gameservers_node_count_bucket{le="12.00001"} 3
agones_gameservers_node_count_bucket{le="13.00001"} 3
agones_gameservers_node_count_bucket{le="14.00001"} 3
agones_gameservers_node_count_bucket{le="15.00001"} 3
agones_gameservers_node_count_bucket{le="16.00001"} 3
agones_gameservers_node_count_bucket{le="32.00001"} 3
agones_gameservers_node_count_bucket{le="40.00001"} 3
agones_gameservers_node_count_bucket{le="50.00001"} 3
agones_gameservers_node_count_bucket{le="60.00001"} 3
agones_gameservers_node_count_bucket{le="70.00001"} 3
agones_gameservers_node_count_bucket{le="80.00001"} 3
agones_gameservers_node_count_bucket{le="90.00001"} 3
agones_gameservers_node_count_bucket{le="100.00001"} 3
agones_gameservers_node_count_bucket{le="110.00001"} 3
agones_gameservers_node_count_bucket{le="120.00001"} 3
agones_gameservers_node_count_bucket{le="+Inf"} 3
agones_gameservers_node_count_sum 3
agones_gameservers_node_count_count 3
Expand Down

0 comments on commit 1d9fd20

Please sign in to comment.