Skip to content

Commit

Permalink
[Performance] - Added a new metric inside the allocator to track the …
Browse files Browse the repository at this point in the history
…success retry rate inside the retry loop (#3927)

* metrics changes

* metrics changes

* error rate metric

* error rate metric nit

* error rate metric nit nit

* error rate metric nit nit nit

* error rate metrics update increment

* error rate metric update

* error rate update

* latency update

* test

* test

* test

* added metric for success rate inside the retry loop of the allocator

* nit

* nit

* nit

* added metric for gameserver creation duration inside the controller

* updated metric to be a histogram and exported recordwithtags function

* lint

* added documentation with feature shortcode

* nit

* nit

* comment nit

* lint

* addressed nit comments and changed function name

* addressed comments and updated documentation
  • Loading branch information
vicentefb authored Aug 22, 2024
1 parent 079ced8 commit 1a5dc06
Show file tree
Hide file tree
Showing 8 changed files with 250 additions and 50 deletions.
9 changes: 8 additions & 1 deletion pkg/gameserverallocations/allocator.go
Original file line number Diff line number Diff line change
Expand Up @@ -250,13 +250,20 @@ func (c *Allocator) loggerForGameServerAllocation(gsa *allocationv1.GameServerAl
}

// allocateFromLocalCluster allocates gameservers from the local cluster.
// Registers number of times we retried before getting a success allocation
func (c *Allocator) allocateFromLocalCluster(ctx context.Context, gsa *allocationv1.GameServerAllocation) (*allocationv1.GameServerAllocation, error) {
var gs *agonesv1.GameServer
retry := c.newMetrics(ctx)
retryCount := 0
err := Retry(allocationRetry, func() error {
var err error
gs, err = c.allocate(ctx, gsa)
retryCount++

if err != nil {
c.loggerForGameServerAllocation(gsa).WithError(err).Warn("failed to allocate. Retrying...")
c.loggerForGameServerAllocation(gsa).WithError(err).Warn("Failed to Allocated. Retrying...")
} else {
retry.recordAllocationRetrySuccess(ctx, retryCount)
}
return err
})
Expand Down
41 changes: 33 additions & 8 deletions pkg/gameserverallocations/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,23 +32,42 @@ import (
)

var (
logger = runtime.NewLoggerWithSource("metrics")

keyFleetName = mt.MustTagKey("fleet_name")
keyClusterName = mt.MustTagKey("cluster_name")
keyMultiCluster = mt.MustTagKey("is_multicluster")
keyStatus = mt.MustTagKey("status")
keySchedulingStrategy = mt.MustTagKey("scheduling_strategy")

gameServerAllocationsLatency = stats.Float64("gameserver_allocations/latency", "The duration of gameserver allocations", "s")
gameServerAllocationsLatency = stats.Float64("gameserver_allocations/latency", "The duration of gameserver allocations", "s")
gameServerAllocationsRetryTotal = stats.Int64("gameserver_allocations/errors", "The errors of gameserver allocations", "1")
)

func init() {
runtime.Must(view.Register(&view.View{
Name: "gameserver_allocations_duration_seconds",
Measure: gameServerAllocationsLatency,
Description: "The distribution of gameserver allocation requests latencies.",
Aggregation: view.Distribution(0, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2, 3),
TagKeys: []tag.Key{keyFleetName, keyClusterName, keyMultiCluster, keyStatus, keySchedulingStrategy},
}))

stateViews := []*view.View{
{
Name: "gameserver_allocations_duration_seconds",
Measure: gameServerAllocationsLatency,
Description: "The distribution of gameserver allocation requests latencies.",
Aggregation: view.Distribution(0, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2, 3),
TagKeys: []tag.Key{keyFleetName, keyClusterName, keyMultiCluster, keyStatus, keySchedulingStrategy},
},
{
Name: "gameserver_allocations_retry_total",
Measure: gameServerAllocationsRetryTotal,
Description: "The count of gameserver allocation retry until it succeeds",
Aggregation: view.Distribution(1, 2, 3, 4, 5),
TagKeys: []tag.Key{keyFleetName, keyClusterName, keyMultiCluster, keyStatus, keySchedulingStrategy},
},
}

for _, v := range stateViews {
if err := view.Register(v); err != nil {
logger.WithError(err).Error("could not register view")
}
}
}

// default set of tags for latency metric
Expand Down Expand Up @@ -123,3 +142,9 @@ func (r *metrics) setResponse(o k8sruntime.Object) {
func (r *metrics) record() {
stats.Record(r.ctx, gameServerAllocationsLatency.M(time.Since(r.start).Seconds()))
}

// record the current allocation retry rate.
func (r *metrics) recordAllocationRetrySuccess(ctx context.Context, retryCount int) {
mt.RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyStatus, "Success")},
gameServerAllocationsRetryTotal.M(int64(retryCount)))
}
54 changes: 39 additions & 15 deletions pkg/gameserversets/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,24 @@ import (
"sync"
"time"

"agones.dev/agones/pkg/apis"
"agones.dev/agones/pkg/apis/agones"
agonesv1 "agones.dev/agones/pkg/apis/agones/v1"
"agones.dev/agones/pkg/client/clientset/versioned"
getterv1 "agones.dev/agones/pkg/client/clientset/versioned/typed/agones/v1"
"agones.dev/agones/pkg/client/informers/externalversions"
listerv1 "agones.dev/agones/pkg/client/listers/agones/v1"
"agones.dev/agones/pkg/gameservers"
"agones.dev/agones/pkg/util/crd"
"agones.dev/agones/pkg/util/logfields"
"agones.dev/agones/pkg/util/runtime"
"agones.dev/agones/pkg/util/webhooks"
"agones.dev/agones/pkg/util/workerqueue"
"github.com/google/go-cmp/cmp"
"github.com/heptiolabs/healthcheck"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"go.opencensus.io/tag"
admissionv1 "k8s.io/api/admission/v1"
corev1 "k8s.io/api/core/v1"
extclientset "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset"
Expand All @@ -36,20 +50,6 @@ import (
typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/record"

"agones.dev/agones/pkg/apis"
"agones.dev/agones/pkg/apis/agones"
agonesv1 "agones.dev/agones/pkg/apis/agones/v1"
"agones.dev/agones/pkg/client/clientset/versioned"
getterv1 "agones.dev/agones/pkg/client/clientset/versioned/typed/agones/v1"
"agones.dev/agones/pkg/client/informers/externalversions"
listerv1 "agones.dev/agones/pkg/client/listers/agones/v1"
"agones.dev/agones/pkg/gameservers"
"agones.dev/agones/pkg/util/crd"
"agones.dev/agones/pkg/util/logfields"
"agones.dev/agones/pkg/util/runtime"
"agones.dev/agones/pkg/util/webhooks"
"agones.dev/agones/pkg/util/workerqueue"
)

var (
Expand Down Expand Up @@ -506,8 +506,18 @@ func shouldDeleteErroredGameServer(gs *agonesv1.GameServer) bool {
}

// addMoreGameServers adds diff more GameServers to the set
func (c *Controller) addMoreGameServers(ctx context.Context, gsSet *agonesv1.GameServerSet, count int) error {
func (c *Controller) addMoreGameServers(ctx context.Context, gsSet *agonesv1.GameServerSet, count int) (err error) {
loggerForGameServerSet(c.baseLogger, gsSet).WithField("count", count).Debug("Adding more gameservers")
latency := c.newMetrics(ctx)
latency.setRequest(count)

defer func() {
if err != nil {
latency.setError("error")
}
latency.record()

}()

return parallelize(newGameServersChannel(count, gsSet), maxCreationParalellism, func(gs *agonesv1.GameServer) error {
gs, err := c.gameServerGetter.GameServers(gs.Namespace).Create(ctx, gs, metav1.CreateOptions{})
Expand Down Expand Up @@ -751,3 +761,17 @@ func aggregateLists(aggListStatus map[string]agonesv1.AggregatedListStatus,

return aggListStatus
}

// newMetrics creates a new gss latency recorder.
func (c *Controller) newMetrics(ctx context.Context) *metrics {
ctx, err := tag.New(ctx, latencyTags...)
if err != nil {
c.baseLogger.WithError(err).Warn("failed to tag latency recorder.")
}
return &metrics{
ctx: ctx,
gameServerLister: c.gameServerLister,
logger: c.baseLogger,
start: time.Now(),
}
}
99 changes: 99 additions & 0 deletions pkg/gameserversets/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
// Copyright 2024 Google LLC All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package gameserversets

import (
"context"
"fmt"
"time"

listerv1 "agones.dev/agones/pkg/client/listers/agones/v1"
mt "agones.dev/agones/pkg/metrics"
"agones.dev/agones/pkg/util/runtime"
"github.com/sirupsen/logrus"
"go.opencensus.io/stats"
"go.opencensus.io/stats/view"
"go.opencensus.io/tag"
)

var (
logger = runtime.NewLoggerWithSource("metrics")

keyName = mt.MustTagKey("name")
keyNamespace = mt.MustTagKey("namespace")
keyFleetName = mt.MustTagKey("fleet_name")
keyType = mt.MustTagKey("type")

gameServerCreationDuration = stats.Float64("gameserver_creation/duration", "The duration of gameserver creation", "s")
)

func init() {

stateViews := []*view.View{
{
Name: "gameserver_creation_duration",
Measure: gameServerCreationDuration,
Description: "The time gameserver takes to be created in seconds",
Aggregation: view.Distribution(0, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2, 3),
TagKeys: []tag.Key{keyName, keyType, keyFleetName, keyNamespace},
},
}

// register all our state views to OpenCensus
for _, v := range stateViews {
if err := view.Register(v); err != nil {
logger.WithError(err).Error("could not register view")
}
}

}

// default set of tags for latency metric
var latencyTags = []tag.Mutator{
tag.Insert(keyName, "none"),
tag.Insert(keyFleetName, "none"),
tag.Insert(keyType, "none"),
}

type metrics struct {
ctx context.Context
gameServerLister listerv1.GameServerLister
logger *logrus.Entry
start time.Time
}

// record the current current gameserver creation latency
func (r *metrics) record() {
stats.Record(r.ctx, gameServerCreationDuration.M(time.Since(r.start).Seconds()))
}

// mutate the current set of metric tags
func (r *metrics) mutate(m ...tag.Mutator) {
var err error
r.ctx, err = tag.New(r.ctx, m...)
if err != nil {
r.logger.WithError(err).Warn("failed to mutate request context.")
}
}

// setError set the latency status tag as error.
func (r *metrics) setError(errorType string) {
r.mutate(tag.Update(keyType, errorType))
}

// setRequest set request metric tags.
func (r *metrics) setRequest(count int) {
r.mutate(tag.Update(keyName, fmt.Sprint(count)))
}
46 changes: 23 additions & 23 deletions pkg/metrics/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,9 +192,9 @@ func (c *Controller) recordFleetAutoScalerChanges(old, next interface{}) {
// recording buffer policy
if fas.Spec.Policy.Buffer != nil {
// recording limits
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "max")},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "max")},
fasBufferLimitsCountStats.M(int64(fas.Spec.Policy.Buffer.MaxReplicas)))
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "min")},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "min")},
fasBufferLimitsCountStats.M(int64(fas.Spec.Policy.Buffer.MinReplicas)))

// recording size
Expand All @@ -203,13 +203,13 @@ func (c *Controller) recordFleetAutoScalerChanges(old, next interface{}) {
sizeString := fas.Spec.Policy.Buffer.BufferSize.StrVal
if sizeString != "" {
if size, err := strconv.Atoi(sizeString[:len(sizeString)-1]); err == nil {
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "percentage")},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "percentage")},
fasBufferSizeStats.M(int64(size)))
}
}
} else {
// as count
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "count")},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "count")},
fasBufferSizeStats.M(int64(fas.Spec.Policy.Buffer.BufferSize.IntVal)))
}
}
Expand Down Expand Up @@ -312,15 +312,15 @@ func (c *Controller) recordFleetReplicas(fleetName, fleetNamespace string, total

ctx, _ := tag.New(context.Background(), tag.Upsert(keyName, fleetName), tag.Upsert(keyNamespace, fleetNamespace))

recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total")},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total")},
fleetsReplicasCountStats.M(int64(total)))
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated")},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated")},
fleetsReplicasCountStats.M(int64(allocated)))
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "ready")},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "ready")},
fleetsReplicasCountStats.M(int64(ready)))
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "desired")},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "desired")},
fleetsReplicasCountStats.M(int64(desired)))
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "reserved")},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "reserved")},
fleetsReplicasCountStats.M(int64(reserved)))
}

Expand All @@ -330,13 +330,13 @@ func (c *Controller) recordCounters(fleetName, fleetNamespace string, counters m
ctx, _ := tag.New(context.Background(), tag.Upsert(keyName, fleetName), tag.Upsert(keyNamespace, fleetNamespace))

for counter, counterStatus := range counters {
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_count"), tag.Upsert(keyCounter, counter)},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_count"), tag.Upsert(keyCounter, counter)},
fleetCountersStats.M(counterStatus.AllocatedCount))
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_capacity"), tag.Upsert(keyCounter, counter)},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_capacity"), tag.Upsert(keyCounter, counter)},
fleetCountersStats.M(counterStatus.AllocatedCapacity))
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_count"), tag.Upsert(keyCounter, counter)},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_count"), tag.Upsert(keyCounter, counter)},
fleetCountersStats.M(counterStatus.Count))
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_capacity"), tag.Upsert(keyCounter, counter)},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_capacity"), tag.Upsert(keyCounter, counter)},
fleetCountersStats.M(counterStatus.Capacity))
}
}
Expand All @@ -347,13 +347,13 @@ func (c *Controller) recordLists(fleetName, fleetNamespace string, lists map[str
ctx, _ := tag.New(context.Background(), tag.Upsert(keyName, fleetName), tag.Upsert(keyNamespace, fleetNamespace))

for list, listStatus := range lists {
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_count"), tag.Upsert(keyList, list)},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_count"), tag.Upsert(keyList, list)},
fleetListsStats.M(listStatus.AllocatedCount))
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_capacity"), tag.Upsert(keyList, list)},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_capacity"), tag.Upsert(keyList, list)},
fleetListsStats.M(listStatus.AllocatedCapacity))
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_count"), tag.Upsert(keyList, list)},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_count"), tag.Upsert(keyList, list)},
fleetListsStats.M(listStatus.Count))
recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_capacity"), tag.Upsert(keyList, list)},
RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_capacity"), tag.Upsert(keyList, list)},
fleetListsStats.M(listStatus.Capacity))
}
}
Expand Down Expand Up @@ -386,27 +386,27 @@ func (c *Controller) recordGameServerStatusChanges(old, next interface{}) {
oldGs.Status.Players != nil {

if newGs.Status.Players.Count != oldGs.Status.Players.Count {
recordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyFleetName, fleetName),
RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyFleetName, fleetName),
tag.Upsert(keyName, newGs.GetName()), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gameServerPlayerConnectedTotal.M(newGs.Status.Players.Count))
}

if newGs.Status.Players.Capacity-newGs.Status.Players.Count != oldGs.Status.Players.Capacity-oldGs.Status.Players.Count {
recordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyFleetName, fleetName),
RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyFleetName, fleetName),
tag.Upsert(keyName, newGs.GetName()), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gameServerPlayerCapacityTotal.M(newGs.Status.Players.Capacity-newGs.Status.Players.Count))
}

}

if newGs.Status.State != oldGs.Status.State {
recordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyType, string(newGs.Status.State)),
RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyType, string(newGs.Status.State)),
tag.Upsert(keyFleetName, fleetName), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gameServerTotalStats.M(1))

// Calculate the duration of the current state
duration, err := c.calcDuration(oldGs, newGs)
if err != nil {
c.logger.Warn(err.Error())
} else {
recordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyType, string(oldGs.Status.State)),
RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyType, string(oldGs.Status.State)),
tag.Upsert(keyFleetName, fleetName), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gsStateDurationSec.M(duration))
}
}
Expand Down Expand Up @@ -516,9 +516,9 @@ func (c *Controller) collectNodeCounts() {
}

nodes = removeSystemNodes(nodes)
recordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyEmpty, "true")},
RecordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyEmpty, "true")},
nodesCountStats.M(int64(len(nodes)-len(gsPerNodes))))
recordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyEmpty, "false")},
RecordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyEmpty, "false")},
nodesCountStats.M(int64(len(gsPerNodes))))

for _, node := range nodes {
Expand Down
Loading

0 comments on commit 1a5dc06

Please sign in to comment.