From 1a5dc0683c4d183f4f26f23d854380dcf8b96efd Mon Sep 17 00:00:00 2001 From: Vicente Ferrara <47219931+vicentefb@users.noreply.github.com> Date: Wed, 21 Aug 2024 18:12:59 -0700 Subject: [PATCH] [Performance] - Added a new metric inside the allocator to track the success retry rate inside the retry loop (#3927) * metrics changes * metrics changes * error rate metric * error rate metric nit * error rate metric nit nit * error rate metric nit nit nit * error rate metrics update increment * error rate metric update * error rate update * latency update * test * test * test * added metric for success rate inside the retry loop of the allocator * nit * nit * nit * added metric for gameserver creation duration inside the controller * updated metric to be a histogram and exported recordwithtags function * lint * added documentation with feature shortcode * nit * nit * comment nit * lint * addressed nit comments and changed function name * addressed comments and updated documentation --- pkg/gameserverallocations/allocator.go | 9 ++- pkg/gameserverallocations/metrics.go | 41 ++++++++--- pkg/gameserversets/controller.go | 54 ++++++++++---- pkg/gameserversets/metrics.go | 99 ++++++++++++++++++++++++++ pkg/metrics/controller.go | 46 ++++++------ pkg/metrics/kubernetes_client.go | 4 +- pkg/metrics/util.go | 3 +- site/content/en/docs/Guides/metrics.md | 44 ++++++++++++ 8 files changed, 250 insertions(+), 50 deletions(-) create mode 100644 pkg/gameserversets/metrics.go diff --git a/pkg/gameserverallocations/allocator.go b/pkg/gameserverallocations/allocator.go index c8208e3063..08f17f175a 100644 --- a/pkg/gameserverallocations/allocator.go +++ b/pkg/gameserverallocations/allocator.go @@ -250,13 +250,20 @@ func (c *Allocator) loggerForGameServerAllocation(gsa *allocationv1.GameServerAl } // allocateFromLocalCluster allocates gameservers from the local cluster. +// Registers number of times we retried before getting a success allocation func (c *Allocator) allocateFromLocalCluster(ctx context.Context, gsa *allocationv1.GameServerAllocation) (*allocationv1.GameServerAllocation, error) { var gs *agonesv1.GameServer + retry := c.newMetrics(ctx) + retryCount := 0 err := Retry(allocationRetry, func() error { var err error gs, err = c.allocate(ctx, gsa) + retryCount++ + if err != nil { - c.loggerForGameServerAllocation(gsa).WithError(err).Warn("failed to allocate. Retrying...") + c.loggerForGameServerAllocation(gsa).WithError(err).Warn("Failed to Allocated. Retrying...") + } else { + retry.recordAllocationRetrySuccess(ctx, retryCount) } return err }) diff --git a/pkg/gameserverallocations/metrics.go b/pkg/gameserverallocations/metrics.go index ecfc889061..64fb4d50ec 100644 --- a/pkg/gameserverallocations/metrics.go +++ b/pkg/gameserverallocations/metrics.go @@ -32,23 +32,42 @@ import ( ) var ( + logger = runtime.NewLoggerWithSource("metrics") + keyFleetName = mt.MustTagKey("fleet_name") keyClusterName = mt.MustTagKey("cluster_name") keyMultiCluster = mt.MustTagKey("is_multicluster") keyStatus = mt.MustTagKey("status") keySchedulingStrategy = mt.MustTagKey("scheduling_strategy") - gameServerAllocationsLatency = stats.Float64("gameserver_allocations/latency", "The duration of gameserver allocations", "s") + gameServerAllocationsLatency = stats.Float64("gameserver_allocations/latency", "The duration of gameserver allocations", "s") + gameServerAllocationsRetryTotal = stats.Int64("gameserver_allocations/errors", "The errors of gameserver allocations", "1") ) func init() { - runtime.Must(view.Register(&view.View{ - Name: "gameserver_allocations_duration_seconds", - Measure: gameServerAllocationsLatency, - Description: "The distribution of gameserver allocation requests latencies.", - Aggregation: view.Distribution(0, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2, 3), - TagKeys: []tag.Key{keyFleetName, keyClusterName, keyMultiCluster, keyStatus, keySchedulingStrategy}, - })) + + stateViews := []*view.View{ + { + Name: "gameserver_allocations_duration_seconds", + Measure: gameServerAllocationsLatency, + Description: "The distribution of gameserver allocation requests latencies.", + Aggregation: view.Distribution(0, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2, 3), + TagKeys: []tag.Key{keyFleetName, keyClusterName, keyMultiCluster, keyStatus, keySchedulingStrategy}, + }, + { + Name: "gameserver_allocations_retry_total", + Measure: gameServerAllocationsRetryTotal, + Description: "The count of gameserver allocation retry until it succeeds", + Aggregation: view.Distribution(1, 2, 3, 4, 5), + TagKeys: []tag.Key{keyFleetName, keyClusterName, keyMultiCluster, keyStatus, keySchedulingStrategy}, + }, + } + + for _, v := range stateViews { + if err := view.Register(v); err != nil { + logger.WithError(err).Error("could not register view") + } + } } // default set of tags for latency metric @@ -123,3 +142,9 @@ func (r *metrics) setResponse(o k8sruntime.Object) { func (r *metrics) record() { stats.Record(r.ctx, gameServerAllocationsLatency.M(time.Since(r.start).Seconds())) } + +// record the current allocation retry rate. +func (r *metrics) recordAllocationRetrySuccess(ctx context.Context, retryCount int) { + mt.RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyStatus, "Success")}, + gameServerAllocationsRetryTotal.M(int64(retryCount))) +} diff --git a/pkg/gameserversets/controller.go b/pkg/gameserversets/controller.go index 20a0a5f71f..dc32defb77 100644 --- a/pkg/gameserversets/controller.go +++ b/pkg/gameserversets/controller.go @@ -20,10 +20,24 @@ import ( "sync" "time" + "agones.dev/agones/pkg/apis" + "agones.dev/agones/pkg/apis/agones" + agonesv1 "agones.dev/agones/pkg/apis/agones/v1" + "agones.dev/agones/pkg/client/clientset/versioned" + getterv1 "agones.dev/agones/pkg/client/clientset/versioned/typed/agones/v1" + "agones.dev/agones/pkg/client/informers/externalversions" + listerv1 "agones.dev/agones/pkg/client/listers/agones/v1" + "agones.dev/agones/pkg/gameservers" + "agones.dev/agones/pkg/util/crd" + "agones.dev/agones/pkg/util/logfields" + "agones.dev/agones/pkg/util/runtime" + "agones.dev/agones/pkg/util/webhooks" + "agones.dev/agones/pkg/util/workerqueue" "github.com/google/go-cmp/cmp" "github.com/heptiolabs/healthcheck" "github.com/pkg/errors" "github.com/sirupsen/logrus" + "go.opencensus.io/tag" admissionv1 "k8s.io/api/admission/v1" corev1 "k8s.io/api/core/v1" extclientset "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset" @@ -36,20 +50,6 @@ import ( typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" "k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/record" - - "agones.dev/agones/pkg/apis" - "agones.dev/agones/pkg/apis/agones" - agonesv1 "agones.dev/agones/pkg/apis/agones/v1" - "agones.dev/agones/pkg/client/clientset/versioned" - getterv1 "agones.dev/agones/pkg/client/clientset/versioned/typed/agones/v1" - "agones.dev/agones/pkg/client/informers/externalversions" - listerv1 "agones.dev/agones/pkg/client/listers/agones/v1" - "agones.dev/agones/pkg/gameservers" - "agones.dev/agones/pkg/util/crd" - "agones.dev/agones/pkg/util/logfields" - "agones.dev/agones/pkg/util/runtime" - "agones.dev/agones/pkg/util/webhooks" - "agones.dev/agones/pkg/util/workerqueue" ) var ( @@ -506,8 +506,18 @@ func shouldDeleteErroredGameServer(gs *agonesv1.GameServer) bool { } // addMoreGameServers adds diff more GameServers to the set -func (c *Controller) addMoreGameServers(ctx context.Context, gsSet *agonesv1.GameServerSet, count int) error { +func (c *Controller) addMoreGameServers(ctx context.Context, gsSet *agonesv1.GameServerSet, count int) (err error) { loggerForGameServerSet(c.baseLogger, gsSet).WithField("count", count).Debug("Adding more gameservers") + latency := c.newMetrics(ctx) + latency.setRequest(count) + + defer func() { + if err != nil { + latency.setError("error") + } + latency.record() + + }() return parallelize(newGameServersChannel(count, gsSet), maxCreationParalellism, func(gs *agonesv1.GameServer) error { gs, err := c.gameServerGetter.GameServers(gs.Namespace).Create(ctx, gs, metav1.CreateOptions{}) @@ -751,3 +761,17 @@ func aggregateLists(aggListStatus map[string]agonesv1.AggregatedListStatus, return aggListStatus } + +// newMetrics creates a new gss latency recorder. +func (c *Controller) newMetrics(ctx context.Context) *metrics { + ctx, err := tag.New(ctx, latencyTags...) + if err != nil { + c.baseLogger.WithError(err).Warn("failed to tag latency recorder.") + } + return &metrics{ + ctx: ctx, + gameServerLister: c.gameServerLister, + logger: c.baseLogger, + start: time.Now(), + } +} diff --git a/pkg/gameserversets/metrics.go b/pkg/gameserversets/metrics.go new file mode 100644 index 0000000000..ae741342a7 --- /dev/null +++ b/pkg/gameserversets/metrics.go @@ -0,0 +1,99 @@ +// Copyright 2024 Google LLC All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gameserversets + +import ( + "context" + "fmt" + "time" + + listerv1 "agones.dev/agones/pkg/client/listers/agones/v1" + mt "agones.dev/agones/pkg/metrics" + "agones.dev/agones/pkg/util/runtime" + "github.com/sirupsen/logrus" + "go.opencensus.io/stats" + "go.opencensus.io/stats/view" + "go.opencensus.io/tag" +) + +var ( + logger = runtime.NewLoggerWithSource("metrics") + + keyName = mt.MustTagKey("name") + keyNamespace = mt.MustTagKey("namespace") + keyFleetName = mt.MustTagKey("fleet_name") + keyType = mt.MustTagKey("type") + + gameServerCreationDuration = stats.Float64("gameserver_creation/duration", "The duration of gameserver creation", "s") +) + +func init() { + + stateViews := []*view.View{ + { + Name: "gameserver_creation_duration", + Measure: gameServerCreationDuration, + Description: "The time gameserver takes to be created in seconds", + Aggregation: view.Distribution(0, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2, 3), + TagKeys: []tag.Key{keyName, keyType, keyFleetName, keyNamespace}, + }, + } + + // register all our state views to OpenCensus + for _, v := range stateViews { + if err := view.Register(v); err != nil { + logger.WithError(err).Error("could not register view") + } + } + +} + +// default set of tags for latency metric +var latencyTags = []tag.Mutator{ + tag.Insert(keyName, "none"), + tag.Insert(keyFleetName, "none"), + tag.Insert(keyType, "none"), +} + +type metrics struct { + ctx context.Context + gameServerLister listerv1.GameServerLister + logger *logrus.Entry + start time.Time +} + +// record the current current gameserver creation latency +func (r *metrics) record() { + stats.Record(r.ctx, gameServerCreationDuration.M(time.Since(r.start).Seconds())) +} + +// mutate the current set of metric tags +func (r *metrics) mutate(m ...tag.Mutator) { + var err error + r.ctx, err = tag.New(r.ctx, m...) + if err != nil { + r.logger.WithError(err).Warn("failed to mutate request context.") + } +} + +// setError set the latency status tag as error. +func (r *metrics) setError(errorType string) { + r.mutate(tag.Update(keyType, errorType)) +} + +// setRequest set request metric tags. +func (r *metrics) setRequest(count int) { + r.mutate(tag.Update(keyName, fmt.Sprint(count))) +} diff --git a/pkg/metrics/controller.go b/pkg/metrics/controller.go index 11f5e718ba..3232c76860 100644 --- a/pkg/metrics/controller.go +++ b/pkg/metrics/controller.go @@ -192,9 +192,9 @@ func (c *Controller) recordFleetAutoScalerChanges(old, next interface{}) { // recording buffer policy if fas.Spec.Policy.Buffer != nil { // recording limits - recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "max")}, + RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "max")}, fasBufferLimitsCountStats.M(int64(fas.Spec.Policy.Buffer.MaxReplicas))) - recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "min")}, + RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "min")}, fasBufferLimitsCountStats.M(int64(fas.Spec.Policy.Buffer.MinReplicas))) // recording size @@ -203,13 +203,13 @@ func (c *Controller) recordFleetAutoScalerChanges(old, next interface{}) { sizeString := fas.Spec.Policy.Buffer.BufferSize.StrVal if sizeString != "" { if size, err := strconv.Atoi(sizeString[:len(sizeString)-1]); err == nil { - recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "percentage")}, + RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "percentage")}, fasBufferSizeStats.M(int64(size))) } } } else { // as count - recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "count")}, + RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "count")}, fasBufferSizeStats.M(int64(fas.Spec.Policy.Buffer.BufferSize.IntVal))) } } @@ -312,15 +312,15 @@ func (c *Controller) recordFleetReplicas(fleetName, fleetNamespace string, total ctx, _ := tag.New(context.Background(), tag.Upsert(keyName, fleetName), tag.Upsert(keyNamespace, fleetNamespace)) - recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total")}, + RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total")}, fleetsReplicasCountStats.M(int64(total))) - recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated")}, + RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated")}, fleetsReplicasCountStats.M(int64(allocated))) - recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "ready")}, + RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "ready")}, fleetsReplicasCountStats.M(int64(ready))) - recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "desired")}, + RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "desired")}, fleetsReplicasCountStats.M(int64(desired))) - recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "reserved")}, + RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "reserved")}, fleetsReplicasCountStats.M(int64(reserved))) } @@ -330,13 +330,13 @@ func (c *Controller) recordCounters(fleetName, fleetNamespace string, counters m ctx, _ := tag.New(context.Background(), tag.Upsert(keyName, fleetName), tag.Upsert(keyNamespace, fleetNamespace)) for counter, counterStatus := range counters { - recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_count"), tag.Upsert(keyCounter, counter)}, + RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_count"), tag.Upsert(keyCounter, counter)}, fleetCountersStats.M(counterStatus.AllocatedCount)) - recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_capacity"), tag.Upsert(keyCounter, counter)}, + RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_capacity"), tag.Upsert(keyCounter, counter)}, fleetCountersStats.M(counterStatus.AllocatedCapacity)) - recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_count"), tag.Upsert(keyCounter, counter)}, + RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_count"), tag.Upsert(keyCounter, counter)}, fleetCountersStats.M(counterStatus.Count)) - recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_capacity"), tag.Upsert(keyCounter, counter)}, + RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_capacity"), tag.Upsert(keyCounter, counter)}, fleetCountersStats.M(counterStatus.Capacity)) } } @@ -347,13 +347,13 @@ func (c *Controller) recordLists(fleetName, fleetNamespace string, lists map[str ctx, _ := tag.New(context.Background(), tag.Upsert(keyName, fleetName), tag.Upsert(keyNamespace, fleetNamespace)) for list, listStatus := range lists { - recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_count"), tag.Upsert(keyList, list)}, + RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_count"), tag.Upsert(keyList, list)}, fleetListsStats.M(listStatus.AllocatedCount)) - recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_capacity"), tag.Upsert(keyList, list)}, + RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_capacity"), tag.Upsert(keyList, list)}, fleetListsStats.M(listStatus.AllocatedCapacity)) - recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_count"), tag.Upsert(keyList, list)}, + RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_count"), tag.Upsert(keyList, list)}, fleetListsStats.M(listStatus.Count)) - recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_capacity"), tag.Upsert(keyList, list)}, + RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_capacity"), tag.Upsert(keyList, list)}, fleetListsStats.M(listStatus.Capacity)) } } @@ -386,19 +386,19 @@ func (c *Controller) recordGameServerStatusChanges(old, next interface{}) { oldGs.Status.Players != nil { if newGs.Status.Players.Count != oldGs.Status.Players.Count { - recordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyFleetName, fleetName), + RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyFleetName, fleetName), tag.Upsert(keyName, newGs.GetName()), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gameServerPlayerConnectedTotal.M(newGs.Status.Players.Count)) } if newGs.Status.Players.Capacity-newGs.Status.Players.Count != oldGs.Status.Players.Capacity-oldGs.Status.Players.Count { - recordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyFleetName, fleetName), + RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyFleetName, fleetName), tag.Upsert(keyName, newGs.GetName()), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gameServerPlayerCapacityTotal.M(newGs.Status.Players.Capacity-newGs.Status.Players.Count)) } } if newGs.Status.State != oldGs.Status.State { - recordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyType, string(newGs.Status.State)), + RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyType, string(newGs.Status.State)), tag.Upsert(keyFleetName, fleetName), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gameServerTotalStats.M(1)) // Calculate the duration of the current state @@ -406,7 +406,7 @@ func (c *Controller) recordGameServerStatusChanges(old, next interface{}) { if err != nil { c.logger.Warn(err.Error()) } else { - recordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyType, string(oldGs.Status.State)), + RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyType, string(oldGs.Status.State)), tag.Upsert(keyFleetName, fleetName), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gsStateDurationSec.M(duration)) } } @@ -516,9 +516,9 @@ func (c *Controller) collectNodeCounts() { } nodes = removeSystemNodes(nodes) - recordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyEmpty, "true")}, + RecordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyEmpty, "true")}, nodesCountStats.M(int64(len(nodes)-len(gsPerNodes)))) - recordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyEmpty, "false")}, + RecordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyEmpty, "false")}, nodesCountStats.M(int64(len(gsPerNodes)))) for _, node := range nodes { diff --git a/pkg/metrics/kubernetes_client.go b/pkg/metrics/kubernetes_client.go index a003306b72..bc0f9be690 100644 --- a/pkg/metrics/kubernetes_client.go +++ b/pkg/metrics/kubernetes_client.go @@ -201,13 +201,13 @@ func (c *clientGoMetricAdapter) Register() { } func (clientGoMetricAdapter) Increment(ctx context.Context, code string, method string, host string) { - recordWithTags(ctx, []tag.Mutator{tag.Insert(keyStatusCode, code), + RecordWithTags(ctx, []tag.Mutator{tag.Insert(keyStatusCode, code), tag.Insert(keyVerb, method)}, httpRequestTotalStats.M(int64(1))) } func (clientGoMetricAdapter) Observe(ctx context.Context, verb string, u url.URL, latency time.Duration) { // url is without {namespace} and {name}, so cardinality of resulting metrics is low. - recordWithTags(ctx, []tag.Mutator{tag.Insert(keyVerb, verb), + RecordWithTags(ctx, []tag.Mutator{tag.Insert(keyVerb, verb), tag.Insert(keyEndpoint, u.Path)}, httpRequestLatencyStats.M(latency.Seconds())) } diff --git a/pkg/metrics/util.go b/pkg/metrics/util.go index 43fa524c24..bc637e384d 100644 --- a/pkg/metrics/util.go +++ b/pkg/metrics/util.go @@ -42,7 +42,8 @@ var ( keyList = MustTagKey("list") ) -func recordWithTags(ctx context.Context, mutators []tag.Mutator, ms ...stats.Measurement) { +// RecordWithTags records a metric value and tags +func RecordWithTags(ctx context.Context, mutators []tag.Mutator, ms ...stats.Measurement) { if err := stats.RecordWithTags(ctx, mutators, ms...); err != nil { logger.WithError(err).Warn("error while recoding stats") } diff --git a/site/content/en/docs/Guides/metrics.md b/site/content/en/docs/Guides/metrics.md index 00ceebede5..e472c168c0 100644 --- a/site/content/en/docs/Guides/metrics.md +++ b/site/content/en/docs/Guides/metrics.md @@ -43,6 +43,7 @@ Follow the [Google Cloud Monitoring installation steps](#google-cloud-monitoring ## Metrics available +{{% feature expiryVersion="1.43.0" %}} | Name | Description | Type | |-------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------| | agones_gameservers_count | The number of gameservers per fleet and status | gauge | @@ -77,6 +78,49 @@ Follow the [Google Cloud Monitoring installation steps](#google-cloud-monitoring | agones_k8s_client_workqueue_longest_running_processor | How long the longest running workqueue processor has been running in microseconds | gauge | | agones_k8s_client_workqueue_unfinished_work_seconds | How long unfinished work has been sitting in the workqueue in seconds | gauge | +{{% /feature %}} + +{{% feature publishVersion="1.43.0" %}} + +| Name | Description | Type | +|-------------------------------------------------------|---------------------------------------------------------------------------p------------------------------------------------------------------------------------------------------------------|-----------| +| agones_gameservers_count | The number of gameservers per fleet and status | gauge | +| agones_gameserver_allocations_duration_seconds | The distribution of gameserver allocation requests latencies | histogram | +| agones_gameserver_allocations_retry_total | The count of gameserver allocation retry until it succeeds | histogram | +| agones_gameserver_creation_duration | The time gameserver takes to be created in seconds | histogram | +| agones_gameservers_total | The total of gameservers per fleet and status | counter | +| agones_gameserver_player_connected_total | The total number of players connected to gameservers (Only available when [player tracking]({{< relref "player-tracking.md" >}}) is enabled) | gauge | +| agones_gameserver_player_capacity_total | The available capacity for players on gameservers (Only available when [player tracking]({{< relref "player-tracking.md" >}}) is enabled) | gauge | +| agones_fleets_replicas_count | The number of replicas per fleet (total, desired, ready, reserved, allocated) | gauge | +| agones_fleet_counters | Aggregate Metrics for Counters within a Fleet, including total capacity and count values (Only available when [Counters and Lists]({{< relref "counters-and-lists.md" >}})) are enabled) | gauge | +| agones_fleet_lists | Aggregate Metrics for Lists within a Fleet, including total capacity and List lengths (Only available when [Counters and Lists]({{< relref "counters-and-lists.md" >}})) are enabled) | gauge | +| agones_fleet_autoscalers_able_to_scale | The fleet autoscaler can access the fleet to scale | gauge | +| agones_fleet_autoscalers_buffer_limits | The limits of buffer based fleet autoscalers (min, max) | gauge | +| agones_fleet_autoscalers_buffer_size | The buffer size of fleet autoscalers (count or percentage) | gauge | +| agones_fleet_autoscalers_current_replicas_count | The current replicas count as seen by autoscalers | gauge | +| agones_fleet_autoscalers_desired_replicas_count | The desired replicas count as seen by autoscalers | gauge | +| agones_fleet_autoscalers_limited | The fleet autoscaler is outside the limits set by MinReplicas and MaxReplicas. | gauge | +| agones_gameservers_node_count | The distribution of gameservers per node | histogram | +| agones_nodes_count | The count of nodes empty and with gameservers | gauge | +| agones_gameservers_state_duration | The distribution of gameserver state duration in seconds. Note: this metric could have some missing samples by design. Do not use the `_total` counter as the real value for state changes. | histogram | +| agones_k8s_client_http_request_total | The total of HTTP requests to the Kubernetes API by status code | counter | +| agones_k8s_client_http_request_duration_seconds | The distribution of HTTP requests latencies to the Kubernetes API by status code | histogram | +| agones_k8s_client_cache_list_total | The total number of list operations for client-go caches | counter | +| agones_k8s_client_cache_list_duration_seconds | Duration of a Kubernetes list API call in seconds | histogram | +| agones_k8s_client_cache_list_items | Count of items in a list from the Kubernetes API | histogram | +| agones_k8s_client_cache_watches_total | The total number of watch operations for client-go caches | counter | +| agones_k8s_client_cache_last_resource_version | Last resource version from the Kubernetes API | gauge | +| agones_k8s_client_workqueue_depth | Current depth of the work queue | gauge | +| agones_k8s_client_workqueue_latency_seconds | How long an item stays in the work queue | histogram | +| agones_k8s_client_workqueue_items_total | Total number of items added to the work queue | counter | +| agones_k8s_client_workqueue_work_duration_seconds | How long processing an item from the work queue takes | histogram | +| agones_k8s_client_workqueue_retries_total | Total number of items retried to the work queue | counter | +| agones_k8s_client_workqueue_longest_running_processor | How long the longest running workqueue processor has been running in microseconds | gauge | +| agones_k8s_client_workqueue_unfinished_work_seconds | How long unfinished work has been sitting in the workqueue in seconds | gauge | + + +{{% /feature %}} + ### Dropping Metric Labels