From 1a5dc0683c4d183f4f26f23d854380dcf8b96efd Mon Sep 17 00:00:00 2001
From: Vicente Ferrara <47219931+vicentefb@users.noreply.github.com>
Date: Wed, 21 Aug 2024 18:12:59 -0700
Subject: [PATCH] [Performance] - Added a new metric inside the allocator to
 track the success retry rate inside the retry loop  (#3927)

* metrics changes

* metrics changes

* error rate metric

* error rate metric nit

* error rate metric nit nit

* error rate metric nit nit nit

* error rate metrics update increment

* error rate metric update

* error rate update

* latency update

* test

* test

* test

* added metric for success rate inside the retry loop of the allocator

* nit

* nit

* nit

* added metric for gameserver creation duration inside the controller

* updated metric to be a histogram and exported recordwithtags function

* lint

* added documentation with feature shortcode

* nit

* nit

* comment nit

* lint

* addressed nit comments and changed function name

* addressed comments and updated documentation
---
 pkg/gameserverallocations/allocator.go |  9 ++-
 pkg/gameserverallocations/metrics.go   | 41 ++++++++---
 pkg/gameserversets/controller.go       | 54 ++++++++++----
 pkg/gameserversets/metrics.go          | 99 ++++++++++++++++++++++++++
 pkg/metrics/controller.go              | 46 ++++++------
 pkg/metrics/kubernetes_client.go       |  4 +-
 pkg/metrics/util.go                    |  3 +-
 site/content/en/docs/Guides/metrics.md | 44 ++++++++++++
 8 files changed, 250 insertions(+), 50 deletions(-)
 create mode 100644 pkg/gameserversets/metrics.go

diff --git a/pkg/gameserverallocations/allocator.go b/pkg/gameserverallocations/allocator.go
index c8208e3063..08f17f175a 100644
--- a/pkg/gameserverallocations/allocator.go
+++ b/pkg/gameserverallocations/allocator.go
@@ -250,13 +250,20 @@ func (c *Allocator) loggerForGameServerAllocation(gsa *allocationv1.GameServerAl
 }
 
 // allocateFromLocalCluster allocates gameservers from the local cluster.
+// Registers number of times we retried before getting a success allocation
 func (c *Allocator) allocateFromLocalCluster(ctx context.Context, gsa *allocationv1.GameServerAllocation) (*allocationv1.GameServerAllocation, error) {
 	var gs *agonesv1.GameServer
+	retry := c.newMetrics(ctx)
+	retryCount := 0
 	err := Retry(allocationRetry, func() error {
 		var err error
 		gs, err = c.allocate(ctx, gsa)
+		retryCount++
+
 		if err != nil {
-			c.loggerForGameServerAllocation(gsa).WithError(err).Warn("failed to allocate. Retrying...")
+			c.loggerForGameServerAllocation(gsa).WithError(err).Warn("Failed to Allocated. Retrying...")
+		} else {
+			retry.recordAllocationRetrySuccess(ctx, retryCount)
 		}
 		return err
 	})
diff --git a/pkg/gameserverallocations/metrics.go b/pkg/gameserverallocations/metrics.go
index ecfc889061..64fb4d50ec 100644
--- a/pkg/gameserverallocations/metrics.go
+++ b/pkg/gameserverallocations/metrics.go
@@ -32,23 +32,42 @@ import (
 )
 
 var (
+	logger = runtime.NewLoggerWithSource("metrics")
+
 	keyFleetName          = mt.MustTagKey("fleet_name")
 	keyClusterName        = mt.MustTagKey("cluster_name")
 	keyMultiCluster       = mt.MustTagKey("is_multicluster")
 	keyStatus             = mt.MustTagKey("status")
 	keySchedulingStrategy = mt.MustTagKey("scheduling_strategy")
 
-	gameServerAllocationsLatency = stats.Float64("gameserver_allocations/latency", "The duration of gameserver allocations", "s")
+	gameServerAllocationsLatency    = stats.Float64("gameserver_allocations/latency", "The duration of gameserver allocations", "s")
+	gameServerAllocationsRetryTotal = stats.Int64("gameserver_allocations/errors", "The errors of gameserver allocations", "1")
 )
 
 func init() {
-	runtime.Must(view.Register(&view.View{
-		Name:        "gameserver_allocations_duration_seconds",
-		Measure:     gameServerAllocationsLatency,
-		Description: "The distribution of gameserver allocation requests latencies.",
-		Aggregation: view.Distribution(0, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2, 3),
-		TagKeys:     []tag.Key{keyFleetName, keyClusterName, keyMultiCluster, keyStatus, keySchedulingStrategy},
-	}))
+
+	stateViews := []*view.View{
+		{
+			Name:        "gameserver_allocations_duration_seconds",
+			Measure:     gameServerAllocationsLatency,
+			Description: "The distribution of gameserver allocation requests latencies.",
+			Aggregation: view.Distribution(0, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2, 3),
+			TagKeys:     []tag.Key{keyFleetName, keyClusterName, keyMultiCluster, keyStatus, keySchedulingStrategy},
+		},
+		{
+			Name:        "gameserver_allocations_retry_total",
+			Measure:     gameServerAllocationsRetryTotal,
+			Description: "The count of gameserver allocation retry until it succeeds",
+			Aggregation: view.Distribution(1, 2, 3, 4, 5),
+			TagKeys:     []tag.Key{keyFleetName, keyClusterName, keyMultiCluster, keyStatus, keySchedulingStrategy},
+		},
+	}
+
+	for _, v := range stateViews {
+		if err := view.Register(v); err != nil {
+			logger.WithError(err).Error("could not register view")
+		}
+	}
 }
 
 // default set of tags for latency metric
@@ -123,3 +142,9 @@ func (r *metrics) setResponse(o k8sruntime.Object) {
 func (r *metrics) record() {
 	stats.Record(r.ctx, gameServerAllocationsLatency.M(time.Since(r.start).Seconds()))
 }
+
+// record the current allocation retry rate.
+func (r *metrics) recordAllocationRetrySuccess(ctx context.Context, retryCount int) {
+	mt.RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyStatus, "Success")},
+		gameServerAllocationsRetryTotal.M(int64(retryCount)))
+}
diff --git a/pkg/gameserversets/controller.go b/pkg/gameserversets/controller.go
index 20a0a5f71f..dc32defb77 100644
--- a/pkg/gameserversets/controller.go
+++ b/pkg/gameserversets/controller.go
@@ -20,10 +20,24 @@ import (
 	"sync"
 	"time"
 
+	"agones.dev/agones/pkg/apis"
+	"agones.dev/agones/pkg/apis/agones"
+	agonesv1 "agones.dev/agones/pkg/apis/agones/v1"
+	"agones.dev/agones/pkg/client/clientset/versioned"
+	getterv1 "agones.dev/agones/pkg/client/clientset/versioned/typed/agones/v1"
+	"agones.dev/agones/pkg/client/informers/externalversions"
+	listerv1 "agones.dev/agones/pkg/client/listers/agones/v1"
+	"agones.dev/agones/pkg/gameservers"
+	"agones.dev/agones/pkg/util/crd"
+	"agones.dev/agones/pkg/util/logfields"
+	"agones.dev/agones/pkg/util/runtime"
+	"agones.dev/agones/pkg/util/webhooks"
+	"agones.dev/agones/pkg/util/workerqueue"
 	"github.com/google/go-cmp/cmp"
 	"github.com/heptiolabs/healthcheck"
 	"github.com/pkg/errors"
 	"github.com/sirupsen/logrus"
+	"go.opencensus.io/tag"
 	admissionv1 "k8s.io/api/admission/v1"
 	corev1 "k8s.io/api/core/v1"
 	extclientset "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset"
@@ -36,20 +50,6 @@ import (
 	typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
 	"k8s.io/client-go/tools/cache"
 	"k8s.io/client-go/tools/record"
-
-	"agones.dev/agones/pkg/apis"
-	"agones.dev/agones/pkg/apis/agones"
-	agonesv1 "agones.dev/agones/pkg/apis/agones/v1"
-	"agones.dev/agones/pkg/client/clientset/versioned"
-	getterv1 "agones.dev/agones/pkg/client/clientset/versioned/typed/agones/v1"
-	"agones.dev/agones/pkg/client/informers/externalversions"
-	listerv1 "agones.dev/agones/pkg/client/listers/agones/v1"
-	"agones.dev/agones/pkg/gameservers"
-	"agones.dev/agones/pkg/util/crd"
-	"agones.dev/agones/pkg/util/logfields"
-	"agones.dev/agones/pkg/util/runtime"
-	"agones.dev/agones/pkg/util/webhooks"
-	"agones.dev/agones/pkg/util/workerqueue"
 )
 
 var (
@@ -506,8 +506,18 @@ func shouldDeleteErroredGameServer(gs *agonesv1.GameServer) bool {
 }
 
 // addMoreGameServers adds diff more GameServers to the set
-func (c *Controller) addMoreGameServers(ctx context.Context, gsSet *agonesv1.GameServerSet, count int) error {
+func (c *Controller) addMoreGameServers(ctx context.Context, gsSet *agonesv1.GameServerSet, count int) (err error) {
 	loggerForGameServerSet(c.baseLogger, gsSet).WithField("count", count).Debug("Adding more gameservers")
+	latency := c.newMetrics(ctx)
+	latency.setRequest(count)
+
+	defer func() {
+		if err != nil {
+			latency.setError("error")
+		}
+		latency.record()
+
+	}()
 
 	return parallelize(newGameServersChannel(count, gsSet), maxCreationParalellism, func(gs *agonesv1.GameServer) error {
 		gs, err := c.gameServerGetter.GameServers(gs.Namespace).Create(ctx, gs, metav1.CreateOptions{})
@@ -751,3 +761,17 @@ func aggregateLists(aggListStatus map[string]agonesv1.AggregatedListStatus,
 
 	return aggListStatus
 }
+
+// newMetrics creates a new gss latency recorder.
+func (c *Controller) newMetrics(ctx context.Context) *metrics {
+	ctx, err := tag.New(ctx, latencyTags...)
+	if err != nil {
+		c.baseLogger.WithError(err).Warn("failed to tag latency recorder.")
+	}
+	return &metrics{
+		ctx:              ctx,
+		gameServerLister: c.gameServerLister,
+		logger:           c.baseLogger,
+		start:            time.Now(),
+	}
+}
diff --git a/pkg/gameserversets/metrics.go b/pkg/gameserversets/metrics.go
new file mode 100644
index 0000000000..ae741342a7
--- /dev/null
+++ b/pkg/gameserversets/metrics.go
@@ -0,0 +1,99 @@
+// Copyright 2024 Google LLC All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gameserversets
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	listerv1 "agones.dev/agones/pkg/client/listers/agones/v1"
+	mt "agones.dev/agones/pkg/metrics"
+	"agones.dev/agones/pkg/util/runtime"
+	"github.com/sirupsen/logrus"
+	"go.opencensus.io/stats"
+	"go.opencensus.io/stats/view"
+	"go.opencensus.io/tag"
+)
+
+var (
+	logger = runtime.NewLoggerWithSource("metrics")
+
+	keyName      = mt.MustTagKey("name")
+	keyNamespace = mt.MustTagKey("namespace")
+	keyFleetName = mt.MustTagKey("fleet_name")
+	keyType      = mt.MustTagKey("type")
+
+	gameServerCreationDuration = stats.Float64("gameserver_creation/duration", "The duration of gameserver creation", "s")
+)
+
+func init() {
+
+	stateViews := []*view.View{
+		{
+			Name:        "gameserver_creation_duration",
+			Measure:     gameServerCreationDuration,
+			Description: "The time gameserver takes to be created in seconds",
+			Aggregation: view.Distribution(0, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2, 3),
+			TagKeys:     []tag.Key{keyName, keyType, keyFleetName, keyNamespace},
+		},
+	}
+
+	// register all our state views to OpenCensus
+	for _, v := range stateViews {
+		if err := view.Register(v); err != nil {
+			logger.WithError(err).Error("could not register view")
+		}
+	}
+
+}
+
+// default set of tags for latency metric
+var latencyTags = []tag.Mutator{
+	tag.Insert(keyName, "none"),
+	tag.Insert(keyFleetName, "none"),
+	tag.Insert(keyType, "none"),
+}
+
+type metrics struct {
+	ctx              context.Context
+	gameServerLister listerv1.GameServerLister
+	logger           *logrus.Entry
+	start            time.Time
+}
+
+// record the current current gameserver creation latency
+func (r *metrics) record() {
+	stats.Record(r.ctx, gameServerCreationDuration.M(time.Since(r.start).Seconds()))
+}
+
+// mutate the current set of metric tags
+func (r *metrics) mutate(m ...tag.Mutator) {
+	var err error
+	r.ctx, err = tag.New(r.ctx, m...)
+	if err != nil {
+		r.logger.WithError(err).Warn("failed to mutate request context.")
+	}
+}
+
+// setError set the latency status tag as error.
+func (r *metrics) setError(errorType string) {
+	r.mutate(tag.Update(keyType, errorType))
+}
+
+// setRequest set request metric tags.
+func (r *metrics) setRequest(count int) {
+	r.mutate(tag.Update(keyName, fmt.Sprint(count)))
+}
diff --git a/pkg/metrics/controller.go b/pkg/metrics/controller.go
index 11f5e718ba..3232c76860 100644
--- a/pkg/metrics/controller.go
+++ b/pkg/metrics/controller.go
@@ -192,9 +192,9 @@ func (c *Controller) recordFleetAutoScalerChanges(old, next interface{}) {
 	// recording buffer policy
 	if fas.Spec.Policy.Buffer != nil {
 		// recording limits
-		recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "max")},
+		RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "max")},
 			fasBufferLimitsCountStats.M(int64(fas.Spec.Policy.Buffer.MaxReplicas)))
-		recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "min")},
+		RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "min")},
 			fasBufferLimitsCountStats.M(int64(fas.Spec.Policy.Buffer.MinReplicas)))
 
 		// recording size
@@ -203,13 +203,13 @@ func (c *Controller) recordFleetAutoScalerChanges(old, next interface{}) {
 			sizeString := fas.Spec.Policy.Buffer.BufferSize.StrVal
 			if sizeString != "" {
 				if size, err := strconv.Atoi(sizeString[:len(sizeString)-1]); err == nil {
-					recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "percentage")},
+					RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "percentage")},
 						fasBufferSizeStats.M(int64(size)))
 				}
 			}
 		} else {
 			// as count
-			recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "count")},
+			RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "count")},
 				fasBufferSizeStats.M(int64(fas.Spec.Policy.Buffer.BufferSize.IntVal)))
 		}
 	}
@@ -312,15 +312,15 @@ func (c *Controller) recordFleetReplicas(fleetName, fleetNamespace string, total
 
 	ctx, _ := tag.New(context.Background(), tag.Upsert(keyName, fleetName), tag.Upsert(keyNamespace, fleetNamespace))
 
-	recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total")},
+	RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total")},
 		fleetsReplicasCountStats.M(int64(total)))
-	recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated")},
+	RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated")},
 		fleetsReplicasCountStats.M(int64(allocated)))
-	recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "ready")},
+	RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "ready")},
 		fleetsReplicasCountStats.M(int64(ready)))
-	recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "desired")},
+	RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "desired")},
 		fleetsReplicasCountStats.M(int64(desired)))
-	recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "reserved")},
+	RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "reserved")},
 		fleetsReplicasCountStats.M(int64(reserved)))
 }
 
@@ -330,13 +330,13 @@ func (c *Controller) recordCounters(fleetName, fleetNamespace string, counters m
 	ctx, _ := tag.New(context.Background(), tag.Upsert(keyName, fleetName), tag.Upsert(keyNamespace, fleetNamespace))
 
 	for counter, counterStatus := range counters {
-		recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_count"), tag.Upsert(keyCounter, counter)},
+		RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_count"), tag.Upsert(keyCounter, counter)},
 			fleetCountersStats.M(counterStatus.AllocatedCount))
-		recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_capacity"), tag.Upsert(keyCounter, counter)},
+		RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_capacity"), tag.Upsert(keyCounter, counter)},
 			fleetCountersStats.M(counterStatus.AllocatedCapacity))
-		recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_count"), tag.Upsert(keyCounter, counter)},
+		RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_count"), tag.Upsert(keyCounter, counter)},
 			fleetCountersStats.M(counterStatus.Count))
-		recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_capacity"), tag.Upsert(keyCounter, counter)},
+		RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_capacity"), tag.Upsert(keyCounter, counter)},
 			fleetCountersStats.M(counterStatus.Capacity))
 	}
 }
@@ -347,13 +347,13 @@ func (c *Controller) recordLists(fleetName, fleetNamespace string, lists map[str
 	ctx, _ := tag.New(context.Background(), tag.Upsert(keyName, fleetName), tag.Upsert(keyNamespace, fleetNamespace))
 
 	for list, listStatus := range lists {
-		recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_count"), tag.Upsert(keyList, list)},
+		RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_count"), tag.Upsert(keyList, list)},
 			fleetListsStats.M(listStatus.AllocatedCount))
-		recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_capacity"), tag.Upsert(keyList, list)},
+		RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_capacity"), tag.Upsert(keyList, list)},
 			fleetListsStats.M(listStatus.AllocatedCapacity))
-		recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_count"), tag.Upsert(keyList, list)},
+		RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_count"), tag.Upsert(keyList, list)},
 			fleetListsStats.M(listStatus.Count))
-		recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_capacity"), tag.Upsert(keyList, list)},
+		RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_capacity"), tag.Upsert(keyList, list)},
 			fleetListsStats.M(listStatus.Capacity))
 	}
 }
@@ -386,19 +386,19 @@ func (c *Controller) recordGameServerStatusChanges(old, next interface{}) {
 		oldGs.Status.Players != nil {
 
 		if newGs.Status.Players.Count != oldGs.Status.Players.Count {
-			recordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyFleetName, fleetName),
+			RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyFleetName, fleetName),
 				tag.Upsert(keyName, newGs.GetName()), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gameServerPlayerConnectedTotal.M(newGs.Status.Players.Count))
 		}
 
 		if newGs.Status.Players.Capacity-newGs.Status.Players.Count != oldGs.Status.Players.Capacity-oldGs.Status.Players.Count {
-			recordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyFleetName, fleetName),
+			RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyFleetName, fleetName),
 				tag.Upsert(keyName, newGs.GetName()), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gameServerPlayerCapacityTotal.M(newGs.Status.Players.Capacity-newGs.Status.Players.Count))
 		}
 
 	}
 
 	if newGs.Status.State != oldGs.Status.State {
-		recordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyType, string(newGs.Status.State)),
+		RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyType, string(newGs.Status.State)),
 			tag.Upsert(keyFleetName, fleetName), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gameServerTotalStats.M(1))
 
 		// Calculate the duration of the current state
@@ -406,7 +406,7 @@ func (c *Controller) recordGameServerStatusChanges(old, next interface{}) {
 		if err != nil {
 			c.logger.Warn(err.Error())
 		} else {
-			recordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyType, string(oldGs.Status.State)),
+			RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyType, string(oldGs.Status.State)),
 				tag.Upsert(keyFleetName, fleetName), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gsStateDurationSec.M(duration))
 		}
 	}
@@ -516,9 +516,9 @@ func (c *Controller) collectNodeCounts() {
 	}
 
 	nodes = removeSystemNodes(nodes)
-	recordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyEmpty, "true")},
+	RecordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyEmpty, "true")},
 		nodesCountStats.M(int64(len(nodes)-len(gsPerNodes))))
-	recordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyEmpty, "false")},
+	RecordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyEmpty, "false")},
 		nodesCountStats.M(int64(len(gsPerNodes))))
 
 	for _, node := range nodes {
diff --git a/pkg/metrics/kubernetes_client.go b/pkg/metrics/kubernetes_client.go
index a003306b72..bc0f9be690 100644
--- a/pkg/metrics/kubernetes_client.go
+++ b/pkg/metrics/kubernetes_client.go
@@ -201,13 +201,13 @@ func (c *clientGoMetricAdapter) Register() {
 }
 
 func (clientGoMetricAdapter) Increment(ctx context.Context, code string, method string, host string) {
-	recordWithTags(ctx, []tag.Mutator{tag.Insert(keyStatusCode, code),
+	RecordWithTags(ctx, []tag.Mutator{tag.Insert(keyStatusCode, code),
 		tag.Insert(keyVerb, method)}, httpRequestTotalStats.M(int64(1)))
 }
 
 func (clientGoMetricAdapter) Observe(ctx context.Context, verb string, u url.URL, latency time.Duration) {
 	// url is without {namespace} and {name}, so cardinality of resulting metrics is low.
-	recordWithTags(ctx, []tag.Mutator{tag.Insert(keyVerb, verb),
+	RecordWithTags(ctx, []tag.Mutator{tag.Insert(keyVerb, verb),
 		tag.Insert(keyEndpoint, u.Path)}, httpRequestLatencyStats.M(latency.Seconds()))
 }
 
diff --git a/pkg/metrics/util.go b/pkg/metrics/util.go
index 43fa524c24..bc637e384d 100644
--- a/pkg/metrics/util.go
+++ b/pkg/metrics/util.go
@@ -42,7 +42,8 @@ var (
 	keyList       = MustTagKey("list")
 )
 
-func recordWithTags(ctx context.Context, mutators []tag.Mutator, ms ...stats.Measurement) {
+// RecordWithTags records a metric value and tags
+func RecordWithTags(ctx context.Context, mutators []tag.Mutator, ms ...stats.Measurement) {
 	if err := stats.RecordWithTags(ctx, mutators, ms...); err != nil {
 		logger.WithError(err).Warn("error while recoding stats")
 	}
diff --git a/site/content/en/docs/Guides/metrics.md b/site/content/en/docs/Guides/metrics.md
index 00ceebede5..e472c168c0 100644
--- a/site/content/en/docs/Guides/metrics.md
+++ b/site/content/en/docs/Guides/metrics.md
@@ -43,6 +43,7 @@ Follow the [Google Cloud Monitoring installation steps](#google-cloud-monitoring
 
 ## Metrics available
 
+{{% feature expiryVersion="1.43.0" %}}
 | Name                                                  | Description                                                                                                                                                                                 | Type      |
 |-------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------|
 | agones_gameservers_count                              | The number of gameservers per fleet and status                                                                                                                                              | gauge     |
@@ -77,6 +78,49 @@ Follow the [Google Cloud Monitoring installation steps](#google-cloud-monitoring
 | agones_k8s_client_workqueue_longest_running_processor | How long the longest running workqueue processor has been running in microseconds                                                                                                           | gauge     |
 | agones_k8s_client_workqueue_unfinished_work_seconds   | How long unfinished work has been sitting in the workqueue in seconds                                                                                                                       | gauge     |
 
+{{% /feature %}}
+
+{{% feature publishVersion="1.43.0" %}}
+
+| Name                                                  | Description                                                                                                                                                                                 | Type      |
+|-------------------------------------------------------|---------------------------------------------------------------------------p------------------------------------------------------------------------------------------------------------------|-----------|
+| agones_gameservers_count                              | The number of gameservers per fleet and status                                                                                                                                              | gauge     |
+| agones_gameserver_allocations_duration_seconds        | The distribution of gameserver allocation requests latencies                                                                                                                                | histogram |
+| agones_gameserver_allocations_retry_total             | The count of gameserver allocation retry until it succeeds | histogram |
+| agones_gameserver_creation_duration                   | The time gameserver takes to be created in seconds  | histogram | 
+| agones_gameservers_total                              | The total of gameservers per fleet and status                                                                                                                                               | counter   |
+| agones_gameserver_player_connected_total              | The total number of players connected to gameservers (Only available when [player tracking]({{< relref "player-tracking.md" >}}) is enabled)                                                | gauge     |
+| agones_gameserver_player_capacity_total               | The available capacity for players on gameservers (Only available when [player tracking]({{< relref "player-tracking.md" >}}) is enabled)                                                   | gauge     |
+| agones_fleets_replicas_count                          | The number of replicas per fleet (total, desired, ready, reserved, allocated)                                                                                                               | gauge     |
+| agones_fleet_counters                                 | Aggregate Metrics for Counters within a Fleet, including total capacity and count values (Only available when [Counters and Lists]({{< relref "counters-and-lists.md" >}})) are enabled)    | gauge     |
+| agones_fleet_lists                                    | Aggregate Metrics for Lists within a Fleet, including total capacity and List lengths (Only available when [Counters and Lists]({{< relref "counters-and-lists.md" >}})) are enabled)       | gauge     |
+| agones_fleet_autoscalers_able_to_scale                | The fleet autoscaler can access the fleet to scale                                                                                                                                          | gauge     |
+| agones_fleet_autoscalers_buffer_limits                | The limits of buffer based fleet autoscalers (min, max)                                                                                                                                     | gauge     |
+| agones_fleet_autoscalers_buffer_size                  | The buffer size of fleet autoscalers (count or percentage)                                                                                                                                  | gauge     |
+| agones_fleet_autoscalers_current_replicas_count       | The current replicas count as seen by autoscalers                                                                                                                                           | gauge     |
+| agones_fleet_autoscalers_desired_replicas_count       | The desired replicas count as seen by autoscalers                                                                                                                                           | gauge     |
+| agones_fleet_autoscalers_limited                      | The fleet autoscaler is outside the limits set by MinReplicas and MaxReplicas.                                                                                                              | gauge     |
+| agones_gameservers_node_count                         | The distribution of gameservers per node                                                                                                                                                    | histogram |
+| agones_nodes_count                                    | The count of nodes empty and with gameservers                                                                                                                                               | gauge     |
+| agones_gameservers_state_duration                     | The distribution of gameserver state duration in seconds. Note: this metric could have some missing samples by design. Do not use the `_total` counter as the real value for state changes. | histogram |
+| agones_k8s_client_http_request_total                  | The total of HTTP requests to the Kubernetes API by status code                                                                                                                             | counter   |
+| agones_k8s_client_http_request_duration_seconds       | The distribution of HTTP requests latencies to the Kubernetes API by status code                                                                                                            | histogram |
+| agones_k8s_client_cache_list_total                    | The total number of list operations for client-go caches                                                                                                                                    | counter   |
+| agones_k8s_client_cache_list_duration_seconds         | Duration of a Kubernetes list API call in seconds                                                                                                                                           | histogram |
+| agones_k8s_client_cache_list_items                    | Count of items in a list from the Kubernetes API                                                                                                                                            | histogram |
+| agones_k8s_client_cache_watches_total                 | The total number of watch operations for client-go caches                                                                                                                                   | counter   |
+| agones_k8s_client_cache_last_resource_version         | Last resource version from the Kubernetes API                                                                                                                                               | gauge     |
+| agones_k8s_client_workqueue_depth                     | Current depth of the work queue                                                                                                                                                             | gauge     |
+| agones_k8s_client_workqueue_latency_seconds           | How long an item stays in the work queue                                                                                                                                                    | histogram |
+| agones_k8s_client_workqueue_items_total               | Total number of items added to the work queue                                                                                                                                               | counter   |
+| agones_k8s_client_workqueue_work_duration_seconds     | How long processing an item from the work queue takes                                                                                                                                       | histogram |
+| agones_k8s_client_workqueue_retries_total             | Total number of items retried to the work queue                                                                                                                                             | counter   |
+| agones_k8s_client_workqueue_longest_running_processor | How long the longest running workqueue processor has been running in microseconds                                                                                                           | gauge     |
+| agones_k8s_client_workqueue_unfinished_work_seconds   | How long unfinished work has been sitting in the workqueue in seconds                                                                                                                       | gauge     |
+
+
+{{% /feature %}}
+
 ### Dropping Metric Labels