Skip to content

Commit

Permalink
Cleanup Patch Sidecar Logging
Browse files Browse the repository at this point in the history
Missed this little of nuance in code review when moving to Patch with
the SDK Sidecar - in that a Patch operation doesn't return a Conflict
error type/status value if it fails, so every time it does, the Agones
sidecar will log it as an error on production installs.

So this cleaned up a few things:

* renamed "debugError" type to "traceError" type, since we only log it
under Trace, not Debug.
* if `patchGameServer` fails under the error `IsInvalid`, wrap it in a
TraceError so it doesn't show up in prod.
* Wrapped the error with `errors` context, such that if the patch does
fail, it's far easier now to work out where it failed.

Closes googleforgames#3967
  • Loading branch information
markmandel committed Sep 12, 2024
1 parent 632a866 commit 12ec087
Show file tree
Hide file tree
Showing 6 changed files with 31 additions and 25 deletions.
12 changes: 6 additions & 6 deletions pkg/gameservers/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -878,19 +878,19 @@ func (c *Controller) syncGameServerStartingState(ctx context.Context, gs *agones
if err != nil {
// expected to happen, so don't log it.
if k8serrors.IsNotFound(err) {
return nil, workerqueue.NewDebugError(err)
return nil, workerqueue.NewTraceError(err)
}

// do log if it's something other than NotFound, since that's weird.
return nil, err
}
if pod.Spec.NodeName == "" {
return gs, workerqueue.NewDebugError(errors.Errorf("node not yet populated for Pod %s", pod.ObjectMeta.Name))
return gs, workerqueue.NewTraceError(errors.Errorf("node not yet populated for Pod %s", pod.ObjectMeta.Name))
}

// Ensure the pod IPs are populated
if pod.Status.PodIPs == nil || len(pod.Status.PodIPs) == 0 {
return gs, workerqueue.NewDebugError(errors.Errorf("pod IPs not yet populated for Pod %s", pod.ObjectMeta.Name))
return gs, workerqueue.NewTraceError(errors.Errorf("pod IPs not yet populated for Pod %s", pod.ObjectMeta.Name))
}

node, err := c.nodeLister.Get(pod.Spec.NodeName)
Expand Down Expand Up @@ -943,7 +943,7 @@ func (c *Controller) syncGameServerRequestReadyState(ctx context.Context, gs *ag
if gs.Status.NodeName == "" {
addressPopulated = true
if pod.Spec.NodeName == "" {
return gs, workerqueue.NewDebugError(errors.Errorf("node not yet populated for Pod %s", pod.ObjectMeta.Name))
return gs, workerqueue.NewTraceError(errors.Errorf("node not yet populated for Pod %s", pod.ObjectMeta.Name))
}
node, err := c.nodeLister.Get(pod.Spec.NodeName)
if err != nil {
Expand All @@ -963,7 +963,7 @@ func (c *Controller) syncGameServerRequestReadyState(ctx context.Context, gs *ag
// check to make sure this container is actually running. If there was a recent crash, the cache may
// not yet have the newer, running container.
if cs.State.Running == nil {
return nil, workerqueue.NewDebugError(fmt.Errorf("game server container for GameServer %s in namespace %s is not currently running, try again", gsCopy.ObjectMeta.Name, gsCopy.ObjectMeta.Namespace))
return nil, workerqueue.NewTraceError(fmt.Errorf("game server container for GameServer %s in namespace %s is not currently running, try again", gsCopy.ObjectMeta.Name, gsCopy.ObjectMeta.Namespace))
}
gsCopy.ObjectMeta.Annotations[agonesv1.GameServerReadyContainerIDAnnotation] = cs.ContainerID
}
Expand All @@ -972,7 +972,7 @@ func (c *Controller) syncGameServerRequestReadyState(ctx context.Context, gs *ag
}
// Verify that we found the game server container - we may have a stale cache where pod is missing ContainerStatuses.
if _, ok := gsCopy.ObjectMeta.Annotations[agonesv1.GameServerReadyContainerIDAnnotation]; !ok {
return nil, workerqueue.NewDebugError(fmt.Errorf("game server container for GameServer %s in namespace %s not present in pod status, try again", gsCopy.ObjectMeta.Name, gsCopy.ObjectMeta.Namespace))
return nil, workerqueue.NewTraceError(fmt.Errorf("game server container for GameServer %s in namespace %s not present in pod status, try again", gsCopy.ObjectMeta.Name, gsCopy.ObjectMeta.Namespace))
}

// Also update the pod with the same annotation, so we can check if the Pod data is up-to-date, now and also in the HealthController.
Expand Down
2 changes: 1 addition & 1 deletion pkg/gameservers/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ func (hc *HealthController) skipUnhealthyGameContainer(gs *agonesv1.GameServer,
// in which case, send it back to the queue to try again.
gsReadyContainerID := gs.ObjectMeta.Annotations[agonesv1.GameServerReadyContainerIDAnnotation]
if pod.ObjectMeta.Annotations[agonesv1.GameServerReadyContainerIDAnnotation] != gsReadyContainerID {
return false, workerqueue.NewDebugError(errors.Errorf("pod and gameserver %s data are out of sync, retrying", gs.ObjectMeta.Name))
return false, workerqueue.NewTraceError(errors.Errorf("pod and gameserver %s data are out of sync, retrying", gs.ObjectMeta.Name))
}

if gs.IsBeforeReady() {
Expand Down
2 changes: 1 addition & 1 deletion pkg/gameservers/migration.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ func (mc *MigrationController) isMigratingGameServerPod(pod *k8sv1.Pod) (*agones
}

if pod.Spec.NodeName == "" {
return nil, nil, false, workerqueue.NewDebugError(errors.Errorf("node not yet populated for Pod %s", pod.ObjectMeta.Name))
return nil, nil, false, workerqueue.NewTraceError(errors.Errorf("node not yet populated for Pod %s", pod.ObjectMeta.Name))
}

node, err := mc.nodeLister.Get(pod.Spec.NodeName)
Expand Down
8 changes: 7 additions & 1 deletion pkg/sdkserver/sdkserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"github.com/sirupsen/logrus"
corev1 "k8s.io/api/core/v1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/types"
Expand Down Expand Up @@ -459,7 +460,12 @@ func (s *SDKServer) patchGameServer(ctx context.Context, gs, gsCopy *agonesv1.Ga
return nil, err
}

return s.gameServerGetter.GameServers(s.namespace).Patch(ctx, gs.GetObjectMeta().GetName(), types.JSONPatchType, patch, metav1.PatchOptions{})
gs, err = s.gameServerGetter.GameServers(s.namespace).Patch(ctx, gs.GetObjectMeta().GetName(), types.JSONPatchType, patch, metav1.PatchOptions{})
// if the test operation fails, no reason to error log
if err != nil && k8serrors.IsInvalid(err) {
err = workerqueue.NewTraceError(err)
}
return gs, errors.Wrapf(err, "error attempting to patch gameserver: %s/%s", gsCopy.ObjectMeta.Namespace, gsCopy.ObjectMeta.Name)
}

// updateLabels updates the labels on this GameServer to the ones persisted in SDKServer,
Expand Down
22 changes: 11 additions & 11 deletions pkg/util/workerqueue/workerqueue.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,29 +37,29 @@ const (
workFx = time.Second
)

// debugError is a marker type for errors that that should only be logged at a Debug level.
// traceError is a marker type for errors that that should only be logged at a Trace level.
// Useful if you want a Handler to be retried, but not logged at an Error level.
type debugError struct {
type traceError struct {
err error
}

// NewDebugError returns a debugError wrapper around an error.
func NewDebugError(err error) error {
return &debugError{err: err}
// NewTraceError returns a traceError wrapper around an error.
func NewTraceError(err error) error {
return &traceError{err: err}
}

// Error returns the error string
func (l *debugError) Error() string {
func (l *traceError) Error() string {
if l.err == nil {
return "<nil>"
}
return l.err.Error()
}

// isDebugError returns if the error is a debug error or not
func isDebugError(err error) bool {
// isTraceError returns if the error is a trace error or not
func isTraceError(err error) bool {
cause := errors.Cause(err)
_, ok := cause.(*debugError)
_, ok := cause.(*traceError)
return ok
}

Expand Down Expand Up @@ -181,8 +181,8 @@ func (wq *WorkerQueue) processNextWorkItem(ctx context.Context) bool {

if err := wq.SyncHandler(ctx, key); err != nil {
// Conflicts are expected, so only show them in debug operations.
// Also check is debugError for other expected errors.
if k8serror.IsConflict(errors.Cause(err)) || isDebugError(err) {
// Also check is traceError for other expected errors.
if k8serror.IsConflict(errors.Cause(err)) || isTraceError(err) {
wq.logger.WithField(wq.keyName, obj).Trace(err)
} else {
runtime.HandleError(wq.logger.WithField(wq.keyName, obj), err)
Expand Down
10 changes: 5 additions & 5 deletions pkg/util/workerqueue/workerqueue_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -198,13 +198,13 @@ func TestWorkerQueueEnqueueAfter(t *testing.T) {

func TestDebugError(t *testing.T) {
err := errors.New("not a debug error")
assert.False(t, isDebugError(err))
assert.False(t, isTraceError(err))

err = NewDebugError(err)
assert.True(t, isDebugError(err))
err = NewTraceError(err)
assert.True(t, isTraceError(err))
assert.EqualError(t, err, "not a debug error")

err = NewDebugError(nil)
assert.True(t, isDebugError(err))
err = NewTraceError(nil)
assert.True(t, isTraceError(err))
assert.EqualError(t, err, "<nil>")
}

0 comments on commit 12ec087

Please sign in to comment.