Skip to content

Commit

Permalink
RSDK-8595 Unhealthy resource state (#4257)
Browse files Browse the repository at this point in the history
  • Loading branch information
maximpertsov authored Aug 20, 2024
1 parent dc25a20 commit 4c6b388
Show file tree
Hide file tree
Showing 12 changed files with 151 additions and 34 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,9 @@ jobs:
GEN_DIFF=$(git status -s)
if [ -n "$GEN_DIFF" ]; then
echo '"make build lint" resulted in the following untracked changes:' 1>&2
echo '"make build-go lint-go generate-go" resulted in the following untracked changes:' 1>&2
git diff
echo '"make build lint" resulted in changes not in git' 1>&2
echo '"make build-go lint-go generate-go" resulted in changes not in git' 1>&2
git status
exit 1
fi
Expand Down Expand Up @@ -137,7 +137,7 @@ jobs:
GEN_DIFF=$(git status -s)
if [ -n "$GEN_DIFF" ]; then
echo '"make build lint" resulted in changes not in git' 1>&2
echo '"make build-go lint-go generate-go" resulted in changes not in git' 1>&2
git status
exit 1
fi
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ require (
go.uber.org/atomic v1.10.0
go.uber.org/multierr v1.11.0
go.uber.org/zap v1.24.0
go.viam.com/api v0.1.330
go.viam.com/api v0.1.331
go.viam.com/test v1.1.1-0.20220913152726-5da9916c08a2
go.viam.com/utils v0.1.94
goji.io v2.0.2+incompatible
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -1539,8 +1539,8 @@ go.uber.org/zap v1.18.1/go.mod h1:xg/QME4nWcxGxrpdeYfq7UvYrLh66cuVKdrbD1XF/NI=
go.uber.org/zap v1.23.0/go.mod h1:D+nX8jyLsMHMYrln8A0rJjFt/T/9/bGgIhAqxv5URuY=
go.uber.org/zap v1.24.0 h1:FiJd5l1UOLj0wCgbSE0rwwXHzEdAZS6hiiSnxJN/D60=
go.uber.org/zap v1.24.0/go.mod h1:2kMP+WWQ8aoFoedH3T2sq6iJ2yDWpHbP0f6MQbS9Gkg=
go.viam.com/api v0.1.330 h1:U0GAFU6SNOtwEYnpONDAhSyeyrQ/4Z2LUeTc0+Kzjp0=
go.viam.com/api v0.1.330/go.mod h1:msa4TPrMVeRDcG4YzKA/S6wLEUC7GyHQE973JklrQ10=
go.viam.com/api v0.1.331 h1:79ggmTwRuKn92yi/fhTrw+ehHGb7yw5rrrZRtZ50Ih4=
go.viam.com/api v0.1.331/go.mod h1:msa4TPrMVeRDcG4YzKA/S6wLEUC7GyHQE973JklrQ10=
go.viam.com/test v1.1.1-0.20220913152726-5da9916c08a2 h1:oBiK580EnEIzgFLU4lHOXmGAE3MxnVbeR7s1wp/F3Ps=
go.viam.com/test v1.1.1-0.20220913152726-5da9916c08a2/go.mod h1:XM0tej6riszsiNLT16uoyq1YjuYPWlRBweTPRDanIts=
go.viam.com/utils v0.1.94 h1:fq/AosNoYd1MW4Avc7OS7gKJb03p5AGDHB+faiPswu4=
Expand Down
72 changes: 64 additions & 8 deletions resource/graph_node.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,18 @@ package resource

import (
"context"
"errors"
"sync"
"sync/atomic"
"time"

"github.com/pkg/errors"

"go.viam.com/rdk/logging"
)

//go:generate stringer -type NodeState -trimprefix NodeState
//
// The directive above configures `go generate ./...` to automatically generate a
// `String` method for the NodeState type.

// NodeState captures the configuration lifecycle state of a resource node.
type NodeState uint8
Expand All @@ -26,12 +28,14 @@ const (
// NodeStateConfiguring denotes a resource is being configured.
NodeStateConfiguring

// NodeStateReady denotes a resource that has been initialized and is not being
// configured or removed. A resource in this state may be unhealthy.
// NodeStateReady denotes a resource that has been configured and is healthy.
NodeStateReady

// NodeStateRemoving denotes a resource is being removed from the resource graph.
NodeStateRemoving

// NodeStateUnhealthy denotes a resource is unhealthy.
NodeStateUnhealthy
)

// A GraphNode contains the current state of a resource.
Expand Down Expand Up @@ -276,8 +280,8 @@ func (w *GraphNode) MarkedForRemoval() bool {
// The additional `args` should come in key/value pairs for structured logging.
func (w *GraphNode) LogAndSetLastError(err error, args ...any) {
w.mu.Lock()
w.lastErr = err
// TODO(RSDK-7903): transition to an "unhealthy" state.
w.lastErr = errors.Join(w.lastErr, err)
w.transitionTo(NodeStateUnhealthy)
w.mu.Unlock()

if w.logger != nil {
Expand All @@ -299,7 +303,10 @@ func (w *GraphNode) Config() Config {
func (w *GraphNode) NeedsReconfigure() bool {
w.mu.RLock()
defer w.mu.RUnlock()
return w.state == NodeStateConfiguring

// A resource can only become unhealthy during (re)configuration, so we can
// assume that an unhealthy node always need to be reconfigured.
return w.state == NodeStateConfiguring || w.state == NodeStateUnhealthy
}

// hasUnresolvedDependencies returns whether or not this node has any
Expand Down Expand Up @@ -471,7 +478,7 @@ func (w *GraphNode) canTransitionTo(state NodeState) bool {
case NodeStateConfiguring:
//nolint
switch state {
case NodeStateReady:
case NodeStateReady, NodeStateUnhealthy:
return true
}
case NodeStateReady:
Expand All @@ -481,6 +488,28 @@ func (w *GraphNode) canTransitionTo(state NodeState) bool {
return true
}
case NodeStateRemoving:
case NodeStateUnhealthy:
//nolint
switch state {
case NodeStateRemoving, NodeStateUnhealthy:
return true
case NodeStateReady:
// TODO(NEEDS TICKET): ideally, a node should "recover" by the following transition
// steps: "unhealthy" -> "configuring" -> "ready".
//
// However, once a node becomes "unhealthy" it is filtered out from most
// reconfiguration operations, and is not available to be transitioned to
// "configuring" when we want it to be.
//
// We eventually want to change the reconfiguration system to show unhealthy
// nodes instead of filtering them out at each step of the process.
//
// In the meantime, we allow nodes to "recover" by transitioning directly
// from "unhealthy" -> "ready".
//
// See this discussion for more details: https://github.com/viamrobotics/rdk/pull/4257#discussion_r1712173743
return true
}
}
return false
}
Expand Down Expand Up @@ -510,6 +539,15 @@ func (w *GraphNode) ResourceStatus() Status {
return w.resourceStatus()
}

func (w *GraphNode) getLoggerOrGlobal() logging.Logger {
if w.logger == nil {
// This node has not yet been configured with a logger - use the global logger as
// a fall-back.
return logging.Global()
}
return w.logger
}

func (w *GraphNode) resourceStatus() Status {
var resName Name
if w.current == nil {
Expand All @@ -518,11 +556,25 @@ func (w *GraphNode) resourceStatus() Status {
resName = w.current.Name()
}

err := w.lastErr
logger := w.getLoggerOrGlobal()

// check invariants between state and error
switch {
case w.state == NodeStateUnhealthy && w.lastErr == nil:
logger.Warnw("an unhealthy node doesn't have an error", "resource", resName)
case w.state == NodeStateReady && w.lastErr != nil:
logger.Warnw("a ready node still has an error", "resource", resName, "error", err)
// do not return leftover error in status if the node is ready
err = nil
}

return Status{
Name: resName,
State: w.state,
LastUpdated: w.transitionedAt,
Revision: w.revision,
Error: err,
}
}

Expand All @@ -532,4 +584,8 @@ type Status struct {
State NodeState
LastUpdated time.Time
Revision string

// Error contains any errors on the resource if it currently unhealthy.
// This field will be nil if the resource is not in the [NodeStateUnhealthy] state.
Error error
}
8 changes: 4 additions & 4 deletions resource/graph_node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,9 +147,9 @@ func lifecycleTest(t *testing.T, node *resource.GraphNode, initialDeps []string)
node.LogAndSetLastError(ourErr)
_, err = node.Resource()
test.That(t, err, test.ShouldNotBeNil)
test.That(t, err.Error(), test.ShouldContainSubstring, "pending removal")
test.That(t, err.Error(), test.ShouldContainSubstring, "whoops")

verifySameState(t, node)
verifyStateTransition(t, node, resource.NodeStateUnhealthy)

test.That(t, node.UnresolvedDependencies(), test.ShouldResemble, initialDeps)

Expand Down Expand Up @@ -185,7 +185,7 @@ func lifecycleTest(t *testing.T, node *resource.GraphNode, initialDeps []string)
test.That(t, res, test.ShouldEqual, ourRes)
test.That(t, node.IsUninitialized(), test.ShouldBeFalse)

verifySameState(t, node)
verifyStateTransition(t, node, resource.NodeStateUnhealthy)

// it reconfigured
ourRes2 := &someResource{Resource: testutils.NewUnimplementedResource(generic.Named("foo"))}
Expand Down Expand Up @@ -226,7 +226,7 @@ func lifecycleTest(t *testing.T, node *resource.GraphNode, initialDeps []string)
res, err = node.UnsafeResource()
test.That(t, err, test.ShouldBeNil)
test.That(t, res, test.ShouldEqual, ourRes2)
verifySameState(t, node)
verifyStateTransition(t, node, resource.NodeStateUnhealthy)

// it reconfigured
ourRes3 := &someResource{Resource: testutils.NewUnimplementedResource(generic.Named("fooa"))}
Expand Down
5 changes: 3 additions & 2 deletions resource/nodestate_string.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 8 additions & 3 deletions robot/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package client

import (
"context"
"errors"
"flag"
"fmt"
"io"
Expand All @@ -15,7 +16,6 @@ import (
grpc_retry "github.com/grpc-ecosystem/go-grpc-middleware/retry"
"github.com/jhump/protoreflect/desc"
"github.com/jhump/protoreflect/grpcreflect"
"github.com/pkg/errors"
"go.uber.org/multierr"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
Expand Down Expand Up @@ -132,7 +132,7 @@ func isClosedPipeError(err error) bool {
}

func (rc *RobotClient) notConnectedToRemoteError() error {
return errors.Errorf("not connected to remote robot at %s", rc.address)
return fmt.Errorf("not connected to remote robot at %s", rc.address)
}

func (rc *RobotClient) handleUnaryDisconnect(
Expand Down Expand Up @@ -628,7 +628,7 @@ func (rc *RobotClient) resources(ctx context.Context) ([]resource.Name, []resour
}
svcDesc, ok := symDesc.(*desc.ServiceDescriptor)
if !ok {
return nil, nil, errors.Errorf("expected descriptor to be service descriptor but got %T", symDesc)
return nil, nil, fmt.Errorf("expected descriptor to be service descriptor but got %T", symDesc)
}
resTypes = append(resTypes, resource.RPCAPI{
API: rprotoutils.ResourceNameFromProto(resAPI.Subtype).API,
Expand Down Expand Up @@ -1067,6 +1067,11 @@ func (rc *RobotClient) MachineStatus(ctx context.Context) (robot.MachineStatus,
resStatus.State = resource.NodeStateReady
case pb.ResourceStatus_STATE_REMOVING:
resStatus.State = resource.NodeStateRemoving
case pb.ResourceStatus_STATE_UNHEALTHY:
resStatus.State = resource.NodeStateUnhealthy
if pbResStatus.Error != "" {
resStatus.Error = errors.New(pbResStatus.Error)
}
}

mStatus.Resources = append(mStatus.Resources, resStatus)
Expand Down
21 changes: 18 additions & 3 deletions robot/client/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package client
import (
"bytes"
"context"
"errors"
"fmt"
"image"
"image/png"
Expand All @@ -19,7 +20,6 @@ import (
"github.com/golang/geo/r3"
"github.com/google/uuid"
"github.com/jhump/protoreflect/grpcreflect"
"github.com/pkg/errors"
"go.uber.org/zap/zapcore"
commonpb "go.viam.com/api/common/v1"
armpb "go.viam.com/api/component/arm/v1"
Expand Down Expand Up @@ -1325,12 +1325,12 @@ func TestClientDiscovery(t *testing.T) {

func ensurePartsAreEqual(part, otherPart *referenceframe.FrameSystemPart) error {
if part.FrameConfig.Name() != otherPart.FrameConfig.Name() {
return errors.Errorf("part had name %s while other part had name %s", part.FrameConfig.Name(), otherPart.FrameConfig.Name())
return fmt.Errorf("part had name %s while other part had name %s", part.FrameConfig.Name(), otherPart.FrameConfig.Name())
}
frameConfig := part.FrameConfig
otherFrameConfig := otherPart.FrameConfig
if frameConfig.Parent() != otherFrameConfig.Parent() {
return errors.Errorf("part had parent %s while other part had parent %s", frameConfig.Parent(), otherFrameConfig.Parent())
return fmt.Errorf("part had parent %s while other part had parent %s", frameConfig.Parent(), otherFrameConfig.Parent())
}
if !spatialmath.R3VectorAlmostEqual(frameConfig.Pose().Point(), otherFrameConfig.Pose().Point(), 1e-8) {
return errors.New("translations of parts not equal")
Expand Down Expand Up @@ -2158,6 +2158,21 @@ func TestMachineStatus(t *testing.T) {
},
2,
},
{
"unhealthy status",
robot.MachineStatus{
Config: config.Revision{Revision: "rev1"},
Resources: []resource.Status{
{
Name: arm.Named("brokenArm"),
State: resource.NodeStateUnhealthy,
Error: errors.New("bad configuration"),
Revision: "rev1",
},
},
},
0,
},
} {
t.Run(tc.name, func(t *testing.T) {
logger, logs := logging.NewObservedTestLogger(t)
Expand Down
Loading

0 comments on commit 4c6b388

Please sign in to comment.