diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f6087fec7d8..3802fe45bc5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -64,9 +64,9 @@ jobs: GEN_DIFF=$(git status -s) if [ -n "$GEN_DIFF" ]; then - echo '"make build lint" resulted in the following untracked changes:' 1>&2 + echo '"make build-go lint-go generate-go" resulted in the following untracked changes:' 1>&2 git diff - echo '"make build lint" resulted in changes not in git' 1>&2 + echo '"make build-go lint-go generate-go" resulted in changes not in git' 1>&2 git status exit 1 fi @@ -137,7 +137,7 @@ jobs: GEN_DIFF=$(git status -s) if [ -n "$GEN_DIFF" ]; then - echo '"make build lint" resulted in changes not in git' 1>&2 + echo '"make build-go lint-go generate-go" resulted in changes not in git' 1>&2 git status exit 1 fi diff --git a/go.mod b/go.mod index 7bc5164ef49..1cd3d6515df 100644 --- a/go.mod +++ b/go.mod @@ -85,7 +85,7 @@ require ( go.uber.org/atomic v1.10.0 go.uber.org/multierr v1.11.0 go.uber.org/zap v1.24.0 - go.viam.com/api v0.1.330 + go.viam.com/api v0.1.331 go.viam.com/test v1.1.1-0.20220913152726-5da9916c08a2 go.viam.com/utils v0.1.90 goji.io v2.0.2+incompatible diff --git a/go.sum b/go.sum index eacb7e2c790..2c21b2f32e7 100644 --- a/go.sum +++ b/go.sum @@ -1539,8 +1539,8 @@ go.uber.org/zap v1.18.1/go.mod h1:xg/QME4nWcxGxrpdeYfq7UvYrLh66cuVKdrbD1XF/NI= go.uber.org/zap v1.23.0/go.mod h1:D+nX8jyLsMHMYrln8A0rJjFt/T/9/bGgIhAqxv5URuY= go.uber.org/zap v1.24.0 h1:FiJd5l1UOLj0wCgbSE0rwwXHzEdAZS6hiiSnxJN/D60= go.uber.org/zap v1.24.0/go.mod h1:2kMP+WWQ8aoFoedH3T2sq6iJ2yDWpHbP0f6MQbS9Gkg= -go.viam.com/api v0.1.330 h1:U0GAFU6SNOtwEYnpONDAhSyeyrQ/4Z2LUeTc0+Kzjp0= -go.viam.com/api v0.1.330/go.mod h1:msa4TPrMVeRDcG4YzKA/S6wLEUC7GyHQE973JklrQ10= +go.viam.com/api v0.1.331 h1:79ggmTwRuKn92yi/fhTrw+ehHGb7yw5rrrZRtZ50Ih4= +go.viam.com/api v0.1.331/go.mod h1:msa4TPrMVeRDcG4YzKA/S6wLEUC7GyHQE973JklrQ10= go.viam.com/test v1.1.1-0.20220913152726-5da9916c08a2 h1:oBiK580EnEIzgFLU4lHOXmGAE3MxnVbeR7s1wp/F3Ps= go.viam.com/test v1.1.1-0.20220913152726-5da9916c08a2/go.mod h1:XM0tej6riszsiNLT16uoyq1YjuYPWlRBweTPRDanIts= go.viam.com/utils v0.1.90 h1:Bja+mtI3Cneu3lFb/7V98YBW828/5+d1oChulQC0FZY= diff --git a/resource/graph_node.go b/resource/graph_node.go index eb92593f4fa..501a1411767 100644 --- a/resource/graph_node.go +++ b/resource/graph_node.go @@ -2,16 +2,18 @@ package resource import ( "context" + "errors" "sync" "sync/atomic" "time" - "github.com/pkg/errors" - "go.viam.com/rdk/logging" ) //go:generate stringer -type NodeState -trimprefix NodeState +// +// The directive above configures `go generate ./...` to automatically generate a +// `String` method for the NodeState type. // NodeState captures the configuration lifecycle state of a resource node. type NodeState uint8 @@ -26,12 +28,14 @@ const ( // NodeStateConfiguring denotes a resource is being configured. NodeStateConfiguring - // NodeStateReady denotes a resource that has been initialized and is not being - // configured or removed. A resource in this state may be unhealthy. + // NodeStateReady denotes a resource that has been configured and is healthy. NodeStateReady // NodeStateRemoving denotes a resource is being removed from the resource graph. NodeStateRemoving + + // NodeStateUnhealthy denotes a resource is unhealthy. + NodeStateUnhealthy ) // A GraphNode contains the current state of a resource. @@ -276,8 +280,8 @@ func (w *GraphNode) MarkedForRemoval() bool { // The additional `args` should come in key/value pairs for structured logging. func (w *GraphNode) LogAndSetLastError(err error, args ...any) { w.mu.Lock() - w.lastErr = err - // TODO(RSDK-7903): transition to an "unhealthy" state. + w.lastErr = errors.Join(w.lastErr, err) + w.transitionTo(NodeStateUnhealthy) w.mu.Unlock() if w.logger != nil { @@ -299,7 +303,10 @@ func (w *GraphNode) Config() Config { func (w *GraphNode) NeedsReconfigure() bool { w.mu.RLock() defer w.mu.RUnlock() - return w.state == NodeStateConfiguring + + // A resource can only become unhealthy during (re)configuration, so we can + // assume that an unhealthy node always need to be reconfigured. + return w.state == NodeStateConfiguring || w.state == NodeStateUnhealthy } // hasUnresolvedDependencies returns whether or not this node has any @@ -471,7 +478,7 @@ func (w *GraphNode) canTransitionTo(state NodeState) bool { case NodeStateConfiguring: //nolint switch state { - case NodeStateReady: + case NodeStateReady, NodeStateUnhealthy: return true } case NodeStateReady: @@ -481,6 +488,28 @@ func (w *GraphNode) canTransitionTo(state NodeState) bool { return true } case NodeStateRemoving: + case NodeStateUnhealthy: + //nolint + switch state { + case NodeStateRemoving, NodeStateUnhealthy: + return true + case NodeStateReady: + // TODO(NEEDS TICKET): ideally, a node should "recover" by the following transition + // steps: "unhealthy" -> "configuring" -> "ready". + // + // However, once a node becomes "unhealthy" it is filtered out from most + // reconfiguration operations, and is not available to be transitioned to + // "configuring" when we want it to be. + // + // We eventually want to change the reconfiguration system to show unhealthy + // nodes instead of filtering them out at each step of the process. + // + // In the meantime, we allow nodes to "recover" by transitioning directly + // from "unhealthy" -> "ready". + // + // See this discussion for more details: https://github.com/viamrobotics/rdk/pull/4257#discussion_r1712173743 + return true + } } return false } @@ -510,6 +539,15 @@ func (w *GraphNode) ResourceStatus() Status { return w.resourceStatus() } +func (w *GraphNode) getLoggerOrGlobal() logging.Logger { + if w.logger == nil { + // This node has not yet been configured with a logger - use the global logger as + // a fall-back. + return logging.Global() + } + return w.logger +} + func (w *GraphNode) resourceStatus() Status { var resName Name if w.current == nil { @@ -518,11 +556,25 @@ func (w *GraphNode) resourceStatus() Status { resName = w.current.Name() } + err := w.lastErr + logger := w.getLoggerOrGlobal() + + // check invariants between state and error + switch { + case w.state == NodeStateUnhealthy && w.lastErr == nil: + logger.Warnw("an unhealthy node doesn't have an error", "resource", resName) + case w.state == NodeStateReady && w.lastErr != nil: + logger.Warnw("a ready node still has an error", "resource", resName, "error", err) + // do not return leftover error in status if the node is ready + err = nil + } + return Status{ Name: resName, State: w.state, LastUpdated: w.transitionedAt, Revision: w.revision, + Error: err, } } @@ -532,4 +584,8 @@ type Status struct { State NodeState LastUpdated time.Time Revision string + + // Error contains any errors on the resource if it currently unhealthy. + // This field will be nil if the resource is not in the [NodeStateUnhealthy] state. + Error error } diff --git a/resource/graph_node_test.go b/resource/graph_node_test.go index 6d16632b109..af6b150f4e6 100644 --- a/resource/graph_node_test.go +++ b/resource/graph_node_test.go @@ -147,9 +147,9 @@ func lifecycleTest(t *testing.T, node *resource.GraphNode, initialDeps []string) node.LogAndSetLastError(ourErr) _, err = node.Resource() test.That(t, err, test.ShouldNotBeNil) - test.That(t, err.Error(), test.ShouldContainSubstring, "pending removal") + test.That(t, err.Error(), test.ShouldContainSubstring, "whoops") - verifySameState(t, node) + verifyStateTransition(t, node, resource.NodeStateUnhealthy) test.That(t, node.UnresolvedDependencies(), test.ShouldResemble, initialDeps) @@ -185,7 +185,7 @@ func lifecycleTest(t *testing.T, node *resource.GraphNode, initialDeps []string) test.That(t, res, test.ShouldEqual, ourRes) test.That(t, node.IsUninitialized(), test.ShouldBeFalse) - verifySameState(t, node) + verifyStateTransition(t, node, resource.NodeStateUnhealthy) // it reconfigured ourRes2 := &someResource{Resource: testutils.NewUnimplementedResource(generic.Named("foo"))} @@ -226,7 +226,7 @@ func lifecycleTest(t *testing.T, node *resource.GraphNode, initialDeps []string) res, err = node.UnsafeResource() test.That(t, err, test.ShouldBeNil) test.That(t, res, test.ShouldEqual, ourRes2) - verifySameState(t, node) + verifyStateTransition(t, node, resource.NodeStateUnhealthy) // it reconfigured ourRes3 := &someResource{Resource: testutils.NewUnimplementedResource(generic.Named("fooa"))} diff --git a/resource/nodestate_string.go b/resource/nodestate_string.go index 4955d45e628..24502c6aa9c 100644 --- a/resource/nodestate_string.go +++ b/resource/nodestate_string.go @@ -13,11 +13,12 @@ func _() { _ = x[NodeStateConfiguring-2] _ = x[NodeStateReady-3] _ = x[NodeStateRemoving-4] + _ = x[NodeStateUnhealthy-5] } -const _NodeState_name = "UnknownUnconfiguredConfiguringReadyRemoving" +const _NodeState_name = "UnknownUnconfiguredConfiguringReadyRemovingUnhealthy" -var _NodeState_index = [...]uint8{0, 7, 19, 30, 35, 43} +var _NodeState_index = [...]uint8{0, 7, 19, 30, 35, 43, 52} func (i NodeState) String() string { if i >= NodeState(len(_NodeState_index)-1) { diff --git a/robot/client/client.go b/robot/client/client.go index 29327028b5d..e9848af5e13 100644 --- a/robot/client/client.go +++ b/robot/client/client.go @@ -3,6 +3,7 @@ package client import ( "context" + "errors" "flag" "fmt" "io" @@ -15,7 +16,6 @@ import ( grpc_retry "github.com/grpc-ecosystem/go-grpc-middleware/retry" "github.com/jhump/protoreflect/desc" "github.com/jhump/protoreflect/grpcreflect" - "github.com/pkg/errors" "go.uber.org/multierr" "go.uber.org/zap" "go.uber.org/zap/zapcore" @@ -131,7 +131,7 @@ func isClosedPipeError(err error) bool { } func (rc *RobotClient) notConnectedToRemoteError() error { - return errors.Errorf("not connected to remote robot at %s", rc.address) + return fmt.Errorf("not connected to remote robot at %s", rc.address) } func (rc *RobotClient) handleUnaryDisconnect( @@ -624,7 +624,7 @@ func (rc *RobotClient) resources(ctx context.Context) ([]resource.Name, []resour } svcDesc, ok := symDesc.(*desc.ServiceDescriptor) if !ok { - return nil, nil, errors.Errorf("expected descriptor to be service descriptor but got %T", symDesc) + return nil, nil, fmt.Errorf("expected descriptor to be service descriptor but got %T", symDesc) } resTypes = append(resTypes, resource.RPCAPI{ API: rprotoutils.ResourceNameFromProto(resAPI.Subtype).API, @@ -1063,6 +1063,11 @@ func (rc *RobotClient) MachineStatus(ctx context.Context) (robot.MachineStatus, resStatus.State = resource.NodeStateReady case pb.ResourceStatus_STATE_REMOVING: resStatus.State = resource.NodeStateRemoving + case pb.ResourceStatus_STATE_UNHEALTHY: + resStatus.State = resource.NodeStateUnhealthy + if pbResStatus.Error != "" { + resStatus.Error = errors.New(pbResStatus.Error) + } } mStatus.Resources = append(mStatus.Resources, resStatus) diff --git a/robot/client/client_test.go b/robot/client/client_test.go index 8a18c898f81..99f025b2be4 100644 --- a/robot/client/client_test.go +++ b/robot/client/client_test.go @@ -3,6 +3,7 @@ package client import ( "bytes" "context" + "errors" "fmt" "image" "image/png" @@ -19,7 +20,6 @@ import ( "github.com/golang/geo/r3" "github.com/google/uuid" "github.com/jhump/protoreflect/grpcreflect" - "github.com/pkg/errors" "go.uber.org/zap/zapcore" commonpb "go.viam.com/api/common/v1" armpb "go.viam.com/api/component/arm/v1" @@ -1325,12 +1325,12 @@ func TestClientDiscovery(t *testing.T) { func ensurePartsAreEqual(part, otherPart *referenceframe.FrameSystemPart) error { if part.FrameConfig.Name() != otherPart.FrameConfig.Name() { - return errors.Errorf("part had name %s while other part had name %s", part.FrameConfig.Name(), otherPart.FrameConfig.Name()) + return fmt.Errorf("part had name %s while other part had name %s", part.FrameConfig.Name(), otherPart.FrameConfig.Name()) } frameConfig := part.FrameConfig otherFrameConfig := otherPart.FrameConfig if frameConfig.Parent() != otherFrameConfig.Parent() { - return errors.Errorf("part had parent %s while other part had parent %s", frameConfig.Parent(), otherFrameConfig.Parent()) + return fmt.Errorf("part had parent %s while other part had parent %s", frameConfig.Parent(), otherFrameConfig.Parent()) } if !spatialmath.R3VectorAlmostEqual(frameConfig.Pose().Point(), otherFrameConfig.Pose().Point(), 1e-8) { return errors.New("translations of parts not equal") @@ -2158,6 +2158,21 @@ func TestMachineStatus(t *testing.T) { }, 2, }, + { + "unhealthy status", + robot.MachineStatus{ + Config: config.Revision{Revision: "rev1"}, + Resources: []resource.Status{ + { + Name: arm.Named("brokenArm"), + State: resource.NodeStateUnhealthy, + Error: errors.New("bad configuration"), + Revision: "rev1", + }, + }, + }, + 0, + }, } { t.Run(tc.name, func(t *testing.T) { logger, logs := logging.NewObservedTestLogger(t) diff --git a/robot/impl/local_robot_test.go b/robot/impl/local_robot_test.go index 42ea460fe23..29ab1c10a69 100644 --- a/robot/impl/local_robot_test.go +++ b/robot/impl/local_robot_test.go @@ -4,6 +4,7 @@ import ( "context" "crypto/tls" "crypto/x509" + "errors" "fmt" "math" "net" @@ -16,7 +17,6 @@ import ( "github.com/go-viper/mapstructure/v2" "github.com/golang/geo/r3" - "github.com/pkg/errors" "go.mongodb.org/mongo-driver/bson/primitive" "go.uber.org/zap" // registers all components. @@ -986,7 +986,10 @@ func TestStatus(t *testing.T) { test.That(t, err, test.ShouldBeNil) _, err = r.Status(context.Background(), []resource.Name{fail1}) - test.That(t, err, test.ShouldBeError, errors.Wrapf(errFailed, "failed to get status from %q", fail1)) + // TODO(RSDK-6875): compare errors directly instead by string after + // `github.com/pkg/errors` is removed entirely. + expectedErr := fmt.Errorf("failed to get status from %q: %w", fail1, errFailed) + test.That(t, err.Error(), test.ShouldEqual, expectedErr.Error()) }) t.Run("many status", func(t *testing.T) { @@ -1024,7 +1027,10 @@ func TestStatus(t *testing.T) { test.That(t, resp[1].Status, test.ShouldResemble, expected[resp[1].Name]) _, err = r.Status(context.Background(), resourceNames) - test.That(t, err, test.ShouldBeError, errors.Wrapf(errFailed, "failed to get status from %q", fail1)) + // TODO(RSDK-6875): compare errors directly instead by string after + // `github.com/pkg/errors` is removed entirely. + expectedErr := fmt.Errorf("failed to get status from %q: %w", fail1, errFailed) + test.That(t, err.Error(), test.ShouldEqual, expectedErr.Error()) }) t.Run("get all status", func(t *testing.T) { @@ -2639,13 +2645,13 @@ func TestDependentAndOrphanedResources(t *testing.T) { if rName.API == gizmoapi.API { gizmo, ok := res.(gizmoapi.Gizmo) if !ok { - return nil, errors.Errorf("resource %s is not a gizmo", rName.Name) + return nil, fmt.Errorf("resource %s is not a gizmo", rName.Name) } newDoodad.gizmo = gizmo } } if newDoodad.gizmo == nil { - return nil, errors.Errorf("doodad %s must depend on a gizmo", conf.Name) + return nil, fmt.Errorf("doodad %s must depend on a gizmo", conf.Name) } return newDoodad, nil }, @@ -3555,9 +3561,11 @@ type mockConfig struct { Fail bool `json:"fail"` } +var errMockValidation = errors.New("whoops") + func (cfg *mockConfig) Validate(path string) ([]string, error) { if cfg.Fail { - return nil, errors.New("whoops") + return nil, errMockValidation } return []string{}, nil } @@ -3669,6 +3677,8 @@ func TestMachineStatus(t *testing.T) { t.Run("reconfigure", func(t *testing.T) { lr := setupLocalRobot(t, ctx, &config.Config{Revision: rev1}, logger) + expectedConfigError := fmt.Errorf("resource config validation error: %w", errMockValidation) + // Add a fake resource to the robot. rev2 := "rev2" builtinRev = rev2 @@ -3724,8 +3734,9 @@ func TestMachineStatus(t *testing.T) { []resource.Status{ { Name: mockNamed("m"), - State: resource.NodeStateConfiguring, + State: resource.NodeStateUnhealthy, Revision: rev2, + Error: errors.Join(expectedConfigError), }, }, ) diff --git a/robot/server/server.go b/robot/server/server.go index ae86ce6ca6d..411c2da6d9b 100644 --- a/robot/server/server.go +++ b/robot/server/server.go @@ -523,6 +523,11 @@ func (s *Server) GetMachineStatus(ctx context.Context, _ *pb.GetMachineStatusReq pbResStatus.State = pb.ResourceStatus_STATE_READY case resource.NodeStateRemoving: pbResStatus.State = pb.ResourceStatus_STATE_REMOVING + case resource.NodeStateUnhealthy: + pbResStatus.State = pb.ResourceStatus_STATE_UNHEALTHY + if resStatus.Error != nil { + pbResStatus.Error = resStatus.Error.Error() + } } result.Resources = append(result.Resources, pbResStatus) diff --git a/robot/server/server_test.go b/robot/server/server_test.go index 565778fe4fd..6e464a2f97d 100644 --- a/robot/server/server_test.go +++ b/robot/server/server_test.go @@ -177,6 +177,30 @@ func TestServer(t *testing.T) { }, 2, }, + { + "unhealthy status", + robot.MachineStatus{ + Config: config.Revision{Revision: "rev1"}, + Resources: []resource.Status{ + { + Name: arm.Named("brokenArm"), + Revision: "rev1", + State: resource.NodeStateUnhealthy, + Error: errors.New("bad configuration"), + }, + }, + }, + &pb.ConfigStatus{Revision: "rev1"}, + []*pb.ResourceStatus{ + { + Name: protoutils.ResourceNameToProto(arm.Named("brokenArm")), + State: pb.ResourceStatus_STATE_UNHEALTHY, + Revision: "rev1", + Error: "bad configuration", + }, + }, + 0, + }, } { logger, logs := logging.NewObservedTestLogger(t) injectRobot := &inject.Robot{} diff --git a/tools/tools.go b/tools/tools.go index bdfea358e73..a2a40d1c478 100644 --- a/tools/tools.go +++ b/tools/tools.go @@ -13,7 +13,7 @@ import ( _ "github.com/golangci/golangci-lint/cmd/golangci-lint" _ "github.com/rhysd/actionlint" _ "golang.org/x/mobile/cmd/gomobile" - _ "golang.org/x/tools/cmd/stringer" + _ "golang.org/x/tools/cmd/stringer" // generates `String` methods for enums _ "gotest.tools/gotestsum" // only needed for proto building in examples/customresources/apis/proto