Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RSDK-3329, RSDK-4070 - Do not restart modules that crash before ready and respect resource configuration timeout #2645

Merged
merged 7 commits into from
Jul 19, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 33 additions & 2 deletions module/modmanager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import (

var (
validateConfigTimeout = 5 * time.Second
readyTimeout = 30 * time.Second
errMessageExitStatus143 = "exit status 143"
logLevelArgumentTemplate = "--log-level=%s"
errModularResourcesDisabled = errors.New("modular resources disabled in untrusted environment")
Expand Down Expand Up @@ -444,6 +445,36 @@ func (mgr *Manager) newOnUnexpectedExitHandler(mod *module) func(exitCode int) b
if mod.inRecovery.Load() {
return false
}

// If mod.handles was never set, the module was never actually ready in the
// first place before crashing. Log an error and do not attempt a restart;
// something is likely wrong with the module implemenation.
mgr.mu.Lock()
if mod.handles == nil {
Otterverse marked this conversation as resolved.
Show resolved Hide resolved
mgr.logger.Errorw(
"module has unexpectedly exited without responding to a ready request ",
"module", mod.name,
"exit_code", exitCode,
)
// Remove module and close connection. Process will already be stopped.
benjirewis marked this conversation as resolved.
Show resolved Hide resolved
for r, m := range mgr.rMap {
if m == mod {
delete(mgr.rMap, r)
}
}
delete(mgr.modules, mod.name)
if mod.conn != nil {
if err := mod.conn.Close(); err != nil {
mgr.logger.Errorw("error while closing connection from crashed module",
"module", mod.name,
"error", err)
}
}
mgr.mu.Unlock()
return false
}
mgr.mu.Unlock()

mod.inRecovery.Store(true)
defer mod.inRecovery.Store(false)

Expand Down Expand Up @@ -604,7 +635,7 @@ func (m *module) dial(conn *grpc.ClientConn) error {
}

func (m *module) checkReady(ctx context.Context, parentAddr string) error {
ctxTimeout, cancelFunc := context.WithTimeout(ctx, time.Second*30)
ctxTimeout, cancelFunc := context.WithTimeout(ctx, readyTimeout)
defer cancelFunc()

for {
Expand Down Expand Up @@ -655,7 +686,7 @@ func (m *module) startProcess(
return errors.WithMessage(err, "module startup failed")
}

ctxTimeout, cancel := context.WithTimeout(ctx, time.Second*30)
ctxTimeout, cancel := context.WithTimeout(ctx, readyTimeout)
defer cancel()
for {
select {
Expand Down
49 changes: 49 additions & 0 deletions module/modmanager/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
modmanageroptions "go.viam.com/rdk/module/modmanager/options"
"go.viam.com/rdk/resource"
rtestutils "go.viam.com/rdk/testutils"
rutils "go.viam.com/rdk/utils"
)

func TestModManagerFunctions(t *testing.T) {
Expand Down Expand Up @@ -459,6 +460,54 @@ func TestModuleReloading(t *testing.T) {
// Assert that RemoveOrphanedResources was called once.
test.That(t, dummyRemoveOrphanedResourcesCallCount.Load(), test.ShouldEqual, 1)
})
t.Run("immediate crash is not restarted", func(t *testing.T) {
benjirewis marked this conversation as resolved.
Show resolved Hide resolved
logger, logs := golog.NewObservedTestLogger(t)

modCfg.ExePath = rutils.ResolveFile("module/testmodule/fakemodule.sh")

// Lower global timeout early to avoid race with actual restart code, and
// lower readyTimeout to avoid waiting for 30 seconds for manager.Add to
// time out waiting for module to start listening.
defer func(oriOrigVal, rtOrigVal time.Duration) {
oueRestartInterval = oriOrigVal
readyTimeout = rtOrigVal
}(oueRestartInterval, readyTimeout)
oueRestartInterval = 10 * time.Millisecond
readyTimeout = 1 * time.Second

// This test neither uses a resource manager nor asserts anything about
// the existence of resources in the graph. Use a dummy
// RemoveOrphanedResources function so orphaned resource logic does not
// panic.
dummyRemoveOrphanedResources := func(context.Context, []resource.Name) {}
mgr := NewManager(parentAddr, logger, modmanageroptions.Options{
UntrustedEnv: false,
RemoveOrphanedResources: dummyRemoveOrphanedResources,
})
err = mgr.Add(ctx, modCfg)
test.That(t, err, test.ShouldNotBeNil)
test.That(t, err.Error(), test.ShouldContainSubstring,
"timed out waiting for module test-module to start listening")

// Assert that manager removes module after immediate crash.
testutils.WaitForAssertion(t, func(tb testing.TB) {
test.That(tb, len(mgr.Configs()), test.ShouldEqual, 0)
})

err = mgr.Close(ctx)
test.That(t, err, test.ShouldBeNil)

// Assert that logs reflect that fakemodule exited without responding to a
// ready request, and the manager did not try to nor succeed in restarting
// it.
test.That(t, logs.FilterMessageSnippet(
"module has unexpectedly exited without responding to a ready request").Len(),
test.ShouldEqual, 1)
test.That(t, logs.FilterMessageSnippet("attempting to restart it").Len(),
test.ShouldEqual, 0)
test.That(t, logs.FilterMessageSnippet("module successfully restarted").Len(),
test.ShouldEqual, 0)
})
}

func TestDebugModule(t *testing.T) {
Expand Down
7 changes: 7 additions & 0 deletions module/testmodule/fakemodule.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash

# fakemodule is a completely fake module that echos a message and exits. Used
# to test that modules that never respond to ready requests will not be
# restarted.

echo "this is a fake module; exiting now"