roachtest: report timeout failures accordingly

Timeout failures are recorded at actual timeout, with subsequent failures secondary. `addFailure` accepts a depth parameter and no longer includes context cancellation, which is done separately. Epic: none Fixes: cockroachdb#91237 Release note: None
smg260 · Mar 7, 2023 · 76c356a · 76c356a
1 parent 006f8a9
commit 76c356a
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 29 deletions.
diff --git a/pkg/cmd/roachtest/test_impl.go b/pkg/cmd/roachtest/test_impl.go
@@ -272,52 +272,47 @@ func collectErrors(args []interface{}) []error {
 // ATTENTION: Since this calls panic(errTestFatal), it should only be called
 // from a test's closure. The test runner itself should never call this.
 func (t *testImpl) Fatal(args ...interface{}) {
-	t.addFailure("", args...)
+	t.addFailureAndCancel(1, "", args...)
 	panic(errTestFatal)
 }
 
 // Fatalf is like Fatal, but takes a format string.
 func (t *testImpl) Fatalf(format string, args ...interface{}) {
-	t.addFailure(format, args...)
+	t.addFailureAndCancel(1, format, args...)
 	panic(errTestFatal)
 }
 
 // FailNow implements the TestingT interface.
 func (t *testImpl) FailNow() {
-	t.addFailure("FailNow called")
+	t.addFailureAndCancel(1, "FailNow called")
 	panic(errTestFatal)
 }
 
 // Error implements the TestingT interface
 func (t *testImpl) Error(args ...interface{}) {
-	t.addFailure("", args...)
+	t.addFailureAndCancel(1, "", args...)
 }
 
 // Errorf implements the TestingT interface.
 func (t *testImpl) Errorf(format string, args ...interface{}) {
-	t.addFailure(format, args...)
+	t.addFailureAndCancel(1, format, args...)
 }
 
-// We take the first error from each failure which is the
-// "squashed" error that contains all information of a failure
-func formatFailure(b *strings.Builder, reportFailures ...failure) {
-	for i, failure := range reportFailures {
-		if i > 0 {
-			fmt.Fprintln(b)
-		}
-		file, line, fn, ok := errors.GetOneLineSource(failure.squashedErr)
-		if !ok {
-			file, line, fn = "<unknown>", 0, "unknown"
-		}
-		fmt.Fprintf(b, "(%s:%d).%s: %v", file, line, fn, failure.squashedErr)
+func (t *testImpl) addFailureAndCancel(depth int, format string, args ...interface{}) {
+	t.addFailure(depth+1, format, args...)
+	if t.mu.cancel != nil {
+		t.mu.cancel()
 	}
 }
 
-func (t *testImpl) addFailure(format string, args ...interface{}) {
+// addFailure depth indicates how many stack frames to skip when reporting the
+// site of the failure in logs. `0` will report the caller of addFailure, `1` the
+// caller of the caller of addFailure, etc.
+func (t *testImpl) addFailure(depth int, format string, args ...interface{}) {
 	if format == "" {
 		format = strings.Repeat(" %v", len(args))[1:]
 	}
-	reportFailure := newFailure(errors.NewWithDepthf(2, format, args...), collectErrors(args))
+	reportFailure := newFailure(errors.NewWithDepthf(depth+1, format, args...), collectErrors(args))
 
 	t.mu.Lock()
 	defer t.mu.Unlock()
@@ -348,8 +343,20 @@ func (t *testImpl) addFailure(format string, args ...interface{}) {
 
 	t.mu.output = append(t.mu.output, msg...)
 	t.mu.output = append(t.mu.output, '\n')
-	if t.mu.cancel != nil {
-		t.mu.cancel()
+}
+
+// We take the first error from each failure which is the
+// "squashed" error that contains all information of a failure
+func formatFailure(b *strings.Builder, reportFailures ...failure) {
+	for i, failure := range reportFailures {
+		if i > 0 {
+			fmt.Fprintln(b)
+		}
+		file, line, fn, ok := errors.GetOneLineSource(failure.squashedErr)
+		if !ok {
+			file, line, fn = "<unknown>", 0, "unknown"
+		}
+		fmt.Fprintf(b, "(%s:%d).%s: %v", file, line, fn, failure.squashedErr)
 	}
 }
 

diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go
@@ -909,10 +909,9 @@ func (r *testRunner) runTest(
 		}
 		t.L().Printf("tearing down after %s; see teardown.log", s)
 	case <-time.After(timeout):
-		// NB: we're intentionally not failing the test if it hasn't
-		// already. This will be done at the very end of this method,
-		// after we've collected artifacts.
-		t.L().Printf("test timed out after %s; check __stacks.log and CRDB logs for goroutine dumps", timeout)
+		// NB: We're adding the timeout failure intentionally without cancelling the context
+		// to capture as much state as possible during artifact collection.
+		t.addFailure(0, "test timed out (%s)", timeout)
 		timedOut = true
 	}
 
@@ -1024,10 +1023,11 @@ func (r *testRunner) teardownTest(
 		// around so someone can poke at it.
 		_ = c.StopE(ctx, t.L(), option.DefaultStopOpts(), c.All())
 
-		// The hung test may, against all odds, still not have reported an error.
-		// We delayed it to improve artifacts collection, and now we ensure the test
-		// is marked as failing.
-		t.Errorf("test timed out (%s)", t.Spec().(*registry.TestSpec).Timeout)
+		// We previously added a timeout failure without cancellation, so we cancel here.
+		if t.mu.cancel != nil {
+			t.mu.cancel()
+		}
+		t.L().Printf("test timed out; check __stacks.log and CRDB logs for goroutine dumps")
 	}
 	return nil
 }

diff --git a/pkg/testutils/lint/passes/fmtsafe/functions.go b/pkg/testutils/lint/passes/fmtsafe/functions.go
@@ -94,6 +94,9 @@ var requireConstFmt = map[string]bool{
 	"(*github.com/cockroachdb/cockroach/pkg/cmd/roachtest.testImpl).addFailure": true,
 	"(*main.testImpl).addFailure": true,
 
+	"(*github.com/cockroachdb/cockroach/pkg/cmd/roachtest.testImpl).addFailureAndCancel": true,
+	"(*main.testImpl).addFailureAndCancel":                                               true,
+
 	"(*main.testImpl).Fatalf": true,
 	"(*github.com/cockroachdb/cockroach/pkg/cmd/roachtest.testImpl).Fatalf": true,