Merge pull request #96903 from smg260/backport22.2-88514-88716-92667-…

…95762-96743 release-22.2: roachtest: set default cluster settings, report timeout failures correctly, roachprod: capture ssh logs
cockroachdb · Mar 8, 2023 · c71d4c2 · c71d4c2
2 parents febfc5e + 54871f4
commit c71d4c2
Show file tree

Hide file tree

Showing 10 changed files with 436 additions and 397 deletions.
diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go
@@ -2112,29 +2112,13 @@ func (c *clusterImpl) Install(
 	return errors.Wrap(roachprod.Install(ctx, l, c.MakeNodes(nodes), software), "cluster.Install")
 }
 
-var reOnlyAlphanumeric = regexp.MustCompile(`[^a-zA-Z0-9]+`)
-
 // cmdLogFileName comes up with a log file to use for the given argument string.
 func cmdLogFileName(t time.Time, nodes option.NodeListOption, args ...string) string {
-	// Make sure we treat {"./cockroach start"} like {"./cockroach", "start"}.
-	args = strings.Split(strings.Join(args, " "), " ")
-	prefix := []string{reOnlyAlphanumeric.ReplaceAllString(args[0], "")}
-	for _, arg := range args[1:] {
-		if s := reOnlyAlphanumeric.ReplaceAllString(arg, ""); s != arg {
-			break
-		}
-		prefix = append(prefix, arg)
-	}
-	s := strings.Join(prefix, "_")
-	const maxLen = 70
-	if len(s) > maxLen {
-		s = s[:maxLen]
-	}
 	logFile := fmt.Sprintf(
 		"run_%s_n%s_%s",
 		t.Format(`150405.000000000`),
 		nodes.String()[1:],
-		s,
+		install.GenFilenameFromArgs(20, args...),
 	)
 	return logFile
 }

diff --git a/pkg/cmd/roachtest/cluster_test.go b/pkg/cmd/roachtest/cluster_test.go
@@ -168,7 +168,7 @@ func TestClusterMachineType(t *testing.T) {
 func TestCmdLogFileName(t *testing.T) {
 	ts := time.Date(2000, 1, 1, 15, 4, 12, 0, time.Local)
 
-	const exp = `run_150412.000000000_n1,3-4,9_cockroach_bla`
+	const exp = `run_150412.000000000_n1,3-4,9_cockroach-bla-foo-ba`
 	nodes := option.NodeListOption{1, 3, 4, 9}
 	assert.Equal(t,
 		exp,

diff --git a/pkg/cmd/roachtest/test_impl.go b/pkg/cmd/roachtest/test_impl.go
@@ -283,52 +283,47 @@ func collectErrors(args []interface{}) []error {
 // ATTENTION: Since this calls panic(errTestFatal), it should only be called
 // from a test's closure. The test runner itself should never call this.
 func (t *testImpl) Fatal(args ...interface{}) {
-	t.addFailure("", args...)
+	t.addFailureAndCancel(1, "", args...)
 	panic(errTestFatal)
 }
 
 // Fatalf is like Fatal, but takes a format string.
 func (t *testImpl) Fatalf(format string, args ...interface{}) {
-	t.addFailure(format, args...)
+	t.addFailureAndCancel(1, format, args...)
 	panic(errTestFatal)
 }
 
 // FailNow implements the TestingT interface.
 func (t *testImpl) FailNow() {
-	t.addFailure("FailNow called")
+	t.addFailureAndCancel(1, "FailNow called")
 	panic(errTestFatal)
 }
 
 // Error implements the TestingT interface
 func (t *testImpl) Error(args ...interface{}) {
-	t.addFailure("", args...)
+	t.addFailureAndCancel(1, "", args...)
 }
 
 // Errorf implements the TestingT interface.
 func (t *testImpl) Errorf(format string, args ...interface{}) {
-	t.addFailure(format, args...)
+	t.addFailureAndCancel(1, format, args...)
 }
 
-// We take the first error from each failure which is the
-// "squashed" error that contains all information of a failure
-func formatFailure(b *strings.Builder, reportFailures ...failure) {
-	for i, failure := range reportFailures {
-		if i > 0 {
-			fmt.Fprintln(b)
-		}
-		file, line, fn, ok := errors.GetOneLineSource(failure.squashedErr)
-		if !ok {
-			file, line, fn = "<unknown>", 0, "unknown"
-		}
-		fmt.Fprintf(b, "(%s:%d).%s: %v", file, line, fn, failure.squashedErr)
+func (t *testImpl) addFailureAndCancel(depth int, format string, args ...interface{}) {
+	t.addFailure(depth+1, format, args...)
+	if t.mu.cancel != nil {
+		t.mu.cancel()
 	}
 }
 
-func (t *testImpl) addFailure(format string, args ...interface{}) {
+// addFailure depth indicates how many stack frames to skip when reporting the
+// site of the failure in logs. `0` will report the caller of addFailure, `1` the
+// caller of the caller of addFailure, etc.
+func (t *testImpl) addFailure(depth int, format string, args ...interface{}) {
 	if format == "" {
 		format = strings.Repeat(" %v", len(args))[1:]
 	}
-	reportFailure := newFailure(errors.NewWithDepthf(1, format, args...), collectErrors(args))
+	reportFailure := newFailure(errors.NewWithDepthf(depth+1, format, args...), collectErrors(args))
 
 	t.mu.Lock()
 	defer t.mu.Unlock()
@@ -339,29 +334,44 @@ func (t *testImpl) addFailure(format string, args ...interface{}) {
 	formatFailure(&b, reportFailure)
 	msg := b.String()
 
-	t.L().Printf("test failure #%d: %s", len(t.mu.failures), msg)
+	failureNum := len(t.mu.failures)
+	failureLog := fmt.Sprintf("failure_%d", failureNum)
+	t.L().Printf("test failure #%d: full stack retained in %s.log: %s", failureNum, failureLog, msg)
 	// Also dump the verbose error (incl. all stack traces) to a log file, in case
 	// we need it. The stacks are sometimes helpful, but we don't want them in the
 	// main log as they are highly verbose.
 	{
-		cl, err := t.L().ChildLogger(
-			fmt.Sprintf("failure_%d", len(t.mu.failures)),
-			logger.QuietStderr, logger.QuietStdout,
-		)
+		cl, err := t.L().ChildLogger(failureLog, logger.QuietStderr, logger.QuietStdout)
 		if err == nil {
 			// We don't actually log through this logger since it adds an unrelated
 			// file:line caller (namely ours). The error already has stack traces
 			// so it's better to write only it to the file to avoid confusion.
-			path := cl.File.Name()
+			if cl.File != nil {
+				path := cl.File.Name()
+				if len(path) > 0 {
+					_ = os.WriteFile(path, []byte(fmt.Sprintf("%+v", reportFailure.squashedErr)), 0644)
+				}
+			}
 			cl.Close() // we just wanted the filename
-			_ = os.WriteFile(path, []byte(fmt.Sprintf("%+v", reportFailure.squashedErr)), 0644)
 		}
 	}
 
 	t.mu.output = append(t.mu.output, msg...)
 	t.mu.output = append(t.mu.output, '\n')
-	if t.mu.cancel != nil {
-		t.mu.cancel()
+}
+
+// We take the first error from each failure which is the
+// "squashed" error that contains all information of a failure
+func formatFailure(b *strings.Builder, reportFailures ...failure) {
+	for i, failure := range reportFailures {
+		if i > 0 {
+			fmt.Fprintln(b)
+		}
+		file, line, fn, ok := errors.GetOneLineSource(failure.squashedErr)
+		if !ok {
+			file, line, fn = "<unknown>", 0, "unknown"
+		}
+		fmt.Fprintf(b, "(%s:%d).%s: %v", file, line, fn, failure.squashedErr)
 	}
 }
 

diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go
@@ -933,10 +933,9 @@ func (r *testRunner) runTest(
 		}
 		t.L().Printf("tearing down after %s; see teardown.log", s)
 	case <-time.After(timeout):
-		// NB: we're intentionally not failing the test if it hasn't
-		// already. This will be done at the very end of this method,
-		// after we've collected artifacts.
-		t.L().Printf("test timed out after %s; check __stacks.log and CRDB logs for goroutine dumps", timeout)
+		// NB: We're adding the timeout failure intentionally without cancelling the context
+		// to capture as much state as possible during artifact collection.
+		t.addFailure(0, "test timed out (%s)", timeout)
 		timedOut = true
 	}
 
@@ -1048,10 +1047,11 @@ func (r *testRunner) teardownTest(
 		// around so someone can poke at it.
 		_ = c.StopE(ctx, t.L(), option.DefaultStopOpts(), c.All())
 
-		// The hung test may, against all odds, still not have reported an error.
-		// We delayed it to improve artifacts collection, and now we ensure the test
-		// is marked as failing.
-		t.Errorf("test timed out (%s)", t.Spec().(*registry.TestSpec).Timeout)
+		// We previously added a timeout failure without cancellation, so we cancel here.
+		if t.mu.cancel != nil {
+			t.mu.cancel()
+		}
+		t.L().Printf("test timed out; check __stacks.log and CRDB logs for goroutine dumps")
 	}
 	return nil
 }