Skip to content

Commit

Permalink
Merge pull request #96903 from smg260/backport22.2-88514-88716-92667-…
Browse files Browse the repository at this point in the history
…95762-96743

release-22.2: roachtest: set default cluster settings, report timeout failures correctly, roachprod: capture ssh logs
  • Loading branch information
smg260 authored Mar 8, 2023
2 parents febfc5e + 54871f4 commit c71d4c2
Show file tree
Hide file tree
Showing 10 changed files with 436 additions and 397 deletions.
18 changes: 1 addition & 17 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -2112,29 +2112,13 @@ func (c *clusterImpl) Install(
return errors.Wrap(roachprod.Install(ctx, l, c.MakeNodes(nodes), software), "cluster.Install")
}

var reOnlyAlphanumeric = regexp.MustCompile(`[^a-zA-Z0-9]+`)

// cmdLogFileName comes up with a log file to use for the given argument string.
func cmdLogFileName(t time.Time, nodes option.NodeListOption, args ...string) string {
// Make sure we treat {"./cockroach start"} like {"./cockroach", "start"}.
args = strings.Split(strings.Join(args, " "), " ")
prefix := []string{reOnlyAlphanumeric.ReplaceAllString(args[0], "")}
for _, arg := range args[1:] {
if s := reOnlyAlphanumeric.ReplaceAllString(arg, ""); s != arg {
break
}
prefix = append(prefix, arg)
}
s := strings.Join(prefix, "_")
const maxLen = 70
if len(s) > maxLen {
s = s[:maxLen]
}
logFile := fmt.Sprintf(
"run_%s_n%s_%s",
t.Format(`150405.000000000`),
nodes.String()[1:],
s,
install.GenFilenameFromArgs(20, args...),
)
return logFile
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/cmd/roachtest/cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ func TestClusterMachineType(t *testing.T) {
func TestCmdLogFileName(t *testing.T) {
ts := time.Date(2000, 1, 1, 15, 4, 12, 0, time.Local)

const exp = `run_150412.000000000_n1,3-4,9_cockroach_bla`
const exp = `run_150412.000000000_n1,3-4,9_cockroach-bla-foo-ba`
nodes := option.NodeListOption{1, 3, 4, 9}
assert.Equal(t,
exp,
Expand Down
66 changes: 38 additions & 28 deletions pkg/cmd/roachtest/test_impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -283,52 +283,47 @@ func collectErrors(args []interface{}) []error {
// ATTENTION: Since this calls panic(errTestFatal), it should only be called
// from a test's closure. The test runner itself should never call this.
func (t *testImpl) Fatal(args ...interface{}) {
t.addFailure("", args...)
t.addFailureAndCancel(1, "", args...)
panic(errTestFatal)
}

// Fatalf is like Fatal, but takes a format string.
func (t *testImpl) Fatalf(format string, args ...interface{}) {
t.addFailure(format, args...)
t.addFailureAndCancel(1, format, args...)
panic(errTestFatal)
}

// FailNow implements the TestingT interface.
func (t *testImpl) FailNow() {
t.addFailure("FailNow called")
t.addFailureAndCancel(1, "FailNow called")
panic(errTestFatal)
}

// Error implements the TestingT interface
func (t *testImpl) Error(args ...interface{}) {
t.addFailure("", args...)
t.addFailureAndCancel(1, "", args...)
}

// Errorf implements the TestingT interface.
func (t *testImpl) Errorf(format string, args ...interface{}) {
t.addFailure(format, args...)
t.addFailureAndCancel(1, format, args...)
}

// We take the first error from each failure which is the
// "squashed" error that contains all information of a failure
func formatFailure(b *strings.Builder, reportFailures ...failure) {
for i, failure := range reportFailures {
if i > 0 {
fmt.Fprintln(b)
}
file, line, fn, ok := errors.GetOneLineSource(failure.squashedErr)
if !ok {
file, line, fn = "<unknown>", 0, "unknown"
}
fmt.Fprintf(b, "(%s:%d).%s: %v", file, line, fn, failure.squashedErr)
func (t *testImpl) addFailureAndCancel(depth int, format string, args ...interface{}) {
t.addFailure(depth+1, format, args...)
if t.mu.cancel != nil {
t.mu.cancel()
}
}

func (t *testImpl) addFailure(format string, args ...interface{}) {
// addFailure depth indicates how many stack frames to skip when reporting the
// site of the failure in logs. `0` will report the caller of addFailure, `1` the
// caller of the caller of addFailure, etc.
func (t *testImpl) addFailure(depth int, format string, args ...interface{}) {
if format == "" {
format = strings.Repeat(" %v", len(args))[1:]
}
reportFailure := newFailure(errors.NewWithDepthf(1, format, args...), collectErrors(args))
reportFailure := newFailure(errors.NewWithDepthf(depth+1, format, args...), collectErrors(args))

t.mu.Lock()
defer t.mu.Unlock()
Expand All @@ -339,29 +334,44 @@ func (t *testImpl) addFailure(format string, args ...interface{}) {
formatFailure(&b, reportFailure)
msg := b.String()

t.L().Printf("test failure #%d: %s", len(t.mu.failures), msg)
failureNum := len(t.mu.failures)
failureLog := fmt.Sprintf("failure_%d", failureNum)
t.L().Printf("test failure #%d: full stack retained in %s.log: %s", failureNum, failureLog, msg)
// Also dump the verbose error (incl. all stack traces) to a log file, in case
// we need it. The stacks are sometimes helpful, but we don't want them in the
// main log as they are highly verbose.
{
cl, err := t.L().ChildLogger(
fmt.Sprintf("failure_%d", len(t.mu.failures)),
logger.QuietStderr, logger.QuietStdout,
)
cl, err := t.L().ChildLogger(failureLog, logger.QuietStderr, logger.QuietStdout)
if err == nil {
// We don't actually log through this logger since it adds an unrelated
// file:line caller (namely ours). The error already has stack traces
// so it's better to write only it to the file to avoid confusion.
path := cl.File.Name()
if cl.File != nil {
path := cl.File.Name()
if len(path) > 0 {
_ = os.WriteFile(path, []byte(fmt.Sprintf("%+v", reportFailure.squashedErr)), 0644)
}
}
cl.Close() // we just wanted the filename
_ = os.WriteFile(path, []byte(fmt.Sprintf("%+v", reportFailure.squashedErr)), 0644)
}
}

t.mu.output = append(t.mu.output, msg...)
t.mu.output = append(t.mu.output, '\n')
if t.mu.cancel != nil {
t.mu.cancel()
}

// We take the first error from each failure which is the
// "squashed" error that contains all information of a failure
func formatFailure(b *strings.Builder, reportFailures ...failure) {
for i, failure := range reportFailures {
if i > 0 {
fmt.Fprintln(b)
}
file, line, fn, ok := errors.GetOneLineSource(failure.squashedErr)
if !ok {
file, line, fn = "<unknown>", 0, "unknown"
}
fmt.Fprintf(b, "(%s:%d).%s: %v", file, line, fn, failure.squashedErr)
}
}

Expand Down
16 changes: 8 additions & 8 deletions pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -933,10 +933,9 @@ func (r *testRunner) runTest(
}
t.L().Printf("tearing down after %s; see teardown.log", s)
case <-time.After(timeout):
// NB: we're intentionally not failing the test if it hasn't
// already. This will be done at the very end of this method,
// after we've collected artifacts.
t.L().Printf("test timed out after %s; check __stacks.log and CRDB logs for goroutine dumps", timeout)
// NB: We're adding the timeout failure intentionally without cancelling the context
// to capture as much state as possible during artifact collection.
t.addFailure(0, "test timed out (%s)", timeout)
timedOut = true
}

Expand Down Expand Up @@ -1048,10 +1047,11 @@ func (r *testRunner) teardownTest(
// around so someone can poke at it.
_ = c.StopE(ctx, t.L(), option.DefaultStopOpts(), c.All())

// The hung test may, against all odds, still not have reported an error.
// We delayed it to improve artifacts collection, and now we ensure the test
// is marked as failing.
t.Errorf("test timed out (%s)", t.Spec().(*registry.TestSpec).Timeout)
// We previously added a timeout failure without cancellation, so we cancel here.
if t.mu.cancel != nil {
t.mu.cancel()
}
t.L().Printf("test timed out; check __stacks.log and CRDB logs for goroutine dumps")
}
return nil
}
Expand Down
Loading

0 comments on commit c71d4c2

Please sign in to comment.