diff --git a/pkg/cmd/roachtest/tpcc.go b/pkg/cmd/roachtest/tpcc.go index f106c7fadaa..081e697c4b1 100644 --- a/pkg/cmd/roachtest/tpcc.go +++ b/pkg/cmd/roachtest/tpcc.go @@ -804,7 +804,36 @@ func runTPCCBench(ctx context.Context, t *test, c *cluster, b tpccBenchSpec) { iteration++ t.l.Printf("initializing cluster for %d warehouses (search attempt: %d)", warehouses, iteration) m := newMonitor(ctx, c, roachNodes) - c.Stop(ctx, roachNodes) + + // We overload the clusters in tpccbench, which can lead to transient infra + // failures. These are a) really annoying to debug and b) hide the actual + // passing warehouse count, making the line search sensitive to the choice + // of starting warehouses. Do a best-effort at waiting for the cloud VM(s) + // to recover without failing the line search. + var ok bool + for i := 0; i < 10; i++ { + if err := ctx.Err(); err != nil { + t.Fatal(err) + } + if err := c.StopE(ctx, roachNodes); err != nil { + t.l.Printf("unable to stop cluster; retrying to allow vm to recover: %s", err) + // We usually spend a long time blocking in StopE anyway, but just in case + // of a fast-failure mode, we still want to spend a little bit of time over + // the course of 10 retries to maximize the chances of things going back to + // working. + select { + case <-time.After(30 * time.Second): + case <-ctx.Done(): + } + continue + } + ok = true + break + } + if !ok { + t.Fatalf("VM is hosed; giving up") + } + c.Start(ctx, t, append(b.startOpts(), roachNodes)...) time.Sleep(restartWait)