diff --git a/pkg/cmd/roachtest/tpcc.go b/pkg/cmd/roachtest/tpcc.go
index f106c7fadaa..081e697c4b1 100644
--- a/pkg/cmd/roachtest/tpcc.go
+++ b/pkg/cmd/roachtest/tpcc.go
@@ -804,7 +804,36 @@ func runTPCCBench(ctx context.Context, t *test, c *cluster, b tpccBenchSpec) {
 		iteration++
 		t.l.Printf("initializing cluster for %d warehouses (search attempt: %d)", warehouses, iteration)
 		m := newMonitor(ctx, c, roachNodes)
-		c.Stop(ctx, roachNodes)
+
+		// We overload the clusters in tpccbench, which can lead to transient infra
+		// failures. These are a) really annoying to debug and b) hide the actual
+		// passing warehouse count, making the line search sensitive to the choice
+		// of starting warehouses. Do a best-effort at waiting for the cloud VM(s)
+		// to recover without failing the line search.
+		var ok bool
+		for i := 0; i < 10; i++ {
+			if err := ctx.Err(); err != nil {
+				t.Fatal(err)
+			}
+			if err := c.StopE(ctx, roachNodes); err != nil {
+				t.l.Printf("unable to stop cluster; retrying to allow vm to recover: %s", err)
+				// We usually spend a long time blocking in StopE anyway, but just in case
+				// of a fast-failure mode, we still want to spend a little bit of time over
+				// the course of 10 retries to maximize the chances of things going back to
+				// working.
+				select {
+				case <-time.After(30 * time.Second):
+				case <-ctx.Done():
+				}
+				continue
+			}
+			ok = true
+			break
+		}
+		if !ok {
+			t.Fatalf("VM is hosed; giving up")
+		}
+
 		c.Start(ctx, t, append(b.startOpts(), roachNodes)...)
 		time.Sleep(restartWait)