From 608c9491b5f3d9ec52380a6f851758b026fab54f Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Tue, 25 Jul 2023 14:18:26 +0200 Subject: [PATCH 1/2] kvserver: fail gracefully in TestLeaseTransferRejectedIfTargetNeedsSnapshot We saw this test hang in CI. What likely happened (according to the stacks) is that a lease transfer that was supposed to be caught by an interceptor never showed up in the interceptor. The most likely explanation is that it errored out before it got to evaluation. It then signaled a channel the test was only prepared to check later, so the test hung (waiting for a channel that was now never to be touched). This test is hard to maintain. It would be great (though, for now, out of reach) to write tests like it in a deterministic framework[^1] [^1]: see https://github.com/cockroachdb/cockroach/issues/105177. For now, fix the test so that when the (so far unknown) error rears its head again, it will fail properly, so we get to see the error and can take another pass at fixing the test (separately). Stressing this commit[^2], we get: > transferErrC unexpectedly signaled: /Table/Max: transfer lease unexpected > error: refusing to transfer lease to (n3,s3):3 because target may need a Raft > snapshot: replica in StateProbe This makes sense. The test wants to exercise the below-raft mechanism, but the above-raft mechanism also exists and while we didn't want to interact with it, we sometimes do[^1] [^1]: somewhat related to https://github.com/cockroachdb/cockroach/issues/107524 [^2]: `./dev test --filter TestLeaseTransferRejectedIfTargetNeedsSnapshot --stress ./pkg/kv/kvserver/` on gceworker, 285s Touches #106383. Epic: None Release note: None --- pkg/kv/kvserver/client_replica_test.go | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pkg/kv/kvserver/client_replica_test.go b/pkg/kv/kvserver/client_replica_test.go index 4737cd86676f..d1d27272d8da 100644 --- a/pkg/kv/kvserver/client_replica_test.go +++ b/pkg/kv/kvserver/client_replica_test.go @@ -3003,10 +3003,17 @@ func TestLeaseTransferRejectedIfTargetNeedsSnapshot(t *testing.T) { // Replica.AdminTransferLease. transferErrC := make(chan error, 1) if rejectAfterRevoke { - _ = tc.Stopper().RunAsyncTask(ctx, "transfer lease", func(ctx context.Context) { + require.NoError(t, tc.Stopper().RunAsyncTask(ctx, "transfer lease", func(ctx context.Context) { transferErrC <- tc.TransferRangeLease(*repl0.Desc(), tc.Target(2)) - }) - <-transferLeaseReqBlockedC + })) + select { + case <-transferLeaseReqBlockedC: + // Expected case: lease transfer triggered our interceptor and is now + // waiting there for transferLeaseReqUnblockedCh. + case err := <-transferErrC: + // Unexpected case: lease transfer errored out before making it into the filter. + t.Fatalf("transferErrC unexpectedly signaled: %v", err) + } } // Truncate the log at index+1 (log entries < N are removed, so this From 1c8c503d69fd0d1bd70ee6040acfc3e5fec840e7 Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Tue, 25 Jul 2023 14:45:48 +0200 Subject: [PATCH 2/2] kvserver: deflake TestLeaseTransferRejectedIfTargetNeedsSnapshot See previous commit. We sometimes hit the above-raft check when we wanted to hit only the below-raft one. This commit fixes this by selectively disabling the above-raft check in this test. Note that not all tests using lease transfers are susceptible to this problem in the way that this test is. This is because this test also lowers the `LeaseTransferRejectedRetryLoopCount`[^1] because it is intentionally manufacturing failed lease transfers and doesn't want to sit out the retry loop. It is that time-saving optimization that also allows the spurious error to bubble up. [^1]: https://github.com/cockroachdb/cockroach/blob/66c9f93ae86bddd7ba4c5f6a6b8b6cb700ca23ce/pkg/kv/kvserver/testing_knobs.go#L375 Epic: none Release note: None --- pkg/kv/kvserver/client_replica_test.go | 3 +++ pkg/kv/kvserver/replica_range_lease.go | 2 +- pkg/kv/kvserver/testing_knobs.go | 4 ++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pkg/kv/kvserver/client_replica_test.go b/pkg/kv/kvserver/client_replica_test.go index d1d27272d8da..befceeb18488 100644 --- a/pkg/kv/kvserver/client_replica_test.go +++ b/pkg/kv/kvserver/client_replica_test.go @@ -2933,6 +2933,9 @@ func TestLeaseTransferRejectedIfTargetNeedsSnapshot(t *testing.T) { ServerArgs: base.TestServerArgs{ Knobs: base.TestingKnobs{ Store: &kvserver.StoreTestingKnobs{ + // If we're testing the below-raft check, disable the above-raft check. + // See: https://github.com/cockroachdb/cockroach/pull/107526 + DisableAboveRaftLeaseTransferSafetyChecks: rejectAfterRevoke, TestingRequestFilter: func(ctx context.Context, ba *kvpb.BatchRequest) *kvpb.Error { if rejectAfterRevoke && ba.IsSingleTransferLeaseRequest() { transferLeaseReqBlockOnce.Do(func() { diff --git a/pkg/kv/kvserver/replica_range_lease.go b/pkg/kv/kvserver/replica_range_lease.go index bd98eca5b655..b3615eac3244 100644 --- a/pkg/kv/kvserver/replica_range_lease.go +++ b/pkg/kv/kvserver/replica_range_lease.go @@ -949,7 +949,7 @@ func (r *Replica) AdminTransferLease( raftStatus := r.raftStatusRLocked() raftFirstIndex := r.raftFirstIndexRLocked() snapStatus := raftutil.ReplicaMayNeedSnapshot(raftStatus, raftFirstIndex, nextLeaseHolder.ReplicaID) - if snapStatus != raftutil.NoSnapshotNeeded && !bypassSafetyChecks { + if snapStatus != raftutil.NoSnapshotNeeded && !bypassSafetyChecks && !r.store.cfg.TestingKnobs.DisableAboveRaftLeaseTransferSafetyChecks { r.store.metrics.LeaseTransferErrorCount.Inc(1) log.VEventf(ctx, 2, "not initiating lease transfer because the target %s may "+ "need a snapshot: %s", nextLeaseHolder, snapStatus) diff --git a/pkg/kv/kvserver/testing_knobs.go b/pkg/kv/kvserver/testing_knobs.go index d8462940073b..7cb72b992956 100644 --- a/pkg/kv/kvserver/testing_knobs.go +++ b/pkg/kv/kvserver/testing_knobs.go @@ -225,6 +225,10 @@ type StoreTestingKnobs struct { // leadership when it diverges from the range's leaseholder. This can // also be set via COCKROACH_DISABLE_LEADER_FOLLOWS_LEASEHOLDER. DisableLeaderFollowsLeaseholder bool + // If set, the above-raft lease transfer safety checks (that verify that + // we don't transfer leases to followers that need a snapshot, etc) are + // disabled. The proposal-time checks are not affected by this knob. + DisableAboveRaftLeaseTransferSafetyChecks bool // DisableRefreshReasonNewLeader disables refreshing pending commands when a new // leader is discovered. DisableRefreshReasonNewLeader bool