kvserver: force-campaign leaseholder on leader removal

Previously, when the leader was removed from the range via a conf change, the first voter in the range descriptor would campaign to avoid waiting for an election timeout. This had a few drawbacks: * If the first voter is unavailable or lags, noone will campaign. * If the first voter isn't the leaseholder, it has to immediately transfer leadership to the leaseholder. * It used Raft PreVote, so it wouldn't be able to win when CheckQuorum is enabled, since followers won't grant votes when they've recently heard from the leader. This patch instead campaigns on the current leaseholder. We know there can only be one, avoiding election ties. The conf change is typically proposed by the leaseholder anyway so it's likely to be up-to-date. And we want it to be colocated with the leader. If there is no leaseholder then waiting out the election timeout is less problematic, since either we'll have to wait out the lease interval anyway, or the range is idle. It also forces an election by transitioning directly to candidate, bypassing PreVote. This is ok, since we know the current leader is dead. To avoid election ties in mixed 23.1/23.2 clusters, we retain the old voter designation until the upgrade is finalized, but always force an election instead of using pre-vote. Epic: none Release note: None
cockroachdb · Jun 16, 2023 · 2959dda · 2959dda
1 parent 99c8d4e
commit 2959dda
Show file tree

Hide file tree

Showing 2 changed files with 155 additions and 29 deletions.
diff --git a/pkg/kv/kvserver/client_raft_test.go b/pkg/kv/kvserver/client_raft_test.go
@@ -6479,3 +6479,106 @@ func TestRaftCheckQuorum(t *testing.T) {
 		})
 	})
 }
+
+// TestRaftLeaderRemovesItself tests that when a raft leader removes itself via
+// a conf change the leaseholder campaigns for leadership, ignoring
+// PreVote+CheckQuorum and transitioning directly to candidate.
+//
+// We set up three replicas:
+//
+// n1: Raft leader
+// n2: follower
+// n3: follower + leaseholder
+//
+// We disable leader following the leaseholder (which would otherwise transfer
+// leadership if we happened to tick during the conf change), and then remove n1
+// from the range. n3 should acquire leadership.
+//
+// We disable election timeouts, such that the only way n3 can become leader is
+// by campaigning explicitly. Furthermore, it must skip pre-votes, since with
+// PreVote+CheckQuorum n2 wouldn't vote for it (it would think n1 was still the
+// leader).
+func TestRaftLeaderRemovesItself(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+	defer log.Scope(t).Close(t)
+
+	// Timing-sensitive, so skip under deadlock detector and stressrace.
+	skip.UnderDeadlock(t)
+	skip.UnderStressRace(t)
+
+	ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second)
+	defer cancel()
+
+	tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{
+		ReplicationMode: base.ReplicationManual,
+		ServerArgs: base.TestServerArgs{
+			RaftConfig: base.RaftConfig{
+				RaftEnableCheckQuorum: true,
+				RaftTickInterval:      100 * time.Millisecond, // speed up test
+				// Set a large election timeout. We don't want replicas to call
+				// elections due to timeouts, we want them to campaign and obtain
+				// votes despite PreVote+CheckQuorum.
+				RaftElectionTimeoutTicks: 200,
+			},
+			Knobs: base.TestingKnobs{
+				Store: &kvserver.StoreTestingKnobs{
+					DisableLeaderFollowsLeaseholder: true, // the leader should stay put
+				},
+			},
+		},
+	})
+	defer tc.Stopper().Stop(ctx)
+
+	logStatus := func(s *raft.Status) {
+		t.Helper()
+		require.NotNil(t, s)
+		t.Logf("n%d %s at term=%d commit=%d", s.ID, s.RaftState, s.Term, s.Commit)
+	}
+
+	send1 := tc.GetFirstStoreFromServer(t, 0).TestSender()
+	send3 := tc.GetFirstStoreFromServer(t, 2).TestSender()
+
+	// Create a range, upreplicate it, and replicate a write.
+	key := tc.ScratchRange(t)
+	desc := tc.AddVotersOrFatal(t, key, tc.Targets(1, 2)...)
+	_, pErr := kv.SendWrapped(ctx, send1, incrementArgs(key, 1))
+	require.NoError(t, pErr.GoError())
+	tc.WaitForValues(t, key, []int64{1, 1, 1})
+
+	repl1, err := tc.GetFirstStoreFromServer(t, 0).GetReplica(desc.RangeID)
+	require.NoError(t, err)
+	repl2, err := tc.GetFirstStoreFromServer(t, 1).GetReplica(desc.RangeID)
+	require.NoError(t, err)
+	repl3, err := tc.GetFirstStoreFromServer(t, 2).GetReplica(desc.RangeID)
+	require.NoError(t, err)
+
+	// Move the lease to n3, and make sure everyone has applied it.
+	tc.TransferRangeLeaseOrFatal(t, desc, tc.Target(2))
+	require.Eventually(t, func() bool {
+		lease, _ := repl3.GetLease()
+		return lease.Replica.ReplicaID == repl3.ReplicaID()
+	}, 10*time.Second, 500*time.Millisecond)
+	_, pErr = kv.SendWrapped(ctx, send3, incrementArgs(key, 1))
+	require.NoError(t, pErr.GoError())
+	tc.WaitForValues(t, key, []int64{2, 2, 2})
+	t.Logf("n3 has lease")
+
+	// Make sure n1 is still leader.
+	st := repl1.RaftStatus()
+	require.Equal(t, raft.StateLeader, st.RaftState)
+	logStatus(st)
+
+	// Remove n1 and wait for n3 to become leader.
+	tc.RemoveVotersOrFatal(t, key, tc.Target(0))
+	t.Logf("n1 removed from range")
+
+	require.Eventually(t, func() bool {
+		logStatus(repl2.RaftStatus())
+		logStatus(repl3.RaftStatus())
+		if repl3.RaftStatus().RaftState == raft.StateLeader {
+			t.Logf("n3 is leader")
+			return true
+		}
+		return false
+	}, 10*time.Second, 500*time.Millisecond)
+}
diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go
@@ -17,6 +17,7 @@ import (
 	"strings"
 	"time"
 
+	"github.com/cockroachdb/cockroach/pkg/clusterversion"
 	"github.com/cockroachdb/cockroach/pkg/keys"
 	"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/apply"
@@ -33,6 +34,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader"
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/uncertainty"
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
+	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
 	"github.com/cockroachdb/cockroach/pkg/storage"
 	"github.com/cockroachdb/cockroach/pkg/util"
 	"github.com/cockroachdb/cockroach/pkg/util/admission/admissionpb"
@@ -1166,12 +1168,15 @@ func (r *Replica) handleRaftReadyRaftMuLocked(
 		r.deliverLocalRaftMsgsRaftMuLockedReplicaMuLocked(ctx, raftGroup)
 
 		if stats.apply.numConfChangeEntries > 0 {
-			// If the raft leader got removed, campaign the first remaining voter.
-			//
-			// NB: this must be called after Advance() above since campaigning is
-			// a no-op in the presence of unapplied conf changes.
-			if shouldCampaignAfterConfChange(ctx, r.store.StoreID(), r.descRLocked(), raftGroup) {
-				r.campaignLocked(ctx)
+			// If the raft leader got removed, campaign on the leaseholder. Uses
+			// forceCampaignLocked() to bypass PreVote+CheckQuorum, since we otherwise
+			// wouldn't get prevotes from other followers who recently heard from the
+			// old leader. We know the leader isn't around anymore anyway.
+			leaseStatus := r.leaseStatusAtRLocked(ctx, r.store.Clock().NowAsClockTimestamp())
+			raftStatus := raftGroup.BasicStatus()
+			if shouldCampaignAfterConfChange(ctx, r.store.ClusterSettings(), r.store.StoreID(),
+				r.descRLocked(), raftStatus, leaseStatus) {
+				r.forceCampaignLocked(ctx)
 			}
 		}
 
@@ -2602,40 +2607,58 @@ func ComputeRaftLogSize(
 	return ms.SysBytes + totalSideloaded, nil
 }
 
+// shouldCampaignAfterConfChange returns true if the current replica should
+// campaign after a conf change. If the leader replica is removed, the
+// leaseholder should campaign. We don't want to campaign on multiple replicas,
+// since that would cause ties.
+//
+// If there is no current leaseholder we'll have to wait out the election
+// timeout before someone campaigns, but that's ok -- either we'll have to wait
+// for the lease to expire anyway, or the range is presumably idle.
+//
+// The caller should campaign using forceCampaignLocked(), transitioning
+// directly to candidate and bypassing PreVote+CheckQuorum. Otherwise it won't
+// receive prevotes since other replicas have heard from the leader recently.
 func shouldCampaignAfterConfChange(
 	ctx context.Context,
+	st *cluster.Settings,
 	storeID roachpb.StoreID,
 	desc *roachpb.RangeDescriptor,
-	raftGroup *raft.RawNode,
+	raftStatus raft.BasicStatus,
+	leaseStatus kvserverpb.LeaseStatus,
 ) bool {
-	// If a config change was carried out, it's possible that the Raft
-	// leader was removed. Verify that, and if so, campaign if we are
-	// the first remaining voter replica. Without this, the range will
-	// be leaderless (and thus unavailable) for a few seconds.
-	//
-	// We can't (or rather shouldn't) campaign on all remaining voters
-	// because that can lead to a stalemate. For example, three voters
-	// may all make it through PreVote and then reject each other.
-	st := raftGroup.BasicStatus()
-	if st.Lead == 0 {
-		// Leader unknown. This isn't what we expect in steady state, so we
-		// don't do anything.
+	if raftStatus.Lead == 0 {
+		// Leader unknown. We can't know if it was removed by the conf change, and
+		// because we force an election without prevote we don't want to risk
+		// throwing spurious elections.
+		return false
+	}
+	if raftStatus.RaftState == raft.StateLeader {
+		// We're already the leader, no point in campaigning.
 		return false
 	}
 	if !desc.IsInitialized() {
-		// We don't have an initialized, so we can't figure out who is supposed
-		// to campaign. It's possible that it's us and we're waiting for the
-		// initial snapshot, but it's hard to tell. Don't do anything.
+		// No descriptor, so we don't know if the leader has been removed. We
+		// don't expect to hit this, but let's be defensive.
 		return false
 	}
-	// If the leader is no longer in the descriptor but we are the first voter,
-	// campaign.
-	_, leaderStillThere := desc.GetReplicaDescriptorByID(roachpb.ReplicaID(st.Lead))
-	if !leaderStillThere && storeID == desc.Replicas().VoterDescriptors()[0].StoreID {
-		log.VEventf(ctx, 3, "leader got removed by conf change")
-		return true
+	if _, ok := desc.GetReplicaDescriptorByID(roachpb.ReplicaID(raftStatus.Lead)); ok {
+		// The leader is still in the descriptor.
+		return false
 	}
-	return false
+	// Prior to 23.2, the first voter in the descriptor campaigned, so we do
+	// the same in mixed-version clusters to avoid ties.
+	if !st.Version.IsActive(ctx, clusterversion.V23_2) {
+		if storeID != desc.Replicas().VoterDescriptors()[0].StoreID {
+			// We're not the designated campaigner.
+			return false
+		}
+	} else if !leaseStatus.OwnedBy(storeID) || !leaseStatus.IsValid() {
+		// We're not the leaseholder.
+		return false
+	}
+	log.VEventf(ctx, 3, "leader got removed by conf change, campaigning")
+	return true
 }
 
 // printRaftTail pretty-prints the tail of the log and returns it as a string,