Skip to content

Commit

Permalink
kvserver: force-campaign leaseholder on leader removal
Browse files Browse the repository at this point in the history
Previously, when the leader was removed from the range via a conf
change, the first voter in the range descriptor would campaign to avoid
waiting for an election timeout. This had a few drawbacks:

* If the first voter is unavailable or lags, noone will campaign.

* If the first voter isn't the leaseholder, it has to immediately
  transfer leadership to the leaseholder.

* It used Raft PreVote, so it wouldn't be able to win when CheckQuorum
  is enabled, since followers won't grant votes when they've recently
  heard from the leader.

This patch instead campaigns on the current leaseholder. We know there
can only be one, avoiding election ties. The conf change is typically
proposed by the leaseholder anyway so it's likely to be up-to-date. And
we want it to be colocated with the leader. If there is no leaseholder
then waiting out the election timeout is less problematic, since either
we'll have to wait out the lease interval anyway, or the range is idle.

It also forces an election by transitioning directly to candidate,
bypassing PreVote. This is ok, since we know the current leader is dead.

To avoid election ties in mixed 23.1/23.2 clusters, we retain the old
voter designation until the upgrade is finalized, but always force an
election instead of using pre-vote.

Epic: none
Release note: None
  • Loading branch information
erikgrinaker committed Jun 16, 2023
1 parent 99c8d4e commit 2959dda
Show file tree
Hide file tree
Showing 2 changed files with 155 additions and 29 deletions.
103 changes: 103 additions & 0 deletions pkg/kv/kvserver/client_raft_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6479,3 +6479,106 @@ func TestRaftCheckQuorum(t *testing.T) {
})
})
}

// TestRaftLeaderRemovesItself tests that when a raft leader removes itself via
// a conf change the leaseholder campaigns for leadership, ignoring
// PreVote+CheckQuorum and transitioning directly to candidate.
//
// We set up three replicas:
//
// n1: Raft leader
// n2: follower
// n3: follower + leaseholder
//
// We disable leader following the leaseholder (which would otherwise transfer
// leadership if we happened to tick during the conf change), and then remove n1
// from the range. n3 should acquire leadership.
//
// We disable election timeouts, such that the only way n3 can become leader is
// by campaigning explicitly. Furthermore, it must skip pre-votes, since with
// PreVote+CheckQuorum n2 wouldn't vote for it (it would think n1 was still the
// leader).
func TestRaftLeaderRemovesItself(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

// Timing-sensitive, so skip under deadlock detector and stressrace.
skip.UnderDeadlock(t)
skip.UnderStressRace(t)

ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second)
defer cancel()

tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{
ReplicationMode: base.ReplicationManual,
ServerArgs: base.TestServerArgs{
RaftConfig: base.RaftConfig{
RaftEnableCheckQuorum: true,
RaftTickInterval: 100 * time.Millisecond, // speed up test
// Set a large election timeout. We don't want replicas to call
// elections due to timeouts, we want them to campaign and obtain
// votes despite PreVote+CheckQuorum.
RaftElectionTimeoutTicks: 200,
},
Knobs: base.TestingKnobs{
Store: &kvserver.StoreTestingKnobs{
DisableLeaderFollowsLeaseholder: true, // the leader should stay put
},
},
},
})
defer tc.Stopper().Stop(ctx)

logStatus := func(s *raft.Status) {
t.Helper()
require.NotNil(t, s)
t.Logf("n%d %s at term=%d commit=%d", s.ID, s.RaftState, s.Term, s.Commit)
}

send1 := tc.GetFirstStoreFromServer(t, 0).TestSender()
send3 := tc.GetFirstStoreFromServer(t, 2).TestSender()

// Create a range, upreplicate it, and replicate a write.
key := tc.ScratchRange(t)
desc := tc.AddVotersOrFatal(t, key, tc.Targets(1, 2)...)
_, pErr := kv.SendWrapped(ctx, send1, incrementArgs(key, 1))
require.NoError(t, pErr.GoError())
tc.WaitForValues(t, key, []int64{1, 1, 1})

repl1, err := tc.GetFirstStoreFromServer(t, 0).GetReplica(desc.RangeID)
require.NoError(t, err)
repl2, err := tc.GetFirstStoreFromServer(t, 1).GetReplica(desc.RangeID)
require.NoError(t, err)
repl3, err := tc.GetFirstStoreFromServer(t, 2).GetReplica(desc.RangeID)
require.NoError(t, err)

// Move the lease to n3, and make sure everyone has applied it.
tc.TransferRangeLeaseOrFatal(t, desc, tc.Target(2))
require.Eventually(t, func() bool {
lease, _ := repl3.GetLease()
return lease.Replica.ReplicaID == repl3.ReplicaID()
}, 10*time.Second, 500*time.Millisecond)
_, pErr = kv.SendWrapped(ctx, send3, incrementArgs(key, 1))
require.NoError(t, pErr.GoError())
tc.WaitForValues(t, key, []int64{2, 2, 2})
t.Logf("n3 has lease")

// Make sure n1 is still leader.
st := repl1.RaftStatus()
require.Equal(t, raft.StateLeader, st.RaftState)
logStatus(st)

// Remove n1 and wait for n3 to become leader.
tc.RemoveVotersOrFatal(t, key, tc.Target(0))
t.Logf("n1 removed from range")

require.Eventually(t, func() bool {
logStatus(repl2.RaftStatus())
logStatus(repl3.RaftStatus())
if repl3.RaftStatus().RaftState == raft.StateLeader {
t.Logf("n3 is leader")
return true
}
return false
}, 10*time.Second, 500*time.Millisecond)
}
81 changes: 52 additions & 29 deletions pkg/kv/kvserver/replica_raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"strings"
"time"

"github.com/cockroachdb/cockroach/pkg/clusterversion"
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/apply"
Expand All @@ -33,6 +34,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/uncertainty"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/storage"
"github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/admission/admissionpb"
Expand Down Expand Up @@ -1166,12 +1168,15 @@ func (r *Replica) handleRaftReadyRaftMuLocked(
r.deliverLocalRaftMsgsRaftMuLockedReplicaMuLocked(ctx, raftGroup)

if stats.apply.numConfChangeEntries > 0 {
// If the raft leader got removed, campaign the first remaining voter.
//
// NB: this must be called after Advance() above since campaigning is
// a no-op in the presence of unapplied conf changes.
if shouldCampaignAfterConfChange(ctx, r.store.StoreID(), r.descRLocked(), raftGroup) {
r.campaignLocked(ctx)
// If the raft leader got removed, campaign on the leaseholder. Uses
// forceCampaignLocked() to bypass PreVote+CheckQuorum, since we otherwise
// wouldn't get prevotes from other followers who recently heard from the
// old leader. We know the leader isn't around anymore anyway.
leaseStatus := r.leaseStatusAtRLocked(ctx, r.store.Clock().NowAsClockTimestamp())
raftStatus := raftGroup.BasicStatus()
if shouldCampaignAfterConfChange(ctx, r.store.ClusterSettings(), r.store.StoreID(),
r.descRLocked(), raftStatus, leaseStatus) {
r.forceCampaignLocked(ctx)
}
}

Expand Down Expand Up @@ -2602,40 +2607,58 @@ func ComputeRaftLogSize(
return ms.SysBytes + totalSideloaded, nil
}

// shouldCampaignAfterConfChange returns true if the current replica should
// campaign after a conf change. If the leader replica is removed, the
// leaseholder should campaign. We don't want to campaign on multiple replicas,
// since that would cause ties.
//
// If there is no current leaseholder we'll have to wait out the election
// timeout before someone campaigns, but that's ok -- either we'll have to wait
// for the lease to expire anyway, or the range is presumably idle.
//
// The caller should campaign using forceCampaignLocked(), transitioning
// directly to candidate and bypassing PreVote+CheckQuorum. Otherwise it won't
// receive prevotes since other replicas have heard from the leader recently.
func shouldCampaignAfterConfChange(
ctx context.Context,
st *cluster.Settings,
storeID roachpb.StoreID,
desc *roachpb.RangeDescriptor,
raftGroup *raft.RawNode,
raftStatus raft.BasicStatus,
leaseStatus kvserverpb.LeaseStatus,
) bool {
// If a config change was carried out, it's possible that the Raft
// leader was removed. Verify that, and if so, campaign if we are
// the first remaining voter replica. Without this, the range will
// be leaderless (and thus unavailable) for a few seconds.
//
// We can't (or rather shouldn't) campaign on all remaining voters
// because that can lead to a stalemate. For example, three voters
// may all make it through PreVote and then reject each other.
st := raftGroup.BasicStatus()
if st.Lead == 0 {
// Leader unknown. This isn't what we expect in steady state, so we
// don't do anything.
if raftStatus.Lead == 0 {
// Leader unknown. We can't know if it was removed by the conf change, and
// because we force an election without prevote we don't want to risk
// throwing spurious elections.
return false
}
if raftStatus.RaftState == raft.StateLeader {
// We're already the leader, no point in campaigning.
return false
}
if !desc.IsInitialized() {
// We don't have an initialized, so we can't figure out who is supposed
// to campaign. It's possible that it's us and we're waiting for the
// initial snapshot, but it's hard to tell. Don't do anything.
// No descriptor, so we don't know if the leader has been removed. We
// don't expect to hit this, but let's be defensive.
return false
}
// If the leader is no longer in the descriptor but we are the first voter,
// campaign.
_, leaderStillThere := desc.GetReplicaDescriptorByID(roachpb.ReplicaID(st.Lead))
if !leaderStillThere && storeID == desc.Replicas().VoterDescriptors()[0].StoreID {
log.VEventf(ctx, 3, "leader got removed by conf change")
return true
if _, ok := desc.GetReplicaDescriptorByID(roachpb.ReplicaID(raftStatus.Lead)); ok {
// The leader is still in the descriptor.
return false
}
return false
// Prior to 23.2, the first voter in the descriptor campaigned, so we do
// the same in mixed-version clusters to avoid ties.
if !st.Version.IsActive(ctx, clusterversion.V23_2) {
if storeID != desc.Replicas().VoterDescriptors()[0].StoreID {
// We're not the designated campaigner.
return false
}
} else if !leaseStatus.OwnedBy(storeID) || !leaseStatus.IsValid() {
// We're not the leaseholder.
return false
}
log.VEventf(ctx, 3, "leader got removed by conf change, campaigning")
return true
}

// printRaftTail pretty-prints the tail of the log and returns it as a string,
Expand Down

0 comments on commit 2959dda

Please sign in to comment.