From a8b6592507e5cc98956cec96e5a3782399ac578e Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Tue, 11 Jul 2023 10:02:14 +0200 Subject: [PATCH] kvserver: prevent etcd-io/raft from dropping conf changes Relies on https://github.com/etcd-io/raft/pull/81. We don't want it to, since that causes issues due to false positives. We are taking responsibility for carrying out only valid conf changes, as we always have. See also https://github.com/etcd-io/raft/issues/80. Fixes #105797. Epic: CRDB-25287 Release note (bug fix): under rare circumstances, a replication change could get stuck when proposed near lease/leadership changes (and likely under overload), and the replica circuit breakers could trip. This problem has been addressed. Note to editors: this time it's really addressed (fingers crossed); a previous attempt with an identical release note had to be reverted. --- pkg/kv/kvserver/client_raft_test.go | 62 +++++++++++++++++++++++++++++ pkg/kv/kvserver/store.go | 25 ++++++------ 2 files changed, 75 insertions(+), 12 deletions(-) diff --git a/pkg/kv/kvserver/client_raft_test.go b/pkg/kv/kvserver/client_raft_test.go index 8a9265ec15af..45195602b806 100644 --- a/pkg/kv/kvserver/client_raft_test.go +++ b/pkg/kv/kvserver/client_raft_test.go @@ -5998,3 +5998,65 @@ func TestRaftSnapshotsWithMVCCRangeKeysEverywhere(t *testing.T) { require.Equal(t, kvpb.CheckConsistencyResponse_RANGE_CONSISTENT, result.Status, "%+v", result) } } + +// TestInvalidConfChangeRejection is a regression test for [1]. It proposes +// an (intentionally) invalid configuration change and makes sure that raft +// does not drop it. +// +// [1]: https://github.com/cockroachdb/cockroach/issues/105797 +func TestInvalidConfChangeRejection(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + // This is a regression test against a stuck command, so set a timeout to get + // a shot at a graceful failure on regression. + ctx, cancel := context.WithTimeout(context.Background(), testutils.DefaultSucceedsSoonDuration) + defer cancel() + + // When our configuration change shows up below raft, we need to apply it as a + // no-op, since the config change is intentionally invalid and assertions + // would fail if we were to try to actually apply it. + injErr := errors.New("injected error") + tc := testcluster.StartTestCluster(t, 1, base.TestClusterArgs{ + ReplicationMode: base.ReplicationManual, + ServerArgs: base.TestServerArgs{Knobs: base.TestingKnobs{Store: &kvserver.StoreTestingKnobs{ + TestingApplyCalledTwiceFilter: func(args kvserverbase.ApplyFilterArgs) (int, *kvpb.Error) { + if args.Req != nil && args.Req.Txn != nil && args.Req.Txn.Name == "fake" { + return 0, kvpb.NewError(injErr) + } + return 0, nil + }}}}, + }) + defer tc.Stopper().Stop(ctx) + + k := tc.ScratchRange(t) + + repl := tc.GetFirstStoreFromServer(t, 0).LookupReplica(keys.MustAddr(k)) + + // Try to leave a joint config even though we're not in one. This is something + // that will lead raft to propose an empty entry instead of our conf change. + // + // See: https://github.com/cockroachdb/cockroach/issues/105797 + var ba kvpb.BatchRequest + now := tc.Server(0).Clock().Now() + txn := roachpb.MakeTransaction("fake", k, roachpb.NormalUserPriority, now, 500*time.Millisecond.Nanoseconds(), 1) + ba.Txn = &txn + ba.Timestamp = now + ba.Add(&kvpb.EndTxnRequest{ + RequestHeader: kvpb.RequestHeader{ + Key: k, + }, + Commit: true, + InternalCommitTrigger: &roachpb.InternalCommitTrigger{ + ChangeReplicasTrigger: &roachpb.ChangeReplicasTrigger{ + Desc: repl.Desc(), + }, + }, + }) + + _, pErr := repl.Send(ctx, &ba) + // Verify that we see the configuration change below raft, where we rejected it + // (since it would've otherwise blow up the Replica: after all, we intentionally + // proposed an invalid configuration change. + require.True(t, errors.Is(pErr.GoError(), injErr), "%+v", pErr.GoError()) +} diff --git a/pkg/kv/kvserver/store.go b/pkg/kv/kvserver/store.go index adcb77503072..0bb69e148e66 100644 --- a/pkg/kv/kvserver/store.go +++ b/pkg/kv/kvserver/store.go @@ -291,18 +291,19 @@ func newRaftConfig( strg raft.Storage, id uint64, appliedIndex uint64, storeCfg StoreConfig, logger raft.Logger, ) *raft.Config { return &raft.Config{ - ID: id, - Applied: appliedIndex, - AsyncStorageWrites: true, - ElectionTick: storeCfg.RaftElectionTimeoutTicks, - HeartbeatTick: storeCfg.RaftHeartbeatIntervalTicks, - MaxUncommittedEntriesSize: storeCfg.RaftMaxUncommittedEntriesSize, - MaxCommittedSizePerReady: storeCfg.RaftMaxCommittedSizePerReady, - MaxSizePerMsg: storeCfg.RaftMaxSizePerMsg, - MaxInflightMsgs: storeCfg.RaftMaxInflightMsgs, - MaxInflightBytes: storeCfg.RaftMaxInflightBytes, - Storage: strg, - Logger: logger, + ID: id, + Applied: appliedIndex, + AsyncStorageWrites: true, + ElectionTick: storeCfg.RaftElectionTimeoutTicks, + HeartbeatTick: storeCfg.RaftHeartbeatIntervalTicks, + MaxUncommittedEntriesSize: storeCfg.RaftMaxUncommittedEntriesSize, + MaxCommittedSizePerReady: storeCfg.RaftMaxCommittedSizePerReady, + DisableConfChangeValidation: true, // see https://github.com/cockroachdb/cockroach/issues/105797 + MaxSizePerMsg: storeCfg.RaftMaxSizePerMsg, + MaxInflightMsgs: storeCfg.RaftMaxInflightMsgs, + MaxInflightBytes: storeCfg.RaftMaxInflightBytes, + Storage: strg, + Logger: logger, PreVote: true, }