From 702ff6fa87c7494b240a16303ecf3e2a37c18e48 Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Thu, 22 Dec 2022 13:23:57 -0500 Subject: [PATCH 1/5] kv: integrate raft async storage writes Fixes #17500. Waiting on github.com/cockroachdb/pebble/pull/2117. This commit integrates with the `AsyncStorageWrites` functionality that we added to Raft in github.com/etcd-io/raft/pull/8. \## Approach The commit makes the minimal changes needed to integrate with async storage writes and pull fsyncs out of the raft state machine loop. It does not make an effort to extract the non-durable portion of raft log writes or raft log application onto separate goroutine pools, as was described in #17500. Those changes will also be impactful, but they're non trivial and bump into a pipelining vs. batching trade-off, so they are left as future work items (TODO(nvanbenschoten): open new issues). With this change, asynchronous Raft log syncs are enabled by the new `DB.ApplyNoSyncWait` Pebble API introduced in github.com/cockroachdb/pebble/pull/2117. The `handleRaftReady` state machine loop continues to initiate Raft log writes, but it uses the Pebble API to offload waiting on durability to a separate goroutine. This separate goroutine then sends the corresponding `MsgStorageAppend`'s response messages where they need to go (locally and/or to the Raft leader) when the fsync completes. The async storage writes functionality in Raft makes this all safe. \## Benchmark Results The result of this change is reduced interference between Raft proposals. As a result, it reduces end-to-end commit latency. github.com/etcd-io/raft/pull/8 presented a collection of benchmark results captured from integrating async storage writes with rafttoy. When integrated into CockroachDB, we see similar improvements to average and tail latency. However, it doesn't provide the throughput improvements at the top end because log appends and state machine application have not yet been extracted into separate goroutine pools, which would facilitate increased opportunity for batching. TODO: add images ---- Release note (performance improvement): The Raft proposal pipeline has been optimized to reduce interference between Raft proposals. This improves average and tail write latency at high concurrency. --- pkg/kv/kvserver/logstore/BUILD.bazel | 5 + pkg/kv/kvserver/logstore/logstore.go | 167 ++++-- .../kvserver/logstore/logstore_bench_test.go | 20 +- pkg/kv/kvserver/logstore/sync_waiter.go | 120 ++++ pkg/kv/kvserver/logstore/sync_waiter_test.go | 93 +++ pkg/kv/kvserver/replica.go | 23 +- pkg/kv/kvserver/replica_raft.go | 534 +++++++++++------- pkg/kv/kvserver/replica_raftstorage.go | 2 + pkg/kv/kvserver/replica_test.go | 93 --- pkg/kv/kvserver/store.go | 5 + pkg/kv/kvserver/store_raft.go | 2 + 11 files changed, 702 insertions(+), 362 deletions(-) create mode 100644 pkg/kv/kvserver/logstore/sync_waiter.go create mode 100644 pkg/kv/kvserver/logstore/sync_waiter_test.go diff --git a/pkg/kv/kvserver/logstore/BUILD.bazel b/pkg/kv/kvserver/logstore/BUILD.bazel index e3cb78453749..7e29b6803e70 100644 --- a/pkg/kv/kvserver/logstore/BUILD.bazel +++ b/pkg/kv/kvserver/logstore/BUILD.bazel @@ -8,6 +8,7 @@ go_library( "sideload.go", "sideload_disk.go", "stateloader.go", + "sync_waiter.go", ], importpath = "github.com/cockroachdb/cockroach/pkg/kv/kvserver/logstore", visibility = ["//visibility:public"], @@ -28,9 +29,11 @@ go_library( "//pkg/util/log", "//pkg/util/metric", "//pkg/util/protoutil", + "//pkg/util/stop", "//pkg/util/timeutil", "@com_github_cockroachdb_errors//:errors", "@com_github_cockroachdb_errors//oserror", + "@com_github_cockroachdb_pebble//record", "@com_github_cockroachdb_redact//:redact", "@io_etcd_go_raft_v3//:raft", "@io_etcd_go_raft_v3//raftpb", @@ -43,6 +46,7 @@ go_test( srcs = [ "logstore_bench_test.go", "sideload_test.go", + "sync_waiter_test.go", ], args = ["-test.timeout=295s"], embed = [":logstore"], @@ -60,6 +64,7 @@ go_test( "//pkg/util/log", "//pkg/util/metric", "//pkg/util/protoutil", + "//pkg/util/stop", "//pkg/util/tracing", "@com_github_cockroachdb_errors//:errors", "@com_github_cockroachdb_errors//oserror", diff --git a/pkg/kv/kvserver/logstore/logstore.go b/pkg/kv/kvserver/logstore/logstore.go index f1784431b0f5..490c4bcd6f7f 100644 --- a/pkg/kv/kvserver/logstore/logstore.go +++ b/pkg/kv/kvserver/logstore/logstore.go @@ -13,6 +13,7 @@ package logstore import ( "context" + "fmt" "sync" "time" @@ -43,25 +44,24 @@ var disableSyncRaftLog = settings.RegisterBoolSetting( envutil.EnvOrDefaultBool("COCKROACH_DISABLE_RAFT_LOG_SYNCHRONIZATION_UNSAFE", false), ) -// Ready contains the log entries and state to be saved to stable storage. This -// is a subset of raft.Ready relevant to log storage. All fields are read-only. -type Ready struct { - // The current state of a replica to be saved to stable storage. Empty if - // there is no update. - raftpb.HardState +var enableNonBlockingRaftLogSync = settings.RegisterBoolSetting( + settings.TenantWritable, + "kv.raft_log.non_blocking_synchronization.enabled", + "set to true to enable non-blocking synchronization on Raft log writes to "+ + "persistent storage. Setting to true does not risk data loss or data corruption "+ + "on server crashes, but can reduce write latency.", + envutil.EnvOrDefaultBool("COCKROACH_ENABLE_RAFT_LOG_NON_BLOCKING_SYNCHRONIZATION", true), +) - // Entries specifies entries to be saved to stable storage. Empty if there is - // no update. - Entries []raftpb.Entry +// MsgStorageAppend is a raftpb.Message with type MsgStorageAppend. +type MsgStorageAppend raftpb.Message - // MustSync indicates whether the HardState and Entries must be synchronously - // written to disk, or if an asynchronous write is permissible. - MustSync bool -} - -// MakeReady constructs a Ready struct from raft.Ready. -func MakeReady(from raft.Ready) Ready { - return Ready{HardState: from.HardState, Entries: from.Entries, MustSync: from.MustSync} +// MakeMsgStorageAppend constructs a MsgStorageAppend from a raftpb.Message. +func MakeMsgStorageAppend(m raftpb.Message) MsgStorageAppend { + if m.Type != raftpb.MsgStorageAppend { + panic(fmt.Sprintf("unexpected message type %s", m.Type)) + } + return MsgStorageAppend(m) } // RaftState stores information about the last entry and the size of the log. @@ -86,6 +86,8 @@ type AppendStats struct { PebbleBytes int64 Sync bool + // If true, PebbleEnd-PebbleBegin does not include the sync time. + NonBlocking bool } // Metrics contains metrics specific to the log storage. @@ -99,37 +101,70 @@ type LogStore struct { Engine storage.Engine Sideload SideloadStorage StateLoader StateLoader + SyncWaiter *SyncWaiterLoop EntryCache *raftentry.Cache Settings *cluster.Settings Metrics Metrics } +// SyncCallback is a callback that is notified when a raft log write has been +// durably committed to disk. The function is handed the response messages that +// are associated with the MsgStorageAppend that triggered the fsync. +type SyncCallback interface { + OnLogSync(context.Context, []raftpb.Message) +} + func newStoreEntriesBatch(eng storage.Engine) storage.Batch { // Use an unindexed batch because we don't need to read our writes, and // it is more efficient. return eng.NewUnindexedBatch(false /* writeOnly */) } -// StoreEntries persists newly appended Raft log Entries to the log storage. +// StoreEntries persists newly appended Raft log Entries to the log storage, +// then calls the provided callback with the input's response messages (if any) +// once the entries are durable. The durable log write may or may not be +// blocking (and therefore the callback may or may not be called synchronously), +// depending on the kv.raft_log.non_blocking_synchronization.enabled cluster +// setting. Either way, the effects of the log append will be immediately +// visible to readers of the Engine. +// // Accepts the state of the log before the operation, returns the state after. // Persists HardState atomically with, or strictly after Entries. func (s *LogStore) StoreEntries( - ctx context.Context, state RaftState, rd Ready, stats *AppendStats, + ctx context.Context, state RaftState, m MsgStorageAppend, cb SyncCallback, stats *AppendStats, ) (RaftState, error) { batch := newStoreEntriesBatch(s.Engine) - defer batch.Close() - return s.storeEntriesAndCommitBatch(ctx, state, rd, stats, batch) + return s.storeEntriesAndCommitBatch(ctx, state, m, cb, stats, batch) } +// storeEntriesAndCommitBatch is like StoreEntries, but it accepts a +// storage.Batch, which it takes responsibility for committing and closing. func (s *LogStore) storeEntriesAndCommitBatch( - ctx context.Context, state RaftState, rd Ready, stats *AppendStats, batch storage.Batch, + ctx context.Context, + state RaftState, + m MsgStorageAppend, + cb SyncCallback, + stats *AppendStats, + batch storage.Batch, ) (RaftState, error) { + // Before returning, Close the batch if we haven't handed ownership of it to a + // SyncWaiterLoop. If batch == nil, SyncWaiterLoop is responsible for closing + // it once the in-progress disk writes complete. + defer func() { + if batch != nil { + defer batch.Close() + } + }() + prevLastIndex := state.LastIndex - if len(rd.Entries) > 0 { + overwriting := false + if len(m.Entries) > 0 { + firstPurge := m.Entries[0].Index // first new entry written + overwriting = firstPurge <= prevLastIndex stats.Begin = timeutil.Now() // All of the entries are appended to distinct keys, returning a new // last index. - thinEntries, numSideloaded, sideLoadedEntriesSize, otherEntriesSize, err := MaybeSideloadEntries(ctx, rd.Entries, s.Sideload) + thinEntries, numSideloaded, sideLoadedEntriesSize, otherEntriesSize, err := MaybeSideloadEntries(ctx, m.Entries, s.Sideload) if err != nil { const expl = "during sideloading" return RaftState{}, errors.Wrap(err, expl) @@ -148,16 +183,21 @@ func (s *LogStore) storeEntriesAndCommitBatch( stats.End = timeutil.Now() } - if !raft.IsEmptyHardState(rd.HardState) { + hs := raftpb.HardState{ + Term: m.Term, + Vote: m.Vote, + Commit: m.Commit, + } + if !raft.IsEmptyHardState(hs) { // NB: Note that without additional safeguards, it's incorrect to write - // the HardState before appending rd.Entries. When catching up, a follower + // the HardState before appending m.Entries. When catching up, a follower // will receive Entries that are immediately Committed in the same // Ready. If we persist the HardState but happen to lose the Entries, // assertions can be tripped. // // We have both in the same batch, so there's no problem. If that ever // changes, we must write and sync the Entries before the HardState. - if err := s.StateLoader.SetHardState(ctx, batch, rd.HardState); err != nil { + if err := s.StateLoader.SetHardState(ctx, batch, hs); err != nil { const expl = "during setHardState" return RaftState{}, errors.Wrap(err, expl) } @@ -167,9 +207,9 @@ func (s *LogStore) storeEntriesAndCommitBatch( // // Note that the data is visible to other goroutines before it is synced to // disk. This is fine. The important constraints are that these syncs happen - // before Raft messages are sent and before the call to RawNode.Advance. Our - // regular locking is sufficient for this and if other goroutines can see the - // data early, that's fine. In particular, snapshots are not a problem (I + // before the MsgStorageAppend's responses are delivered back to the RawNode. + // Our regular locking is sufficient for this and if other goroutines can see + // the data early, that's fine. In particular, snapshots are not a problem (I // think they're the only thing that might access log entries or HardState // from other goroutines). Snapshots do not include either the HardState or // uncommitted log entries, and even if they did include log entries that @@ -182,25 +222,57 @@ func (s *LogStore) storeEntriesAndCommitBatch( // (Replica), so this comment might need to move. stats.PebbleBegin = timeutil.Now() stats.PebbleBytes = int64(batch.Len()) - sync := rd.MustSync && !disableSyncRaftLog.Get(&s.Settings.SV) - if err := batch.Commit(sync); err != nil { - const expl = "while committing batch" - return RaftState{}, errors.Wrap(err, expl) - } - stats.Sync = sync - stats.PebbleEnd = timeutil.Now() - if rd.MustSync { - s.Metrics.RaftLogCommitLatency.RecordValue(stats.PebbleEnd.Sub(stats.PebbleBegin).Nanoseconds()) + wantsSync := len(m.Responses) > 0 + willSync := wantsSync && !disableSyncRaftLog.Get(&s.Settings.SV) + // Use the non-blocking log sync path if we are performing a log sync, the + // cluster setting to do so is enabled, and we are not overwriting any + // previous log entries. If we are overwriting, we may need to purge the + // sideloaded SSTables associated with overwritten entries. This must be + // performed after the corresponding entries are durably replaced and it's + // easier to do ensure proper ordering using a blocking log sync. This is a + // rare case, so it's not worth optimizing for. + nonBlockingSync := willSync && enableNonBlockingRaftLogSync.Get(&s.Settings.SV) && !overwriting + if nonBlockingSync { + // If non-blocking synchronization is enabled, apply the batched updates to + // the engine and initiate a synchronous disk write, but don't wait for the + // write to complete. Instead, enqueue that waiting on the SyncWaiterLoop, + // who will signal the callback when the write completes. + if err := batch.CommitNoSyncWait(); err != nil { + const expl = "while committing batch without sync wait" + return RaftState{}, errors.Wrap(err, expl) + } + stats.PebbleEnd = timeutil.Now() + s.SyncWaiter.enqueue(ctx, batch, func() { + // NOTE: run on the SyncWaiterLoop goroutine. + logCommitEnd := timeutil.Now() + s.Metrics.RaftLogCommitLatency.RecordValue(logCommitEnd.Sub(stats.PebbleBegin).Nanoseconds()) + cb.OnLogSync(ctx, m.Responses) + }) + // Do not Close batch on return. Will be Closed by SyncWaiterLoop. + batch = nil + } else { + if err := batch.Commit(willSync); err != nil { + const expl = "while committing batch" + return RaftState{}, errors.Wrap(err, expl) + } + stats.PebbleEnd = timeutil.Now() + if wantsSync { + logCommitEnd := stats.PebbleEnd + s.Metrics.RaftLogCommitLatency.RecordValue(logCommitEnd.Sub(stats.PebbleBegin).Nanoseconds()) + cb.OnLogSync(ctx, m.Responses) + } } + stats.Sync = wantsSync + stats.NonBlocking = nonBlockingSync - if len(rd.Entries) > 0 { + if overwriting { // We may have just overwritten parts of the log which contain // sideloaded SSTables from a previous term (and perhaps discarded some // entries that we didn't overwrite). Remove any such leftover on-disk // payloads (we can do that now because we've committed the deletion // just above). - firstPurge := rd.Entries[0].Index // first new entry written - purgeTerm := rd.Entries[0].Term - 1 + firstPurge := m.Entries[0].Index // first new entry written + purgeTerm := m.Entries[0].Term - 1 lastPurge := prevLastIndex // old end of the log, include in deletion purgedSize, err := maybePurgeSideloaded(ctx, s.Sideload, firstPurge, lastPurge, purgeTerm) if err != nil { @@ -216,7 +288,16 @@ func (s *LogStore) storeEntriesAndCommitBatch( // Update raft log entry cache. We clear any older, uncommitted log entries // and cache the latest ones. - s.EntryCache.Add(s.RangeID, rd.Entries, true /* truncate */) + // + // In the blocking log sync case, these entries are already durable. In the + // non-blocking case, these entries have been written to the pebble engine (so + // reads of the engine will see them), but they are not yet be durable. This + // means that the entry cache can lead the durable log. This is allowed by + // etcd/raft, which maintains its own tracking of entry durability by + // splitting its log into an unstable portion for entries that are not known + // to be durable and a stable portion for entries that are known to be + // durable. + s.EntryCache.Add(s.RangeID, m.Entries, true /* truncate */) return state, nil } diff --git a/pkg/kv/kvserver/logstore/logstore_bench_test.go b/pkg/kv/kvserver/logstore/logstore_bench_test.go index 0c3f8e0473e2..ebfdfd924bf3 100644 --- a/pkg/kv/kvserver/logstore/logstore_bench_test.go +++ b/pkg/kv/kvserver/logstore/logstore_bench_test.go @@ -36,6 +36,10 @@ func (b *discardBatch) Commit(bool) error { return nil } +type noopSyncCallback struct{} + +func (noopSyncCallback) OnLogSync(context.Context, []raftpb.Message) {} + func BenchmarkLogStore_StoreEntries(b *testing.B) { defer log.Scope(b).Close(b) const kb = 1 << 10 @@ -48,23 +52,25 @@ func BenchmarkLogStore_StoreEntries(b *testing.B) { } func runBenchmarkLogStore_StoreEntries(b *testing.B, bytes int64) { + ctx := context.Background() const tenMB = 10 * 1 << 20 ec := raftentry.NewCache(tenMB) const rangeID = 1 eng := storage.NewDefaultInMemForTesting() defer eng.Close() + st := cluster.MakeTestingClusterSettings() + enableNonBlockingRaftLogSync.Override(ctx, &st.SV, false) s := LogStore{ RangeID: rangeID, Engine: eng, StateLoader: NewStateLoader(rangeID), EntryCache: ec, - Settings: cluster.MakeTestingClusterSettings(), + Settings: st, Metrics: Metrics{ RaftLogCommitLatency: metric.NewHistogram(metric.Metadata{}, 10*time.Second, metric.IOLatencyBuckets), }, } - ctx := context.Background() rs := RaftState{ LastTerm: 1, ByteSize: 0, @@ -89,17 +95,13 @@ func runBenchmarkLogStore_StoreEntries(b *testing.B, bytes int64) { batch := &discardBatch{} for i := 0; i < b.N; i++ { batch.Batch = newStoreEntriesBatch(eng) - rd := Ready{ - HardState: raftpb.HardState{}, - Entries: ents, - MustSync: true, - } + m := MsgStorageAppend{Entries: ents} + cb := noopSyncCallback{} var err error - rs, err = s.storeEntriesAndCommitBatch(ctx, rs, rd, stats, batch) + rs, err = s.storeEntriesAndCommitBatch(ctx, rs, m, cb, stats, batch) if err != nil { b.Fatal(err) } - batch.Batch.Close() ents[0].Index++ } require.EqualValues(b, b.N, rs.LastIndex) diff --git a/pkg/kv/kvserver/logstore/sync_waiter.go b/pkg/kv/kvserver/logstore/sync_waiter.go new file mode 100644 index 000000000000..e62f482b8579 --- /dev/null +++ b/pkg/kv/kvserver/logstore/sync_waiter.go @@ -0,0 +1,120 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package logstore + +import ( + "context" + "time" + + "github.com/cockroachdb/cockroach/pkg/storage" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/stop" + "github.com/cockroachdb/pebble/record" +) + +// syncWaiter is capable of waiting for a disk write to be durably committed. +type syncWaiter interface { + // SyncWait waits for the write to be durable. + SyncWait() error + // Close closes the syncWaiter and releases associated resources. + // Must be called after SyncWait returns. + Close() +} + +var _ syncWaiter = storage.Batch(nil) + +// SyncWaiterLoop waits on a sequence of in-progress disk writes, notifying +// callbacks when their corresponding disk writes have completed. +// Invariant: The callbacks are notified in the order that they were enqueued +// and without concurrency. +type SyncWaiterLoop struct { + q chan syncBatch + stopped chan struct{} + + logEveryEnqueueBlocked log.EveryN +} + +type syncBatch struct { + wg syncWaiter + cb func() +} + +// NewSyncWaiterLoop constructs a SyncWaiterLoop. It must be Started before use. +func NewSyncWaiterLoop() *SyncWaiterLoop { + return &SyncWaiterLoop{ + // We size the waiter loop's queue to the same size as Pebble's sync + // concurrency. This is the maximum number of pending syncWaiter's that + // pebble allows. + q: make(chan syncBatch, record.SyncConcurrency), + stopped: make(chan struct{}), + logEveryEnqueueBlocked: log.Every(1 * time.Second), + } +} + +// Start launches the loop. +func (w *SyncWaiterLoop) Start(ctx context.Context, stopper *stop.Stopper) { + _ = stopper.RunAsyncTaskEx(ctx, + stop.TaskOpts{ + TaskName: "raft-logstore-sync-waiter-loop", + // This task doesn't reference a parent because it runs for the server's + // lifetime. + SpanOpt: stop.SterileRootSpan, + }, + func(ctx context.Context) { + w.waitLoop(ctx, stopper) + }) +} + +// waitLoop pulls off the SyncWaiterLoop's queue. For each syncWaiter, it waits +// for the sync to complete and then calls the associated callback. +func (w *SyncWaiterLoop) waitLoop(ctx context.Context, stopper *stop.Stopper) { + defer close(w.stopped) + for { + select { + case w := <-w.q: + if err := w.wg.SyncWait(); err != nil { + log.Fatalf(ctx, "SyncWait error: %+v", err) + } + w.wg.Close() + w.cb() + case <-stopper.ShouldQuiesce(): + return + } + } +} + +// enqueue registers the syncWaiter with the SyncWaiterLoop. The provided +// callback will be called once the syncWaiter's associated disk write has been +// durably committed. It may never be called in case the stopper stops. +// +// The syncWaiter will be Closed after its SyncWait method completes. It must +// not be Closed by the caller. +// +// If the SyncWaiterLoop has already been stopped, the callback will never be +// called. +func (w *SyncWaiterLoop) enqueue(ctx context.Context, wg syncWaiter, cb func()) { + b := syncBatch{wg, cb} + select { + case w.q <- b: + case <-w.stopped: + default: + if w.logEveryEnqueueBlocked.ShouldLog() { + // NOTE: we don't expect to hit this because we size the enqueue channel + // with enough capacity to hold as many in-progress sync operations as + // Pebble allows (pebble/record.SyncConcurrency). + log.Warningf(ctx, "SyncWaiterLoop.enqueue blocking due to insufficient channel capacity") + } + select { + case w.q <- b: + case <-w.stopped: + } + } +} diff --git a/pkg/kv/kvserver/logstore/sync_waiter_test.go b/pkg/kv/kvserver/logstore/sync_waiter_test.go new file mode 100644 index 000000000000..ffc12492933f --- /dev/null +++ b/pkg/kv/kvserver/logstore/sync_waiter_test.go @@ -0,0 +1,93 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package logstore + +import ( + "context" + "testing" + "time" + + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/stop" +) + +func TestSyncWaiterLoop(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + stopper := stop.NewStopper() + w := NewSyncWaiterLoop() + w.Start(ctx, stopper) + + // Enqueue a waiter while the loop is running. + c := make(chan struct{}) + wg1 := make(chanSyncWaiter) + w.enqueue(ctx, wg1, func() { close(c) }) + + // Callback is not called before SyncWait completes. + select { + case <-c: + t.Fatal("callback unexpectedly called before SyncWait") + case <-time.After(5 * time.Millisecond): + } + + // Callback is called after SyncWait completes. + close(wg1) + <-c + + // Enqueue a waiter once the loop is stopped. Enqueuing should not block, + // regardless of how many times it is called. + stopper.Stop(ctx) + wg2 := make(chanSyncWaiter) + for i := 0; i < 2*cap(w.q); i++ { + w.enqueue(ctx, wg2, func() { t.Fatalf("callback unexpectedly called") }) + } + + // Callback should not be called, even after SyncWait completes. + // NB: stopper.Stop waits for the waitLoop to exit. + time.Sleep(5 * time.Millisecond) // give time to catch bugs + close(wg2) + time.Sleep(5 * time.Millisecond) // give time to catch bugs +} + +func BenchmarkSyncWaiterLoop(b *testing.B) { + ctx := context.Background() + stopper := stop.NewStopper() + defer stopper.Stop(ctx) + w := NewSyncWaiterLoop() + w.Start(ctx, stopper) + + // Pre-allocate a syncWaiter, notification channel, and callback function that + // can all be re-used across benchmark iterations so we can isolate the + // performance of operations inside the SyncWaiterLoop. + wg := make(chanSyncWaiter) + c := make(chan struct{}) + cb := func() { c <- struct{}{} } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + w.enqueue(ctx, wg, cb) + wg <- struct{}{} + <-c + } +} + +// chanSyncWaiter implements the syncWaiter interface. +type chanSyncWaiter chan struct{} + +func (c chanSyncWaiter) SyncWait() error { + <-c + return nil +} + +func (c chanSyncWaiter) Close() {} diff --git a/pkg/kv/kvserver/replica.go b/pkg/kv/kvserver/replica.go index 16ed5104010e..eb72b9c650e2 100644 --- a/pkg/kv/kvserver/replica.go +++ b/pkg/kv/kvserver/replica.go @@ -55,6 +55,7 @@ import ( "github.com/cockroachdb/redact" "github.com/kr/pretty" "go.etcd.io/raft/v3" + "go.etcd.io/raft/v3/raftpb" "go.etcd.io/raft/v3/tracker" ) @@ -260,6 +261,18 @@ type Replica struct { decoder replicaDecoder } + // localMsgs contains a collection of raftpb.Message that target the local + // RawNode. They are to be delivered on the next iteration of handleRaftReady. + // + // Locking notes: + // - Replica.localMsgs must be held to append messages to active. + // - Replica.raftMu and Replica.localMsgs must both be held to switch slices. + // - Replica.raftMu < Replica.localMsgs + localMsgs struct { + syncutil.Mutex + active, recycled []raftpb.Message + } + // The last seen replica descriptors from incoming Raft messages. These are // stored so that the replica still knows the replica descriptors for itself // and for its message recipients in the circumstances when its RangeDescriptor @@ -384,11 +397,11 @@ type Replica struct { mergeTxnID uuid.UUID // The state of the Raft state machine. state kvserverpb.ReplicaState - // Last index/term persisted to the raft log (not necessarily - // committed). Note that lastTerm may be 0 (and thus invalid) even when - // lastIndex is known, in which case the term will have to be retrieved - // from the Raft log entry. Use the invalidLastTerm constant for this - // case. + // Last index/term written to the raft log (not necessarily durable + // locally or committed by the group). Note that lastTerm may be 0 (and + // thus invalid) even when lastIndex is known, in which case the term + // will have to be retrieved from the Raft log entry. Use the + // invalidLastTerm constant for this case. lastIndex, lastTerm uint64 // A map of raft log index of pending snapshots to deadlines. // Used to prohibit raft log truncations that would leave a gap between diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go index f09bdf33cdac..0a0b9b3b248c 100644 --- a/pkg/kv/kvserver/replica_raft.go +++ b/pkg/kv/kvserver/replica_raft.go @@ -610,7 +610,11 @@ func (s handleRaftReadyStats) SafeFormat(p redact.SafePrinter, _ rune) { { var sync redact.SafeString if s.append.Sync { - sync = "-sync" + if s.append.NonBlocking { + sync = "-non-blocking-sync" + } else { + sync = "-sync" + } } p.Printf("raft ready handling: %.2fs [append=%.2fs, apply=%.2fs, commit-batch%s=%.2fs", dTotal.Seconds(), dAppend.Seconds(), dApply.Seconds(), sync, dPebble.Seconds()) @@ -625,6 +629,9 @@ func (s handleRaftReadyStats) SafeFormat(p redact.SafePrinter, _ rune) { ) if s.append.Sync { p.SafeString(" sync") + if s.append.NonBlocking { + p.SafeString("(non-blocking)") + } } p.SafeString(" [") @@ -707,7 +714,9 @@ func (r *Replica) handleRaftReadyRaftMuLocked( } var hasReady bool - var rd raft.Ready + var softState *raft.SoftState + var outboundMsgs []raftpb.Message + var msgStorageAppend, msgStorageApply raftpb.Message r.mu.Lock() state := logstore.RaftState{ // used for append below LastIndex: r.mu.lastIndex, @@ -717,12 +726,18 @@ func (r *Replica) handleRaftReadyRaftMuLocked( leaderID := r.mu.leaderID lastLeaderID := leaderID err := r.withRaftGroupLocked(true, func(raftGroup *raft.RawNode) (bool, error) { + r.deliverLocalRaftMsgsRaftMuLockedReplicaMuLocked(ctx, raftGroup) + numFlushed, err := r.mu.proposalBuf.FlushLockedWithRaftGroup(ctx, raftGroup) if err != nil { return false, err } if hasReady = raftGroup.HasReady(); hasReady { - rd = raftGroup.Ready() + syncRd := raftGroup.Ready() + logRaftReady(ctx, syncRd) + asyncRd := makeAsyncReady(syncRd) + softState = asyncRd.SoftState + outboundMsgs, msgStorageAppend, msgStorageApply = splitLocalStorageMsgs(asyncRd.Messages) } // We unquiesce if we have a Ready (= there's work to do). We also have // to unquiesce if we just flushed some proposals but there isn't a @@ -739,7 +754,7 @@ func (r *Replica) handleRaftReadyRaftMuLocked( unquiesceAndWakeLeader := hasReady || numFlushed > 0 || len(r.mu.proposals) > 0 return unquiesceAndWakeLeader, nil }) - r.mu.applyingEntries = len(rd.CommittedEntries) > 0 + r.mu.applyingEntries = hasMsg(msgStorageApply) pausedFollowers := r.mu.pausedFollowers r.mu.Unlock() if errors.Is(err, errRemoved) { @@ -761,10 +776,8 @@ func (r *Replica) handleRaftReadyRaftMuLocked( return stats, nil } - logRaftReady(ctx, rd) - refreshReason := noReason - if rd.SoftState != nil && leaderID != roachpb.ReplicaID(rd.SoftState.Lead) { + if softState != nil && leaderID != roachpb.ReplicaID(softState.Lead) { // Refresh pending commands if the Raft leader has changed. This is usually // the first indication we have of a new leader on a restarted node. // @@ -773,17 +786,68 @@ func (r *Replica) handleRaftReadyRaftMuLocked( // indicating a newly elected leader or a conf change. Replay protection // prevents any corruption, so the waste is only a performance issue. if log.V(3) { - log.Infof(ctx, "raft leader changed: %d -> %d", leaderID, rd.SoftState.Lead) + log.Infof(ctx, "raft leader changed: %d -> %d", leaderID, softState.Lead) } if !r.store.TestingKnobs().DisableRefreshReasonNewLeader { refreshReason = reasonNewLeader } - leaderID = roachpb.ReplicaID(rd.SoftState.Lead) + leaderID = roachpb.ReplicaID(softState.Lead) } - if inSnap.Desc != nil { - if !raft.IsEmptySnap(rd.Snapshot) { - snapUUID, err := uuid.FromBytes(rd.Snapshot.Data) + r.traceMessageSends(outboundMsgs, "sending messages") + r.sendRaftMessages(ctx, outboundMsgs, pausedFollowers) + + // If the ready struct includes entries that have been committed, these + // entries will be applied to the Replica's replicated state machine down + // below, after appending new entries to the raft log and sending messages + // to peers. However, the process of appending new entries to the raft log + // and then applying committed entries to the state machine can take some + // time - and these entries are already durably committed. If they have + // clients waiting on them, we'd like to acknowledge their success as soon + // as possible. To facilitate this, we take a quick pass over the committed + // entries and acknowledge as many as we can trivially prove will not be + // rejected beneath raft. + // + // Note that the CommittedEntries slice may contain entries that are also in + // the Entries slice (to be appended in this ready pass). This can happen when + // a follower is being caught up on committed commands. We could acknowledge + // these commands early even though they aren't durably in the local raft log + // yet (since they're committed via a quorum elsewhere), but we chose to be + // conservative and avoid it by passing the last Ready cycle's `lastIndex` for + // the maxIndex argument to AckCommittedEntriesBeforeApplication. + // + // TODO(nvanbenschoten): this is less important with async storage writes. + // Consider getting rid of it. + sm := r.getStateMachine() + dec := r.getDecoder() + var appTask apply.Task + if hasMsg(msgStorageApply) { + appTask = apply.MakeTask(sm, dec) + appTask.SetMaxBatchSize(r.store.TestingKnobs().MaxApplicationBatchSize) + defer appTask.Close() + if err := appTask.Decode(ctx, msgStorageApply.Entries); err != nil { + return stats, err + } + if knobs := r.store.TestingKnobs(); knobs == nil || !knobs.DisableCanAckBeforeApplication { + if err := appTask.AckCommittedEntriesBeforeApplication(ctx, state.LastIndex); err != nil { + return stats, err + } + } + } + + if hasMsg(msgStorageAppend) { + if msgStorageAppend.Snapshot != nil { + if inSnap.Desc == nil { + // If we didn't expect Raft to have a snapshot but it has one + // regardless, that is unexpected and indicates a programming + // error. + return stats, errors.AssertionFailedf( + "have inSnap=nil, but raft has a snapshot %s", + raft.DescribeSnapshot(*msgStorageAppend.Snapshot), + ) + } + + snapUUID, err := uuid.FromBytes(msgStorageAppend.Snapshot.Data) if err != nil { return stats, errors.Wrap(err, "invalid snapshot id") } @@ -794,6 +858,16 @@ func (r *Replica) handleRaftReadyRaftMuLocked( log.Fatalf(ctx, "incoming snapshot id doesn't match raft snapshot id: %s != %s", snapUUID, inSnap.SnapUUID) } + snap := *msgStorageAppend.Snapshot + hs := raftpb.HardState{ + Term: msgStorageAppend.Term, + Vote: msgStorageAppend.Vote, + Commit: msgStorageAppend.Commit, + } + if len(msgStorageAppend.Entries) != 0 { + log.Fatalf(ctx, "found Entries in MsgStorageAppend with non-empty Snapshot") + } + // Applying this snapshot may require us to subsume one or more of our right // neighbors. This occurs if this replica is informed about the merges via a // Raft snapshot instead of a MsgApp containing the merge commits, e.g., @@ -803,7 +877,7 @@ func (r *Replica) handleRaftReadyRaftMuLocked( defer releaseMergeLock() stats.tSnapBegin = timeutil.Now() - if err := r.applySnapshot(ctx, inSnap, rd.Snapshot, rd.HardState, subsumedRepls); err != nil { + if err := r.applySnapshot(ctx, inSnap, snap, hs, subsumedRepls); err != nil { return stats, errors.Wrap(err, "while applying snapshot") } stats.tSnapEnd = timeutil.Now() @@ -830,129 +904,36 @@ func (r *Replica) handleRaftReadyRaftMuLocked( refreshReason == noReason { refreshReason = reasonSnapshotApplied } - } - } else if !raft.IsEmptySnap(rd.Snapshot) { - // If we didn't expect Raft to have a snapshot but it has one - // regardless, that is unexpected and indicates a programming - // error. - return stats, errors.AssertionFailedf( - "have inSnap=nil, but raft has a snapshot %s", - raft.DescribeSnapshot(rd.Snapshot), - ) - } - - // If the ready struct includes entries that have been committed, these - // entries will be applied to the Replica's replicated state machine down - // below, after appending new entries to the raft log and sending messages - // to peers. However, the process of appending new entries to the raft log - // and then applying committed entries to the state machine can take some - // time - and these entries are already durably committed. If they have - // clients waiting on them, we'd like to acknowledge their success as soon - // as possible. To facilitate this, we take a quick pass over the committed - // entries and acknowledge as many as we can trivially prove will not be - // rejected beneath raft. - // - // Note that the CommittedEntries slice may contain entries that are also in - // the Entries slice (to be appended in this ready pass). This can happen when - // a follower is being caught up on committed commands. We could acknowledge - // these commands early even though they aren't durably in the local raft log - // yet (since they're committed via a quorum elsewhere), but we chose to be - // conservative and avoid it by passing the last Ready cycle's `lastIndex` for - // the maxIndex argument to AckCommittedEntriesBeforeApplication. - sm := r.getStateMachine() - dec := r.getDecoder() - appTask := apply.MakeTask(sm, dec) - appTask.SetMaxBatchSize(r.store.TestingKnobs().MaxApplicationBatchSize) - defer appTask.Close() - if err := appTask.Decode(ctx, rd.CommittedEntries); err != nil { - return stats, err - } - if knobs := r.store.TestingKnobs(); knobs == nil || !knobs.DisableCanAckBeforeApplication { - if err := appTask.AckCommittedEntriesBeforeApplication(ctx, state.LastIndex); err != nil { - return stats, err - } - } - // Separate the MsgApp messages from all other Raft message types so that we - // can take advantage of the optimization discussed in the Raft thesis under - // the section: `10.2.1 Writing to the leader’s disk in parallel`. The - // optimization suggests that instead of a leader writing new log entries to - // disk before replicating them to its followers, the leader can instead - // write the entries to disk in parallel with replicating to its followers - // and them writing to their disks. - // - // Here, we invoke this optimization by: - // 1. sending all MsgApps. - // 2. syncing all entries and Raft state to disk. - // 3. sending all other messages. - // - // Since this is all handled in handleRaftReadyRaftMuLocked, we're assured - // that even though we may sync new entries to disk after sending them in - // MsgApps to followers, we'll always have them synced to disk before we - // process followers' MsgAppResps for the corresponding entries because - // Ready processing is sequential (and because a restart of the leader would - // prevent the MsgAppResp from being handled by it). This is important - // because it makes sure that the leader always has all of the entries in - // the log for its term, which is required in etcd/raft for technical - // reasons[1]. - // - // MsgApps are also used to inform followers of committed entries through - // the Commit index that they contain. Due to the optimization described - // above, a Commit index may be sent out to a follower before it is - // persisted on the leader. This is safe because the Commit index can be - // treated as volatile state, as is supported by raft.MustSync[2]. - // Additionally, the Commit index can never refer to entries from the - // current Ready (due to the MsgAppResp argument above) except in - // single-node groups, in which as a result we have to be careful to not - // persist a Commit index without the entries its commit index might refer - // to (see the HardState update below for details). - // - // [1]: the Raft thesis states that this can be made safe: - // - // > The leader may even commit an entry before it has been written to its - // > own disk, if a majority of followers have written it to their disks; - // > this is still safe. - // - // [2]: Raft thesis section: `3.8 Persisted state and server restarts`: - // - // > Other state variables are safe to lose on a restart, as they can all be - // > recreated. The most interesting example is the commit index, which can - // > safely be reinitialized to zero on a restart. - // - // Note that this will change when joint quorums are implemented, at which - // point we have to introduce coupling between the Commit index and - // persisted config changes, and also require some commit indexes to be - // durably synced. - // See: - // https://github.com/etcd-io/etcd/issues/7625#issuecomment-489232411 - - msgApps, otherMsgs := splitMsgApps(rd.Messages) - r.traceMessageSends(msgApps, "sending msgApp") - r.sendRaftMessages(ctx, msgApps, pausedFollowers) - - // TODO(pavelkalinnikov): find a way to move it to storeEntries. - if !raft.IsEmptyHardState(rd.HardState) { - if !r.IsInitialized() && rd.HardState.Commit != 0 { - log.Fatalf(ctx, "setting non-zero HardState.Commit on uninitialized replica %s. HS=%+v", r, rd.HardState) + // Send MsgStorageAppend's responses. + r.sendRaftMessages(ctx, msgStorageAppend.Responses, nil) + } else { + // TODO(pavelkalinnikov): find a way to move it to storeEntries. + if msgStorageAppend.Commit != 0 && !r.IsInitialized() { + log.Fatalf(ctx, "setting non-zero HardState.Commit on uninitialized replica %s", r) + } + // TODO(pavelkalinnikov): construct and store this in Replica. + // TODO(pavelkalinnikov): fields like raftEntryCache are the same across all + // ranges, so can be passed to LogStore methods instead of being stored in it. + s := logstore.LogStore{ + RangeID: r.RangeID, + Engine: r.store.engine, + Sideload: r.raftMu.sideloaded, + StateLoader: r.raftMu.stateLoader.StateLoader, + SyncWaiter: r.store.syncWaiter, + EntryCache: r.store.raftEntryCache, + Settings: r.store.cfg.Settings, + Metrics: logstore.Metrics{ + RaftLogCommitLatency: r.store.metrics.RaftLogCommitLatency, + }, + } + m := logstore.MakeMsgStorageAppend(msgStorageAppend) + cb := (*replicaSyncCallback)(r) + if state, err = s.StoreEntries(ctx, state, m, cb, &stats.append); err != nil { + return stats, errors.Wrap(err, "while storing log entries") + } } } - // TODO(pavelkalinnikov): construct and store this in Replica. - // TODO(pavelkalinnikov): fields like raftEntryCache are the same across all - // ranges, so can be passed to LogStore methods instead of being stored in it. - s := logstore.LogStore{ - RangeID: r.RangeID, - Engine: r.store.engine, - Sideload: r.raftMu.sideloaded, - StateLoader: r.raftMu.stateLoader.StateLoader, - EntryCache: r.store.raftEntryCache, - Settings: r.store.cfg.Settings, - Metrics: logstore.Metrics{ - RaftLogCommitLatency: r.store.metrics.RaftLogCommitLatency, - }, - } - if state, err = s.StoreEntries(ctx, state, logstore.MakeReady(rd), &stats.append); err != nil { - return stats, errors.Wrap(err, "while storing log entries") - } // Update protected state - last index, last term, raft log size, and raft // leader ID. @@ -977,11 +958,10 @@ func (r *Replica) handleRaftReadyRaftMuLocked( r.store.replicateQueue.MaybeAddAsync(ctx, r, r.store.Clock().NowAsClockTimestamp()) } - r.sendRaftMessages(ctx, otherMsgs, nil /* blocked */) - r.traceEntries(rd.CommittedEntries, "committed, before applying any entries") - stats.tApplicationBegin = timeutil.Now() - if len(rd.CommittedEntries) > 0 { + if hasMsg(msgStorageApply) { + r.traceEntries(msgStorageApply.Entries, "committed, before applying any entries") + err := appTask.ApplyCommittedEntries(ctx) stats.apply = sm.moveStats() if err != nil { @@ -1016,11 +996,14 @@ func (r *Replica) handleRaftReadyRaftMuLocked( refreshReason = reasonNewLeaderOrConfigChange } } + + // Send MsgStorageApply's responses. + r.sendRaftMessages(ctx, msgStorageApply.Responses, nil) } stats.tApplicationEnd = timeutil.Now() applicationElapsed := stats.tApplicationEnd.Sub(stats.tApplicationBegin).Nanoseconds() r.store.metrics.RaftApplyCommittedLatency.RecordValue(applicationElapsed) - r.store.metrics.RaftCommandsApplied.Inc(int64(len(rd.CommittedEntries))) + r.store.metrics.RaftCommandsApplied.Inc(int64(len(msgStorageApply.Entries))) if r.store.TestingKnobs().EnableUnconditionalRefreshesInRaftReady { refreshReason = reasonNewLeaderOrConfigChange } @@ -1037,7 +1020,8 @@ func (r *Replica) handleRaftReadyRaftMuLocked( r.mu.Lock() err = r.withRaftGroupLocked(true, func(raftGroup *raft.RawNode) (bool, error) { - raftGroup.Advance(rd) + r.deliverLocalRaftMsgsRaftMuLockedReplicaMuLocked(ctx, raftGroup) + if stats.apply.numConfChangeEntries > 0 { // If the raft leader got removed, campaign the first remaining voter. // @@ -1077,18 +1061,71 @@ func (r *Replica) handleRaftReadyRaftMuLocked( return stats, nil } -// splitMsgApps splits the Raft message slice into two slices, one containing -// MsgApps and one containing all other message types. Each slice retains the -// relative ordering between messages in the original slice. -func splitMsgApps(msgs []raftpb.Message) (msgApps, otherMsgs []raftpb.Message) { - splitIdx := 0 - for i, msg := range msgs { - if msg.Type == raftpb.MsgApp { - msgs[i], msgs[splitIdx] = msgs[splitIdx], msgs[i] - splitIdx++ +// asyncReady encapsulates the messages that are ready to be sent to other peers +// or to be sent to local storage routines when async storage writes are enabled. +// All fields in asyncReady are read-only. +// TODO(nvanbenschoten): move this into go.etcd.io/raft. +type asyncReady struct { + // The current volatile state of a Node. + // SoftState will be nil if there is no update. + // It is not required to consume or store SoftState. + *raft.SoftState + + // ReadStates can be used for node to serve linearizable read requests locally + // when its applied index is greater than the index in ReadState. + // Note that the readState will be returned when raft receives msgReadIndex. + // The returned is only valid for the request that requested to read. + ReadStates []raft.ReadState + + // Messages specifies outbound messages to other peers and to local storage + // threads. These messages can be sent in any order. + // + // If it contains a MsgSnap message, the application MUST report back to raft + // when the snapshot has been received or has failed by calling ReportSnapshot. + Messages []raftpb.Message +} + +// makeAsyncReady constructs an asyncReady from the provided Ready. +func makeAsyncReady(rd raft.Ready) asyncReady { + return asyncReady{ + SoftState: rd.SoftState, + ReadStates: rd.ReadStates, + Messages: rd.Messages, + } +} + +// hasMsg returns whether the provided raftpb.Message is present. +// It serves as a poor man's Optional[raftpb.Message]. +func hasMsg(m raftpb.Message) bool { return m.Type != 0 } + +// splitLocalStorageMsgs filters out local storage messages from the provided +// message slice and returns them separately. +func splitLocalStorageMsgs( + msgs []raftpb.Message, +) (otherMsgs []raftpb.Message, msgStorageAppend, msgStorageApply raftpb.Message) { + for i := len(msgs) - 1; i >= 0; i-- { + switch msgs[i].Type { + case raftpb.MsgStorageAppend: + if hasMsg(msgStorageAppend) { + panic("two MsgStorageAppend") + } + msgStorageAppend = msgs[i] + case raftpb.MsgStorageApply: + if hasMsg(msgStorageApply) { + panic("two MsgStorageApply") + } + msgStorageApply = msgs[i] + default: + // Local storage messages will always be at the end of the messages slice, + // so we can terminate iteration as soon as we reach any other message + // type. This is leaking an implementation detail from etcd/raft which may + // not always hold, but while it does, we use it for convenience and + // assert against it changing in sendRaftMessages. + return msgs[:i+1], msgStorageAppend, msgStorageApply } } - return msgs[:splitIdx], msgs[splitIdx:] + // Only local storage messages. + return nil, msgStorageAppend, msgStorageApply } // maybeFatalOnRaftReadyErr will fatal if err is neither nil nor @@ -1414,87 +1451,160 @@ func (r *Replica) maybeCoalesceHeartbeat( return true } +// replicaSyncCallback implements the logstore.SyncCallback interface. +type replicaSyncCallback Replica + +func (r *replicaSyncCallback) OnLogSync(ctx context.Context, msgs []raftpb.Message) { + // Send MsgStorageAppend's responses. + (*Replica)(r).sendRaftMessages(ctx, msgs, nil /* blocked */) +} + func (r *Replica) sendRaftMessages( ctx context.Context, messages []raftpb.Message, blocked map[roachpb.ReplicaID]struct{}, ) { var lastAppResp raftpb.Message for _, message := range messages { - _, drop := blocked[roachpb.ReplicaID(message.To)] - if drop { - r.store.Metrics().RaftPausedFollowerDroppedMsgs.Inc(1) - } - switch message.Type { - case raftpb.MsgApp: - if util.RaceEnabled { - // Iterate over the entries to assert that all sideloaded commands - // are already inlined. replicaRaftStorage.Entries already performs - // the sideload inlining for stable entries and raft.unstable always - // contain fat entries. Since these are the only two sources that - // raft.sendAppend gathers entries from to populate MsgApps, we - // should never see thin entries here. + switch message.To { + case raft.LocalAppendThread: + // To local append thread. + // NOTE: we don't currently split append work off into an async goroutine. + // Instead, we handle messages to LocalAppendThread inline on the raft + // scheduler goroutine, so this code path is unused. + panic("unsupported, currently processed inline on raft scheduler goroutine") + case raft.LocalApplyThread: + // To local apply thread. + // NOTE: we don't currently split apply work off into an async goroutine. + // Instead, we handle messages to LocalAppendThread inline on the raft + // scheduler goroutine, so this code path is unused. + panic("unsupported, currently processed inline on raft scheduler goroutine") + case uint64(r.ReplicaID()): + // To local raft state machine, from local storage append and apply work. + // NOTE: For async Raft log appends, these messages come from calls to + // replicaSyncCallback.OnLogSync. For other local storage work (log + // application and snapshot application), these messages come from + // Replica.handleRaftReadyRaftMuLocked. + r.sendLocalRaftMsg(message) + default: + _, drop := blocked[roachpb.ReplicaID(message.To)] + if drop { + r.store.Metrics().RaftPausedFollowerDroppedMsgs.Inc(1) + } + switch message.Type { + case raftpb.MsgApp: + if util.RaceEnabled { + // Iterate over the entries to assert that all sideloaded commands + // are already inlined. replicaRaftStorage.Entries already performs + // the sideload inlining for stable entries and raft.unstable always + // contain fat entries. Since these are the only two sources that + // raft.sendAppend gathers entries from to populate MsgApps, we + // should never see thin entries here. + // + // Also assert that the log term only ever increases (most of the + // time it stays constant, as term changes are rare), and that + // the index increases by exactly one with each entry. + // + // This assertion came out of #61990. + prevTerm := message.LogTerm // term of entry preceding the append + prevIndex := message.Index // index of entry preceding the append + for j := range message.Entries { + ent := &message.Entries[j] + logstore.AssertSideloadedRaftCommandInlined(ctx, ent) + + if prevIndex+1 != ent.Index { + log.Fatalf(ctx, + "index gap in outgoing MsgApp: idx %d followed by %d", + prevIndex, ent.Index, + ) + } + prevIndex = ent.Index + if prevTerm > ent.Term { + log.Fatalf(ctx, + "term regression in outgoing MsgApp: idx %d at term=%d "+ + "appended with logterm=%d", + ent.Index, ent.Term, message.LogTerm, + ) + } + prevTerm = ent.Term + } + } + + case raftpb.MsgAppResp: + // A successful (non-reject) MsgAppResp contains one piece of + // information: the highest log index. Raft currently queues up + // one MsgAppResp per incoming MsgApp, and we may process + // multiple messages in one handleRaftReady call (because + // multiple messages may arrive while we're blocked syncing to + // disk). If we get redundant MsgAppResps, drop all but the + // last (we've seen that too many MsgAppResps can overflow + // message queues on the receiving side). // - // Also assert that the log term only ever increases (most of the - // time it stays constant, as term changes are rare), and that - // the index increases by exactly one with each entry. + // Note that this reorders the chosen MsgAppResp relative to + // other messages (including any MsgAppResps with the Reject flag), + // but raft is fine with this reordering. // - // This assertion came out of #61990. - prevTerm := message.LogTerm // term of entry preceding the append - prevIndex := message.Index // index of entry preceding the append - for j := range message.Entries { - ent := &message.Entries[j] - logstore.AssertSideloadedRaftCommandInlined(ctx, ent) - - if prevIndex+1 != ent.Index { - log.Fatalf(ctx, - "index gap in outgoing MsgApp: idx %d followed by %d", - prevIndex, ent.Index, - ) - } - prevIndex = ent.Index - if prevTerm > ent.Term { - log.Fatalf(ctx, - "term regression in outgoing MsgApp: idx %d at term=%d "+ - "appended with logterm=%d", - ent.Index, ent.Term, message.LogTerm, - ) - } - prevTerm = ent.Term + // TODO(bdarnell): Consider pushing this optimization into etcd/raft. + // Similar optimizations may be possible for other message types, + // although MsgAppResp is the only one that has been seen as a + // problem in practice. + if !message.Reject && message.Index > lastAppResp.Index { + lastAppResp = message + drop = true } } - case raftpb.MsgAppResp: - // A successful (non-reject) MsgAppResp contains one piece of - // information: the highest log index. Raft currently queues up - // one MsgAppResp per incoming MsgApp, and we may process - // multiple messages in one handleRaftReady call (because - // multiple messages may arrive while we're blocked syncing to - // disk). If we get redundant MsgAppResps, drop all but the - // last (we've seen that too many MsgAppResps can overflow - // message queues on the receiving side). - // - // Note that this reorders the chosen MsgAppResp relative to - // other messages (including any MsgAppResps with the Reject flag), - // but raft is fine with this reordering. - // - // TODO(bdarnell): Consider pushing this optimization into etcd/raft. - // Similar optimizations may be possible for other message types, - // although MsgAppResp is the only one that has been seen as a - // problem in practice. - if !message.Reject && message.Index > lastAppResp.Index { - lastAppResp = message - drop = true + if !drop { + r.sendRaftMessage(ctx, message) } } - - if !drop { - r.sendRaftMessage(ctx, message) - } } if lastAppResp.Index > 0 { r.sendRaftMessage(ctx, lastAppResp) } } +// sendLocalRaftMsg sends a message to the local raft state machine. +func (r *Replica) sendLocalRaftMsg(msg raftpb.Message) { + if msg.To != uint64(r.ReplicaID()) { + panic("incorrect message target") + } + r.localMsgs.Lock() + wasEmpty := len(r.localMsgs.active) == 0 + r.localMsgs.active = append(r.localMsgs.active, msg) + r.localMsgs.Unlock() + if wasEmpty { + r.store.enqueueRaftUpdateCheck(r.RangeID) + } +} + +// deliverLocalRaftMsgsRaftMuLockedReplicaMuLocked delivers local messages to +// the provided raw node. +func (r *Replica) deliverLocalRaftMsgsRaftMuLockedReplicaMuLocked( + ctx context.Context, raftGroup *raft.RawNode, +) { + r.raftMu.AssertHeld() + r.mu.AssertHeld() + r.localMsgs.Lock() + localMsgs := r.localMsgs.active + r.localMsgs.active, r.localMsgs.recycled = r.localMsgs.recycled, r.localMsgs.active[:0] + // Don't recycle large slices. + if cap(r.localMsgs.recycled) > 16 { + r.localMsgs.recycled = nil + } + r.localMsgs.Unlock() + + for i, m := range localMsgs { + if err := raftGroup.Step(m); err != nil { + log.Fatalf(ctx, "unexpected error stepping local raft message [%s]: %v", + raftDescribeMessage(m, raftEntryFormatter), err) + } + // NB: we can reset messages in the localMsgs.recycled slice without holding + // the localMsgs mutex because no-one ever writes to localMsgs.recycled and + // we are holding raftMu, which must be held to switch localMsgs.active and + // localMsgs.recycled. + localMsgs[i].Reset() // for GC + } +} + // sendRaftMessage sends a Raft message. // // When calling this method, the raftMu may be held, but it does not need to be. diff --git a/pkg/kv/kvserver/replica_raftstorage.go b/pkg/kv/kvserver/replica_raftstorage.go index ae01168ee7a4..22cef1a6f55d 100644 --- a/pkg/kv/kvserver/replica_raftstorage.go +++ b/pkg/kv/kvserver/replica_raftstorage.go @@ -95,6 +95,8 @@ func (r *replicaRaftStorage) InitialState() (raftpb.HardState, raftpb.ConfState, // maxBytes. Sideloaded proposals count towards maxBytes with their payloads inlined. // Entries requires that r.mu is held for writing because it requires exclusive // access to r.mu.stateLoader. +// +// Entries can return log entries that are not yet stable in durable storage. func (r *replicaRaftStorage) Entries(lo, hi, maxBytes uint64) ([]raftpb.Entry, error) { ctx := r.AnnotateCtx(context.TODO()) if r.raftMu.sideloaded == nil { diff --git a/pkg/kv/kvserver/replica_test.go b/pkg/kv/kvserver/replica_test.go index 5cc5370cc04f..02daab501997 100644 --- a/pkg/kv/kvserver/replica_test.go +++ b/pkg/kv/kvserver/replica_test.go @@ -9881,99 +9881,6 @@ func TestApplyPaginatedCommittedEntries(t *testing.T) { } } -func TestSplitMsgApps(t *testing.T) { - defer leaktest.AfterTest(t)() - defer log.Scope(t).Close(t) - - msgApp := func(idx uint64) raftpb.Message { - return raftpb.Message{Index: idx, Type: raftpb.MsgApp} - } - otherMsg := func(idx uint64) raftpb.Message { - return raftpb.Message{Index: idx, Type: raftpb.MsgVote} - } - formatMsgs := func(msgs []raftpb.Message) string { - strs := make([]string, len(msgs)) - for i, msg := range msgs { - strs[i] = fmt.Sprintf("{%s:%d}", msg.Type, msg.Index) - } - return fmt.Sprint(strs) - } - - testCases := []struct { - msgsIn, msgAppsOut, otherMsgsOut []raftpb.Message - }{ - // No msgs. - { - msgsIn: []raftpb.Message{}, - msgAppsOut: []raftpb.Message{}, - otherMsgsOut: []raftpb.Message{}, - }, - // Only msgApps. - { - msgsIn: []raftpb.Message{msgApp(1)}, - msgAppsOut: []raftpb.Message{msgApp(1)}, - otherMsgsOut: []raftpb.Message{}, - }, - { - msgsIn: []raftpb.Message{msgApp(1), msgApp(2)}, - msgAppsOut: []raftpb.Message{msgApp(1), msgApp(2)}, - otherMsgsOut: []raftpb.Message{}, - }, - { - msgsIn: []raftpb.Message{msgApp(2), msgApp(1)}, - msgAppsOut: []raftpb.Message{msgApp(2), msgApp(1)}, - otherMsgsOut: []raftpb.Message{}, - }, - // Only otherMsgs. - { - msgsIn: []raftpb.Message{otherMsg(1)}, - msgAppsOut: []raftpb.Message{}, - otherMsgsOut: []raftpb.Message{otherMsg(1)}, - }, - { - msgsIn: []raftpb.Message{otherMsg(1), otherMsg(2)}, - msgAppsOut: []raftpb.Message{}, - otherMsgsOut: []raftpb.Message{otherMsg(1), otherMsg(2)}, - }, - { - msgsIn: []raftpb.Message{otherMsg(2), otherMsg(1)}, - msgAppsOut: []raftpb.Message{}, - otherMsgsOut: []raftpb.Message{otherMsg(2), otherMsg(1)}, - }, - // Mixed msgApps and otherMsgs. - { - msgsIn: []raftpb.Message{msgApp(1), otherMsg(2)}, - msgAppsOut: []raftpb.Message{msgApp(1)}, - otherMsgsOut: []raftpb.Message{otherMsg(2)}, - }, - { - msgsIn: []raftpb.Message{otherMsg(1), msgApp(2)}, - msgAppsOut: []raftpb.Message{msgApp(2)}, - otherMsgsOut: []raftpb.Message{otherMsg(1)}, - }, - { - msgsIn: []raftpb.Message{msgApp(1), otherMsg(2), msgApp(3)}, - msgAppsOut: []raftpb.Message{msgApp(1), msgApp(3)}, - otherMsgsOut: []raftpb.Message{otherMsg(2)}, - }, - { - msgsIn: []raftpb.Message{otherMsg(1), msgApp(2), otherMsg(3)}, - msgAppsOut: []raftpb.Message{msgApp(2)}, - otherMsgsOut: []raftpb.Message{otherMsg(1), otherMsg(3)}, - }, - } - for _, c := range testCases { - inStr := formatMsgs(c.msgsIn) - t.Run(inStr, func(t *testing.T) { - msgAppsRes, otherMsgsRes := splitMsgApps(c.msgsIn) - if !reflect.DeepEqual(msgAppsRes, c.msgAppsOut) || !reflect.DeepEqual(otherMsgsRes, c.otherMsgsOut) { - t.Errorf("expected splitMsgApps(%s)=%s/%s, found %s/%s", inStr, formatMsgs(c.msgAppsOut), - formatMsgs(c.otherMsgsOut), formatMsgs(msgAppsRes), formatMsgs(otherMsgsRes)) - } - }) - } -} - type testQuiescer struct { desc roachpb.RangeDescriptor numProposals int diff --git a/pkg/kv/kvserver/store.go b/pkg/kv/kvserver/store.go index e9c90106b427..fc2546734a78 100644 --- a/pkg/kv/kvserver/store.go +++ b/pkg/kv/kvserver/store.go @@ -46,6 +46,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvstorage" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/logstore" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/multiqueue" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/raftentry" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/rangefeed" @@ -270,6 +271,7 @@ func newRaftConfig( return &raft.Config{ ID: id, Applied: appliedIndex, + AsyncStorageWrites: true, ElectionTick: storeCfg.RaftElectionTimeoutTicks, HeartbeatTick: storeCfg.RaftHeartbeatIntervalTicks, MaxUncommittedEntriesSize: storeCfg.RaftMaxUncommittedEntriesSize, @@ -745,6 +747,7 @@ type Store struct { metrics *StoreMetrics intentResolver *intentresolver.IntentResolver recoveryMgr txnrecovery.Manager + syncWaiter *logstore.SyncWaiterLoop raftEntryCache *raftentry.Cache limiters batcheval.Limiters txnWaitMetrics *txnwait.Metrics @@ -1248,6 +1251,8 @@ func NewStore( s.scheduler = newRaftScheduler(cfg.AmbientCtx, s.metrics, s, storeSchedulerConcurrency, cfg.RaftElectionTimeoutTicks) + s.syncWaiter = logstore.NewSyncWaiterLoop() + s.raftEntryCache = raftentry.NewCache(cfg.RaftEntryCacheSize) s.metrics.registry.AddMetricStruct(s.raftEntryCache.Metrics()) diff --git a/pkg/kv/kvserver/store_raft.go b/pkg/kv/kvserver/store_raft.go index 60d543d71685..90c93262c610 100644 --- a/pkg/kv/kvserver/store_raft.go +++ b/pkg/kv/kvserver/store_raft.go @@ -724,6 +724,8 @@ func (s *Store) processRaft(ctx context.Context) { s.cfg.Transport.Stop(s.StoreID()) })) + s.syncWaiter.Start(ctx, s.stopper) + // We'll want to cancel all in-flight proposals. Proposals embed tracing // spans in them, and we don't want to be leaking any. s.stopper.AddCloser(stop.CloserFn(func() { From c18156317ddefe552f88db6f3c9f19e4d6bddcc1 Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Mon, 30 Jan 2023 22:56:05 -0500 Subject: [PATCH 2/5] kv: stop limiting AckCommittedEntriesBeforeApplication to Raft log's previous LastIndex This commit removes logic in `apply.Task.AckCommittedEntriesBeforeApplication` which was ensuring that we don't acknowledge committed Raft log entries before they were durable in a node's local Raft log. This is now ensured inside of the etcd/raft library when AsyncStorageWrites is enabled, as the comment added here describes. --- pkg/kv/kvserver/apply/doc_test.go | 2 +- pkg/kv/kvserver/apply/task.go | 13 ++--------- pkg/kv/kvserver/apply/task_test.go | 36 +----------------------------- pkg/kv/kvserver/replica_raft.go | 22 +++++++++++------- 4 files changed, 18 insertions(+), 55 deletions(-) diff --git a/pkg/kv/kvserver/apply/doc_test.go b/pkg/kv/kvserver/apply/doc_test.go index 6f34af877499..7258b1fd3a25 100644 --- a/pkg/kv/kvserver/apply/doc_test.go +++ b/pkg/kv/kvserver/apply/doc_test.go @@ -45,7 +45,7 @@ Setting up a batch of seven log entries: } fmt.Println("\nAckCommittedEntriesBeforeApplication:") - if err := t.AckCommittedEntriesBeforeApplication(ctx, 10 /* maxIndex */); err != nil { + if err := t.AckCommittedEntriesBeforeApplication(ctx); err != nil { panic(err) } fmt.Print(` diff --git a/pkg/kv/kvserver/apply/task.go b/pkg/kv/kvserver/apply/task.go index b6063c0de096..989258346681 100644 --- a/pkg/kv/kvserver/apply/task.go +++ b/pkg/kv/kvserver/apply/task.go @@ -197,13 +197,7 @@ func (t *Task) assertDecoded() { // it is applied. Because of this, the client can be informed of the success of // a write at this point, but we cannot release that write's latches until the // write has applied. See ProposalData.signalProposalResult/finishApplication. -// -// 4. Note that when catching up a follower that is behind, the (etcd/raft) -// leader will emit an MsgApp with a commit index that encompasses the entries -// in the MsgApp, and Ready() will expose these as present in both the Entries -// and CommittedEntries slices (i.e. append and apply). We don't ack these -// early - the caller will pass the "old" last index in. -func (t *Task) AckCommittedEntriesBeforeApplication(ctx context.Context, maxIndex uint64) error { +func (t *Task) AckCommittedEntriesBeforeApplication(ctx context.Context) error { t.assertDecoded() if !t.anyLocal { return nil // fast-path @@ -218,11 +212,8 @@ func (t *Task) AckCommittedEntriesBeforeApplication(ctx context.Context, maxInde defer iter.Close() // Collect a batch of trivial commands from the applier. Stop at the first - // non-trivial command or at the first command with an index above maxIndex. + // non-trivial command. batchIter := takeWhileCmdIter(iter, func(cmd Command) bool { - if cmd.Index() > maxIndex { - return false - } return cmd.IsTrivial() }) diff --git a/pkg/kv/kvserver/apply/task_test.go b/pkg/kv/kvserver/apply/task_test.go index a970173e524f..7927bdca5492 100644 --- a/pkg/kv/kvserver/apply/task_test.go +++ b/pkg/kv/kvserver/apply/task_test.go @@ -317,7 +317,7 @@ func TestAckCommittedEntriesBeforeApplication(t *testing.T) { appT := apply.MakeTask(sm, dec) defer appT.Close() require.NoError(t, appT.Decode(ctx, ents)) - require.NoError(t, appT.AckCommittedEntriesBeforeApplication(ctx, 10 /* maxIndex */)) + require.NoError(t, appT.AckCommittedEntriesBeforeApplication(ctx)) // Assert that the state machine was not updated. require.Equal(t, testStateMachine{}, *sm) @@ -338,40 +338,6 @@ func TestAckCommittedEntriesBeforeApplication(t *testing.T) { require.Equal(t, exp, cmd.acked) require.False(t, cmd.finished) } - - // Try again with a lower maximum log index. - appT.Close() - ents = makeEntries(5) - - dec = newTestDecoder() - dec.nonLocal[2] = true - dec.shouldReject[3] = true - - appT = apply.MakeTask(sm, dec) - require.NoError(t, appT.Decode(ctx, ents)) - require.NoError(t, appT.AckCommittedEntriesBeforeApplication(ctx, 4 /* maxIndex */)) - - // Assert that the state machine was not updated. - require.Equal(t, testStateMachine{}, *sm) - - // Assert that some commands were acknowledged early and that none were finished. - for _, cmd := range dec.cmds { - var exp bool - switch cmd.index { - case 1, 4: - exp = true // local and successful - case 2: - exp = false // remote - case 3: - exp = false // local and rejected - case 5: - exp = false // index too high - default: - t.Fatalf("unexpected index %d", cmd.index) - } - require.Equal(t, exp, cmd.acked) - require.False(t, cmd.finished) - } } func TestApplyCommittedEntriesWithErr(t *testing.T) { diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go index 0a0b9b3b248c..6d306b3efb38 100644 --- a/pkg/kv/kvserver/replica_raft.go +++ b/pkg/kv/kvserver/replica_raft.go @@ -808,13 +808,19 @@ func (r *Replica) handleRaftReadyRaftMuLocked( // entries and acknowledge as many as we can trivially prove will not be // rejected beneath raft. // - // Note that the CommittedEntries slice may contain entries that are also in - // the Entries slice (to be appended in this ready pass). This can happen when - // a follower is being caught up on committed commands. We could acknowledge - // these commands early even though they aren't durably in the local raft log - // yet (since they're committed via a quorum elsewhere), but we chose to be - // conservative and avoid it by passing the last Ready cycle's `lastIndex` for - // the maxIndex argument to AckCommittedEntriesBeforeApplication. + // Note that the Entries slice in the MsgStorageApply cannot refer to entries + // that are also in the Entries slice in the MsgStorageAppend. Raft will not + // allow unstable entries to be applied when AsyncStorageWrites is enabled. + // + // If we disable AsyncStorageWrites in the future, this property will no + // longer be true, and the two slices could overlap. For example, this can + // happen when a follower is being caught up on committed commands. We could + // acknowledge these commands early even though they aren't durably in the + // local raft log yet (since they're committed via a quorum elsewhere), but + // we'd likely want to revert to an earlier version of this code that chose to + // be conservative and avoid this behavior by passing the last Ready cycle's + // `lastIndex` for a maxIndex argument to + // AckCommittedEntriesBeforeApplication. // // TODO(nvanbenschoten): this is less important with async storage writes. // Consider getting rid of it. @@ -829,7 +835,7 @@ func (r *Replica) handleRaftReadyRaftMuLocked( return stats, err } if knobs := r.store.TestingKnobs(); knobs == nil || !knobs.DisableCanAckBeforeApplication { - if err := appTask.AckCommittedEntriesBeforeApplication(ctx, state.LastIndex); err != nil { + if err := appTask.AckCommittedEntriesBeforeApplication(ctx); err != nil { return stats, err } } From cb1efdc48f5f7526792e049db4ee8b4f6638f12f Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Tue, 31 Jan 2023 11:30:26 -0500 Subject: [PATCH 3/5] kv: exercise blocking and non-blocking log sync interleaving in tests This commit adds a testing facility to randomly disable non-blocking log syncs in tests. This ensures that we properly handle the case where non-blocking log sync is disabled while some log syncs are still in-progress. --- pkg/kv/kvserver/logstore/BUILD.bazel | 1 + pkg/kv/kvserver/logstore/logstore.go | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/pkg/kv/kvserver/logstore/BUILD.bazel b/pkg/kv/kvserver/logstore/BUILD.bazel index 7e29b6803e70..1a7ec0fba2c1 100644 --- a/pkg/kv/kvserver/logstore/BUILD.bazel +++ b/pkg/kv/kvserver/logstore/BUILD.bazel @@ -23,6 +23,7 @@ go_library( "//pkg/storage", "//pkg/storage/enginepb", "//pkg/storage/fs", + "//pkg/util/buildutil", "//pkg/util/envutil", "//pkg/util/hlc", "//pkg/util/iterutil", diff --git a/pkg/kv/kvserver/logstore/logstore.go b/pkg/kv/kvserver/logstore/logstore.go index 490c4bcd6f7f..67d535718fa8 100644 --- a/pkg/kv/kvserver/logstore/logstore.go +++ b/pkg/kv/kvserver/logstore/logstore.go @@ -14,6 +14,7 @@ package logstore import ( "context" "fmt" + "math/rand" "sync" "time" @@ -25,6 +26,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/storage/enginepb" + "github.com/cockroachdb/cockroach/pkg/util/buildutil" "github.com/cockroachdb/cockroach/pkg/util/envutil" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/iterutil" @@ -224,14 +226,20 @@ func (s *LogStore) storeEntriesAndCommitBatch( stats.PebbleBytes = int64(batch.Len()) wantsSync := len(m.Responses) > 0 willSync := wantsSync && !disableSyncRaftLog.Get(&s.Settings.SV) - // Use the non-blocking log sync path if we are performing a log sync, the - // cluster setting to do so is enabled, and we are not overwriting any - // previous log entries. If we are overwriting, we may need to purge the - // sideloaded SSTables associated with overwritten entries. This must be - // performed after the corresponding entries are durably replaced and it's - // easier to do ensure proper ordering using a blocking log sync. This is a - // rare case, so it's not worth optimizing for. - nonBlockingSync := willSync && enableNonBlockingRaftLogSync.Get(&s.Settings.SV) && !overwriting + // Use the non-blocking log sync path if we are performing a log sync ... + nonBlockingSync := willSync && + // and the cluster setting is enabled ... + enableNonBlockingRaftLogSync.Get(&s.Settings.SV) && + // and we are not overwriting any previous log entries. If we are + // overwriting, we may need to purge the sideloaded SSTables associated with + // overwritten entries. This must be performed after the corresponding + // entries are durably replaced and it's easier to do ensure proper ordering + // using a blocking log sync. This is a rare case, so it's not worth + // optimizing for. + !overwriting && + // Also, randomly disable non-blocking sync in test builds to exercise the + // interleaved blocking and non-blocking syncs. + !(buildutil.CrdbTestBuild && rand.Intn(2) == 0) if nonBlockingSync { // If non-blocking synchronization is enabled, apply the batched updates to // the engine and initiate a synchronous disk write, but don't wait for the From 14ebedc2dcde54563e9671553f484b0811fafb3b Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Tue, 31 Jan 2023 11:30:44 -0500 Subject: [PATCH 4/5] kv: shuffle local raft messages before delivery in tests This commit adds a testing facility to shuffle local raft messages before delivering them to the raft state machine. These are not required to be in order, so we shuffle in tests ensure that re-ordering is handled properly. --- pkg/kv/kvserver/replica_raft.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go index 6d306b3efb38..994471cb9564 100644 --- a/pkg/kv/kvserver/replica_raft.go +++ b/pkg/kv/kvserver/replica_raft.go @@ -32,6 +32,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/util" + "github.com/cockroachdb/cockroach/pkg/util/buildutil" "github.com/cockroachdb/cockroach/pkg/util/encoding" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" @@ -1598,6 +1599,15 @@ func (r *Replica) deliverLocalRaftMsgsRaftMuLockedReplicaMuLocked( } r.localMsgs.Unlock() + // If we are in a test build, shuffle the local messages before delivering + // them. These are not required to be in order, so ensure that re-ordering is + // handled properly. + if buildutil.CrdbTestBuild { + rand.Shuffle(len(localMsgs), func(i, j int) { + localMsgs[i], localMsgs[j] = localMsgs[j], localMsgs[i] + }) + } + for i, m := range localMsgs { if err := raftGroup.Step(m); err != nil { log.Fatalf(ctx, "unexpected error stepping local raft message [%s]: %v", From ebc33fa0823ccf9a9c6095e3d8d487261af20ef5 Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Thu, 2 Feb 2023 17:44:47 -0500 Subject: [PATCH 5/5] kv: rename r.mu.last{Index,Term} to r.mu.last{Index,Term}NotDurable Simple rename to make the semantics of these fields more clear and harder to misinterpret. --- pkg/kv/kvserver/helpers_test.go | 2 +- pkg/kv/kvserver/raft_log_queue.go | 2 +- pkg/kv/kvserver/replica.go | 12 ++++++------ pkg/kv/kvserver/replica_init.go | 4 ++-- pkg/kv/kvserver/replica_raft.go | 18 +++++++++--------- pkg/kv/kvserver/replica_raftstorage.go | 20 ++++++++++---------- pkg/kv/kvserver/replica_sideload_test.go | 2 +- 7 files changed, 30 insertions(+), 30 deletions(-) diff --git a/pkg/kv/kvserver/helpers_test.go b/pkg/kv/kvserver/helpers_test.go index 00564d86034a..5b4b112a5a8b 100644 --- a/pkg/kv/kvserver/helpers_test.go +++ b/pkg/kv/kvserver/helpers_test.go @@ -356,7 +356,7 @@ func (r *Replica) GetRaftLogSize() (int64, bool) { func (r *Replica) GetCachedLastTerm() uint64 { r.mu.RLock() defer r.mu.RUnlock() - return r.mu.lastTerm + return r.mu.lastTermNotDurable } func (r *Replica) IsRaftGroupInitialized() bool { diff --git a/pkg/kv/kvserver/raft_log_queue.go b/pkg/kv/kvserver/raft_log_queue.go index eb305fcae327..70ec7779bf8b 100644 --- a/pkg/kv/kvserver/raft_log_queue.go +++ b/pkg/kv/kvserver/raft_log_queue.go @@ -263,7 +263,7 @@ func newTruncateDecision(ctx context.Context, r *Replica) (truncateDecision, err const anyRecipientStore roachpb.StoreID = 0 pendingSnapshotIndex := r.getSnapshotLogTruncationConstraintsRLocked(anyRecipientStore) - lastIndex := r.mu.lastIndex + lastIndex := r.mu.lastIndexNotDurable // NB: raftLogSize above adjusts for pending truncations that have already // been successfully replicated via raft, but logSizeTrusted does not see if // those pending truncations would cause a transition from trusted => diff --git a/pkg/kv/kvserver/replica.go b/pkg/kv/kvserver/replica.go index eb72b9c650e2..270b137fd799 100644 --- a/pkg/kv/kvserver/replica.go +++ b/pkg/kv/kvserver/replica.go @@ -397,12 +397,12 @@ type Replica struct { mergeTxnID uuid.UUID // The state of the Raft state machine. state kvserverpb.ReplicaState - // Last index/term written to the raft log (not necessarily durable - // locally or committed by the group). Note that lastTerm may be 0 (and - // thus invalid) even when lastIndex is known, in which case the term - // will have to be retrieved from the Raft log entry. Use the + // Last index/term written to the raft log (not necessarily durable locally + // or committed by the group). Note that lastTermNotDurable may be 0 (and + // thus invalid) even when lastIndexNotDurable is known, in which case the + // term will have to be retrieved from the Raft log entry. Use the // invalidLastTerm constant for this case. - lastIndex, lastTerm uint64 + lastIndexNotDurable, lastTermNotDurable uint64 // A map of raft log index of pending snapshots to deadlines. // Used to prohibit raft log truncations that would leave a gap between // the snapshot and the new first index. The map entry has a zero @@ -1294,7 +1294,7 @@ func (r *Replica) State(ctx context.Context) kvserverpb.RangeInfo { r.mu.RLock() defer r.mu.RUnlock() ri.ReplicaState = *(protoutil.Clone(&r.mu.state)).(*kvserverpb.ReplicaState) - ri.LastIndex = r.mu.lastIndex + ri.LastIndex = r.mu.lastIndexNotDurable ri.NumPending = uint64(r.numPendingProposalsRLocked()) ri.RaftLogSize = r.mu.raftLogSize ri.RaftLogSizeTrusted = r.mu.raftLogSizeTrusted diff --git a/pkg/kv/kvserver/replica_init.go b/pkg/kv/kvserver/replica_init.go index b5964c917b3b..c2494e35bba7 100644 --- a/pkg/kv/kvserver/replica_init.go +++ b/pkg/kv/kvserver/replica_init.go @@ -208,11 +208,11 @@ func (r *Replica) loadRaftMuLockedReplicaMuLocked(desc *roachpb.RangeDescriptor) if r.mu.state, err = r.mu.stateLoader.Load(ctx, r.Engine(), desc); err != nil { return err } - r.mu.lastIndex, err = r.mu.stateLoader.LoadLastIndex(ctx, r.Engine()) + r.mu.lastIndexNotDurable, err = r.mu.stateLoader.LoadLastIndex(ctx, r.Engine()) if err != nil { return err } - r.mu.lastTerm = invalidLastTerm + r.mu.lastTermNotDurable = invalidLastTerm // Ensure that we're not trying to load a replica with a different ID than // was used to construct this Replica. diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go index 994471cb9564..9db90f1af469 100644 --- a/pkg/kv/kvserver/replica_raft.go +++ b/pkg/kv/kvserver/replica_raft.go @@ -720,8 +720,8 @@ func (r *Replica) handleRaftReadyRaftMuLocked( var msgStorageAppend, msgStorageApply raftpb.Message r.mu.Lock() state := logstore.RaftState{ // used for append below - LastIndex: r.mu.lastIndex, - LastTerm: r.mu.lastTerm, + LastIndex: r.mu.lastIndexNotDurable, + LastTerm: r.mu.lastTermNotDurable, ByteSize: r.mu.raftLogSize, } leaderID := r.mu.leaderID @@ -890,13 +890,13 @@ func (r *Replica) handleRaftReadyRaftMuLocked( stats.tSnapEnd = timeutil.Now() stats.snap.applied = true - // r.mu.lastIndex, r.mu.lastTerm and r.mu.raftLogSize were updated in - // applySnapshot, but we also want to make sure we reflect these changes in - // the local variables we're tracking here. + // r.mu.lastIndexNotDurable, r.mu.lastTermNotDurable and r.mu.raftLogSize + // were updated in applySnapshot, but we also want to make sure we reflect + // these changes in the local variables we're tracking here. r.mu.RLock() state = logstore.RaftState{ - LastIndex: r.mu.lastIndex, - LastTerm: r.mu.lastTerm, + LastIndex: r.mu.lastIndexNotDurable, + LastTerm: r.mu.lastTermNotDurable, ByteSize: r.mu.raftLogSize, } r.mu.RUnlock() @@ -946,8 +946,8 @@ func (r *Replica) handleRaftReadyRaftMuLocked( // leader ID. r.mu.Lock() // TODO(pavelkalinnikov): put logstore.RaftState to r.mu directly. - r.mu.lastIndex = state.LastIndex - r.mu.lastTerm = state.LastTerm + r.mu.lastIndexNotDurable = state.LastIndex + r.mu.lastTermNotDurable = state.LastTerm r.mu.raftLogSize = state.ByteSize var becameLeader bool if r.mu.leaderID != leaderID { diff --git a/pkg/kv/kvserver/replica_raftstorage.go b/pkg/kv/kvserver/replica_raftstorage.go index 22cef1a6f55d..40a78cf4bb4a 100644 --- a/pkg/kv/kvserver/replica_raftstorage.go +++ b/pkg/kv/kvserver/replica_raftstorage.go @@ -111,19 +111,19 @@ func (r *Replica) raftEntriesLocked(lo, hi, maxBytes uint64) ([]raftpb.Entry, er return (*replicaRaftStorage)(r).Entries(lo, hi, maxBytes) } -// invalidLastTerm is an out-of-band value for r.mu.lastTerm that -// invalidates lastTerm caching and forces retrieval of Term(lastTerm) -// from the raftEntryCache/RocksDB. +// invalidLastTerm is an out-of-band value for r.mu.lastTermNotDurable that +// invalidates lastTermNotDurable caching and forces retrieval of +// Term(lastIndexNotDurable) from the raftEntryCache/Pebble. const invalidLastTerm = 0 // Term implements the raft.Storage interface. // Term requires that r.mu is held for writing because it requires exclusive // access to r.mu.stateLoader. func (r *replicaRaftStorage) Term(i uint64) (uint64, error) { - // TODO(nvanbenschoten): should we set r.mu.lastTerm when - // r.mu.lastIndex == i && r.mu.lastTerm == invalidLastTerm? - if r.mu.lastIndex == i && r.mu.lastTerm != invalidLastTerm { - return r.mu.lastTerm, nil + // TODO(nvanbenschoten): should we set r.mu.lastTermNotDurable when + // r.mu.lastIndexNotDurable == i && r.mu.lastTermNotDurable == invalidLastTerm? + if r.mu.lastIndexNotDurable == i && r.mu.lastTermNotDurable != invalidLastTerm { + return r.mu.lastTermNotDurable, nil } ctx := r.AnnotateCtx(context.TODO()) return logstore.LoadTerm(ctx, r.mu.stateLoader.StateLoader, r.store.Engine(), r.RangeID, @@ -145,7 +145,7 @@ func (r *Replica) GetTerm(i uint64) (uint64, error) { // raftLastIndexRLocked requires that r.mu is held for reading. func (r *Replica) raftLastIndexRLocked() uint64 { - return r.mu.lastIndex + return r.mu.lastIndexNotDurable } // LastIndex implements the raft.Storage interface. @@ -731,13 +731,13 @@ func (r *Replica) applySnapshot( // performance implications are not likely to be drastic. If our // feelings about this ever change, we can add a LastIndex field to // raftpb.SnapshotMetadata. - r.mu.lastIndex = state.RaftAppliedIndex + r.mu.lastIndexNotDurable = state.RaftAppliedIndex // TODO(sumeer): We should be able to set this to // nonemptySnap.Metadata.Term. See // https://github.com/cockroachdb/cockroach/pull/75675#pullrequestreview-867926687 // for a discussion regarding this. - r.mu.lastTerm = invalidLastTerm + r.mu.lastTermNotDurable = invalidLastTerm r.mu.raftLogSize = 0 // Update the store stats for the data in the snapshot. r.store.metrics.subtractMVCCStats(ctx, r.tenantMetricsRef, *r.mu.state.Stats) diff --git a/pkg/kv/kvserver/replica_sideload_test.go b/pkg/kv/kvserver/replica_sideload_test.go index 74501c638a78..19633d18dd8d 100644 --- a/pkg/kv/kvserver/replica_sideload_test.go +++ b/pkg/kv/kvserver/replica_sideload_test.go @@ -179,7 +179,7 @@ func TestRaftSSTableSideloading(t *testing.T) { rsl := logstore.NewStateLoader(tc.repl.RangeID) lo := tc.repl.mu.state.TruncatedState.Index + 1 - hi := tc.repl.mu.lastIndex + 1 + hi := tc.repl.mu.lastIndexNotDurable + 1 tc.store.raftEntryCache.Clear(tc.repl.RangeID, hi) ents, err := logstore.LoadEntries(