-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
replica_raft.go
2461 lines (2297 loc) · 99.2 KB
/
replica_raft.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2019 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
package kvserver
import (
"context"
"math/rand"
"sort"
"strings"
"time"
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/apply"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/poison"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvadmission"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/uncertainty"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/storage"
"github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/encoding"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/cockroach/pkg/util/tracing"
"github.com/cockroachdb/cockroach/pkg/util/uuid"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/redact"
"go.etcd.io/etcd/raft/v3"
"go.etcd.io/etcd/raft/v3/raftpb"
"go.etcd.io/etcd/raft/v3/tracker"
)
var (
// raftLogTruncationClearRangeThreshold is the number of entries at which Raft
// log truncation uses a Pebble range tombstone rather than point deletes. It
// is set high enough to avoid writing too many range tombstones to Pebble,
// but low enough that we don't do too many point deletes either (in
// particular, we don't want to overflow the Pebble write batch).
//
// In the steady state, Raft log truncation occurs when RaftLogQueueStaleSize
// (64 KB) or RaftLogQueueStaleThreshold (100 entries) is exceeded, so
// truncations are generally small. If followers are lagging, we let the log
// grow to RaftLogTruncationThreshold (16 MB) before truncating.
//
// 100k was chosen because it is unlikely to be hit in most common cases,
// keeping the number of range tombstones low, but will trigger when Raft logs
// have grown abnormally large. RaftLogTruncationThreshold will typically not
// trigger it, unless the average log entry is <= 160 bytes. The key size is
// ~16 bytes, so Pebble point deletion batches will be bounded at ~1.6MB.
raftLogTruncationClearRangeThreshold = uint64(util.ConstantWithMetamorphicTestRange(
"raft-log-truncation-clearrange-threshold", 100000 /* default */, 1 /* min */, 1e6 /* max */))
)
func makeIDKey() kvserverbase.CmdIDKey {
idKeyBuf := make([]byte, 0, kvserverbase.RaftCommandIDLen)
idKeyBuf = encoding.EncodeUint64Ascending(idKeyBuf, uint64(rand.Int63()))
return kvserverbase.CmdIDKey(idKeyBuf)
}
// evalAndPropose prepares the necessary pending command struct and initializes
// a client command ID if one hasn't been. A verified lease is supplied as a
// parameter if the command requires a lease; nil otherwise. It then evaluates
// the command and proposes it to Raft on success.
//
// The method accepts a concurrency guard, which it assumes responsibility for
// if it succeeds in proposing a command into Raft. If the method does not
// return an error, the guard is guaranteed to be eventually freed and the
// caller should relinquish all ownership of it. If it does return an error, the
// caller retains full ownership over the guard.
//
// evalAndPropose takes ownership of the supplied token; the caller should
// tok.Move() it into this method. It will be used to untrack the request once
// it comes out of the proposal buffer.
//
// Nothing here or below can take out a raftMu lock, since executeWriteBatch()
// is already holding readOnlyCmdMu when calling this. Locking raftMu after it
// would violate the locking order specified for Store.mu.
//
// Return values:
// - a channel which receives a response or error upon application
// - a closure used to attempt to abandon the command. When called, it unbinds
// the command's context from its Raft proposal. The client is then free to
// terminate execution, although it is given no guarantee that the proposal
// won't still go on to commit and apply at some later time.
// - the proposal's ID.
// - any error obtained during the creation or proposal of the command, in
// which case the other returned values are zero.
func (r *Replica) evalAndPropose(
ctx context.Context,
ba *roachpb.BatchRequest,
g *concurrency.Guard,
st *kvserverpb.LeaseStatus,
ui uncertainty.Interval,
tok TrackedRequestToken,
) (
chan proposalResult,
func(),
kvserverbase.CmdIDKey,
*kvadmission.StoreWriteBytes,
*roachpb.Error,
) {
defer tok.DoneIfNotMoved(ctx)
idKey := makeIDKey()
proposal, pErr := r.requestToProposal(ctx, idKey, ba, g, st, ui)
log.Event(proposal.ctx, "evaluated request")
// If the request hit a server-side concurrency retry error, immediately
// propagate the error. Don't assume ownership of the concurrency guard.
if isConcurrencyRetryError(pErr) {
pErr = maybeAttachLease(pErr, &st.Lease)
return nil, nil, "", nil, pErr
} else if _, ok := pErr.GetDetail().(*roachpb.ReplicaCorruptionError); ok {
return nil, nil, "", nil, pErr
}
// Attach the endCmds to the proposal and assume responsibility for
// releasing the concurrency guard if the proposal makes it to Raft.
proposal.ec = endCmds{repl: r, g: g, st: *st}
// Pull out proposal channel to return. proposal.doneCh may be set to
// nil if it is signaled in this function.
proposalCh := proposal.doneCh
// There are two cases where request evaluation does not lead to a Raft
// proposal:
// 1. proposal.command == nil indicates that the evaluation was a no-op
// and that no Raft command needs to be proposed.
// 2. pErr != nil corresponds to a failed proposal - the command resulted
// in an error.
if proposal.command == nil {
if proposal.Local.RequiresRaft() {
return nil, nil, "", nil, roachpb.NewError(errors.AssertionFailedf(
"proposal resulting from batch %s erroneously bypassed Raft", ba))
}
intents := proposal.Local.DetachEncounteredIntents()
endTxns := proposal.Local.DetachEndTxns(pErr != nil /* alwaysOnly */)
r.handleReadWriteLocalEvalResult(ctx, *proposal.Local)
pr := proposalResult{
Reply: proposal.Local.Reply,
Err: pErr,
EncounteredIntents: intents,
EndTxns: endTxns,
}
proposal.finishApplication(ctx, pr)
return proposalCh, func() {}, "", nil, nil
}
log.VEventf(proposal.ctx, 2,
"proposing command to write %d new keys, %d new values, %d new intents, "+
"write batch size=%d bytes",
proposal.command.ReplicatedEvalResult.Delta.KeyCount,
proposal.command.ReplicatedEvalResult.Delta.ValCount,
proposal.command.ReplicatedEvalResult.Delta.IntentCount,
proposal.command.WriteBatch.Size(),
)
// NB: if ba.AsyncConsensus is true, we will tell admission control about
// writes that may not have happened yet. We consider this ok, since (a) the
// typical lag in consensus is expected to be small compared to the time
// granularity of admission control doing token and size estimation (which
// is 15s). Also, admission control corrects for gaps in reporting.
writeBytes := kvadmission.NewStoreWriteBytes()
if proposal.command.WriteBatch != nil {
writeBytes.WriteBytes = int64(len(proposal.command.WriteBatch.Data))
}
if proposal.command.ReplicatedEvalResult.AddSSTable != nil {
writeBytes.IngestedBytes = int64(len(proposal.command.ReplicatedEvalResult.AddSSTable.Data))
}
// If the request requested that Raft consensus be performed asynchronously,
// return a proposal result immediately on the proposal's done channel.
// The channel's capacity will be large enough to accommodate this.
if ba.AsyncConsensus {
if ets := proposal.Local.DetachEndTxns(false /* alwaysOnly */); len(ets) != 0 {
// Disallow async consensus for commands with EndTxnIntents because
// any !Always EndTxnIntent can't be cleaned up until after the
// command succeeds.
return nil, nil, "", writeBytes, roachpb.NewErrorf("cannot perform consensus asynchronously for "+
"proposal with EndTxnIntents=%v; %v", ets, ba)
}
// Fork the proposal's context span so that the proposal's context
// can outlive the original proposer's context.
proposal.ctx, proposal.sp = tracing.ForkSpan(ctx, "async consensus")
// Signal the proposal's response channel immediately.
reply := *proposal.Local.Reply
reply.Responses = append([]roachpb.ResponseUnion(nil), reply.Responses...)
pr := proposalResult{
Reply: &reply,
EncounteredIntents: proposal.Local.DetachEncounteredIntents(),
}
proposal.signalProposalResult(pr)
// Continue with proposal...
}
// Attach information about the proposer's lease to the command, for
// verification below raft. Lease requests are special since they are not
// necessarily proposed under a valid lease (by necessity). Instead, they
// reference the previous lease. Note that TransferLease also skip lease
// checks (for technical reasons, see `TransferLease.flags`) and uses the
// same mechanism.
if ba.IsSingleSkipsLeaseCheckRequest() {
// Lease-related commands have below-raft special casing and will carry the
// lease sequence of the lease they are intending to follow.
// The remaining requests that skip a lease check (at the time of writing
// ProbeRequest) will assign a zero lease sequence and thus won't be able
// to mutate state.
var seq roachpb.LeaseSequence
switch t := ba.Requests[0].GetInner().(type) {
case *roachpb.RequestLeaseRequest:
seq = t.PrevLease.Sequence
case *roachpb.TransferLeaseRequest:
seq = t.PrevLease.Sequence
default:
}
proposal.command.ProposerLeaseSequence = seq
} else if !st.Lease.OwnedBy(r.store.StoreID()) {
// Perform a sanity check that the lease is owned by this replica. This must
// have been ascertained by the callers in
// checkExecutionCanProceedBeforeStorageSnapshot.
log.Fatalf(ctx, "cannot propose %s on follower with remotely owned lease %s", ba, st.Lease)
} else {
proposal.command.ProposerLeaseSequence = st.Lease.Sequence
}
// Once a command is written to the raft log, it must be loaded into memory
// and replayed on all replicas. If a command is too big, stop it here. If
// the command is not too big, acquire an appropriate amount of quota from
// the replica's proposal quota pool.
//
// TODO(tschottdorf): blocking a proposal here will leave it dangling in the
// closed timestamp tracker for an extended period of time, which will in turn
// prevent the node-wide closed timestamp from making progress. This is quite
// unfortunate; we should hoist the quota pool before the reference with the
// closed timestamp tracker is acquired. This is better anyway; right now many
// commands can evaluate but then be blocked on quota, which has worse memory
// behavior.
quotaSize := uint64(proposal.command.Size())
if maxSize := uint64(MaxCommandSize.Get(&r.store.cfg.Settings.SV)); quotaSize > maxSize {
return nil, nil, "", nil, roachpb.NewError(errors.Errorf(
"command is too large: %d bytes (max: %d)", quotaSize, maxSize,
))
}
var err error
proposal.quotaAlloc, err = r.maybeAcquireProposalQuota(ctx, quotaSize)
if err != nil {
return nil, nil, "", nil, roachpb.NewError(err)
}
// Make sure we clean up the proposal if we fail to insert it into the
// proposal buffer successfully. This ensures that we always release any
// quota that we acquire.
defer func() {
if pErr != nil {
proposal.releaseQuota()
}
}()
if filter := r.store.TestingKnobs().TestingProposalFilter; filter != nil {
filterArgs := kvserverbase.ProposalFilterArgs{
Ctx: ctx,
Cmd: *proposal.command,
QuotaAlloc: proposal.quotaAlloc,
CmdID: idKey,
Req: *ba,
}
if pErr = filter(filterArgs); pErr != nil {
return nil, nil, "", nil, pErr
}
}
pErr = r.propose(ctx, proposal, tok.Move(ctx))
if pErr != nil {
return nil, nil, "", nil, pErr
}
// Abandoning a proposal unbinds its context so that the proposal's client
// is free to terminate execution. However, it does nothing to try to
// prevent the command from succeeding. In particular, endCmds will still be
// invoked when the command is applied. There are a handful of cases where
// the command may not be applied (or even processed): the process crashes
// or the local replica is removed from the range.
abandon := func() {
// The proposal may or may not be in the Replica's proposals map.
// Instead of trying to look it up, simply modify the captured object
// directly. The raftMu must be locked to modify the context of a
// proposal because as soon as we propose a command to Raft, ownership
// passes to the "below Raft" machinery.
r.raftMu.Lock()
defer r.raftMu.Unlock()
r.mu.Lock()
defer r.mu.Unlock()
// TODO(radu): Should this context be created via tracer.ForkSpan?
// We'd need to make sure the span is finished eventually.
proposal.ctx = r.AnnotateCtx(context.TODO())
}
return proposalCh, abandon, idKey, writeBytes, nil
}
// propose encodes a command, starts tracking it, and proposes it to Raft.
//
// The method hands ownership of the command over to the Raft machinery. After
// the method returns, all access to the command must be performed while holding
// Replica.mu and Replica.raftMu.
//
// propose takes ownership of the supplied token; the caller should tok.Move()
// it into this method. It will be used to untrack the request once it comes out
// of the proposal buffer.
func (r *Replica) propose(
ctx context.Context, p *ProposalData, tok TrackedRequestToken,
) (pErr *roachpb.Error) {
defer tok.DoneIfNotMoved(ctx)
// If an error occurs reset the command's MaxLeaseIndex to its initial value.
// Failure to propose will propagate to the client. An invariant of this
// package is that proposals which are finished carry a raft command with a
// MaxLeaseIndex equal to the proposal command's max lease index.
defer func(prev uint64) {
if pErr != nil {
p.command.MaxLeaseIndex = prev
}
}(p.command.MaxLeaseIndex)
// Make sure the maximum lease index is unset. This field will be set in
// propBuf.Insert and its encoded bytes will be appended to the encoding
// buffer as a MaxLeaseFooter.
p.command.MaxLeaseIndex = 0
// Determine the encoding style for the Raft command.
prefix := true
version := kvserverbase.RaftVersionStandard
if crt := p.command.ReplicatedEvalResult.ChangeReplicas; crt != nil {
// EndTxnRequest with a ChangeReplicasTrigger is special because Raft
// needs to understand it; it cannot simply be an opaque command. To
// permit this, the command is proposed by the proposal buffer using
// ProposeConfChange. For that reason, we also don't need a Raft command
// prefix because the command ID is stored in a field in
// raft.ConfChange.
log.KvDistribution.Infof(p.ctx, "proposing %s", crt)
prefix = false
// The following deals with removing a leaseholder. A voter can be removed
// in two ways. 1) Simple (old style) where there is a reconfiguration
// turning a voter into a LEARNER / NON-VOTER. 2) Through an intermediate
// joint configuration, where the replica remains in the descriptor, but
// as VOTER_{OUTGOING, DEMOTING}. When leaving the JOINT config (a second
// Raft operation), the removed replica transitions a LEARNER / NON-VOTER.
//
// In case (1) the lease needs to be transferred out before a removal is
// proposed (cooperative transfer). The code below permits leaseholder
// removal only if entering a joint configuration (option 2 above) in which
// the leaseholder is (any kind of) voter, and in addition, this joint config
// should include a VOTER_INCOMING replica. In this case, the lease is
// transferred to this new replica in maybeLeaveAtomicChangeReplicas right
// before we exit the joint configuration.
//
// When the leaseholder is replaced by a new replica, transferring the
// lease in the joint config allows transferring directly from old to new,
// since both are active in the joint config, without going through a third
// node or adding the new node before transferring, which might reduce
// fault tolerance. For example, consider v1 in region1 (leaseholder), v2
// in region2 and v3 in region3. We want to relocate v1 to a new node v4 in
// region1. We add v4 as LEARNER. At this point we can't transfer the lease
// to v4, so we could transfer it to v2 first, but this is likely to hurt
// application performance. We could instead add v4 as VOTER first, and
// then transfer lease directly to v4, but this would change the number of
// replicas to 4, and if region1 goes down, we loose a quorum. Instead,
// we move to a joint config where v1 (VOTER_DEMOTING_LEARNER) transfer the
// lease to v4 (VOTER_INCOMING) directly.
//
// Our implementation assumes that the intention of the caller is for the
// VOTER_INCOMING node to be the replacement replica, and hence get the
// lease. We therefore don't dynamically select a lease target during the
// joint config, and hand it to the VOTER_INCOMING node. This means,
// however, that we only allow a VOTER_DEMOTING to have the lease in a
// joint configuration, when there's also a VOTER_INCOMING node (that
// will be used as a target for the lease transfer). Otherwise, the caller
// is expected to shed the lease before entering a joint configuration.
// See also https://github.com/cockroachdb/cockroach/issues/67740.
lhDesc, err := r.GetReplicaDescriptor()
if err != nil {
return roachpb.NewError(err)
}
proposedDesc := p.command.ReplicatedEvalResult.State.Desc
// This is a reconfiguration command, we make sure the proposed
// config is legal w.r.t. the current leaseholder: we now allow the
// leaseholder to be a VOTER_DEMOTING as long as there is a VOTER_INCOMING.
// Otherwise, the leaseholder must be a full voter in the target config.
// This check won't allow exiting the joint config before the lease is
// transferred away. The previous leaseholder is a LEARNER in the target config,
// and therefore shouldn't continue holding the lease.
if err := roachpb.CheckCanReceiveLease(
lhDesc, proposedDesc.Replicas(), true, /* wasLastLeaseholder */
); err != nil {
e := errors.Mark(errors.Wrapf(err, "%v received invalid ChangeReplicasTrigger %s to "+
"remove self (leaseholder); lhRemovalAllowed: %v; current desc: %v; proposed desc: %v",
lhDesc, crt, true /* lhRemovalAllowed */, r.Desc(), proposedDesc), errMarkInvalidReplicationChange)
log.Errorf(p.ctx, "%v", e)
return roachpb.NewError(e)
}
} else if p.command.ReplicatedEvalResult.AddSSTable != nil {
log.VEvent(p.ctx, 4, "sideloadable proposal detected")
version = kvserverbase.RaftVersionSideloaded
r.store.metrics.AddSSTableProposals.Inc(1)
if p.command.ReplicatedEvalResult.AddSSTable.Data == nil {
return roachpb.NewErrorf("cannot sideload empty SSTable")
}
} else if log.V(4) {
log.Infof(p.ctx, "proposing command %x: %s", p.idKey, p.Request.Summary())
}
// Create encoding buffer.
preLen := 0
if prefix {
preLen = kvserverbase.RaftCommandPrefixLen
}
cmdLen := p.command.Size()
// Allocate the data slice with enough capacity to eventually hold the two
// "footers" that are filled later.
needed := preLen + cmdLen + kvserverpb.MaxRaftCommandFooterSize()
data := make([]byte, preLen, needed)
// Encode prefix with command ID, if necessary.
if prefix {
kvserverbase.EncodeRaftCommandPrefix(data, version, p.idKey)
}
// Encode body of command.
data = data[:preLen+cmdLen]
if _, err := protoutil.MarshalTo(p.command, data[preLen:]); err != nil {
return roachpb.NewError(err)
}
p.encodedCommand = data
// Too verbose even for verbose logging, so manually enable if you want to
// debug proposal sizes.
if false {
log.Infof(p.ctx, `%s: proposal: %d
RaftCommand.ReplicatedEvalResult: %d
RaftCommand.ReplicatedEvalResult.Delta: %d
RaftCommand.WriteBatch: %d
`, p.Request.Summary(), cmdLen,
p.command.ReplicatedEvalResult.Size(),
p.command.ReplicatedEvalResult.Delta.Size(),
p.command.WriteBatch.Size(),
)
}
// Log an event if this is a large proposal. These are more likely to cause
// blips or worse, and it's good to be able to pick them from traces.
//
// TODO(tschottdorf): can we mark them so lightstep can group them?
const largeProposalEventThresholdBytes = 2 << 19 // 512kb
if cmdLen > largeProposalEventThresholdBytes {
log.Eventf(p.ctx, "proposal is large: %s", humanizeutil.IBytes(int64(cmdLen)))
}
// Insert into the proposal buffer, which passes the command to Raft to be
// proposed. The proposal buffer assigns the command a maximum lease index
// when it sequences it.
//
// NB: we must not hold r.mu while using the proposal buffer, see comment
// on the field.
err := r.mu.proposalBuf.Insert(ctx, p, tok.Move(ctx))
if err != nil {
return roachpb.NewError(err)
}
return nil
}
func (r *Replica) numPendingProposalsRLocked() int {
return len(r.mu.proposals) + r.mu.proposalBuf.AllocatedIdx()
}
// hasPendingProposalsRLocked is part of the quiescer interface.
// It returns true if this node has any outstanding proposals. A client might be
// waiting for the outcome of these proposals, so we definitely don't want to
// quiesce while such proposals are in-flight.
//
// Note that this method says nothing about other node's outstanding proposals:
// if this node is the current leaseholders, previous leaseholders might have
// proposals on which they're waiting. If this node is not the current
// leaseholder, then obviously whoever is the current leaseholder might have
// pending proposals. This method is called in two places: on the current
// leaseholder when deciding whether the leaseholder should attempt to quiesce
// the range, and then on every follower to confirm that the range can indeed be
// quiesced.
func (r *Replica) hasPendingProposalsRLocked() bool {
return r.numPendingProposalsRLocked() > 0 ||
// If slow proposals just finished, it's possible that
// refreshProposalsLocked hasn't been invoked yet. We don't want to quiesce
// until it has been, since otherwise we're never fully resetting this
// Replica's contribution to `requests.slow.raft`. So we only claim to
// have no pending proposals when we've done one last refresh that resets
// the counter, i.e. in a few ticks at most.
r.mu.slowProposalCount > 0
}
// hasPendingProposalQuotaRLocked is part of the quiescer interface. It returns
// true if there are any commands that haven't completed replicating that are
// tracked by this node's quota pool (i.e. commands that haven't been acked by
// all live replicas).
// We can't quiesce while there's outstanding quota because the respective quota
// would not be released while quiesced, and it might prevent the range from
// unquiescing (leading to deadlock). See #46699.
func (r *Replica) hasPendingProposalQuotaRLocked() bool {
if r.mu.proposalQuota == nil {
return true
}
return !r.mu.proposalQuota.Full()
}
var errRemoved = errors.New("replica removed")
// stepRaftGroup calls Step on the replica's RawNode with the provided request's
// message. Before doing so, it assures that the replica is unquiesced and ready
// to handle the request.
func (r *Replica) stepRaftGroup(req *kvserverpb.RaftMessageRequest) error {
// We're processing an incoming raft message (from a batch that may
// include MsgVotes), so don't campaign if we wake up our raft
// group.
return r.withRaftGroup(false, func(raftGroup *raft.RawNode) (bool, error) {
// We're processing a message from another replica which means that the
// other replica is not quiesced, so we don't need to wake the leader.
// Note that we avoid campaigning when receiving raft messages, because
// we expect the originator to campaign instead.
r.maybeUnquiesceWithOptionsLocked(false /* campaignOnWake */)
r.mu.lastUpdateTimes.update(req.FromReplica.ReplicaID, timeutil.Now())
if req.Message.Type == raftpb.MsgSnap {
// Occasionally a snapshot message may arrive under an outdated term,
// which would lead to Raft discarding the snapshot. This should be
// really rare in practice, but it does happen in tests and in particular
// can happen to the synchronous snapshots on the learner path, which
// will then have to wait for the raft snapshot queue to send another
// snapshot. However, in some tests it is desirable to disable the
// raft snapshot queue. This workaround makes that possible.
//
// See TestReportUnreachableRemoveRace for the test that prompted
// this addition.
if term := raftGroup.BasicStatus().Term; term > req.Message.Term {
req.Message.Term = term
}
}
err := raftGroup.Step(req.Message)
if errors.Is(err, raft.ErrProposalDropped) {
// A proposal was forwarded to this replica but we couldn't propose it.
// Swallow the error since we don't have an effective way of signaling
// this to the sender.
// TODO(bdarnell): Handle ErrProposalDropped better.
// https://github.com/cockroachdb/cockroach/issues/21849
err = nil
}
return false /* unquiesceAndWakeLeader */, err
})
}
type handleSnapshotStats struct {
offered bool
applied bool
}
type handleRaftReadyStats struct {
tBegin, tEnd time.Time
tApplicationBegin, tApplicationEnd time.Time
apply applyCommittedEntriesStats
tAppendBegin, tAppendEnd time.Time
appendedRegularCount int
appendedSideloadedCount int
appendedSideloadedBytes int64
appendedRegularBytes int64
tPebbleCommitBegin, tPebbleCommitEnd time.Time
pebbleBatchBytes int64
tSnapBegin, tSnapEnd time.Time
snap handleSnapshotStats
sync bool
}
// SafeFormat implements redact.SafeFormatter
func (s handleRaftReadyStats) SafeFormat(p redact.SafePrinter, _ rune) {
dTotal := s.tEnd.Sub(s.tBegin)
dAppend := s.tAppendEnd.Sub(s.tAppendBegin)
dApply := s.tApplicationEnd.Sub(s.tApplicationBegin)
dPebble := s.tPebbleCommitEnd.Sub(s.tPebbleCommitBegin)
dSnap := s.tSnapEnd.Sub(s.tSnapBegin)
dUnaccounted := dTotal - dSnap - dAppend - dApply - dPebble
{
var sync redact.SafeString
if s.sync {
sync = "-sync"
}
p.Printf("raft ready handling: %.2fs [append=%.2fs, apply=%.2fs, commit-batch%s=%.2fs",
dTotal.Seconds(), dAppend.Seconds(), dApply.Seconds(), sync, dPebble.Seconds())
}
if dSnap > 0 {
p.Printf(", snap=%.2fs", dSnap.Seconds())
}
p.Printf(", other=%.2fs]", dUnaccounted.Seconds())
p.Printf(", wrote %s",
humanizeutil.IBytes(s.pebbleBatchBytes),
)
if s.sync {
p.SafeString(" sync")
}
p.SafeString(" [")
if b, n := s.appendedRegularBytes, s.appendedRegularCount; n > 0 || b > 0 {
p.Printf("append-ent=%s (%d), ", humanizeutil.IBytes(b), n)
}
if b, n := s.appendedSideloadedBytes, s.appendedSideloadedCount; n > 0 || b > 0 {
p.Printf("append-sst=%s (%d), ", humanizeutil.IBytes(b), n)
}
if b, n := s.apply.entriesProcessedBytes, s.apply.entriesProcessed; n > 0 || b > 0 {
p.Printf("apply=%s (%d", humanizeutil.IBytes(b), n)
if c := s.apply.batchesProcessed; c > 1 {
p.Printf(" in %d batches", c)
}
p.SafeString(")")
}
p.SafeString("]")
if n := s.apply.stateAssertions; n > 0 {
p.Printf(", state_assertions=%d", n)
}
if s.snap.offered {
if s.snap.applied {
p.Printf(", snapshot applied")
} else {
p.Printf(", snapshot ignored")
}
}
}
func (s handleRaftReadyStats) String() string {
return redact.StringWithoutMarkers(s)
}
// noSnap can be passed to handleRaftReady when no snapshot should be processed.
var noSnap IncomingSnapshot
// handleRaftReady processes a raft.Ready containing entries and messages that
// are ready to read, be saved to stable storage, committed, or sent to other
// peers. It takes a non-empty IncomingSnapshot to indicate that it is
// about to process a snapshot.
//
// The returned string is nonzero whenever an error is returned to give a
// non-sensitive cue as to what happened.
func (r *Replica) handleRaftReady(
ctx context.Context, inSnap IncomingSnapshot,
) (handleRaftReadyStats, string, error) {
r.raftMu.Lock()
defer r.raftMu.Unlock()
return r.handleRaftReadyRaftMuLocked(ctx, inSnap)
}
// handleRaftReadyRaftMuLocked is the same as handleRaftReady but requires that
// the replica's raftMu be held.
//
// The returned string is nonzero whenever an error is returned to give a
// non-sensitive cue as to what happened.
func (r *Replica) handleRaftReadyRaftMuLocked(
ctx context.Context, inSnap IncomingSnapshot,
) (stats handleRaftReadyStats, _ string, _ error) {
// handleRaftReadyRaftMuLocked is not prepared to handle context cancellation,
// so assert that it's given a non-cancellable context.
if ctx.Done() != nil {
return handleRaftReadyStats{}, "", errors.AssertionFailedf(
"handleRaftReadyRaftMuLocked cannot be called with a cancellable context")
}
stats = handleRaftReadyStats{
tBegin: timeutil.Now(),
}
defer func() {
// NB: we need to reference the named return parameter here. If `stats` were
// just a local, we'd be modifying the local but not the return value.
stats.tEnd = timeutil.Now()
}()
if inSnap.Desc != nil {
stats.snap.offered = true
}
var hasReady bool
var rd raft.Ready
r.mu.Lock()
state := raftLogState{ // used for append below
lastIndex: r.mu.lastIndex,
lastTerm: r.mu.lastTerm,
byteSize: r.mu.raftLogSize,
}
leaderID := r.mu.leaderID
lastLeaderID := leaderID
err := r.withRaftGroupLocked(true, func(raftGroup *raft.RawNode) (bool, error) {
numFlushed, err := r.mu.proposalBuf.FlushLockedWithRaftGroup(ctx, raftGroup)
if err != nil {
return false, err
}
if hasReady = raftGroup.HasReady(); hasReady {
rd = raftGroup.Ready()
}
// We unquiesce if we have a Ready (= there's work to do). We also have
// to unquiesce if we just flushed some proposals but there isn't a
// Ready, which can happen if the proposals got dropped (raft does this
// if it doesn't know who the leader is). And, for extra defense in depth,
// we also unquiesce if there are outstanding proposals.
//
// NB: if we had the invariant that the group can only be in quiesced
// state if it knows the leader (state.Lead) AND we knew that raft would
// never give us an empty ready here (i.e. the only reason to drop a
// proposal is not knowing the leader) then numFlushed would not be
// necessary. The latter is likely true but we don't want to rely on
// it. The former is maybe true, but there's no easy way to enforce it.
unquiesceAndWakeLeader := hasReady || numFlushed > 0 || len(r.mu.proposals) > 0
return unquiesceAndWakeLeader, nil
})
r.mu.applyingEntries = len(rd.CommittedEntries) > 0
pausedFollowers := r.mu.pausedFollowers
r.mu.Unlock()
if errors.Is(err, errRemoved) {
// If we've been removed then just return.
return stats, "", nil
} else if err != nil {
const expl = "while checking raft group for Ready"
return stats, expl, errors.Wrap(err, expl)
}
if !hasReady {
// We must update the proposal quota even if we don't have a ready.
// Consider the case when our quota is of size 1 and two out of three
// replicas have committed one log entry while the third is lagging
// behind. When the third replica finally does catch up and sends
// along a MsgAppResp, since the entry is already committed on the
// leader replica, no Ready is emitted. But given that the third
// replica has caught up, we can release
// some quota back to the pool.
r.updateProposalQuotaRaftMuLocked(ctx, lastLeaderID)
return stats, "", nil
}
logRaftReady(ctx, rd)
refreshReason := noReason
if rd.SoftState != nil && leaderID != roachpb.ReplicaID(rd.SoftState.Lead) {
// Refresh pending commands if the Raft leader has changed. This is usually
// the first indication we have of a new leader on a restarted node.
//
// TODO(peter): Re-proposing commands when SoftState.Lead changes can lead
// to wasteful multiple-reproposals when we later see an empty Raft command
// indicating a newly elected leader or a conf change. Replay protection
// prevents any corruption, so the waste is only a performance issue.
if log.V(3) {
log.Infof(ctx, "raft leader changed: %d -> %d", leaderID, rd.SoftState.Lead)
}
if !r.store.TestingKnobs().DisableRefreshReasonNewLeader {
refreshReason = reasonNewLeader
}
leaderID = roachpb.ReplicaID(rd.SoftState.Lead)
}
if inSnap.Desc != nil {
if !raft.IsEmptySnap(rd.Snapshot) {
snapUUID, err := uuid.FromBytes(rd.Snapshot.Data)
if err != nil {
const expl = "invalid snapshot id"
return stats, expl, errors.Wrap(err, expl)
}
if inSnap.SnapUUID == (uuid.UUID{}) {
log.Fatalf(ctx, "programming error: a snapshot application was attempted outside of the streaming snapshot codepath")
}
if snapUUID != inSnap.SnapUUID {
log.Fatalf(ctx, "incoming snapshot id doesn't match raft snapshot id: %s != %s", snapUUID, inSnap.SnapUUID)
}
// Applying this snapshot may require us to subsume one or more of our right
// neighbors. This occurs if this replica is informed about the merges via a
// Raft snapshot instead of a MsgApp containing the merge commits, e.g.,
// because it went offline before the merge commits applied and did not come
// back online until after the merge commits were truncated away.
subsumedRepls, releaseMergeLock := r.maybeAcquireSnapshotMergeLock(ctx, inSnap)
defer releaseMergeLock()
stats.tSnapBegin = timeutil.Now()
if err := r.applySnapshot(ctx, inSnap, rd.Snapshot, rd.HardState, subsumedRepls); err != nil {
const expl = "while applying snapshot"
return stats, expl, errors.Wrap(err, expl)
}
stats.tSnapEnd = timeutil.Now()
stats.snap.applied = true
// r.mu.lastIndex, r.mu.lastTerm and r.mu.raftLogSize were updated in
// applySnapshot, but we also want to make sure we reflect these changes in
// the local variables we're tracking here.
r.mu.RLock()
state = raftLogState{
lastIndex: r.mu.lastIndex,
lastTerm: r.mu.lastTerm,
byteSize: r.mu.raftLogSize,
}
r.mu.RUnlock()
// We refresh pending commands after applying a snapshot because this
// replica may have been temporarily partitioned from the Raft group and
// missed leadership changes that occurred. Suppose node A is the leader,
// and then node C gets partitioned away from the others. Leadership passes
// back and forth between A and B during the partition, but when the
// partition is healed node A is leader again.
if !r.store.TestingKnobs().DisableRefreshReasonSnapshotApplied &&
refreshReason == noReason {
refreshReason = reasonSnapshotApplied
}
}
} else if !raft.IsEmptySnap(rd.Snapshot) {
// If we didn't expect Raft to have a snapshot but it has one
// regardless, that is unexpected and indicates a programming
// error.
err := makeNonDeterministicFailure(
"have inSnap=nil, but raft has a snapshot %s",
raft.DescribeSnapshot(rd.Snapshot),
)
return stats, getNonDeterministicFailureExplanation(err), err
}
// If the ready struct includes entries that have been committed, these
// entries will be applied to the Replica's replicated state machine down
// below, after appending new entries to the raft log and sending messages
// to peers. However, the process of appending new entries to the raft log
// and then applying committed entries to the state machine can take some
// time - and these entries are already durably committed. If they have
// clients waiting on them, we'd like to acknowledge their success as soon
// as possible. To facilitate this, we take a quick pass over the committed
// entries and acknowledge as many as we can trivially prove will not be
// rejected beneath raft.
//
// Note that the CommittedEntries slice may contain entries that are also in
// the Entries slice (to be appended in this ready pass). This can happen when
// a follower is being caught up on committed commands. We could acknowledge
// these commands early even though they aren't durably in the local raft log
// yet (since they're committed via a quorum elsewhere), but we chose to be
// conservative and avoid it by passing the last Ready cycle's `lastIndex` for
// the maxIndex argument to AckCommittedEntriesBeforeApplication.
sm := r.getStateMachine()
dec := r.getDecoder()
appTask := apply.MakeTask(sm, dec)
appTask.SetMaxBatchSize(r.store.TestingKnobs().MaxApplicationBatchSize)
defer appTask.Close()
if err := appTask.Decode(ctx, rd.CommittedEntries); err != nil {
return stats, getNonDeterministicFailureExplanation(err), err
}
if knobs := r.store.TestingKnobs(); knobs == nil || !knobs.DisableCanAckBeforeApplication {
if err := appTask.AckCommittedEntriesBeforeApplication(ctx, state.lastIndex); err != nil {
return stats, getNonDeterministicFailureExplanation(err), err
}
}
// Separate the MsgApp messages from all other Raft message types so that we
// can take advantage of the optimization discussed in the Raft thesis under
// the section: `10.2.1 Writing to the leader’s disk in parallel`. The
// optimization suggests that instead of a leader writing new log entries to
// disk before replicating them to its followers, the leader can instead
// write the entries to disk in parallel with replicating to its followers
// and them writing to their disks.
//
// Here, we invoke this optimization by:
// 1. sending all MsgApps.
// 2. syncing all entries and Raft state to disk.
// 3. sending all other messages.
//
// Since this is all handled in handleRaftReadyRaftMuLocked, we're assured
// that even though we may sync new entries to disk after sending them in
// MsgApps to followers, we'll always have them synced to disk before we
// process followers' MsgAppResps for the corresponding entries because
// Ready processing is sequential (and because a restart of the leader would
// prevent the MsgAppResp from being handled by it). This is important
// because it makes sure that the leader always has all of the entries in
// the log for its term, which is required in etcd/raft for technical
// reasons[1].
//
// MsgApps are also used to inform followers of committed entries through
// the Commit index that they contain. Due to the optimization described
// above, a Commit index may be sent out to a follower before it is
// persisted on the leader. This is safe because the Commit index can be
// treated as volatile state, as is supported by raft.MustSync[2].
// Additionally, the Commit index can never refer to entries from the
// current Ready (due to the MsgAppResp argument above) except in
// single-node groups, in which as a result we have to be careful to not
// persist a Commit index without the entries its commit index might refer
// to (see the HardState update below for details).
//
// [1]: the Raft thesis states that this can be made safe:
//
// > The leader may even commit an entry before it has been written to its
// > own disk, if a majority of followers have written it to their disks;
// > this is still safe.
//
// [2]: Raft thesis section: `3.8 Persisted state and server restarts`:
//
// > Other state variables are safe to lose on a restart, as they can all be
// > recreated. The most interesting example is the commit index, which can
// > safely be reinitialized to zero on a restart.
//
// Note that this will change when joint quorums are implemented, at which
// point we have to introduce coupling between the Commit index and
// persisted config changes, and also require some commit indexes to be
// durably synced.
// See:
// https://github.com/etcd-io/etcd/issues/7625#issuecomment-489232411
msgApps, otherMsgs := splitMsgApps(rd.Messages)
r.traceMessageSends(msgApps, "sending msgApp")
r.sendRaftMessagesRaftMuLocked(ctx, msgApps, pausedFollowers)
prevLastIndex := state.lastIndex
// TODO(pavelkalinnikov): find a way to move it to storeEntries.
if !raft.IsEmptyHardState(rd.HardState) {
if !r.IsInitialized() && rd.HardState.Commit != 0 {
log.Fatalf(ctx, "setting non-zero HardState.Commit on uninitialized replica %s. HS=%+v", r, rd.HardState)
}
}
// TODO(pavelkalinnikov): construct and store this in Replica.
s := logStore{
engine: r.store.engine,
sideload: r.raftMu.sideloaded,
stateLoader: r.raftMu.stateLoader,
settings: r.store.cfg.Settings,
metrics: r.store.metrics,
}
if state, err = s.storeEntries(ctx, state, rd, &stats); err != nil {
const expl = "while storing log entries"
return stats, expl, err
}
if len(rd.Entries) > 0 {
// We may have just overwritten parts of the log which contain
// sideloaded SSTables from a previous term (and perhaps discarded some
// entries that we didn't overwrite). Remove any such leftover on-disk
// payloads (we can do that now because we've committed the deletion
// just above).
firstPurge := rd.Entries[0].Index // first new entry written
purgeTerm := rd.Entries[0].Term - 1
lastPurge := prevLastIndex // old end of the log, include in deletion
purgedSize, err := maybePurgeSideloaded(ctx, r.raftMu.sideloaded, firstPurge, lastPurge, purgeTerm)
if err != nil {
const expl = "while purging sideloaded storage"
return stats, expl, err
}
state.byteSize -= purgedSize
if state.byteSize < 0 {
// Might have gone negative if node was recently restarted.
state.byteSize = 0
}
}
// Update protected state - last index, last term, raft log size, and raft
// leader ID.
r.mu.Lock()
// TODO(pavelkalinnikov): put raftLogState to r.mu directly instead of fields.
r.mu.lastIndex = state.lastIndex
r.mu.lastTerm = state.lastTerm
r.mu.raftLogSize = state.byteSize
var becameLeader bool
if r.mu.leaderID != leaderID {
r.mu.leaderID = leaderID
// Clear the remote proposal set. Would have been nil already if not
// previously the leader.
becameLeader = r.mu.leaderID == r.replicaID
}
r.mu.Unlock()
// When becoming the leader, proactively add the replica to the replicate
// queue. We might have been handed leadership by a remote node which wanted
// to remove itself from the range.
if becameLeader && r.store.replicateQueue != nil {
r.store.replicateQueue.MaybeAddAsync(ctx, r, r.store.Clock().NowAsClockTimestamp())
}
// Update raft log entry cache. We clear any older, uncommitted log entries
// and cache the latest ones.
r.store.raftEntryCache.Add(r.RangeID, rd.Entries, true /* truncate */)
r.sendRaftMessagesRaftMuLocked(ctx, otherMsgs, nil /* blocked */)
r.traceEntries(rd.CommittedEntries, "committed, before applying any entries")