-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
failover.go
1797 lines (1558 loc) · 66.8 KB
/
failover.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
package tests
import (
"context"
gosql "database/sql"
"fmt"
"math/rand"
"sync"
"time"
"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/util/randutil"
"github.com/cockroachdb/errors"
"github.com/lib/pq"
"github.com/stretchr/testify/require"
)
var rangeLeaseRenewalDuration = func() time.Duration {
var raftCfg base.RaftConfig
raftCfg.SetDefaults()
return raftCfg.RangeLeaseRenewalDuration()
}()
// registerFailover registers a set of failover benchmarks. These tests
// benchmark the maximum unavailability experienced by clients during various
// node failures, and exports them for roachperf graphing. They do not make any
// assertions on recovery time, similarly to other performance benchmarks.
//
// The tests run a kv workload against a cluster while subjecting individual
// nodes to various failures. The workload uses dedicated SQL gateway nodes that
// don't fail, relying on these for error handling and retries. The pMax latency
// seen by any client is exported and graphed. Since recovery times are
// probabilistic, each test performs multiple failures (typically 9) in order to
// find the worst-case recovery time following a failure.
//
// Failures last for 60 seconds before the node is recovered. Thus, the maximum
// measured unavailability is 60 seconds, which in practice means permanent
// unavailability (for some or all clients).
//
// No attempt is made to find the distribution of recovery times (e.g. the
// minimum and median), since this requires more sophisticated data recording
// and analysis. Simply taking the median across all requests is not sufficient,
// since requests are also executed against a healthy cluster between failures,
// and against healthy replicas during failures, thus the vast majority of
// requests are successful with nominal latencies. See also:
// https://github.com/cockroachdb/cockroach/issues/103654
func registerFailover(r registry.Registry) {
for _, leases := range []registry.LeaseType{registry.EpochLeases, registry.ExpirationLeases} {
var leasesStr string
if leases == registry.ExpirationLeases {
leasesStr = "/lease=expiration"
}
for _, readOnly := range []bool{false, true} {
var readOnlyStr string
if readOnly {
readOnlyStr = "/read-only"
} else {
readOnlyStr = "/read-write"
}
r.Add(registry.TestSpec{
Name: "failover/chaos" + readOnlyStr + leasesStr,
Owner: registry.OwnerKV,
Benchmark: true,
Timeout: 90 * time.Minute,
Cluster: r.MakeClusterSpec(10, spec.CPU(2), spec.WorkloadNode(), spec.WorkloadNodeCPU(2), spec.DisableLocalSSD(), spec.ReuseNone()), // uses disk stalls
CompatibleClouds: registry.OnlyGCE, // dmsetup only configured for gce
Suites: registry.Suites(registry.Nightly),
Leases: leases,
SkipPostValidations: registry.PostValidationNoDeadNodes, // cleanup kills nodes
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runFailoverChaos(ctx, t, c, readOnly)
},
})
}
r.Add(registry.TestSpec{
Name: "failover/partial/lease-gateway" + leasesStr,
Owner: registry.OwnerKV,
Benchmark: true,
Timeout: 30 * time.Minute,
Cluster: r.MakeClusterSpec(8, spec.CPU(2), spec.WorkloadNode(), spec.WorkloadNodeCPU(2)),
CompatibleClouds: registry.AllExceptAWS,
Suites: registry.Suites(registry.Nightly),
Leases: leases,
Run: runFailoverPartialLeaseGateway,
})
r.Add(registry.TestSpec{
Name: "failover/partial/lease-leader" + leasesStr,
Owner: registry.OwnerKV,
Benchmark: true,
Timeout: 30 * time.Minute,
Cluster: r.MakeClusterSpec(7, spec.CPU(2), spec.WorkloadNode(), spec.WorkloadNodeCPU(2)),
CompatibleClouds: registry.AllExceptAWS,
Suites: registry.Suites(registry.Nightly),
Leases: leases,
Run: runFailoverPartialLeaseLeader,
})
r.Add(registry.TestSpec{
Name: "failover/partial/lease-liveness" + leasesStr,
Owner: registry.OwnerKV,
Benchmark: true,
Timeout: 30 * time.Minute,
Cluster: r.MakeClusterSpec(8, spec.CPU(2), spec.WorkloadNode(), spec.WorkloadNodeCPU(2)),
CompatibleClouds: registry.AllExceptAWS,
Suites: registry.Suites(registry.Nightly),
Leases: leases,
Run: runFailoverPartialLeaseLiveness,
})
for _, failureMode := range allFailureModes {
clusterOpts := make([]spec.Option, 0)
clusterOpts = append(clusterOpts, spec.CPU(2), spec.WorkloadNode(), spec.WorkloadNodeCPU(2))
clouds := registry.AllExceptAWS
var postValidation registry.PostValidation
if failureMode == failureModeDiskStall {
// Use PDs in an attempt to work around flakes encountered when using
// SSDs. See #97968.
clusterOpts = append(clusterOpts, spec.DisableLocalSSD())
// Don't reuse the cluster for tests that call dmsetup to avoid
// spurious flakes from previous runs. See #107865
clusterOpts = append(clusterOpts, spec.ReuseNone())
postValidation = registry.PostValidationNoDeadNodes
// dmsetup is currently only configured for gce.
clouds = registry.OnlyGCE
}
r.Add(registry.TestSpec{
Name: fmt.Sprintf("failover/non-system/%s%s", failureMode, leasesStr),
Owner: registry.OwnerKV,
Benchmark: true,
Timeout: 30 * time.Minute,
SkipPostValidations: postValidation,
Cluster: r.MakeClusterSpec(7, clusterOpts...),
CompatibleClouds: clouds,
Suites: registry.Suites(registry.Nightly),
Leases: leases,
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runFailoverNonSystem(ctx, t, c, failureMode)
},
})
r.Add(registry.TestSpec{
Name: fmt.Sprintf("failover/liveness/%s%s", failureMode, leasesStr),
Owner: registry.OwnerKV,
CompatibleClouds: registry.AllExceptAWS,
Suites: registry.Suites(registry.Weekly),
Benchmark: true,
Timeout: 30 * time.Minute,
SkipPostValidations: postValidation,
Cluster: r.MakeClusterSpec(5, clusterOpts...),
Leases: leases,
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runFailoverLiveness(ctx, t, c, failureMode)
},
})
r.Add(registry.TestSpec{
Name: fmt.Sprintf("failover/system-non-liveness/%s%s", failureMode, leasesStr),
Owner: registry.OwnerKV,
CompatibleClouds: registry.AllExceptAWS,
Suites: registry.Suites(registry.Weekly),
Benchmark: true,
Timeout: 30 * time.Minute,
SkipPostValidations: postValidation,
Cluster: r.MakeClusterSpec(7, clusterOpts...),
Leases: leases,
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runFailoverSystemNonLiveness(ctx, t, c, failureMode)
},
})
}
}
}
// runFailoverChaos sets up a 9-node cluster with RF=5 and randomly scattered
// ranges and replicas, and then runs a random failure on one or two random
// nodes for 1 minute with 1 minute recovery, for 20 cycles total. Nodes n1-n2
// are used as SQL gateways, and are not failed to avoid disconnecting the
// client workload.
//
// It runs with either a read-write or read-only KV workload, measuring the pMax
// unavailability for graphing. The read-only workload is useful to test e.g.
// recovering nodes stealing Raft leadership away, since this requires the
// replica to still be up-to-date on the log.
func runFailoverChaos(ctx context.Context, t test.Test, c cluster.Cluster, readOnly bool) {
require.Equal(t, 10, c.Spec().NodeCount)
rng, _ := randutil.NewTestRand()
// Create cluster, and set up failers for all failure modes.
settings := install.MakeClusterSettings()
settings.Env = append(settings.Env, "COCKROACH_ENABLE_UNSAFE_TEST_BUILTINS=true")
settings.Env = append(settings.Env, "COCKROACH_SCAN_MAX_IDLE_TIME=100ms") // speed up replication
// DistSender circuit breakers are useful for these chaos tests. Turn them on.
// TODO(arul): this can be removed if/when we turn on DistSender circuit
// breakers for all ranges by default.
settings.ClusterSettings["kv.dist_sender.circuit_breakers.mode"] = "all ranges"
m := c.NewMonitor(ctx, c.CRDBNodes())
failers := []Failer{}
for _, failureMode := range allFailureModes {
failer := makeFailerWithoutLocalNoop(t, c, m, failureMode, settings, rng)
if c.IsLocal() && !failer.CanUseLocal() {
t.L().Printf("skipping failure mode %q on local cluster", failureMode)
continue
}
failer.Setup(ctx)
defer failer.Cleanup(ctx)
failers = append(failers, failer)
}
c.Start(ctx, t.L(), failoverStartOpts(), settings, c.CRDBNodes())
conn := c.Conn(ctx, t.L(), 1)
// Place 5 replicas of all ranges on n3-n9, keeping n1-n2 as SQL gateways.
configureAllZones(t, ctx, conn, zoneConfig{replicas: 5, onlyNodes: []int{3, 4, 5, 6, 7, 8, 9}})
// Wait for upreplication.
require.NoError(
t, WaitForReplication(ctx, t, t.L(), conn, 5 /* replicationFactor */, atLeastReplicationFactor),
)
// Create the kv database. If this is a read-only workload, populate it with
// 100.000 keys.
var insertCount int
if readOnly {
insertCount = 100000
}
t.L().Printf("creating workload database")
_, err := conn.ExecContext(ctx, `CREATE DATABASE kv`)
require.NoError(t, err)
c.Run(ctx, option.WithNodes(c.WorkloadNode()), fmt.Sprintf(
`./cockroach workload init kv --splits 1000 --insert-count %d {pgurl:1}`, insertCount))
// Scatter the ranges, then relocate them off of the SQL gateways n1-n2.
t.L().Printf("scattering table")
_, err = conn.ExecContext(ctx, `ALTER TABLE kv.kv SCATTER`)
require.NoError(t, err)
relocateRanges(t, ctx, conn, `true`, []int{1, 2}, []int{3, 4, 5, 6, 7, 8, 9})
// Wait for upreplication of the new ranges.
require.NoError(
t, WaitForReplication(ctx, t, t.L(), conn, 5 /* replicationFactor */, atLeastReplicationFactor),
)
// Run workload on n10 via n1-n2 gateways until test ends (context cancels).
t.L().Printf("running workload")
cancelWorkload := m.GoWithCancel(func(ctx context.Context) error {
readPercent := 50
if readOnly {
readPercent = 100
}
err := c.RunE(ctx, option.WithNodes(c.WorkloadNode()), fmt.Sprintf(
`./cockroach workload run kv --read-percent %d --write-seq R%d `+
`--concurrency 256 --max-rate 2048 --timeout 1m --tolerate-errors `+
`--histograms=`+t.PerfArtifactsDir()+`/stats.json {pgurl:1-2}`,
readPercent, insertCount))
if ctx.Err() != nil {
return nil // test requested workload shutdown
}
return err
})
// Start a worker to randomly fail random nodes for 1 minute, with 20 cycles.
m.Go(func(ctx context.Context) error {
defer cancelWorkload()
for i := 0; i < 20; i++ {
t.L().Printf("chaos iteration %d", i)
sleepFor(ctx, t, time.Minute)
// Ranges may occasionally escape their constraints. Move them to where
// they should be.
relocateRanges(t, ctx, conn, `true`, []int{1, 2}, []int{3, 4, 5, 6, 7, 8, 9})
// Randomly sleep up to the lease renewal interval, to vary the time
// between the last lease renewal and the failure.
sleepFor(ctx, t, randutil.RandDuration(rng, rangeLeaseRenewalDuration))
// Pick 1 or 2 random nodes and failure modes. Make sure we call Ready()
// on both before failing, since we may need to fetch information from the
// cluster which won't work if there's an active failure.
nodeFailers := map[int]Failer{}
for numNodes := 1 + rng.Intn(2); len(nodeFailers) < numNodes; {
var node int
for node == 0 || nodeFailers[node] != nil {
node = 3 + rng.Intn(7) // n1-n2 are SQL gateways, n10 is workload runner
}
var failer Failer
for failer == nil {
failer = failers[rng.Intn(len(failers))]
for _, other := range nodeFailers {
if !other.CanRunWith(failer.Mode()) || !failer.CanRunWith(other.Mode()) {
failer = nil // failers aren't compatible, pick a different one
break
}
}
}
if d, ok := failer.(*deadlockFailer); ok { // randomize deadlockFailer
d.numReplicas = 1 + rng.Intn(5)
d.onlyLeaseholders = rng.Float64() < 0.5
}
failer.Ready(ctx, node)
nodeFailers[node] = failer
}
for node, failer := range nodeFailers {
// If the failer supports partial failures (e.g. partial partitions), do
// one with 50% probability against a random node (including SQL
// gateways).
if partialFailer, ok := failer.(PartialFailer); ok && rng.Float64() < 0.5 {
var partialPeer int
for partialPeer == 0 || partialPeer == node {
partialPeer = 1 + rng.Intn(9)
}
t.L().Printf("failing n%d to n%d (%s)", node, partialPeer, failer)
partialFailer.FailPartial(ctx, node, []int{partialPeer})
} else {
t.L().Printf("failing n%d (%s)", node, failer)
failer.Fail(ctx, node)
}
}
sleepFor(ctx, t, time.Minute)
// Recover the failers on different goroutines. Otherwise, they
// might interact as certain failures can prevent other failures
// from recovering.
var wg sync.WaitGroup
for node, failer := range nodeFailers {
wg.Add(1)
node := node
failer := failer
m.Go(func(ctx context.Context) error {
defer wg.Done()
t.L().Printf("recovering n%d (%s)", node, failer)
failer.Recover(ctx, node)
return nil
})
}
wg.Wait()
}
sleepFor(ctx, t, time.Minute) // let cluster recover
return nil
})
m.Wait()
}
// runFailoverPartialLeaseGateway tests a partial network partition between a
// SQL gateway and a user range leaseholder. These must be routed via other
// nodes to be able to serve the request.
//
// Cluster topology:
//
// n1-n3: system ranges and user ranges (2/5 replicas)
// n4-n5: user range leaseholders (2/5 replicas)
// n6-n7: SQL gateways and 1 user replica (1/5 replicas)
//
// 5 user range replicas will be placed on n2-n6, with leases on n4. A partial
// partition will be introduced between n4,n5 and n6,n7, both fully and
// individually. This corresponds to the case where we have three data centers
// with a broken network link between one pair. For example:
//
// n1-n3 (2 replicas, liveness)
// A
// / \
// / \
// n4-n5 B --x-- C n6-n7 <--- n8 (workload)
// (2 replicas, leases) (1 replica, SQL gateways)
//
// Once follower routing is implemented, this tests the following scenarios:
//
// - Routes via followers in both A, B, and C when possible.
// - Skips follower replica on local node that can't reach leaseholder (n6).
// - Skips follower replica in C that can't reach leaseholder (n7 via n6).
// - Skips follower replica in B that's unreachable (n5).
//
// We run a kv50 workload on SQL gateways and collect pMax latency for graphing.
func runFailoverPartialLeaseGateway(ctx context.Context, t test.Test, c cluster.Cluster) {
require.Equal(t, 8, c.Spec().NodeCount)
rng, _ := randutil.NewTestRand()
// Create cluster.
settings := install.MakeClusterSettings()
settings.Env = append(settings.Env, "COCKROACH_SCAN_MAX_IDLE_TIME=100ms") // speed up replication
m := c.NewMonitor(ctx, c.CRDBNodes())
failer := makeFailer(t, c, m, failureModeBlackhole, settings, rng).(PartialFailer)
failer.Setup(ctx)
defer failer.Cleanup(ctx)
c.Start(ctx, t.L(), failoverStartOpts(), settings, c.CRDBNodes())
conn := c.Conn(ctx, t.L(), 1)
// Place all ranges on n1-n3 to start with.
configureAllZones(t, ctx, conn, zoneConfig{replicas: 3, onlyNodes: []int{1, 2, 3}})
// Wait for upreplication.
require.NoError(t, WaitFor3XReplication(ctx, t, t.L(), conn))
// Create the kv database with 5 replicas on n2-n6, and leases on n4.
t.L().Printf("creating workload database")
_, err := conn.ExecContext(ctx, `CREATE DATABASE kv`)
require.NoError(t, err)
configureZone(t, ctx, conn, `DATABASE kv`, zoneConfig{
replicas: 5, onlyNodes: []int{2, 3, 4, 5, 6}, leasePreference: "[+node4]"})
c.Run(ctx, option.WithNodes(c.Node(6)), `./cockroach workload init kv --splits 1000 {pgurl:1}`)
// Wait for the KV table to upreplicate.
waitForUpreplication(t, ctx, conn, `database_name = 'kv'`, 5)
// The replicate queue takes forever to move the ranges, so we do it
// ourselves. Precreating the database/range and moving it to the correct
// nodes first is not sufficient, since workload will spread the ranges across
// all nodes regardless.
relocateRanges(t, ctx, conn, `database_name = 'kv'`, []int{1, 7}, []int{2, 3, 4, 5, 6})
relocateRanges(t, ctx, conn, `database_name != 'kv'`, []int{4, 5, 6, 7}, []int{1, 2, 3})
relocateLeases(t, ctx, conn, `database_name = 'kv'`, 4)
// Run workload on n8 via n6-n7 gateways until test ends (context cancels).
t.L().Printf("running workload")
cancelWorkload := m.GoWithCancel(func(ctx context.Context) error {
err := c.RunE(ctx, option.WithNodes(c.WorkloadNode()), `./cockroach workload run kv --read-percent 50 `+
`--concurrency 256 --max-rate 2048 --timeout 1m --tolerate-errors `+
`--histograms=`+t.PerfArtifactsDir()+`/stats.json {pgurl:6-7}`)
if ctx.Err() != nil {
return nil // test requested workload shutdown
}
return err
})
// Start a worker to fail and recover partial partitions between n4,n5
// (leases) and n6,n7 (gateways), both fully and individually, for 3 cycles.
// Leases are only placed on n4.
m.Go(func(ctx context.Context) error {
defer cancelWorkload()
for i := 0; i < 3; i++ {
testcases := []struct {
nodes []int
peers []int
}{
// Fully partition leases from gateways, must route via n1-n3. In
// addition to n4 leaseholder being unreachable, follower on n5 is
// unreachable, and follower replica on n6 can't reach leaseholder.
{[]int{6, 7}, []int{4, 5}},
// Partition n6 (gateway with local follower) from n4 (leaseholder).
// Local follower replica can't reach leaseholder.
{[]int{6}, []int{4}},
// Partition n7 (gateway) from n4 (leaseholder).
{[]int{7}, []int{4}},
}
for _, tc := range testcases {
sleepFor(ctx, t, time.Minute)
// Ranges and leases may occasionally escape their constraints. Move
// them to where they should be.
relocateRanges(t, ctx, conn, `database_name = 'kv'`, []int{1, 7}, []int{2, 3, 4, 5, 6})
relocateRanges(t, ctx, conn, `database_name != 'kv'`, []int{4, 5, 6, 7}, []int{1, 2, 3})
relocateLeases(t, ctx, conn, `database_name = 'kv'`, 4)
// Randomly sleep up to the lease renewal interval, to vary the time
// between the last lease renewal and the failure.
sleepFor(ctx, t, randutil.RandDuration(rng, rangeLeaseRenewalDuration))
for _, node := range tc.nodes {
failer.Ready(ctx, node)
}
for _, node := range tc.nodes {
t.L().Printf("failing n%d to n%v (%s lease/gateway)", node, tc.peers, failer)
failer.FailPartial(ctx, node, tc.peers)
}
sleepFor(ctx, t, time.Minute)
for _, node := range tc.nodes {
t.L().Printf("recovering n%d to n%v (%s lease/gateway)", node, tc.peers, failer)
failer.Recover(ctx, node)
}
}
}
sleepFor(ctx, t, time.Minute) // let cluster recover
return nil
})
m.Wait()
}
// runFailoverPartialLeaseLeader tests a partial network partition between
// leaseholders and Raft leaders. This will prevent the leaseholder from making
// Raft proposals, but it can still hold onto leases as long as it can heartbeat
// liveness.
//
// Cluster topology:
//
// n1-n3: system and liveness ranges, SQL gateway
// n4-n6: user ranges
//
// The cluster runs with COCKROACH_DISABLE_LEADER_FOLLOWS_LEASEHOLDER, which
// will place Raft leaders and leases independently of each other. We can then
// assume that some number of user ranges will randomly have split leader/lease,
// and simply create partial partitions between each of n4-n6 in sequence.
//
// We run a kv50 workload on SQL gateways and collect pMax latency for graphing.
func runFailoverPartialLeaseLeader(ctx context.Context, t test.Test, c cluster.Cluster) {
require.Equal(t, 7, c.Spec().NodeCount)
rng, _ := randutil.NewTestRand()
// Create cluster, disabling leader/leaseholder colocation. We only start
// n1-n3, to precisely place system ranges, since we'll have to disable the
// replicate queue shortly.
settings := install.MakeClusterSettings()
settings.Env = append(settings.Env, "COCKROACH_DISABLE_LEADER_FOLLOWS_LEASEHOLDER=true")
settings.Env = append(settings.Env, "COCKROACH_SCAN_MAX_IDLE_TIME=100ms") // speed up replication
m := c.NewMonitor(ctx, c.CRDBNodes())
failer := makeFailer(t, c, m, failureModeBlackhole, settings, rng).(PartialFailer)
failer.Setup(ctx)
defer failer.Cleanup(ctx)
c.Start(ctx, t.L(), failoverStartOpts(), settings, c.Range(1, 3))
conn := c.Conn(ctx, t.L(), 1)
// Place all ranges on n1-n3 to start with, and wait for upreplication.
configureAllZones(t, ctx, conn, zoneConfig{replicas: 3, onlyNodes: []int{1, 2, 3}})
// NB: We want to ensure the system ranges are all down-replicated from their
// initial RF of 5, so pass in exactlyReplicationFactor below.
require.NoError(t, WaitForReplication(ctx, t, t.L(), conn, 3, exactlyReplicationFactor))
// Now that system ranges are properly placed on n1-n3, start n4-n6.
c.Start(ctx, t.L(), failoverStartOpts(), settings, c.Range(4, 6))
// Create the kv database on n4-n6.
t.L().Printf("creating workload database")
_, err := conn.ExecContext(ctx, `CREATE DATABASE kv`)
require.NoError(t, err)
configureZone(t, ctx, conn, `DATABASE kv`, zoneConfig{replicas: 3, onlyNodes: []int{4, 5, 6}})
c.Run(ctx, option.WithNodes(c.Node(6)), `./cockroach workload init kv --splits 1000 {pgurl:1}`)
// Move ranges to the appropriate nodes. Precreating the database/range and
// moving it to the correct nodes first is not sufficient, since workload will
// spread the ranges across all nodes regardless.
relocateRanges(t, ctx, conn, `database_name = 'kv'`, []int{1, 2, 3}, []int{4, 5, 6})
relocateRanges(t, ctx, conn, `database_name != 'kv'`, []int{4, 5, 6}, []int{1, 2, 3})
// Check that we have a few split leaders/leaseholders on n4-n6. We give
// it a few seconds, since metrics are updated every 10 seconds.
for i := 0; ; i++ {
var count float64
for _, node := range []int{4, 5, 6} {
count += nodeMetric(ctx, t, c, node, "replicas.leaders_not_leaseholders")
}
t.L().Printf("%.0f split leaders/leaseholders", count)
if count >= 3 {
break
} else if i >= 10 {
t.Fatalf("timed out waiting for 3 split leaders/leaseholders")
}
time.Sleep(time.Second)
}
// Run workload on n7 via n1-n3 gateways until test ends (context cancels).
t.L().Printf("running workload")
cancelWorkload := m.GoWithCancel(func(ctx context.Context) error {
err := c.RunE(ctx, option.WithNodes(c.WorkloadNode()), `./cockroach workload run kv --read-percent 50 `+
`--concurrency 256 --max-rate 2048 --timeout 1m --tolerate-errors `+
`--histograms=`+t.PerfArtifactsDir()+`/stats.json {pgurl:1-3}`)
if ctx.Err() != nil {
return nil // test requested workload shutdown
}
return err
})
// Start a worker to fail and recover partial partitions between each pair of
// n4-n6 for 3 cycles (9 failures total).
m.Go(func(ctx context.Context) error {
defer cancelWorkload()
for i := 0; i < 3; i++ {
for _, node := range []int{4, 5, 6} {
sleepFor(ctx, t, time.Minute)
// Ranges may occasionally escape their constraints. Move them to where
// they should be.
relocateRanges(t, ctx, conn, `database_name = 'kv'`, []int{1, 2, 3}, []int{4, 5, 6})
relocateRanges(t, ctx, conn, `database_name != 'kv'`, []int{4, 5, 6}, []int{1, 2, 3})
// Randomly sleep up to the lease renewal interval, to vary the time
// between the last lease renewal and the failure.
sleepFor(ctx, t, randutil.RandDuration(rng, rangeLeaseRenewalDuration))
failer.Ready(ctx, node)
peer := node + 1
if peer > 6 {
peer = 4
}
t.L().Printf("failing n%d to n%d (%s lease/leader)", node, peer, failer)
failer.FailPartial(ctx, node, []int{peer})
sleepFor(ctx, t, time.Minute)
t.L().Printf("recovering n%d to n%d (%s lease/leader)", node, peer, failer)
failer.Recover(ctx, node)
}
}
sleepFor(ctx, t, time.Minute) // let cluster recover
return nil
})
m.Wait()
}
// runFailoverPartialLeaseLiveness tests a partial network partition between a
// leaseholder and node liveness. With epoch leases we would normally expect
// this to recover shortly, since the node can't heartbeat its liveness record
// and thus its leases will expire. However, it will maintain Raft leadership,
// and we prevent non-leaders from acquiring leases, which can prevent the lease
// from moving unless we explicitly handle this. See also:
// https://github.com/cockroachdb/cockroach/pull/87244.
//
// Cluster topology:
//
// n1-n3: system ranges and SQL gateways
// n4: liveness leaseholder
// n5-7: user ranges
//
// A partial blackhole network partition is triggered between n4 and each of
// n5-n7 sequentially, 3 times per node for a total of 9 times. A kv50 workload
// is running against SQL gateways on n1-n3, and we collect the pMax latency for
// graphing.
func runFailoverPartialLeaseLiveness(ctx context.Context, t test.Test, c cluster.Cluster) {
require.Equal(t, 8, c.Spec().NodeCount)
rng, _ := randutil.NewTestRand()
// Create cluster.
settings := install.MakeClusterSettings()
settings.Env = append(settings.Env, "COCKROACH_SCAN_MAX_IDLE_TIME=100ms") // speed up replication
m := c.NewMonitor(ctx, c.CRDBNodes())
failer := makeFailer(t, c, m, failureModeBlackhole, settings, rng).(PartialFailer)
failer.Setup(ctx)
defer failer.Cleanup(ctx)
c.Start(ctx, t.L(), failoverStartOpts(), settings, c.CRDBNodes())
conn := c.Conn(ctx, t.L(), 1)
// Place all ranges on n1-n3, and an extra liveness leaseholder replica on n4.
configureAllZones(t, ctx, conn, zoneConfig{replicas: 3, onlyNodes: []int{1, 2, 3}})
configureZone(t, ctx, conn, `RANGE liveness`, zoneConfig{
replicas: 4, onlyNodes: []int{1, 2, 3, 4}, leasePreference: "[+node4]"})
// Wait for upreplication.
require.NoError(t, WaitFor3XReplication(ctx, t, t.L(), conn))
// Create the kv database on n5-n7.
t.L().Printf("creating workload database")
_, err := conn.ExecContext(ctx, `CREATE DATABASE kv`)
require.NoError(t, err)
configureZone(t, ctx, conn, `DATABASE kv`, zoneConfig{replicas: 3, onlyNodes: []int{5, 6, 7}})
c.Run(ctx, option.WithNodes(c.Node(6)), `./cockroach workload init kv --splits 1000 {pgurl:1}`)
// The replicate queue takes forever to move the ranges, so we do it
// ourselves. Precreating the database/range and moving it to the correct
// nodes first is not sufficient, since workload will spread the ranges across
// all nodes regardless.
relocateRanges(t, ctx, conn, `database_name = 'kv'`, []int{1, 2, 3, 4}, []int{5, 6, 7})
relocateRanges(t, ctx, conn, `database_name != 'kv'`, []int{5, 6, 7}, []int{1, 2, 3, 4})
relocateRanges(t, ctx, conn, `range_id != 2`, []int{4}, []int{1, 2, 3})
// Run workload on n8 using n1-n3 as gateways (not partitioned) until test
// ends (context cancels).
t.L().Printf("running workload")
cancelWorkload := m.GoWithCancel(func(ctx context.Context) error {
err := c.RunE(ctx, option.WithNodes(c.WorkloadNode()), `./cockroach workload run kv --read-percent 50 `+
`--concurrency 256 --max-rate 2048 --timeout 1m --tolerate-errors `+
`--histograms=`+t.PerfArtifactsDir()+`/stats.json {pgurl:1-3}`)
if ctx.Err() != nil {
return nil // test requested workload shutdown
}
return err
})
// Start a worker to fail and recover partial partitions between n4 (liveness)
// and workload leaseholders n5-n7 for 1 minute each, 3 times per node for 9
// times total.
m.Go(func(ctx context.Context) error {
defer cancelWorkload()
for i := 0; i < 3; i++ {
for _, node := range []int{5, 6, 7} {
sleepFor(ctx, t, time.Minute)
// Ranges and leases may occasionally escape their constraints. Move
// them to where they should be.
relocateRanges(t, ctx, conn, `database_name = 'kv'`, []int{1, 2, 3, 4}, []int{5, 6, 7})
relocateRanges(t, ctx, conn, `database_name != 'kv'`, []int{node}, []int{1, 2, 3})
relocateRanges(t, ctx, conn, `range_id = 2`, []int{5, 6, 7}, []int{1, 2, 3, 4})
relocateLeases(t, ctx, conn, `range_id = 2`, 4)
// Randomly sleep up to the lease renewal interval, to vary the time
// between the last lease renewal and the failure.
sleepFor(ctx, t, randutil.RandDuration(rng, rangeLeaseRenewalDuration))
failer.Ready(ctx, node)
peer := 4
t.L().Printf("failing n%d to n%d (%s lease/liveness)", node, peer, failer)
failer.FailPartial(ctx, node, []int{peer})
sleepFor(ctx, t, time.Minute)
t.L().Printf("recovering n%d to n%d (%s lease/liveness)", node, peer, failer)
failer.Recover(ctx, node)
}
}
sleepFor(ctx, t, time.Minute) // let cluster recover
return nil
})
m.Wait()
}
// runFailoverNonSystem benchmarks the maximum duration of range unavailability
// following a leaseholder failure with only non-system ranges.
//
// - No system ranges located on the failed node.
//
// - SQL clients do not connect to the failed node.
//
// - The workload consists of individual point reads and writes.
//
// The cluster layout is as follows:
//
// n1-n3: System ranges and SQL gateways.
// n4-n6: Workload ranges.
// n7: Workload runner.
//
// The test runs a kv50 workload via gateways on n1-n3, measuring the pMax
// latency for graphing.
func runFailoverNonSystem(
ctx context.Context, t test.Test, c cluster.Cluster, failureMode failureMode,
) {
require.Equal(t, 7, c.Spec().NodeCount)
rng, _ := randutil.NewTestRand()
// Create cluster.
settings := install.MakeClusterSettings()
settings.Env = append(settings.Env, "COCKROACH_ENABLE_UNSAFE_TEST_BUILTINS=true")
settings.Env = append(settings.Env, "COCKROACH_SCAN_MAX_IDLE_TIME=100ms") // speed up replication
m := c.NewMonitor(ctx, c.CRDBNodes())
failer := makeFailer(t, c, m, failureMode, settings, rng)
failer.Setup(ctx)
defer failer.Cleanup(ctx)
c.Start(ctx, t.L(), failoverStartOpts(), settings, c.CRDBNodes())
conn := c.Conn(ctx, t.L(), 1)
// Constrain all existing zone configs to n1-n3.
configureAllZones(t, ctx, conn, zoneConfig{replicas: 3, onlyNodes: []int{1, 2, 3}})
// Wait for upreplication.
require.NoError(t, WaitFor3XReplication(ctx, t, t.L(), conn))
// Create the kv database, constrained to n4-n6. Despite the zone config, the
// ranges will initially be distributed across all cluster nodes.
t.L().Printf("creating workload database")
_, err := conn.ExecContext(ctx, `CREATE DATABASE kv`)
require.NoError(t, err)
configureZone(t, ctx, conn, `DATABASE kv`, zoneConfig{replicas: 3, onlyNodes: []int{4, 5, 6}})
c.Run(ctx, option.WithNodes(c.Node(7)), `./cockroach workload init kv --splits 1000 {pgurl:1}`)
// The replicate queue takes forever to move the kv ranges from n1-n3 to
// n4-n6, so we do it ourselves. Precreating the database/range and moving it
// to the correct nodes first is not sufficient, since workload will spread
// the ranges across all nodes regardless.
relocateRanges(t, ctx, conn, `database_name = 'kv'`, []int{1, 2, 3}, []int{4, 5, 6})
// Run workload on n7 via n1-n3 gateways until test ends (context cancels).
t.L().Printf("running workload")
cancelWorkload := m.GoWithCancel(func(ctx context.Context) error {
err := c.RunE(ctx, option.WithNodes(c.WorkloadNode()), `./cockroach workload run kv --read-percent 50 `+
`--concurrency 256 --max-rate 2048 --timeout 1m --tolerate-errors `+
`--histograms=`+t.PerfArtifactsDir()+`/stats.json {pgurl:1-3}`)
if ctx.Err() != nil {
return nil // test requested workload shutdown
}
return err
})
// Start a worker to fail and recover n4-n6 in order.
m.Go(func(ctx context.Context) error {
defer cancelWorkload()
for i := 0; i < 3; i++ {
for _, node := range []int{4, 5, 6} {
sleepFor(ctx, t, time.Minute)
// Ranges may occasionally escape their constraints. Move them
// to where they should be.
relocateRanges(t, ctx, conn, `database_name = 'kv'`, []int{1, 2, 3}, []int{4, 5, 6})
relocateRanges(t, ctx, conn, `database_name != 'kv'`, []int{node}, []int{1, 2, 3})
// Randomly sleep up to the lease renewal interval, to vary the time
// between the last lease renewal and the failure.
sleepFor(ctx, t, randutil.RandDuration(rng, rangeLeaseRenewalDuration))
failer.Ready(ctx, node)
t.L().Printf("failing n%d (%s)", node, failer)
failer.Fail(ctx, node)
sleepFor(ctx, t, time.Minute)
t.L().Printf("recovering n%d (%s)", node, failer)
failer.Recover(ctx, node)
}
}
return nil
})
m.Wait()
}
// runFailoverLiveness benchmarks the maximum duration of *user* range
// unavailability following a liveness-only leaseholder failure. When the
// liveness range becomes unavailable, other nodes are unable to heartbeat and
// extend their leases, and their leases may thus expire as well making them
// unavailable.
//
// - Only liveness range located on the failed node, as leaseholder.
//
// - SQL clients do not connect to the failed node.
//
// - The workload consists of individual point reads and writes.
//
// The cluster layout is as follows:
//
// n1-n3: All ranges, including liveness.
// n4: Liveness range leaseholder.
// n5: Workload runner.
//
// The test runs a kv50 workload via gateways on n1-n3, measuring the pMax
// latency for graphing.
func runFailoverLiveness(
ctx context.Context, t test.Test, c cluster.Cluster, failureMode failureMode,
) {
require.Equal(t, 5, c.Spec().NodeCount)
rng, _ := randutil.NewTestRand()
// Create cluster.
settings := install.MakeClusterSettings()
settings.Env = append(settings.Env, "COCKROACH_ENABLE_UNSAFE_TEST_BUILTINS=true")
settings.Env = append(settings.Env, "COCKROACH_SCAN_MAX_IDLE_TIME=100ms") // speed up replication
m := c.NewMonitor(ctx, c.CRDBNodes())
failer := makeFailer(t, c, m, failureMode, settings, rng)
failer.Setup(ctx)
defer failer.Cleanup(ctx)
c.Start(ctx, t.L(), failoverStartOpts(), settings, c.CRDBNodes())
conn := c.Conn(ctx, t.L(), 1)
// Constrain all existing zone configs to n1-n3.
configureAllZones(t, ctx, conn, zoneConfig{replicas: 3, onlyNodes: []int{1, 2, 3}})
// Constrain the liveness range to n1-n4, with leaseholder preference on n4.
configureZone(t, ctx, conn, `RANGE liveness`, zoneConfig{replicas: 4, leasePreference: "[+node4]"})
// Wait for upreplication.
require.NoError(t, WaitFor3XReplication(ctx, t, t.L(), conn))
// Create the kv database, constrained to n1-n3. Despite the zone config, the
// ranges will initially be distributed across all cluster nodes.
t.L().Printf("creating workload database")
_, err := conn.ExecContext(ctx, `CREATE DATABASE kv`)
require.NoError(t, err)
configureZone(t, ctx, conn, `DATABASE kv`, zoneConfig{replicas: 3, onlyNodes: []int{1, 2, 3}})
c.Run(ctx, option.WithNodes(c.WorkloadNode()), `./cockroach workload init kv --splits 1000 {pgurl:1}`)
// The replicate queue takes forever to move the other ranges off of n4 so we
// do it ourselves. Precreating the database/range and moving it to the
// correct nodes first is not sufficient, since workload will spread the
// ranges across all nodes regardless.
relocateRanges(t, ctx, conn, `range_id != 2`, []int{4}, []int{1, 2, 3})
// We also make sure the lease is located on n4.
relocateLeases(t, ctx, conn, `range_id = 2`, 4)
// Run workload on n5 via n1-n3 gateways until test ends (context cancels).
t.L().Printf("running workload")
cancelWorkload := m.GoWithCancel(func(ctx context.Context) error {
err := c.RunE(ctx, option.WithNodes(c.WorkloadNode()), `./cockroach workload run kv --read-percent 50 `+
`--concurrency 256 --max-rate 2048 --timeout 1m --tolerate-errors `+
`--histograms=`+t.PerfArtifactsDir()+`/stats.json {pgurl:1-3}`)
if ctx.Err() != nil {
return nil // test requested workload shutdown
}
return err
})
// Start a worker to fail and recover n4.
m.Go(func(ctx context.Context) error {
defer cancelWorkload()
for i := 0; i < 9; i++ {
sleepFor(ctx, t, time.Minute)
// Ranges and leases may occasionally escape their constraints. Move them
// to where they should be.
relocateRanges(t, ctx, conn, `range_id != 2`, []int{4}, []int{1, 2, 3})
relocateLeases(t, ctx, conn, `range_id = 2`, 4)
// Randomly sleep up to the lease renewal interval, to vary the time
// between the last lease renewal and the failure.
sleepFor(ctx, t, randutil.RandDuration(rng, rangeLeaseRenewalDuration))
failer.Ready(ctx, 4)
t.L().Printf("failing n%d (%s)", 4, failer)
failer.Fail(ctx, 4)
sleepFor(ctx, t, time.Minute)
t.L().Printf("recovering n%d (%s)", 4, failer)
failer.Recover(ctx, 4)
relocateLeases(t, ctx, conn, `range_id = 2`, 4)
}
sleepFor(ctx, t, time.Minute) // let cluster recover
return nil
})
m.Wait()
}
// runFailoverSystemNonLiveness benchmarks the maximum duration of range
// unavailability following a leaseholder failure with only system ranges,
// excluding the liveness range which is tested separately in
// runFailoverLiveness.
//
// - No user or liveness ranges located on the failed node.
//
// - SQL clients do not connect to the failed node.
//
// - The workload consists of individual point reads and writes.
//
// The cluster layout is as follows:
//
// n1-n3: Workload ranges, liveness range, and SQL gateways.
// n4-n6: System ranges excluding liveness.
// n7: Workload runner.
//
// The test runs a kv50 workload via gateways on n1-n3, measuring the pMax
// latency for graphing.