Skip to content

Commit

Permalink
vtctld/vtorc: improve reparenting stats (vitessio#13723)
Browse files Browse the repository at this point in the history
Signed-off-by: Tim Vaillancourt <tim@timvaillancourt.com>
Signed-off-by: Manan Gupta <manan@planetscale.com>
Co-authored-by: Manan Gupta <manan@planetscale.com>
  • Loading branch information
timvaillancourt and GuptaManan100 committed Oct 13, 2023
1 parent a6c860c commit d47c899
Show file tree
Hide file tree
Showing 9 changed files with 670 additions and 19 deletions.
4 changes: 3 additions & 1 deletion go/stats/timings.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,12 @@ func NewTimings(name, help, label string, categories ...string) *Timings {
return t
}

// Reset will clear histograms: used during testing
// Reset will clear histograms and counters: used during testing
func (t *Timings) Reset() {
t.mu.RLock()
t.histograms = make(map[string]*Histogram)
t.totalCount.Set(0)
t.totalTime.Set(0)
t.mu.RUnlock()
}

Expand Down
6 changes: 6 additions & 0 deletions go/test/endtoend/vtorc/general/vtorc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ func TestSingleKeyspace(t *testing.T) {
utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true)
utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], keyspace.Name, shard0.Name, 1)
}

// Cases to test:
Expand All @@ -94,6 +95,7 @@ func TestKeyspaceShard(t *testing.T) {
utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true)
utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], keyspace.Name, shard0.Name, 1)
}

// Cases to test:
Expand All @@ -116,6 +118,7 @@ func TestVTOrcRepairs(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

var replica, otherReplica *cluster.Vttablet
for _, tablet := range shard0.Vttablets {
Expand Down Expand Up @@ -335,6 +338,7 @@ func TestVTOrcWithPrs(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// find any replica tablet other than the current primary
var replica *cluster.Vttablet
Expand Down Expand Up @@ -362,7 +366,9 @@ func TestVTOrcWithPrs(t *testing.T) {
utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
// Verify that VTOrc didn't run any other recovery
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 0)
utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 0)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, 0)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 0)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, 0)
Expand Down
209 changes: 209 additions & 0 deletions go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ limitations under the License.
package primaryfailure

import (
"fmt"
"path"
"testing"
"time"

Expand Down Expand Up @@ -47,6 +49,7 @@ func TestDownPrimary(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// find the replica and rdonly tablets
var replica, rdonly *cluster.Vttablet
Expand Down Expand Up @@ -85,6 +88,212 @@ func TestDownPrimary(t *testing.T) {
// also check that the replication is working correctly after failover
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
}

// bring down primary before VTOrc has started, let vtorc repair.
func TestDownPrimaryBeforeVTOrc(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{}, 0, "none")
keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
shard0 := &keyspace.Shards[0]
curPrimary := shard0.Vttablets[0]

// Promote the first tablet as the primary
err := clusterInfo.ClusterInstance.VtctlclientProcess.InitializeShard(keyspace.Name, shard0.Name, clusterInfo.ClusterInstance.Cell, curPrimary.TabletUID)
require.NoError(t, err)

// find the replica and rdonly tablets
var replica, rdonly *cluster.Vttablet
for _, tablet := range shard0.Vttablets {
// we know we have only two replcia tablets, so the one not the primary must be the other replica
if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" {
replica = tablet
}
if tablet.Type == "rdonly" {
rdonly = tablet
}
}
assert.NotNil(t, replica, "could not find replica tablet")
assert.NotNil(t, rdonly, "could not find rdonly tablet")

// check that the replication is setup correctly before we failover
utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica}, 10*time.Second)

// Make the current primary vttablet unavailable.
_ = curPrimary.VttabletProcess.TearDown()
err = curPrimary.MysqlctlProcess.Stop()
require.NoError(t, err)

// Start a VTOrc instance
utils.StartVTOrcs(t, clusterInfo, []string{"--remote_operation_timeout=10s"}, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
}, 1)

vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]

defer func() {
// we remove the tablet from our global list
utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary)
}()

// check that the replica gets promoted
utils.CheckPrimaryTablet(t, clusterInfo, replica, true)

// also check that the replication is working correctly after failover
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
}

// delete the primary record and let vtorc repair.
func TestDeletedPrimaryTablet(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s"}, cluster.VTOrcConfiguration{}, 1, "none")
keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
shard0 := &keyspace.Shards[0]
// find primary from topo
curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// find the replica and rdonly tablets
var replica, rdonly *cluster.Vttablet
for _, tablet := range shard0.Vttablets {
// we know we have only two replcia tablets, so the one not the primary must be the other replica
if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" {
replica = tablet
}
if tablet.Type == "rdonly" {
rdonly = tablet
}
}
assert.NotNil(t, replica, "could not find replica tablet")
assert.NotNil(t, rdonly, "could not find rdonly tablet")

// check that the replication is setup correctly before we failover
utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, rdonly}, 10*time.Second)

// Disable VTOrc recoveries
vtOrcProcess.DisableGlobalRecoveries(t)

Check failure on line 182 in go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go

View workflow job for this annotation

GitHub Actions / Static Code Checks Etc

vtOrcProcess.DisableGlobalRecoveries undefined (type *cluster.VTOrcProcess has no field or method DisableGlobalRecoveries) (typecheck)
// use vtctlclient to stop replication on the replica
_, err := clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("StopReplication", replica.Alias)
require.NoError(t, err)
// insert a write that is not available on the replica.
utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly}, 10*time.Second)

// Make the current primary vttablet unavailable and delete its tablet record.
_ = curPrimary.VttabletProcess.TearDown()
err = curPrimary.MysqlctlProcess.Stop()
require.NoError(t, err)
// use vtctlclient to start replication on the replica back
_, err = clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("StartReplication", replica.Alias)
require.NoError(t, err)
err = clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommand("DeleteTablets", "--allow-primary", curPrimary.Alias)
require.NoError(t, err)
// Enable VTOrc recoveries now
vtOrcProcess.EnableGlobalRecoveries(t)

Check failure on line 199 in go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go

View workflow job for this annotation

GitHub Actions / Static Code Checks Etc

vtOrcProcess.EnableGlobalRecoveries undefined (type *cluster.VTOrcProcess has no field or method EnableGlobalRecoveries) (typecheck)

defer func() {
// we remove the tablet from our global list
utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary)
}()

// check that the replica gets promoted. Also verify that it has all the writes.
utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
utils.CheckTabletUptoDate(t, clusterInfo, replica)

Check failure on line 208 in go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go

View workflow job for this annotation

GitHub Actions / Static Code Checks Etc

CheckTabletUptoDate not declared by package utils (typecheck)

// also check that the replication is working correctly after failover
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryTabletDeletedRecoveryName, 1)

Check failure on line 212 in go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go

View workflow job for this annotation

GitHub Actions / Static Code Checks Etc

RecoverPrimaryTabletDeletedRecoveryName not declared by package logic (typecheck)
}

// TestDeadPrimaryRecoversImmediately test Vtorc ability to recover immediately if primary is dead.
// Reason is, unlike other recoveries, in DeadPrimary we don't call DiscoverInstance since we know
// that primary is unreachable. This help us save few seconds depending on value of `RemoteOperationTimeout` flag.
func TestDeadPrimaryRecoversImmediately(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
// We specify the --wait-replicas-timeout to a small value because we spawn a cross-cell replica later in the test.
// If that replica is more advanced than the same-cell-replica, then we try to promote the cross-cell replica as an intermediate source.
// If we don't specify a small value of --wait-replicas-timeout, then we would end up waiting for 30 seconds for the dead-primary to respond, failing this test.
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s", "--wait-replicas-timeout=5s"}, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
}, 1, "semi_sync")
keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
shard0 := &keyspace.Shards[0]
// find primary from topo
curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// find the replica and rdonly tablets
var replica, rdonly *cluster.Vttablet
for _, tablet := range shard0.Vttablets {
// we know we have only two replcia tablets, so the one not the primary must be the other replica
if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" {
replica = tablet
}
if tablet.Type == "rdonly" {
rdonly = tablet
}
}
assert.NotNil(t, replica, "could not find replica tablet")
assert.NotNil(t, rdonly, "could not find rdonly tablet")

// Start a cross-cell replica
crossCellReplica := utils.StartVttablet(t, clusterInfo, utils.Cell2, false)

// check that the replication is setup correctly before we failover
utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica, crossCellReplica}, 10*time.Second)

// Make the current primary vttablet unavailable.
curPrimary.VttabletProcess.Kill()

Check failure on line 257 in go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go

View workflow job for this annotation

GitHub Actions / Static Code Checks Etc

curPrimary.VttabletProcess.Kill undefined (type *cluster.VttabletProcess has no field or method Kill) (typecheck)
err := curPrimary.MysqlctlProcess.Stop()
require.NoError(t, err)
defer func() {
// we remove the tablet from our global list
utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary)
}()

// check that the replica gets promoted
utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
utils.WaitForInstancePollSecondsExceededCount(t, vtOrcProcess, "InstancePollSecondsExceeded", 2, false)
// also check that the replication is working correctly after failover
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// Parse log file and find out how much time it took for DeadPrimary to recover.
logFile := path.Join(vtOrcProcess.LogDir, vtOrcProcess.LogFileName)
// log prefix printed at the end of analysis where we conclude we have DeadPrimary
t1 := extractTimeFromLog(t, logFile, "Proceeding with DeadPrimary recovery")

Check failure on line 276 in go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go

View workflow job for this annotation

GitHub Actions / Static Code Checks Etc

undeclared name: `extractTimeFromLog` (typecheck)
// log prefix printed at the end of recovery
t2 := extractTimeFromLog(t, logFile, "auditType:RecoverDeadPrimary")

Check failure on line 278 in go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go

View workflow job for this annotation

GitHub Actions / Static Code Checks Etc

undeclared name: `extractTimeFromLog` (typecheck)
curr := time.Now().Format("2006-01-02")
timeLayout := "2006-01-02 15:04:05.000000"
timeStr1 := fmt.Sprintf("%s %s", curr, t1)
timeStr2 := fmt.Sprintf("%s %s", curr, t2)
time1, err := time.Parse(timeLayout, timeStr1)
if err != nil {
t.Errorf("unable to parse time %s", err.Error())
}
time2, err := time.Parse(timeLayout, timeStr2)
if err != nil {
t.Errorf("unable to parse time %s", err.Error())
}
diff := time2.Sub(time1)
fmt.Printf("The difference between %s and %s is %v seconds.\n", t1, t2, diff.Seconds())
// assert that it takes less than `remote_operation_timeout` to recover from `DeadPrimary`
// use the value provided in `remote_operation_timeout` flag to compare with.
// We are testing against 9.5 seconds to be safe and prevent flakiness.
assert.Less(t, diff.Seconds(), 9.5)
}

// Failover should not be cross data centers, according to the configuration file
Expand Down
Loading

0 comments on commit d47c899

Please sign in to comment.