From 64a8f46b471619661e695998503efabb2979cc84 Mon Sep 17 00:00:00 2001 From: "vitess-bot[bot]" <108069721+vitess-bot[bot]@users.noreply.github.com> Date: Tue, 18 Jul 2023 17:29:13 +0530 Subject: [PATCH 1/3] Cherry-pick 888df9228e94b2fad4e57af0f55ce68f8236bf25 with conflicts --- go/test/endtoend/vtorc/general/vtorc_test.go | 9 + .../primaryfailure/primary_failure_test.go | 154 +++++- .../vtorc/readtopologyinstance/main_test.go | 3 +- go/test/endtoend/vtorc/utils/utils.go | 57 ++ go/vt/vtorc/inst/instance_dao.go | 7 + go/vt/vtorc/inst/instance_dao_test.go | 497 ++++++++++++++++++ 6 files changed, 725 insertions(+), 2 deletions(-) diff --git a/go/test/endtoend/vtorc/general/vtorc_test.go b/go/test/endtoend/vtorc/general/vtorc_test.go index 4254606dd94..c0a845a5699 100644 --- a/go/test/endtoend/vtorc/general/vtorc_test.go +++ b/go/test/endtoend/vtorc/general/vtorc_test.go @@ -37,6 +37,7 @@ import ( // verify replication is setup // verify that with multiple vtorc instances, we still only have 1 PlannedReparentShard call func TestPrimaryElection(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -64,6 +65,7 @@ func TestPrimaryElection(t *testing.T) { // verify rdonly is not elected, only replica // verify replication is setup func TestSingleKeyspace(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, []string{"--clusters_to_watch", "ks"}, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -81,6 +83,7 @@ func TestSingleKeyspace(t *testing.T) { // verify rdonly is not elected, only replica // verify replication is setup func TestKeyspaceShard(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, []string{"--clusters_to_watch", "ks/0"}, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -100,6 +103,7 @@ func TestKeyspaceShard(t *testing.T) { // 4. setup replication from non-primary, let vtorc repair // 5. make instance A replicates from B and B from A, wait for repair func TestVTOrcRepairs(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 3, 0, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -216,6 +220,7 @@ func TestVTOrcRepairs(t *testing.T) { func TestRepairAfterTER(t *testing.T) { // test fails intermittently on CI, skip until it can be fixed. t.SkipNow() + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 0, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -252,6 +257,7 @@ func TestSemiSync(t *testing.T) { // stop any vtorc instance running due to a previous test. utils.StopVTOrcs(t, clusterInfo) newCluster := utils.SetupNewClusterSemiSync(t) + defer utils.PrintVTOrcLogsOnFailure(t, newCluster.ClusterInstance) utils.StartVTOrcs(t, newCluster, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, }, 1) @@ -316,6 +322,7 @@ func TestSemiSync(t *testing.T) { // TestVTOrcWithPrs tests that VTOrc works fine even when PRS is called from vtctld func TestVTOrcWithPrs(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 4, 0, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -364,6 +371,7 @@ func TestVTOrcWithPrs(t *testing.T) { // TestMultipleDurabilities tests that VTOrc works with 2 keyspaces having 2 different durability policies func TestMultipleDurabilities(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) // Setup a normal cluster and start vtorc utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, nil, cluster.VTOrcConfiguration{}, 1, "") @@ -388,6 +396,7 @@ func TestDurabilityPolicySetLater(t *testing.T) { // stop any vtorc instance running due to a previous test. utils.StopVTOrcs(t, clusterInfo) newCluster := utils.SetupNewClusterSemiSync(t) + defer utils.PrintVTOrcLogsOnFailure(t, newCluster.ClusterInstance) keyspace := &newCluster.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] // Before starting VTOrc we explicity want to set the durability policy of the keyspace to an empty string diff --git a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go index 8e91028926c..804da2beb1e 100644 --- a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go +++ b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go @@ -32,8 +32,12 @@ import ( // covers the test case master-failover from orchestrator // Also tests that VTOrc can handle multiple failures, if the durability policies allow it func TestDownPrimary(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) - utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s"}, cluster.VTOrcConfiguration{ + // We specify the --wait-replicas-timeout to a small value because we spawn a cross-cell replica later in the test. + // If that replica is more advanced than the same-cell-replica, then we try to promote the cross-cell replica as an intermediate source. + // If we don't specify a small value of --wait-replicas-timeout, then we would end up waiting for 30 seconds for the dead-primary to respond, failing this test. + utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s", "--wait-replicas-timeout=5s"}, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, }, 1, "semi_sync") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] @@ -87,9 +91,150 @@ func TestDownPrimary(t *testing.T) { utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) } +<<<<<<< HEAD +======= +// bring down primary before VTOrc has started, let vtorc repair. +func TestDownPrimaryBeforeVTOrc(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) + defer cluster.PanicHandler(t) + utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{}, 0, "none") + keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] + shard0 := &keyspace.Shards[0] + curPrimary := shard0.Vttablets[0] + + // Promote the first tablet as the primary + err := clusterInfo.ClusterInstance.VtctlclientProcess.InitializeShard(keyspace.Name, shard0.Name, clusterInfo.ClusterInstance.Cell, curPrimary.TabletUID) + require.NoError(t, err) + + // find the replica and rdonly tablets + var replica, rdonly *cluster.Vttablet + for _, tablet := range shard0.Vttablets { + // we know we have only two replcia tablets, so the one not the primary must be the other replica + if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" { + replica = tablet + } + if tablet.Type == "rdonly" { + rdonly = tablet + } + } + assert.NotNil(t, replica, "could not find replica tablet") + assert.NotNil(t, rdonly, "could not find rdonly tablet") + + // check that the replication is setup correctly before we failover + utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica}, 10*time.Second) + + // Make the current primary vttablet unavailable. + _ = curPrimary.VttabletProcess.TearDown() + err = curPrimary.MysqlctlProcess.Stop() + require.NoError(t, err) + + // Start a VTOrc instance + utils.StartVTOrcs(t, clusterInfo, []string{"--remote_operation_timeout=10s"}, cluster.VTOrcConfiguration{ + PreventCrossDataCenterPrimaryFailover: true, + }, 1) + + vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] + + defer func() { + // we remove the tablet from our global list + utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary) + }() + + // check that the replica gets promoted + utils.CheckPrimaryTablet(t, clusterInfo, replica, true) + + // also check that the replication is working correctly after failover + utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) +} + +// TestDeadPrimaryRecoversImmediately test Vtorc ability to recover immediately if primary is dead. +// Reason is, unlike other recoveries, in DeadPrimary we don't call DiscoverInstance since we know +// that primary is unreachable. This help us save few seconds depending on value of `RemoteOperationTimeout` flag. +func TestDeadPrimaryRecoversImmediately(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) + defer cluster.PanicHandler(t) + // We specify the --wait-replicas-timeout to a small value because we spawn a cross-cell replica later in the test. + // If that replica is more advanced than the same-cell-replica, then we try to promote the cross-cell replica as an intermediate source. + // If we don't specify a small value of --wait-replicas-timeout, then we would end up waiting for 30 seconds for the dead-primary to respond, failing this test. + utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s", "--wait-replicas-timeout=5s"}, cluster.VTOrcConfiguration{ + PreventCrossDataCenterPrimaryFailover: true, + }, 1, "semi_sync") + keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] + shard0 := &keyspace.Shards[0] + // find primary from topo + curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0) + assert.NotNil(t, curPrimary, "should have elected a primary") + vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1) + + // find the replica and rdonly tablets + var replica, rdonly *cluster.Vttablet + for _, tablet := range shard0.Vttablets { + // we know we have only two replcia tablets, so the one not the primary must be the other replica + if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" { + replica = tablet + } + if tablet.Type == "rdonly" { + rdonly = tablet + } + } + assert.NotNil(t, replica, "could not find replica tablet") + assert.NotNil(t, rdonly, "could not find rdonly tablet") + + // Start a cross-cell replica + crossCellReplica := utils.StartVttablet(t, clusterInfo, utils.Cell2, false) + + // check that the replication is setup correctly before we failover + utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica, crossCellReplica}, 10*time.Second) + + // Make the current primary vttablet unavailable. + curPrimary.VttabletProcess.Kill() + err := curPrimary.MysqlctlProcess.Stop() + require.NoError(t, err) + defer func() { + // we remove the tablet from our global list + utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary) + }() + + // check that the replica gets promoted + utils.CheckPrimaryTablet(t, clusterInfo, replica, true) + utils.WaitForInstancePollSecondsExceededCount(t, vtOrcProcess, "InstancePollSecondsExceeded", 2, false) + // also check that the replication is working correctly after failover + utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second) + utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1) + + // Parse log file and find out how much time it took for DeadPrimary to recover. + logFile := path.Join(vtOrcProcess.LogDir, vtOrcProcess.LogFileName) + // log prefix printed at the end of analysis where we conclude we have DeadPrimary + t1 := extractTimeFromLog(t, logFile, "Proceeding with DeadPrimary recovery") + // log prefix printed at the end of recovery + t2 := extractTimeFromLog(t, logFile, "auditType:recover-dead-primary") + curr := time.Now().Format("2006-01-02") + timeLayout := "2006-01-02 15:04:05.000000" + timeStr1 := fmt.Sprintf("%s %s", curr, t1) + timeStr2 := fmt.Sprintf("%s %s", curr, t2) + time1, err := time.Parse(timeLayout, timeStr1) + if err != nil { + t.Errorf("unable to parse time %s", err.Error()) + } + time2, err := time.Parse(timeLayout, timeStr2) + if err != nil { + t.Errorf("unable to parse time %s", err.Error()) + } + diff := time2.Sub(time1) + fmt.Printf("The difference between %s and %s is %v seconds.\n", t1, t2, diff.Seconds()) + // assert that it takes less than `remote_operation_timeout` to recover from `DeadPrimary` + // use the value provided in `remote_operation_timeout` flag to compare with. + // We are testing against 9.5 seconds to be safe and prevent flakiness. + assert.Less(t, diff.Seconds(), 9.5) +} + +>>>>>>> 888df9228e (Fix flakiness in VTOrc tests (#13489)) // Failover should not be cross data centers, according to the configuration file // covers part of the test case master-failover-lost-replicas from orchestrator func TestCrossDataCenterFailure(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -135,6 +280,7 @@ func TestCrossDataCenterFailure(t *testing.T) { // Failover should not be cross data centers, according to the configuration file // In case of no viable candidates, we should error out func TestCrossDataCenterFailureError(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -181,6 +327,7 @@ func TestLostRdonlyOnPrimaryFailure(t *testing.T) { // Earlier any replicas that were not able to replicate from the previous primary // were detected by vtorc and could be configured to have their sources detached t.Skip() + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 2, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -262,6 +409,7 @@ func TestLostRdonlyOnPrimaryFailure(t *testing.T) { // This test checks that the promotion of a tablet succeeds if it passes the promotion lag test // covers the test case master-failover-fail-promotion-lag-minutes-success from orchestrator func TestPromotionLagSuccess(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ ReplicationLagQuery: "select 59", @@ -311,6 +459,7 @@ func TestPromotionLagFailure(t *testing.T) { // Earlier vtorc used to check that the promotion lag between the new primary and the old one // was smaller than the configured value, otherwise it would fail the promotion t.Skip() + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 3, 1, nil, cluster.VTOrcConfiguration{ ReplicationLagQuery: "select 61", @@ -363,6 +512,7 @@ func TestPromotionLagFailure(t *testing.T) { // We explicitly set one of the replicas to Prefer promotion rule. // That is the replica which should be promoted in case of primary failure func TestDownPrimaryPromotionRule(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ LockShardTimeoutSeconds: 5, @@ -410,6 +560,7 @@ func TestDownPrimaryPromotionRule(t *testing.T) { // That is the replica which should be promoted in case of primary failure // It should also be caught up when it is promoted func TestDownPrimaryPromotionRuleWithLag(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ LockShardTimeoutSeconds: 5, @@ -489,6 +640,7 @@ func TestDownPrimaryPromotionRuleWithLag(t *testing.T) { // We let a replica in our own cell lag. That is the replica which should be promoted in case of primary failure // It should also be caught up when it is promoted func TestDownPrimaryPromotionRuleWithLagCrossCenter(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ LockShardTimeoutSeconds: 5, diff --git a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go index af05dbadc54..871f77bad8b 100644 --- a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go +++ b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go @@ -40,6 +40,7 @@ func TestReadTopologyInstanceBufferable(t *testing.T) { defer func() { clusterInfo.ClusterInstance.Teardown() }() + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] oldArgs := os.Args @@ -146,7 +147,7 @@ func TestReadTopologyInstanceBufferable(t *testing.T) { assert.Equal(t, replicaInstance.ReadBinlogCoordinates.LogFile, primaryInstance.SelfBinlogCoordinates.LogFile) assert.Greater(t, replicaInstance.ReadBinlogCoordinates.LogPos, int64(0)) assert.Equal(t, replicaInstance.ExecBinlogCoordinates.LogFile, primaryInstance.SelfBinlogCoordinates.LogFile) - assert.LessOrEqual(t, replicaInstance.ExecBinlogCoordinates.LogPos, replicaInstance.ReadBinlogCoordinates.LogPos) + assert.Greater(t, replicaInstance.ExecBinlogCoordinates.LogPos, uint32(0)) assert.Contains(t, replicaInstance.RelaylogCoordinates.LogFile, fmt.Sprintf("vt-0000000%d-relay", replica.TabletUID)) assert.Greater(t, replicaInstance.RelaylogCoordinates.LogPos, int64(0)) assert.Empty(t, replicaInstance.LastIOError) diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index d4f23c0de70..efdb7b7b47f 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -946,3 +946,60 @@ func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcPr successCount := successfulRecoveriesMap[recoveryName] assert.EqualValues(t, countExpected, successCount) } +<<<<<<< HEAD +======= + +// WaitForInstancePollSecondsExceededCount waits for 30 seconds and then queries api/aggregated-discovery-metrics. +// It expects to find minimum occurrence or exact count of `keyName` provided. +func WaitForInstancePollSecondsExceededCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess, keyName string, minCountExpected float64, enforceEquality bool) { + t.Helper() + var sinceInSeconds = 30 + duration := time.Duration(sinceInSeconds) + time.Sleep(duration * time.Second) + + statusCode, res, err := vtorcInstance.MakeAPICall("api/aggregated-discovery-metrics?seconds=" + strconv.Itoa(sinceInSeconds)) + if err != nil { + assert.Fail(t, "Not able to call api/aggregated-discovery-metrics") + } + if statusCode == 200 { + resultMap := make(map[string]any) + err := json.Unmarshal([]byte(res), &resultMap) + if err != nil { + assert.Fail(t, "invalid response from api/aggregated-discovery-metrics") + } + successCount := resultMap[keyName] + if iSuccessCount, ok := successCount.(float64); ok { + if enforceEquality { + assert.Equal(t, iSuccessCount, minCountExpected) + } else { + assert.GreaterOrEqual(t, iSuccessCount, minCountExpected) + } + return + } + } + assert.Fail(t, "invalid response from api/aggregated-discovery-metrics") +} + +// PrintVTOrcLogsOnFailure prints the VTOrc logs on failure of the test. +// This function is supposed to be called as the first defer command from the vtorc tests. +func PrintVTOrcLogsOnFailure(t *testing.T, clusterInstance *cluster.LocalProcessCluster) { + // If the test has not failed, then we don't need to print anything. + if !t.Failed() { + return + } + + log.Errorf("Printing VTOrc logs") + for _, vtorc := range clusterInstance.VTOrcProcesses { + if vtorc == nil || vtorc.LogFileName == "" { + continue + } + filePath := path.Join(vtorc.LogDir, vtorc.LogFileName) + log.Errorf("Printing file - %s", filePath) + content, err := os.ReadFile(filePath) + if err != nil { + log.Errorf("Error while reading the file - %v", err) + } + log.Errorf("%s", string(content)) + } +} +>>>>>>> 888df9228e (Fix flakiness in VTOrc tests (#13489)) diff --git a/go/vt/vtorc/inst/instance_dao.go b/go/vt/vtorc/inst/instance_dao.go index a799e4e3cb4..75a47b161cb 100644 --- a/go/vt/vtorc/inst/instance_dao.go +++ b/go/vt/vtorc/inst/instance_dao.go @@ -25,6 +25,7 @@ import ( "strconv" "strings" "sync" + "sync/atomic" "time" "github.com/patrickmn/go-cache" @@ -80,6 +81,7 @@ var backendWrites = collection.CreateOrReturnCollection("BACKEND_WRITES") var writeBufferLatency = stopwatch.NewNamedStopwatch() var emptyQuotesRegexp = regexp.MustCompile(`^""$`) +var cacheInitializationCompleted atomic.Bool func init() { _ = metrics.Register("instance.access_denied", accessDeniedCounter) @@ -94,7 +96,12 @@ func init() { func initializeInstanceDao() { config.WaitForConfigurationToBeLoaded() +<<<<<<< HEAD forgetInstanceKeys = cache.New(time.Duration(config.Config.InstancePollSeconds*3)*time.Second, time.Second) +======= + forgetAliases = cache.New(time.Duration(config.Config.InstancePollSeconds*3)*time.Second, time.Second) + cacheInitializationCompleted.Store(true) +>>>>>>> 888df9228e (Fix flakiness in VTOrc tests (#13489)) } // ExecDBWriteFunc chooses how to execute a write onto the database: whether synchronuously or not diff --git a/go/vt/vtorc/inst/instance_dao_test.go b/go/vt/vtorc/inst/instance_dao_test.go index 71d0ed94ff9..f7891418133 100644 --- a/go/vt/vtorc/inst/instance_dao_test.go +++ b/go/vt/vtorc/inst/instance_dao_test.go @@ -155,3 +155,500 @@ func TestGetKeyspaceShardName(t *testing.T) { require.Equal(t, ks, keyspaceRead) require.Equal(t, shard, shardRead) } +<<<<<<< HEAD +======= + +// TestReadInstance is used to test the functionality of ReadInstance and verify its failure modes and successes. +func TestReadInstance(t *testing.T) { + tests := []struct { + name string + tabletAliasToRead string + instanceFound bool + }{ + { + name: "Read success", + tabletAliasToRead: "zone1-0000000100", + instanceFound: true, + }, { + name: "Unknown tablet", + tabletAliasToRead: "unknown-tablet", + instanceFound: false, + }, + } + + // Clear the database after the test. The easiest way to do that is to run all the initialization commands again. + defer func() { + db.ClearVTOrcDatabase() + }() + for _, query := range initialSQL { + _, err := db.ExecVTOrc(query) + require.NoError(t, err) + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + got, found, err := ReadInstance(tt.tabletAliasToRead) + require.NoError(t, err) + require.Equal(t, tt.instanceFound, found) + if tt.instanceFound { + require.EqualValues(t, tt.tabletAliasToRead, got.InstanceAlias) + } + }) + } +} + +// TestReadReplicaInstances is used to test the functionality of ReadReplicaInstances and verify its failure modes and successes. +func TestReadReplicaInstances(t *testing.T) { + tests := []struct { + name string + tabletPort int + replicasLen int + }{ + { + name: "Read success - Multiple replicas", + // This tabletPort corresponds to zone1-0000000101. That is the primary for the data inserted. + // Check initialSQL for more details. + tabletPort: 6714, + replicasLen: 3, + }, { + name: "Unknown tablet", + // This tabletPort corresponds to none of the tablets. + // Check initialSQL for more details. + tabletPort: 343, + replicasLen: 0, + }, { + name: "Read success - No replicas", + // This tabletPort corresponds to zone1-0000000100. That is a replica tablet, with no replicas of its own. + // Check initialSQL for more details. + tabletPort: 6711, + replicasLen: 0, + }, + } + + // Clear the database after the test. The easiest way to do that is to run all the initialization commands again. + defer func() { + db.ClearVTOrcDatabase() + }() + for _, query := range initialSQL { + _, err := db.ExecVTOrc(query) + require.NoError(t, err) + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + instances, err := ReadReplicaInstances("localhost", tt.tabletPort) + require.NoError(t, err) + require.EqualValues(t, tt.replicasLen, len(instances)) + }) + } +} + +// TestReadProblemInstances is used to test the functionality of ReadProblemInstances and verify its failure modes and successes. +func TestReadProblemInstances(t *testing.T) { + // The test is intended to be used as follows. The initial data is stored into the database. Following this, some specific queries are run that each individual test specifies to get the desired state. + tests := []struct { + name string + sql []string + instancesRequired []string + }{ + { + name: "No problems", + sql: nil, + instancesRequired: nil, + }, { + name: "Replication stopped on a replica", + sql: []string{ + "update database_instance set replication_sql_thread_state = 0 where alias = 'zone1-0000000112'", + }, + instancesRequired: []string{"zone1-0000000112"}, + }, { + name: "IO thread stopped on a replica", + sql: []string{ + "update database_instance set replication_io_thread_state = 0 where alias = 'zone1-0000000112'", + }, + instancesRequired: []string{"zone1-0000000112"}, + }, { + name: "High replication lag", + sql: []string{ + "update database_instance set replication_lag_seconds = 1000 where alias = 'zone1-0000000112'", + }, + instancesRequired: []string{"zone1-0000000112"}, + }, { + name: "High replication lag - replica_lag", + sql: []string{ + "update database_instance set replica_lag_seconds = 1000 where alias = 'zone1-0000000112'", + }, + instancesRequired: []string{"zone1-0000000112"}, + }, { + name: "errant GTID", + sql: []string{ + "update database_instance set gtid_errant = '729a4cc4-8680-11ed-a104-47706090afbd:1' where alias = 'zone1-0000000112'", + }, + instancesRequired: []string{"zone1-0000000112"}, + }, { + name: "Many failures", + sql: []string{ + "update database_instance set gtid_errant = '729a4cc4-8680-11ed-a104-47706090afbd:1' where alias = 'zone1-0000000112'", + "update database_instance set replication_sql_thread_state = 0 where alias = 'zone1-0000000100'", + }, + instancesRequired: []string{"zone1-0000000112", "zone1-0000000100"}, + }, + } + + // We need to set InstancePollSeconds to a large value otherwise all the instances are reported as having problems since their last_checked is very old. + // Setting this value to a hundred years, we ensure that this test doesn't fail with this issue for the next hundred years. + oldVal := config.Config.InstancePollSeconds + defer func() { + config.Config.InstancePollSeconds = oldVal + }() + config.Config.InstancePollSeconds = 60 * 60 * 24 * 365 * 100 + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Each test should clear the database. The easiest way to do that is to run all the initialization commands again + defer func() { + db.ClearVTOrcDatabase() + }() + + for _, query := range append(initialSQL, tt.sql...) { + _, err := db.ExecVTOrc(query) + require.NoError(t, err) + } + + instances, err := ReadProblemInstances("ks", "0") + require.NoError(t, err) + var tabletAliases []string + for _, instance := range instances { + tabletAliases = append(tabletAliases, instance.InstanceAlias) + } + require.ElementsMatch(t, tabletAliases, tt.instancesRequired) + }) + } +} + +// TestReadInstancesByCondition is used to test the functionality of readInstancesByCondition and verify its failure modes and successes. +func TestReadInstancesByCondition(t *testing.T) { + tests := []struct { + name string + condition string + args []any + sort string + instancesRequired []string + }{ + { + name: "All instances with no sort", + condition: "1=1", + instancesRequired: []string{"zone1-0000000100", "zone1-0000000101", "zone1-0000000112", "zone2-0000000200"}, + }, { + name: "All instances sort by data_center descending and then alias ascending", + condition: "1=1", + sort: "data_center desc, alias asc", + instancesRequired: []string{"zone2-0000000200", "zone1-0000000100", "zone1-0000000101", "zone1-0000000112"}, + }, { + name: "Filtering by replication_depth", + condition: "replication_depth=1", + instancesRequired: []string{"zone1-0000000100", "zone1-0000000112", "zone2-0000000200"}, + }, { + name: "Filtering by exact alias", + condition: "alias='zone1-0000000100'", + instancesRequired: []string{"zone1-0000000100"}, + }, { + name: "No qualifying tablets", + condition: "replication_depth=15", + }, + } + + // Clear the database after the test. The easiest way to do that is to run all the initialization commands again. + defer func() { + db.ClearVTOrcDatabase() + }() + for _, query := range initialSQL { + _, err := db.ExecVTOrc(query) + require.NoError(t, err) + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + instances, err := readInstancesByCondition(tt.condition, tt.args, tt.sort) + require.NoError(t, err) + var tabletAliases []string + for _, instance := range instances { + tabletAliases = append(tabletAliases, instance.InstanceAlias) + } + require.EqualValues(t, tt.instancesRequired, tabletAliases) + }) + } +} + +// TestReadOutdatedInstanceKeys is used to test the functionality of ReadOutdatedInstanceKeys and verify its failure modes and successes. +func TestReadOutdatedInstanceKeys(t *testing.T) { + // The test is intended to be used as follows. The initial data is stored into the database. Following this, some specific queries are run that each individual test specifies to get the desired state. + tests := []struct { + name string + sql []string + instancesRequired []string + }{ + { + name: "No problems", + sql: []string{"update database_instance set last_checked = now()"}, + instancesRequired: nil, + }, { + name: "One instance is outdated", + sql: []string{ + "update database_instance set last_checked = now()", + "update database_instance set last_checked = time(now(), '-1 hour') where alias = 'zone1-0000000100'", + }, + instancesRequired: []string{"zone1-0000000100"}, + }, { + name: "One instance doesn't have myql data", + sql: []string{ + "update database_instance set last_checked = now()", + `INSERT INTO vitess_tablet VALUES('zone1-0000000103','localhost',7706,'ks','0','zone1',2,'0001-01-01 00:00:00+00:00','');`, + }, + instancesRequired: []string{"zone1-0000000103"}, + }, { + name: "One instance doesn't have myql data and one is outdated", + sql: []string{ + "update database_instance set last_checked = now()", + "update database_instance set last_checked = time(now(), '-1 hour') where alias = 'zone1-0000000100'", + `INSERT INTO vitess_tablet VALUES('zone1-0000000103','localhost',7706,'ks','0','zone1',2,'0001-01-01 00:00:00+00:00','');`, + }, + instancesRequired: []string{"zone1-0000000103", "zone1-0000000100"}, + }, + } + + // wait for the forgetAliases cache to be initialized to prevent data race. + waitForCacheInitialization() + + // We are setting InstancePollSeconds to 59 minutes, just for the test. + oldVal := config.Config.InstancePollSeconds + oldCache := forgetAliases + defer func() { + forgetAliases = oldCache + config.Config.InstancePollSeconds = oldVal + }() + config.Config.InstancePollSeconds = 60 * 59 + forgetAliases = cache.New(time.Minute, time.Minute) + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Each test should clear the database. The easiest way to do that is to run all the initialization commands again + defer func() { + db.ClearVTOrcDatabase() + }() + + for _, query := range append(initialSQL, tt.sql...) { + _, err := db.ExecVTOrc(query) + require.NoError(t, err) + } + + tabletAliases, err := ReadOutdatedInstanceKeys() + require.NoError(t, err) + require.ElementsMatch(t, tabletAliases, tt.instancesRequired) + }) + } +} + +// TestUpdateInstanceLastChecked is used to test the functionality of UpdateInstanceLastChecked and verify its failure modes and successes. +func TestUpdateInstanceLastChecked(t *testing.T) { + tests := []struct { + name string + tabletAlias string + partialSuccess bool + conditionToCheck string + }{ + { + name: "Verify updated last checked", + tabletAlias: "zone1-0000000100", + partialSuccess: false, + conditionToCheck: "last_checked >= now() - interval 30 second and last_check_partial_success = false", + }, { + name: "Verify partial success", + tabletAlias: "zone1-0000000100", + partialSuccess: true, + conditionToCheck: "last_checked >= now() - interval 30 second and last_check_partial_success = true", + }, { + name: "Verify no error on unknown tablet", + tabletAlias: "unknown tablet", + partialSuccess: true, + }, + } + + // Clear the database after the test. The easiest way to do that is to run all the initialization commands again. + defer func() { + db.ClearVTOrcDatabase() + }() + for _, query := range initialSQL { + _, err := db.ExecVTOrc(query) + require.NoError(t, err) + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := UpdateInstanceLastChecked(tt.tabletAlias, tt.partialSuccess) + require.NoError(t, err) + + if tt.conditionToCheck != "" { + // Verify the instance we just updated satisfies the condition specified. + instances, err := readInstancesByCondition(tt.conditionToCheck, nil, "") + require.NoError(t, err) + var tabletAliases []string + for _, instance := range instances { + tabletAliases = append(tabletAliases, instance.InstanceAlias) + } + require.Contains(t, tabletAliases, tt.tabletAlias) + } + }) + } +} + +// UpdateInstanceLastAttemptedCheck is used to test the functionality of UpdateInstanceLastAttemptedCheck and verify its failure modes and successes. +func TestUpdateInstanceLastAttemptedCheck(t *testing.T) { + tests := []struct { + name string + tabletAlias string + conditionToCheck string + }{ + { + name: "Verify updated last checked", + tabletAlias: "zone1-0000000100", + conditionToCheck: "last_attempted_check >= now() - interval 30 second", + }, { + name: "Verify no error on unknown tablet", + tabletAlias: "unknown tablet", + }, + } + + // Clear the database after the test. The easiest way to do that is to run all the initialization commands again. + defer func() { + db.ClearVTOrcDatabase() + }() + for _, query := range initialSQL { + _, err := db.ExecVTOrc(query) + require.NoError(t, err) + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := UpdateInstanceLastAttemptedCheck(tt.tabletAlias) + require.NoError(t, err) + + if tt.conditionToCheck != "" { + // Verify the instance we just updated satisfies the condition specified. + instances, err := readInstancesByCondition(tt.conditionToCheck, nil, "") + require.NoError(t, err) + var tabletAliases []string + for _, instance := range instances { + tabletAliases = append(tabletAliases, instance.InstanceAlias) + } + require.Contains(t, tabletAliases, tt.tabletAlias) + } + }) + } +} + +// TestForgetInstanceAndInstanceIsForgotten tests the functionality of ForgetInstance and InstanceIsForgotten together. +func TestForgetInstanceAndInstanceIsForgotten(t *testing.T) { + tests := []struct { + name string + tabletAlias string + errExpected string + instanceForgotten bool + tabletsExpected []string + }{ + { + name: "Unknown tablet", + tabletAlias: "unknown-tablet", + errExpected: "ForgetInstance(): tablet unknown-tablet not found", + instanceForgotten: true, + tabletsExpected: []string{"zone1-0000000100", "zone1-0000000101", "zone1-0000000112", "zone2-0000000200"}, + }, { + name: "Empty tabletAlias", + tabletAlias: "", + errExpected: "ForgetInstance(): empty tabletAlias", + instanceForgotten: false, + tabletsExpected: []string{"zone1-0000000100", "zone1-0000000101", "zone1-0000000112", "zone2-0000000200"}, + }, { + name: "Success", + tabletAlias: "zone1-0000000112", + instanceForgotten: true, + tabletsExpected: []string{"zone1-0000000100", "zone1-0000000101", "zone2-0000000200"}, + }, + } + + // wait for the forgetAliases cache to be initialized to prevent data race. + waitForCacheInitialization() + + oldCache := forgetAliases + // Clear the database after the test. The easiest way to do that is to run all the initialization commands again. + defer func() { + forgetAliases = oldCache + db.ClearVTOrcDatabase() + }() + forgetAliases = cache.New(time.Minute, time.Minute) + + for _, query := range initialSQL { + _, err := db.ExecVTOrc(query) + require.NoError(t, err) + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := ForgetInstance(tt.tabletAlias) + if tt.errExpected != "" { + require.EqualError(t, err, tt.errExpected) + } else { + require.NoError(t, err) + } + isForgotten := InstanceIsForgotten(tt.tabletAlias) + require.Equal(t, tt.instanceForgotten, isForgotten) + + instances, err := readInstancesByCondition("1=1", nil, "") + require.NoError(t, err) + var tabletAliases []string + for _, instance := range instances { + tabletAliases = append(tabletAliases, instance.InstanceAlias) + } + require.EqualValues(t, tt.tabletsExpected, tabletAliases) + }) + } +} + +func TestSnapshotTopologies(t *testing.T) { + // Clear the database after the test. The easiest way to do that is to run all the initialization commands again. + defer func() { + db.ClearVTOrcDatabase() + }() + + for _, query := range initialSQL { + _, err := db.ExecVTOrc(query) + require.NoError(t, err) + } + + err := SnapshotTopologies() + require.NoError(t, err) + + query := "select alias from database_instance_topology_history" + var tabletAliases []string + err = db.QueryVTOrc(query, nil, func(rowMap sqlutils.RowMap) error { + tabletAliases = append(tabletAliases, rowMap.GetString("alias")) + return nil + }) + require.NoError(t, err) + + require.Equal(t, []string{"zone1-0000000100", "zone1-0000000101", "zone1-0000000112", "zone2-0000000200"}, tabletAliases) +} + +// waitForCacheInitialization waits for the cache to be initialized to prevent data race in tests +// that alter the cache or depend on its behaviour. +func waitForCacheInitialization() { + for { + if cacheInitializationCompleted.Load() { + return + } + time.Sleep(100 * time.Millisecond) + } +} +>>>>>>> 888df9228e (Fix flakiness in VTOrc tests (#13489)) From e9d6d312c2d0688e61f7c3e2e96bc1cbfd909527 Mon Sep 17 00:00:00 2001 From: "vitess-bot[bot]" <108069721+vitess-bot[bot]@users.noreply.github.com> Date: Tue, 18 Jul 2023 17:29:13 +0530 Subject: [PATCH 2/3] Cherry-pick 888df9228e94b2fad4e57af0f55ce68f8236bf25 with conflicts --- go/test/endtoend/vtorc/utils/utils.go | 3 +++ go/vt/vtorc/inst/instance_dao.go | 3 +++ go/vt/vtorc/inst/instance_dao_test.go | 3 +++ 3 files changed, 9 insertions(+) diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index efdb7b7b47f..65159e68812 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -1002,4 +1002,7 @@ func PrintVTOrcLogsOnFailure(t *testing.T, clusterInstance *cluster.LocalProcess log.Errorf("%s", string(content)) } } +<<<<<<< HEAD +>>>>>>> 888df9228e (Fix flakiness in VTOrc tests (#13489)) +======= >>>>>>> 888df9228e (Fix flakiness in VTOrc tests (#13489)) diff --git a/go/vt/vtorc/inst/instance_dao.go b/go/vt/vtorc/inst/instance_dao.go index 75a47b161cb..c94970c559d 100644 --- a/go/vt/vtorc/inst/instance_dao.go +++ b/go/vt/vtorc/inst/instance_dao.go @@ -101,6 +101,9 @@ func initializeInstanceDao() { ======= forgetAliases = cache.New(time.Duration(config.Config.InstancePollSeconds*3)*time.Second, time.Second) cacheInitializationCompleted.Store(true) +<<<<<<< HEAD +>>>>>>> 888df9228e (Fix flakiness in VTOrc tests (#13489)) +======= >>>>>>> 888df9228e (Fix flakiness in VTOrc tests (#13489)) } diff --git a/go/vt/vtorc/inst/instance_dao_test.go b/go/vt/vtorc/inst/instance_dao_test.go index f7891418133..ae497a65fb6 100644 --- a/go/vt/vtorc/inst/instance_dao_test.go +++ b/go/vt/vtorc/inst/instance_dao_test.go @@ -651,4 +651,7 @@ func waitForCacheInitialization() { time.Sleep(100 * time.Millisecond) } } +<<<<<<< HEAD +>>>>>>> 888df9228e (Fix flakiness in VTOrc tests (#13489)) +======= >>>>>>> 888df9228e (Fix flakiness in VTOrc tests (#13489)) From b20286c05cf117cd22a93adbd83a01b56dd01437 Mon Sep 17 00:00:00 2001 From: "vitess-bot[bot]" <108069721+vitess-bot[bot]@users.noreply.github.com> Date: Tue, 18 Jul 2023 17:29:13 +0530 Subject: [PATCH 3/3] Cherry-pick 888df9228e94b2fad4e57af0f55ce68f8236bf25 with conflicts --- go/test/endtoend/vtorc/utils/utils.go | 3 +++ go/vt/vtorc/inst/instance_dao.go | 3 +++ go/vt/vtorc/inst/instance_dao_test.go | 3 +++ 3 files changed, 9 insertions(+) diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index 65159e68812..76d177b979e 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -1003,6 +1003,9 @@ func PrintVTOrcLogsOnFailure(t *testing.T, clusterInstance *cluster.LocalProcess } } <<<<<<< HEAD +<<<<<<< HEAD +>>>>>>> 888df9228e (Fix flakiness in VTOrc tests (#13489)) +======= >>>>>>> 888df9228e (Fix flakiness in VTOrc tests (#13489)) ======= >>>>>>> 888df9228e (Fix flakiness in VTOrc tests (#13489)) diff --git a/go/vt/vtorc/inst/instance_dao.go b/go/vt/vtorc/inst/instance_dao.go index c94970c559d..5dcdc7a7807 100644 --- a/go/vt/vtorc/inst/instance_dao.go +++ b/go/vt/vtorc/inst/instance_dao.go @@ -102,6 +102,9 @@ func initializeInstanceDao() { forgetAliases = cache.New(time.Duration(config.Config.InstancePollSeconds*3)*time.Second, time.Second) cacheInitializationCompleted.Store(true) <<<<<<< HEAD +<<<<<<< HEAD +>>>>>>> 888df9228e (Fix flakiness in VTOrc tests (#13489)) +======= >>>>>>> 888df9228e (Fix flakiness in VTOrc tests (#13489)) ======= >>>>>>> 888df9228e (Fix flakiness in VTOrc tests (#13489)) diff --git a/go/vt/vtorc/inst/instance_dao_test.go b/go/vt/vtorc/inst/instance_dao_test.go index ae497a65fb6..c58b9e375d5 100644 --- a/go/vt/vtorc/inst/instance_dao_test.go +++ b/go/vt/vtorc/inst/instance_dao_test.go @@ -652,6 +652,9 @@ func waitForCacheInitialization() { } } <<<<<<< HEAD +<<<<<<< HEAD +>>>>>>> 888df9228e (Fix flakiness in VTOrc tests (#13489)) +======= >>>>>>> 888df9228e (Fix flakiness in VTOrc tests (#13489)) ======= >>>>>>> 888df9228e (Fix flakiness in VTOrc tests (#13489))