Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release-16.0]: Fix flakiness in VTOrc tests (#13489) #13528

Merged
merged 2 commits into from
Jul 19, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 13 additions & 9 deletions go/test/endtoend/cluster/vtorc_process.go
Original file line number Diff line number Diff line change
@@ -37,14 +37,15 @@ import (
// vtorc as a separate process for testing
type VTOrcProcess struct {
VtctlProcess
Port int
LogDir string
ExtraArgs []string
ConfigPath string
Config VTOrcConfiguration
WebPort int
proc *exec.Cmd
exit chan error
Port int
LogDir string
LogFileName string
ExtraArgs []string
ConfigPath string
Config VTOrcConfiguration
WebPort int
proc *exec.Cmd
exit chan error
}

type VTOrcConfiguration struct {
@@ -124,7 +125,10 @@ func (orc *VTOrcProcess) Setup() (err error) {
orc.proc.Args = append(orc.proc.Args, orc.ExtraArgs...)
orc.proc.Args = append(orc.proc.Args, "--alsologtostderr")

errFile, _ := os.Create(path.Join(orc.LogDir, fmt.Sprintf("orc-stderr-%d.txt", timeNow)))
if orc.LogFileName == "" {
orc.LogFileName = fmt.Sprintf("orc-stderr-%d.txt", timeNow)
}
errFile, _ := os.Create(path.Join(orc.LogDir, orc.LogFileName))
orc.proc.Stderr = errFile

orc.proc.Env = append(orc.proc.Env, os.Environ()...)
9 changes: 9 additions & 0 deletions go/test/endtoend/vtorc/general/vtorc_test.go
Original file line number Diff line number Diff line change
@@ -37,6 +37,7 @@ import (
// verify replication is setup
// verify that with multiple vtorc instances, we still only have 1 PlannedReparentShard call
func TestPrimaryElection(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
@@ -64,6 +65,7 @@ func TestPrimaryElection(t *testing.T) {
// verify rdonly is not elected, only replica
// verify replication is setup
func TestSingleKeyspace(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, []string{"--clusters_to_watch", "ks"}, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
@@ -81,6 +83,7 @@ func TestSingleKeyspace(t *testing.T) {
// verify rdonly is not elected, only replica
// verify replication is setup
func TestKeyspaceShard(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, []string{"--clusters_to_watch", "ks/0"}, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
@@ -100,6 +103,7 @@ func TestKeyspaceShard(t *testing.T) {
// 4. setup replication from non-primary, let vtorc repair
// 5. make instance A replicates from B and B from A, wait for repair
func TestVTOrcRepairs(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 3, 0, nil, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
@@ -216,6 +220,7 @@ func TestVTOrcRepairs(t *testing.T) {
func TestRepairAfterTER(t *testing.T) {
// test fails intermittently on CI, skip until it can be fixed.
t.SkipNow()
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 0, nil, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
@@ -252,6 +257,7 @@ func TestSemiSync(t *testing.T) {
// stop any vtorc instance running due to a previous test.
utils.StopVTOrcs(t, clusterInfo)
newCluster := utils.SetupNewClusterSemiSync(t)
defer utils.PrintVTOrcLogsOnFailure(t, newCluster.ClusterInstance)
utils.StartVTOrcs(t, newCluster, nil, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
}, 1)
@@ -316,6 +322,7 @@ func TestSemiSync(t *testing.T) {

// TestVTOrcWithPrs tests that VTOrc works fine even when PRS is called from vtctld
func TestVTOrcWithPrs(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 4, 0, nil, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
@@ -364,6 +371,7 @@ func TestVTOrcWithPrs(t *testing.T) {

// TestMultipleDurabilities tests that VTOrc works with 2 keyspaces having 2 different durability policies
func TestMultipleDurabilities(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
// Setup a normal cluster and start vtorc
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, nil, cluster.VTOrcConfiguration{}, 1, "")
@@ -388,6 +396,7 @@ func TestDurabilityPolicySetLater(t *testing.T) {
// stop any vtorc instance running due to a previous test.
utils.StopVTOrcs(t, clusterInfo)
newCluster := utils.SetupNewClusterSemiSync(t)
defer utils.PrintVTOrcLogsOnFailure(t, newCluster.ClusterInstance)
keyspace := &newCluster.ClusterInstance.Keyspaces[0]
shard0 := &keyspace.Shards[0]
// Before starting VTOrc we explicity want to set the durability policy of the keyspace to an empty string
14 changes: 13 additions & 1 deletion go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
Original file line number Diff line number Diff line change
@@ -32,8 +32,12 @@ import (
// covers the test case master-failover from orchestrator
// Also tests that VTOrc can handle multiple failures, if the durability policies allow it
func TestDownPrimary(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s"}, cluster.VTOrcConfiguration{
// We specify the --wait-replicas-timeout to a small value because we spawn a cross-cell replica later in the test.
// If that replica is more advanced than the same-cell-replica, then we try to promote the cross-cell replica as an intermediate source.
// If we don't specify a small value of --wait-replicas-timeout, then we would end up waiting for 30 seconds for the dead-primary to respond, failing this test.
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s", "--wait-replicas-timeout=5s"}, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
}, 1, "semi_sync")
keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
@@ -90,6 +94,7 @@ func TestDownPrimary(t *testing.T) {
// Failover should not be cross data centers, according to the configuration file
// covers part of the test case master-failover-lost-replicas from orchestrator
func TestCrossDataCenterFailure(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
@@ -135,6 +140,7 @@ func TestCrossDataCenterFailure(t *testing.T) {
// Failover should not be cross data centers, according to the configuration file
// In case of no viable candidates, we should error out
func TestCrossDataCenterFailureError(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, nil, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
@@ -181,6 +187,7 @@ func TestLostRdonlyOnPrimaryFailure(t *testing.T) {
// Earlier any replicas that were not able to replicate from the previous primary
// were detected by vtorc and could be configured to have their sources detached
t.Skip()
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 2, nil, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
@@ -262,6 +269,7 @@ func TestLostRdonlyOnPrimaryFailure(t *testing.T) {
// This test checks that the promotion of a tablet succeeds if it passes the promotion lag test
// covers the test case master-failover-fail-promotion-lag-minutes-success from orchestrator
func TestPromotionLagSuccess(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
ReplicationLagQuery: "select 59",
@@ -311,6 +319,7 @@ func TestPromotionLagFailure(t *testing.T) {
// Earlier vtorc used to check that the promotion lag between the new primary and the old one
// was smaller than the configured value, otherwise it would fail the promotion
t.Skip()
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 3, 1, nil, cluster.VTOrcConfiguration{
ReplicationLagQuery: "select 61",
@@ -363,6 +372,7 @@ func TestPromotionLagFailure(t *testing.T) {
// We explicitly set one of the replicas to Prefer promotion rule.
// That is the replica which should be promoted in case of primary failure
func TestDownPrimaryPromotionRule(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
LockShardTimeoutSeconds: 5,
@@ -410,6 +420,7 @@ func TestDownPrimaryPromotionRule(t *testing.T) {
// That is the replica which should be promoted in case of primary failure
// It should also be caught up when it is promoted
func TestDownPrimaryPromotionRuleWithLag(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
LockShardTimeoutSeconds: 5,
@@ -489,6 +500,7 @@ func TestDownPrimaryPromotionRuleWithLag(t *testing.T) {
// We let a replica in our own cell lag. That is the replica which should be promoted in case of primary failure
// It should also be caught up when it is promoted
func TestDownPrimaryPromotionRuleWithLagCrossCenter(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
LockShardTimeoutSeconds: 5,
3 changes: 2 additions & 1 deletion go/test/endtoend/vtorc/readtopologyinstance/main_test.go
Original file line number Diff line number Diff line change
@@ -40,6 +40,7 @@ func TestReadTopologyInstanceBufferable(t *testing.T) {
defer func() {
clusterInfo.ClusterInstance.Teardown()
}()
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
shard0 := &keyspace.Shards[0]
oldArgs := os.Args
@@ -146,7 +147,7 @@ func TestReadTopologyInstanceBufferable(t *testing.T) {
assert.Equal(t, replicaInstance.ReadBinlogCoordinates.LogFile, primaryInstance.SelfBinlogCoordinates.LogFile)
assert.Greater(t, replicaInstance.ReadBinlogCoordinates.LogPos, int64(0))
assert.Equal(t, replicaInstance.ExecBinlogCoordinates.LogFile, primaryInstance.SelfBinlogCoordinates.LogFile)
assert.LessOrEqual(t, replicaInstance.ExecBinlogCoordinates.LogPos, replicaInstance.ReadBinlogCoordinates.LogPos)
assert.Greater(t, replicaInstance.ExecBinlogCoordinates.LogPos, int64(0))
assert.Contains(t, replicaInstance.RelaylogCoordinates.LogFile, fmt.Sprintf("vt-0000000%d-relay", replica.TabletUID))
assert.Greater(t, replicaInstance.RelaylogCoordinates.LogPos, int64(0))
assert.Empty(t, replicaInstance.LastIOError)
23 changes: 23 additions & 0 deletions go/test/endtoend/vtorc/utils/utils.go
Original file line number Diff line number Diff line change
@@ -946,3 +946,26 @@ func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcPr
successCount := successfulRecoveriesMap[recoveryName]
assert.EqualValues(t, countExpected, successCount)
}

// PrintVTOrcLogsOnFailure prints the VTOrc logs on failure of the test.
// This function is supposed to be called as the first defer command from the vtorc tests.
func PrintVTOrcLogsOnFailure(t *testing.T, clusterInstance *cluster.LocalProcessCluster) {
// If the test has not failed, then we don't need to print anything.
if !t.Failed() {
return
}

log.Errorf("Printing VTOrc logs")
for _, vtorc := range clusterInstance.VTOrcProcesses {
if vtorc == nil || vtorc.LogFileName == "" {
continue
}
filePath := path.Join(vtorc.LogDir, vtorc.LogFileName)
log.Errorf("Printing file - %s", filePath)
content, err := os.ReadFile(filePath)
if err != nil {
log.Errorf("Error while reading the file - %v", err)
}
log.Errorf("%s", string(content))
}
}