Skip to content

Commit

Permalink
[release-16.0]: Fix flakiness in VTOrc tests (#13489) (#13528)
Browse files Browse the repository at this point in the history
Signed-off-by: Manan Gupta <manan@planetscale.com>
  • Loading branch information
GuptaManan100 authored Jul 19, 2023
1 parent 19db42e commit 721d2c3
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 11 deletions.
22 changes: 13 additions & 9 deletions go/test/endtoend/cluster/vtorc_process.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,15 @@ import (
// vtorc as a separate process for testing
type VTOrcProcess struct {
VtctlProcess
Port int
LogDir string
ExtraArgs []string
ConfigPath string
Config VTOrcConfiguration
WebPort int
proc *exec.Cmd
exit chan error
Port int
LogDir string
LogFileName string
ExtraArgs []string
ConfigPath string
Config VTOrcConfiguration
WebPort int
proc *exec.Cmd
exit chan error
}

type VTOrcConfiguration struct {
Expand Down Expand Up @@ -124,7 +125,10 @@ func (orc *VTOrcProcess) Setup() (err error) {
orc.proc.Args = append(orc.proc.Args, orc.ExtraArgs...)
orc.proc.Args = append(orc.proc.Args, "--alsologtostderr")

errFile, _ := os.Create(path.Join(orc.LogDir, fmt.Sprintf("orc-stderr-%d.txt", timeNow)))
if orc.LogFileName == "" {
orc.LogFileName = fmt.Sprintf("orc-stderr-%d.txt", timeNow)
}
errFile, _ := os.Create(path.Join(orc.LogDir, orc.LogFileName))
orc.proc.Stderr = errFile

orc.proc.Env = append(orc.proc.Env, os.Environ()...)
Expand Down
9 changes: 9 additions & 0 deletions go/test/endtoend/vtorc/general/vtorc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import (
// verify replication is setup
// verify that with multiple vtorc instances, we still only have 1 PlannedReparentShard call
func TestPrimaryElection(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
Expand Down Expand Up @@ -64,6 +65,7 @@ func TestPrimaryElection(t *testing.T) {
// verify rdonly is not elected, only replica
// verify replication is setup
func TestSingleKeyspace(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, []string{"--clusters_to_watch", "ks"}, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
Expand All @@ -81,6 +83,7 @@ func TestSingleKeyspace(t *testing.T) {
// verify rdonly is not elected, only replica
// verify replication is setup
func TestKeyspaceShard(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, []string{"--clusters_to_watch", "ks/0"}, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
Expand All @@ -100,6 +103,7 @@ func TestKeyspaceShard(t *testing.T) {
// 4. setup replication from non-primary, let vtorc repair
// 5. make instance A replicates from B and B from A, wait for repair
func TestVTOrcRepairs(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 3, 0, nil, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
Expand Down Expand Up @@ -216,6 +220,7 @@ func TestVTOrcRepairs(t *testing.T) {
func TestRepairAfterTER(t *testing.T) {
// test fails intermittently on CI, skip until it can be fixed.
t.SkipNow()
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 0, nil, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
Expand Down Expand Up @@ -252,6 +257,7 @@ func TestSemiSync(t *testing.T) {
// stop any vtorc instance running due to a previous test.
utils.StopVTOrcs(t, clusterInfo)
newCluster := utils.SetupNewClusterSemiSync(t)
defer utils.PrintVTOrcLogsOnFailure(t, newCluster.ClusterInstance)
utils.StartVTOrcs(t, newCluster, nil, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
}, 1)
Expand Down Expand Up @@ -316,6 +322,7 @@ func TestSemiSync(t *testing.T) {

// TestVTOrcWithPrs tests that VTOrc works fine even when PRS is called from vtctld
func TestVTOrcWithPrs(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 4, 0, nil, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
Expand Down Expand Up @@ -364,6 +371,7 @@ func TestVTOrcWithPrs(t *testing.T) {

// TestMultipleDurabilities tests that VTOrc works with 2 keyspaces having 2 different durability policies
func TestMultipleDurabilities(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
// Setup a normal cluster and start vtorc
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, nil, cluster.VTOrcConfiguration{}, 1, "")
Expand All @@ -388,6 +396,7 @@ func TestDurabilityPolicySetLater(t *testing.T) {
// stop any vtorc instance running due to a previous test.
utils.StopVTOrcs(t, clusterInfo)
newCluster := utils.SetupNewClusterSemiSync(t)
defer utils.PrintVTOrcLogsOnFailure(t, newCluster.ClusterInstance)
keyspace := &newCluster.ClusterInstance.Keyspaces[0]
shard0 := &keyspace.Shards[0]
// Before starting VTOrc we explicity want to set the durability policy of the keyspace to an empty string
Expand Down
14 changes: 13 additions & 1 deletion go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,12 @@ import (
// covers the test case master-failover from orchestrator
// Also tests that VTOrc can handle multiple failures, if the durability policies allow it
func TestDownPrimary(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s"}, cluster.VTOrcConfiguration{
// We specify the --wait-replicas-timeout to a small value because we spawn a cross-cell replica later in the test.
// If that replica is more advanced than the same-cell-replica, then we try to promote the cross-cell replica as an intermediate source.
// If we don't specify a small value of --wait-replicas-timeout, then we would end up waiting for 30 seconds for the dead-primary to respond, failing this test.
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s", "--wait-replicas-timeout=5s"}, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
}, 1, "semi_sync")
keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
Expand Down Expand Up @@ -90,6 +94,7 @@ func TestDownPrimary(t *testing.T) {
// Failover should not be cross data centers, according to the configuration file
// covers part of the test case master-failover-lost-replicas from orchestrator
func TestCrossDataCenterFailure(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
Expand Down Expand Up @@ -135,6 +140,7 @@ func TestCrossDataCenterFailure(t *testing.T) {
// Failover should not be cross data centers, according to the configuration file
// In case of no viable candidates, we should error out
func TestCrossDataCenterFailureError(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, nil, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
Expand Down Expand Up @@ -181,6 +187,7 @@ func TestLostRdonlyOnPrimaryFailure(t *testing.T) {
// Earlier any replicas that were not able to replicate from the previous primary
// were detected by vtorc and could be configured to have their sources detached
t.Skip()
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 2, nil, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
Expand Down Expand Up @@ -262,6 +269,7 @@ func TestLostRdonlyOnPrimaryFailure(t *testing.T) {
// This test checks that the promotion of a tablet succeeds if it passes the promotion lag test
// covers the test case master-failover-fail-promotion-lag-minutes-success from orchestrator
func TestPromotionLagSuccess(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
ReplicationLagQuery: "select 59",
Expand Down Expand Up @@ -311,6 +319,7 @@ func TestPromotionLagFailure(t *testing.T) {
// Earlier vtorc used to check that the promotion lag between the new primary and the old one
// was smaller than the configured value, otherwise it would fail the promotion
t.Skip()
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 3, 1, nil, cluster.VTOrcConfiguration{
ReplicationLagQuery: "select 61",
Expand Down Expand Up @@ -363,6 +372,7 @@ func TestPromotionLagFailure(t *testing.T) {
// We explicitly set one of the replicas to Prefer promotion rule.
// That is the replica which should be promoted in case of primary failure
func TestDownPrimaryPromotionRule(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
LockShardTimeoutSeconds: 5,
Expand Down Expand Up @@ -410,6 +420,7 @@ func TestDownPrimaryPromotionRule(t *testing.T) {
// That is the replica which should be promoted in case of primary failure
// It should also be caught up when it is promoted
func TestDownPrimaryPromotionRuleWithLag(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
LockShardTimeoutSeconds: 5,
Expand Down Expand Up @@ -489,6 +500,7 @@ func TestDownPrimaryPromotionRuleWithLag(t *testing.T) {
// We let a replica in our own cell lag. That is the replica which should be promoted in case of primary failure
// It should also be caught up when it is promoted
func TestDownPrimaryPromotionRuleWithLagCrossCenter(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{
LockShardTimeoutSeconds: 5,
Expand Down
3 changes: 2 additions & 1 deletion go/test/endtoend/vtorc/readtopologyinstance/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ func TestReadTopologyInstanceBufferable(t *testing.T) {
defer func() {
clusterInfo.ClusterInstance.Teardown()
}()
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
shard0 := &keyspace.Shards[0]
oldArgs := os.Args
Expand Down Expand Up @@ -146,7 +147,7 @@ func TestReadTopologyInstanceBufferable(t *testing.T) {
assert.Equal(t, replicaInstance.ReadBinlogCoordinates.LogFile, primaryInstance.SelfBinlogCoordinates.LogFile)
assert.Greater(t, replicaInstance.ReadBinlogCoordinates.LogPos, int64(0))
assert.Equal(t, replicaInstance.ExecBinlogCoordinates.LogFile, primaryInstance.SelfBinlogCoordinates.LogFile)
assert.LessOrEqual(t, replicaInstance.ExecBinlogCoordinates.LogPos, replicaInstance.ReadBinlogCoordinates.LogPos)
assert.Greater(t, replicaInstance.ExecBinlogCoordinates.LogPos, int64(0))
assert.Contains(t, replicaInstance.RelaylogCoordinates.LogFile, fmt.Sprintf("vt-0000000%d-relay", replica.TabletUID))
assert.Greater(t, replicaInstance.RelaylogCoordinates.LogPos, int64(0))
assert.Empty(t, replicaInstance.LastIOError)
Expand Down
23 changes: 23 additions & 0 deletions go/test/endtoend/vtorc/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -946,3 +946,26 @@ func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcPr
successCount := successfulRecoveriesMap[recoveryName]
assert.EqualValues(t, countExpected, successCount)
}

// PrintVTOrcLogsOnFailure prints the VTOrc logs on failure of the test.
// This function is supposed to be called as the first defer command from the vtorc tests.
func PrintVTOrcLogsOnFailure(t *testing.T, clusterInstance *cluster.LocalProcessCluster) {
// If the test has not failed, then we don't need to print anything.
if !t.Failed() {
return
}

log.Errorf("Printing VTOrc logs")
for _, vtorc := range clusterInstance.VTOrcProcesses {
if vtorc == nil || vtorc.LogFileName == "" {
continue
}
filePath := path.Join(vtorc.LogDir, vtorc.LogFileName)
log.Errorf("Printing file - %s", filePath)
content, err := os.ReadFile(filePath)
if err != nil {
log.Errorf("Error while reading the file - %v", err)
}
log.Errorf("%s", string(content))
}
}

0 comments on commit 721d2c3

Please sign in to comment.