vitessio · GuptaManan100 · Jul 27, 2023 · Jul 21, 2023 · Jul 21, 2023 · Jul 21, 2023
@@ -6,6 +6,7 @@
   - **[Breaking Changes](#breaking-changes)**
   - **[New command line flags and behavior](#new-flag)**
     - [VTOrc flag `--allow-emergency-reparent`](#new-flag-toggle-ers)
+    - [ERS sub flag `--wait-for-all-tablets`](#new-ers-subflag)
   - **[VTAdmin](#vtadmin)**
     - [Updated to node v18.16.0](#update-node)
   - **[Deprecations and Deletions](#deprecations-and-deletions)**
@@ -34,6 +35,13 @@ reparent operations. The users that want VTOrc to fix the replication issues, bu
 should start using this flag. By default, VTOrc will be able to run `EmergencyReparentShard`. The users must specify the
 flag to `false` to change the behaviour.
 
+#### <a id="new-ers-subflag"/>ERS sub flag `--wait-for-all-tablets`
+
+Running `EmergencyReparentShard` from the vtctldclient has a new sub-flag `--wait-for-all-tablets` that makes `EmergencyReparentShard` wait 
+for a response from all the tablets. Originally `EmergencyReparentShard` was meant only to be run when a primary tablet is unreachable.
+We have realized now that there are cases when the replication is broken but all the tablets are reachable. In these cases, it is advisable to 
+call `EmergencyReparentShard` with `--wait-for-all-tablets` so that it doesn't ignore one of the tablets.
+
 ### <a id="vtadmin"/>VTAdmin
 
 #### <a id="updated-node"/>vtadmin-web updated to node v18.16.0 (LTS)

@@ -94,6 +94,7 @@ var emergencyReparentShardOptions = struct {
 	NewPrimaryAliasStr        string
 	IgnoreReplicaAliasStrList []string
 	PreventCrossCellPromotion bool
+	WaitForAllTablets         bool
 }{}
 
 func commandEmergencyReparentShard(cmd *cobra.Command, args []string) error {
@@ -132,6 +133,7 @@ func commandEmergencyReparentShard(cmd *cobra.Command, args []string) error {
 		IgnoreReplicas:            ignoreReplicaAliases,
 		WaitReplicasTimeout:       protoutil.DurationToProto(emergencyReparentShardOptions.WaitReplicasTimeout),
 		PreventCrossCellPromotion: emergencyReparentShardOptions.PreventCrossCellPromotion,
+		WaitForAllTablets:         emergencyReparentShardOptions.WaitForAllTablets,
 	})
 	if err != nil {
 		return err
@@ -281,6 +283,7 @@ func init() {
 	EmergencyReparentShard.Flags().DurationVar(&emergencyReparentShardOptions.WaitReplicasTimeout, "wait-replicas-timeout", topo.RemoteOperationTimeout, "Time to wait for replicas to catch up in reparenting.")
 	EmergencyReparentShard.Flags().StringVar(&emergencyReparentShardOptions.NewPrimaryAliasStr, "new-primary", "", "Alias of a tablet that should be the new primary. If not specified, the vtctld will select the best candidate to promote.")
 	EmergencyReparentShard.Flags().BoolVar(&emergencyReparentShardOptions.PreventCrossCellPromotion, "prevent-cross-cell-promotion", false, "Only promotes a new primary from the same cell as the previous primary.")
+	EmergencyReparentShard.Flags().BoolVar(&emergencyReparentShardOptions.WaitForAllTablets, "wait-for-all-tablets", false, "Should ERS wait for all the tablets to respond. Useful when all the tablets are reachable.")
 	EmergencyReparentShard.Flags().StringSliceVarP(&emergencyReparentShardOptions.IgnoreReplicaAliasStrList, "ignore-replicas", "i", nil, "Comma-separated, repeated list of replica tablet aliases to ignore during the emergency reparent.")
 	Root.AddCommand(EmergencyReparentShard)
 

@@ -156,6 +156,70 @@ func TestDownPrimaryBeforeVTOrc(t *testing.T) {
 	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
 }
 
+// delete the primary record and let vtorc repair.
+func TestDeletedPrimaryTablet(t *testing.T) {
+	defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
+	defer cluster.PanicHandler(t)
+	utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s"}, cluster.VTOrcConfiguration{}, 1, "none")
+	keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
+	shard0 := &keyspace.Shards[0]
+	// find primary from topo
+	curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
+	assert.NotNil(t, curPrimary, "should have elected a primary")
+	vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
+	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
+
+	// find the replica and rdonly tablets
+	var replica, rdonly *cluster.Vttablet
+	for _, tablet := range shard0.Vttablets {
+		// we know we have only two replcia tablets, so the one not the primary must be the other replica
+		if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" {
+			replica = tablet
+		}
+		if tablet.Type == "rdonly" {
+			rdonly = tablet
+		}
+	}
+	assert.NotNil(t, replica, "could not find replica tablet")
+	assert.NotNil(t, rdonly, "could not find rdonly tablet")
+
+	// check that the replication is setup correctly before we failover
+	utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, rdonly}, 10*time.Second)
+
+	// Disable VTOrc recoveries
+	vtOrcProcess.DisableGlobalRecoveries(t)
+	// use vtctlclient to stop replication on the replica
+	_, err := clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("StopReplication", replica.Alias)
+	require.NoError(t, err)
+	// insert a write that is not available on the replica.
+	utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly}, 10*time.Second)
+
+	// Make the current primary vttablet unavailable and delete its tablet record.
+	_ = curPrimary.VttabletProcess.TearDown()
+	err = curPrimary.MysqlctlProcess.Stop()
+	require.NoError(t, err)
+	// use vtctlclient to start replication on the replica back
+	_, err = clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("StartReplication", replica.Alias)
+	require.NoError(t, err)
+	err = clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommand("DeleteTablets", "--allow-primary", curPrimary.Alias)
+	require.NoError(t, err)
+	// Enable VTOrc recoveries now
+	vtOrcProcess.EnableGlobalRecoveries(t)
+
+	defer func() {
+		// we remove the tablet from our global list
+		utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary)
+	}()
+
+	// check that the replica gets promoted. Also verify that it has all the writes.
+	utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
+	utils.CheckTabletUptoDate(t, clusterInfo, replica)
+
+	// also check that the replication is working correctly after failover
+	utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second)
+	utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryTabletDeletedRecoveryName, 1)
+}
+
 // TestDeadPrimaryRecoversImmediately test Vtorc ability to recover immediately if primary is dead.
 // Reason is, unlike other recoveries, in DeadPrimary we don't call DiscoverInstance since we know
 // that primary is unreachable. This help us save few seconds depending on value of `RemoteOperationTimeout` flag.
@@ -217,7 +281,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) {
 	// log prefix printed at the end of analysis where we conclude we have DeadPrimary
 	t1 := extractTimeFromLog(t, logFile, "Proceeding with DeadPrimary recovery")
 	// log prefix printed at the end of recovery
-	t2 := extractTimeFromLog(t, logFile, "auditType:recover-dead-primary")
+	t2 := extractTimeFromLog(t, logFile, "auditType:RecoverDeadPrimary")
 	curr := time.Now().Format("2006-01-02")
 	timeLayout := "2006-01-02 15:04:05.000000"
 	timeStr1 := fmt.Sprintf("%s %s", curr, t1)

@@ -203,10 +203,8 @@ func shutdownVttablets(clusterInfo *VTOrcClusterInfo) error {
 			}
 			// Remove the tablet record for this tablet
 		}
-		err = clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommand("DeleteTablet", vttablet.Alias)
-		if err != nil {
-			return err
-		}
+		// Ignoring error here because some tests delete tablets themselves.
+		_ = clusterInfo.ClusterInstance.VtctlclientProcess.ExecuteCommand("DeleteTablet", vttablet.Alias)
 	}
 	clusterInfo.ClusterInstance.Keyspaces[0].Shards[0].Vttablets = nil
 	return nil
@@ -304,6 +302,13 @@ func SetupVttabletsAndVTOrcs(t *testing.T, clusterInfo *VTOrcClusterInfo, numRep
 	}
 	out, err := clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("SetKeyspaceDurabilityPolicy", keyspaceName, fmt.Sprintf("--durability-policy=%s", durability))
 	require.NoError(t, err, out)
+	// VTOrc now uses shard record too, so we need to clear that as well for correct testing.
+	_, err = clusterInfo.Ts.UpdateShardFields(context.Background(), keyspaceName, shardName, func(info *topo.ShardInfo) error {
+		info.PrimaryTermStartTime = nil
+		info.PrimaryAlias = nil
+		return nil
+	})
+	require.NoError(t, err)
 
 	// start vtorc
 	StartVTOrcs(t, clusterInfo, orcExtraArgs, config, vtorcCount)
@@ -430,8 +435,8 @@ func CheckReplication(t *testing.T, clusterInfo *VTOrcClusterInfo, primary *clus
 				time.Sleep(100 * time.Millisecond)
 				break
 			}
-			confirmReplication(t, primary, replicas, time.Until(endTime), clusterInfo.lastUsedValue)
 			clusterInfo.lastUsedValue++
+			confirmReplication(t, primary, replicas, time.Until(endTime), clusterInfo.lastUsedValue)
 			validateTopology(t, clusterInfo, true, time.Until(endTime))
 			return
 		}
@@ -442,8 +447,8 @@ func CheckReplication(t *testing.T, clusterInfo *VTOrcClusterInfo, primary *clus
 // Call this function only after CheckReplication has been executed once, since that function creates the table that this function uses.
 func VerifyWritesSucceed(t *testing.T, clusterInfo *VTOrcClusterInfo, primary *cluster.Vttablet, replicas []*cluster.Vttablet, timeToWait time.Duration) {
 	t.Helper()
-	confirmReplication(t, primary, replicas, timeToWait, clusterInfo.lastUsedValue)
 	clusterInfo.lastUsedValue++
+	confirmReplication(t, primary, replicas, timeToWait, clusterInfo.lastUsedValue)
 }
 
 func confirmReplication(t *testing.T, primary *cluster.Vttablet, replicas []*cluster.Vttablet, timeToWait time.Duration, valueToInsert int) {
@@ -478,6 +483,12 @@ func confirmReplication(t *testing.T, primary *cluster.Vttablet, replicas []*clu
 	}
 }
 
+// CheckTabletUptoDate verifies that the tablet has all the writes so far
+func CheckTabletUptoDate(t *testing.T, clusterInfo *VTOrcClusterInfo, tablet *cluster.Vttablet) {
+	err := checkInsertedValues(t, tablet, clusterInfo.lastUsedValue)
+	require.NoError(t, err)
+}
+
 func checkInsertedValues(t *testing.T, tablet *cluster.Vttablet, index int) error {
 	selectSQL := fmt.Sprintf("select msg from vt_ks.vt_insert_test where id=%d", index)
 	qr, err := RunSQL(t, selectSQL, tablet, "")