diff --git a/go/vt/orchestrator/db/db.go b/go/vt/orchestrator/db/db.go index 7a578d31f0a..878bce9242f 100644 --- a/go/vt/orchestrator/db/db.go +++ b/go/vt/orchestrator/db/db.go @@ -30,11 +30,25 @@ import ( var ( EmptyArgs []any + Db DB = (*vtorcDB)(nil) ) var mysqlURI string var dbMutex sync.Mutex +type DB interface { + QueryOrchestrator(query string, argsArray []any, onRow func(sqlutils.RowMap) error) error +} + +type vtorcDB struct { +} + +var _ DB = (*vtorcDB)(nil) + +func (m *vtorcDB) QueryOrchestrator(query string, argsArray []any, onRow func(sqlutils.RowMap) error) error { + return QueryOrchestrator(query, argsArray, onRow) +} + type DummySQLResult struct { } diff --git a/go/vt/orchestrator/inst/analysis.go b/go/vt/orchestrator/inst/analysis.go index bdc3baa82de..01f984c7723 100644 --- a/go/vt/orchestrator/inst/analysis.go +++ b/go/vt/orchestrator/inst/analysis.go @@ -30,47 +30,33 @@ type AnalysisCode string type StructureAnalysisCode string const ( - NoProblem AnalysisCode = "NoProblem" - ClusterHasNoPrimary AnalysisCode = "ClusterHasNoPrimary" - DeadPrimaryWithoutReplicas AnalysisCode = "DeadPrimaryWithoutReplicas" - DeadPrimary AnalysisCode = "DeadPrimary" - DeadPrimaryAndReplicas AnalysisCode = "DeadPrimaryAndReplicas" - DeadPrimaryAndSomeReplicas AnalysisCode = "DeadPrimaryAndSomeReplicas" - PrimaryHasPrimary AnalysisCode = "PrimaryHasPrimary" - PrimaryIsReadOnly AnalysisCode = "PrimaryIsReadOnly" - PrimarySemiSyncMustBeSet AnalysisCode = "PrimarySemiSyncMustBeSet" - PrimarySemiSyncMustNotBeSet AnalysisCode = "PrimarySemiSyncMustNotBeSet" - ReplicaIsWritable AnalysisCode = "ReplicaIsWritable" - NotConnectedToPrimary AnalysisCode = "NotConnectedToPrimary" - ConnectedToWrongPrimary AnalysisCode = "ConnectedToWrongPrimary" - ReplicationStopped AnalysisCode = "ReplicationStopped" - ReplicaSemiSyncMustBeSet AnalysisCode = "ReplicaSemiSyncMustBeSet" - ReplicaSemiSyncMustNotBeSet AnalysisCode = "ReplicaSemiSyncMustNotBeSet" - UnreachablePrimaryWithLaggingReplicas AnalysisCode = "UnreachablePrimaryWithLaggingReplicas" - UnreachablePrimary AnalysisCode = "UnreachablePrimary" - PrimarySingleReplicaNotReplicating AnalysisCode = "PrimarySingleReplicaNotReplicating" - PrimarySingleReplicaDead AnalysisCode = "PrimarySingleReplicaDead" - AllPrimaryReplicasNotReplicating AnalysisCode = "AllPrimaryReplicasNotReplicating" - AllPrimaryReplicasNotReplicatingOrDead AnalysisCode = "AllPrimaryReplicasNotReplicatingOrDead" - LockedSemiSyncPrimaryHypothesis AnalysisCode = "LockedSemiSyncPrimaryHypothesis" - LockedSemiSyncPrimary AnalysisCode = "LockedSemiSyncPrimary" - PrimaryWithoutReplicas AnalysisCode = "PrimaryWithoutReplicas" - DeadCoPrimary AnalysisCode = "DeadCoPrimary" - DeadCoPrimaryAndSomeReplicas AnalysisCode = "DeadCoPrimaryAndSomeReplicas" - UnreachableCoPrimary AnalysisCode = "UnreachableCoPrimary" - AllCoPrimaryReplicasNotReplicating AnalysisCode = "AllCoPrimaryReplicasNotReplicating" - DeadIntermediatePrimary AnalysisCode = "DeadIntermediatePrimary" - DeadIntermediatePrimaryWithSingleReplica AnalysisCode = "DeadIntermediatePrimaryWithSingleReplica" - DeadIntermediatePrimaryWithSingleReplicaFailingToConnect AnalysisCode = "DeadIntermediatePrimaryWithSingleReplicaFailingToConnect" - DeadIntermediatePrimaryAndSomeReplicas AnalysisCode = "DeadIntermediatePrimaryAndSomeReplicas" - DeadIntermediatePrimaryAndReplicas AnalysisCode = "DeadIntermediatePrimaryAndReplicas" - UnreachableIntermediatePrimaryWithLaggingReplicas AnalysisCode = "UnreachableIntermediatePrimaryWithLaggingReplicas" - UnreachableIntermediatePrimary AnalysisCode = "UnreachableIntermediatePrimary" - AllIntermediatePrimaryReplicasFailingToConnectOrDead AnalysisCode = "AllIntermediatePrimaryReplicasFailingToConnectOrDead" - AllIntermediatePrimaryReplicasNotReplicating AnalysisCode = "AllIntermediatePrimaryReplicasNotReplicating" - FirstTierReplicaFailingToConnectToPrimary AnalysisCode = "FirstTierReplicaFailingToConnectToPrimary" - BinlogServerFailingToConnectToPrimary AnalysisCode = "BinlogServerFailingToConnectToPrimary" - GraceFulPrimaryTakeover AnalysisCode = "GracefulPrimaryTakeover" + NoProblem AnalysisCode = "NoProblem" + ClusterHasNoPrimary AnalysisCode = "ClusterHasNoPrimary" + DeadPrimaryWithoutReplicas AnalysisCode = "DeadPrimaryWithoutReplicas" + DeadPrimary AnalysisCode = "DeadPrimary" + DeadPrimaryAndReplicas AnalysisCode = "DeadPrimaryAndReplicas" + DeadPrimaryAndSomeReplicas AnalysisCode = "DeadPrimaryAndSomeReplicas" + PrimaryHasPrimary AnalysisCode = "PrimaryHasPrimary" + PrimaryIsReadOnly AnalysisCode = "PrimaryIsReadOnly" + PrimarySemiSyncMustBeSet AnalysisCode = "PrimarySemiSyncMustBeSet" + PrimarySemiSyncMustNotBeSet AnalysisCode = "PrimarySemiSyncMustNotBeSet" + ReplicaIsWritable AnalysisCode = "ReplicaIsWritable" + NotConnectedToPrimary AnalysisCode = "NotConnectedToPrimary" + ConnectedToWrongPrimary AnalysisCode = "ConnectedToWrongPrimary" + ReplicationStopped AnalysisCode = "ReplicationStopped" + ReplicaSemiSyncMustBeSet AnalysisCode = "ReplicaSemiSyncMustBeSet" + ReplicaSemiSyncMustNotBeSet AnalysisCode = "ReplicaSemiSyncMustNotBeSet" + UnreachablePrimaryWithLaggingReplicas AnalysisCode = "UnreachablePrimaryWithLaggingReplicas" + UnreachablePrimary AnalysisCode = "UnreachablePrimary" + PrimarySingleReplicaNotReplicating AnalysisCode = "PrimarySingleReplicaNotReplicating" + PrimarySingleReplicaDead AnalysisCode = "PrimarySingleReplicaDead" + AllPrimaryReplicasNotReplicating AnalysisCode = "AllPrimaryReplicasNotReplicating" + AllPrimaryReplicasNotReplicatingOrDead AnalysisCode = "AllPrimaryReplicasNotReplicatingOrDead" + LockedSemiSyncPrimaryHypothesis AnalysisCode = "LockedSemiSyncPrimaryHypothesis" + LockedSemiSyncPrimary AnalysisCode = "LockedSemiSyncPrimary" + PrimaryWithoutReplicas AnalysisCode = "PrimaryWithoutReplicas" + BinlogServerFailingToConnectToPrimary AnalysisCode = "BinlogServerFailingToConnectToPrimary" + GraceFulPrimaryTakeover AnalysisCode = "GracefulPrimaryTakeover" ) const ( diff --git a/go/vt/orchestrator/inst/analysis_dao.go b/go/vt/orchestrator/inst/analysis_dao.go index 10c6b4a121c..4f7c243353e 100644 --- a/go/vt/orchestrator/inst/analysis_dao.go +++ b/go/vt/orchestrator/inst/analysis_dao.go @@ -363,7 +363,7 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) ` clusters := make(map[string]*clusterAnalysis) - err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { + err := db.Db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { a := ReplicationAnalysis{ Analysis: NoProblem, ProcessingNodeHostname: process.ThisHostname, @@ -579,71 +579,10 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) a.Analysis = AllPrimaryReplicasNotReplicatingOrDead a.Description = "Primary is reachable but none of its replicas is replicating" // - } else /* co-primary */ if a.IsCoPrimary && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { - a.Analysis = DeadCoPrimary - a.Description = "Co-primary cannot be reached by orchestrator and none of its replicas is replicating" - // - } else if a.IsCoPrimary && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 { - a.Analysis = DeadCoPrimaryAndSomeReplicas - a.Description = "Co-primary cannot be reached by orchestrator; some of its replicas are unreachable and none of its reachable replicas is replicating" - // - } else if a.IsCoPrimary && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 { - a.Analysis = UnreachableCoPrimary - a.Description = "Co-primary cannot be reached by orchestrator but it has replicating replicas; possibly a network/host issue" - // - } else if a.IsCoPrimary && a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicatingReplicas == 0 { - a.Analysis = AllCoPrimaryReplicasNotReplicating - a.Description = "Co-primary is reachable but none of its replicas is replicating" - // - } else /* intermediate-primary */ if !a.IsPrimary && !a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == a.CountReplicas && a.CountReplicasFailingToConnectToPrimary == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { - a.Analysis = DeadIntermediatePrimaryWithSingleReplicaFailingToConnect - a.Description = "Intermediate primary cannot be reached by orchestrator and its (single) replica is failing to connect" - // - } else if !a.IsPrimary && !a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { - a.Analysis = DeadIntermediatePrimaryWithSingleReplica - a.Description = "Intermediate primary cannot be reached by orchestrator and its (single) replica is not replicating" - // - } else if !a.IsPrimary && !a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { - a.Analysis = DeadIntermediatePrimary - a.Description = "Intermediate primary cannot be reached by orchestrator and none of its replicas is replicating" - // - } else if !a.IsPrimary && !a.LastCheckValid && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 { - a.Analysis = DeadIntermediatePrimaryAndSomeReplicas - a.Description = "Intermediate primary cannot be reached by orchestrator; some of its replicas are unreachable and none of its reachable replicas is replicating" - // - } else if !a.IsPrimary && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas == 0 { - a.Analysis = DeadIntermediatePrimaryAndReplicas - a.Description = "Intermediate primary cannot be reached by orchestrator and all of its replicas are unreachable" - // - } else if !a.IsPrimary && !a.LastCheckValid && a.CountLaggingReplicas == a.CountReplicas && a.CountDelayedReplicas < a.CountReplicas && a.CountValidReplicatingReplicas > 0 { - a.Analysis = UnreachableIntermediatePrimaryWithLaggingReplicas - a.Description = "Intermediate primary cannot be reached by orchestrator and all of its replicas are lagging" - // - } else if !a.IsPrimary && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 { - a.Analysis = UnreachableIntermediatePrimary - a.Description = "Intermediate primary cannot be reached by orchestrator but it has replicating replicas; possibly a network/host issue" - // - } else if !a.IsPrimary && a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicatingReplicas == 0 && - a.CountReplicasFailingToConnectToPrimary > 0 && a.CountReplicasFailingToConnectToPrimary == a.CountValidReplicas { - // All replicas are either failing to connect to primary (and at least one of these have to exist) - // or completely dead. - // Must have at least two replicas to reach such conclusion -- do note that the intermediate primary is still - // reachable to orchestrator, so we base our conclusion on replicas only at this point. - a.Analysis = AllIntermediatePrimaryReplicasFailingToConnectOrDead - a.Description = "Intermediate primary is reachable but all of its replicas are failing to connect" - // - } else if !a.IsPrimary && a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicatingReplicas == 0 { - a.Analysis = AllIntermediatePrimaryReplicasNotReplicating - a.Description = "Intermediate primary is reachable but none of its replicas is replicating" - // } else if a.IsBinlogServer && a.IsFailingToConnectToPrimary { a.Analysis = BinlogServerFailingToConnectToPrimary a.Description = "Binlog server is unable to connect to its primary" // - } else if a.ReplicationDepth == 1 && a.IsFailingToConnectToPrimary { - a.Analysis = FirstTierReplicaFailingToConnectToPrimary - a.Description = "1st tier replica (directly replicating from topology primary) is unable to connect to the primary" - // } // else if a.IsPrimary && a.CountReplicas == 0 { // a.Analysis = PrimaryWithoutReplicas @@ -666,14 +605,7 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) switch a.Analysis { case AllPrimaryReplicasNotReplicating, AllPrimaryReplicasNotReplicatingOrDead, - PrimarySingleReplicaDead, - AllCoPrimaryReplicasNotReplicating, - DeadIntermediatePrimaryWithSingleReplica, - DeadIntermediatePrimaryWithSingleReplicaFailingToConnect, - DeadIntermediatePrimaryAndReplicas, - DeadIntermediatePrimaryAndSomeReplicas, - AllIntermediatePrimaryReplicasFailingToConnectOrDead, - AllIntermediatePrimaryReplicasNotReplicating: + PrimarySingleReplicaDead: a.IsReplicasDowntimed = true a.SkippableDueToDowntime = true } diff --git a/go/vt/orchestrator/inst/analysis_dao_test.go b/go/vt/orchestrator/inst/analysis_dao_test.go new file mode 100644 index 00000000000..47b61156b81 --- /dev/null +++ b/go/vt/orchestrator/inst/analysis_dao_test.go @@ -0,0 +1,462 @@ +/* +Copyright 2022 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package inst + +import ( + "testing" + + "github.com/stretchr/testify/require" + + "vitess.io/vitess/go/vt/orchestrator/db" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" + "vitess.io/vitess/go/vt/orchestrator/test" + topodatapb "vitess.io/vitess/go/vt/proto/topodata" + "vitess.io/vitess/go/vt/vtctl/reparentutil" +) + +func TestGetReplicationAnalysis(t *testing.T) { + tests := []struct { + name string + info []*test.InfoForRecoveryAnalysis + durability string + codeWanted AnalysisCode + wantErr string + }{ + { + name: "ClusterHasNoPrimary", + info: []*test.InfoForRecoveryAnalysis{{ + TabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_REPLICA, + MysqlHostname: "localhost", + MysqlPort: 6709, + }, + LastCheckValid: 1, + }}, + codeWanted: ClusterHasNoPrimary, + }, { + name: "DeadPrimary", + info: []*test.InfoForRecoveryAnalysis{{ + TabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + MysqlHostname: "localhost", + MysqlPort: 6709, + }, + LastCheckValid: 0, + CountReplicas: 4, + CountValidReplicas: 4, + CountValidReplicatingReplicas: 0, + IsPrimary: 1, + }}, + codeWanted: DeadPrimary, + }, { + name: "DeadPrimaryWithoutReplicas", + info: []*test.InfoForRecoveryAnalysis{{ + TabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + MysqlHostname: "localhost", + MysqlPort: 6709, + }, + LastCheckValid: 0, + CountReplicas: 0, + IsPrimary: 1, + }}, + codeWanted: DeadPrimaryWithoutReplicas, + }, { + name: "DeadPrimaryAndReplicas", + info: []*test.InfoForRecoveryAnalysis{{ + TabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + MysqlHostname: "localhost", + MysqlPort: 6709, + }, + LastCheckValid: 0, + CountReplicas: 3, + IsPrimary: 1, + }}, + codeWanted: DeadPrimaryAndReplicas, + }, { + name: "DeadPrimaryAndSomeReplicas", + info: []*test.InfoForRecoveryAnalysis{{ + TabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + MysqlHostname: "localhost", + MysqlPort: 6709, + }, + LastCheckValid: 0, + CountReplicas: 4, + CountValidReplicas: 2, + CountValidReplicatingReplicas: 0, + IsPrimary: 1, + }}, + codeWanted: DeadPrimaryAndSomeReplicas, + }, { + name: "PrimaryHasPrimary", + info: []*test.InfoForRecoveryAnalysis{{ + TabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + MysqlHostname: "localhost", + MysqlPort: 6709, + }, + LastCheckValid: 1, + CountReplicas: 4, + CountValidReplicas: 4, + IsPrimary: 0, + }}, + codeWanted: PrimaryHasPrimary, + }, { + name: "PrimaryIsReadOnly", + info: []*test.InfoForRecoveryAnalysis{{ + TabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + MysqlHostname: "localhost", + MysqlPort: 6709, + }, + LastCheckValid: 1, + CountReplicas: 4, + CountValidReplicas: 4, + IsPrimary: 1, + ReadOnly: 1, + }}, + codeWanted: PrimaryIsReadOnly, + }, { + name: "PrimarySemiSyncMustNotBeSet", + info: []*test.InfoForRecoveryAnalysis{{ + TabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + MysqlHostname: "localhost", + MysqlPort: 6709, + }, + LastCheckValid: 1, + CountReplicas: 4, + CountValidReplicas: 4, + IsPrimary: 1, + SemiSyncPrimaryEnabled: 1, + }}, + codeWanted: PrimarySemiSyncMustNotBeSet, + }, { + name: "PrimarySemiSyncMustBeSet", + info: []*test.InfoForRecoveryAnalysis{{ + TabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + MysqlHostname: "localhost", + MysqlPort: 6709, + }, + LastCheckValid: 1, + CountReplicas: 4, + CountValidReplicas: 4, + IsPrimary: 1, + SemiSyncPrimaryEnabled: 0, + }}, + durability: "semi_sync", + codeWanted: PrimarySemiSyncMustBeSet, + }, { + name: "NotConnectedToPrimary", + info: []*test.InfoForRecoveryAnalysis{{ + TabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + MysqlHostname: "localhost", + MysqlPort: 6708, + }, + LastCheckValid: 1, + CountReplicas: 4, + CountValidReplicas: 4, + CountValidReplicatingReplicas: 3, + CountValidOracleGTIDReplicas: 4, + CountLoggingReplicas: 2, + IsPrimary: 1, + }, { + TabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_REPLICA, + MysqlHostname: "localhost", + MysqlPort: 6709, + }, + LastCheckValid: 1, + ReadOnly: 1, + IsPrimary: 1, + }}, + codeWanted: NotConnectedToPrimary, + }, { + name: "ReplicaIsWritable", + info: []*test.InfoForRecoveryAnalysis{{ + TabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + MysqlHostname: "localhost", + MysqlPort: 6708, + }, + LastCheckValid: 1, + CountReplicas: 4, + CountValidReplicas: 4, + CountValidReplicatingReplicas: 3, + CountValidOracleGTIDReplicas: 4, + CountLoggingReplicas: 2, + IsPrimary: 1, + }, { + TabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_REPLICA, + MysqlHostname: "localhost", + MysqlPort: 6709, + }, + SourceHost: "localhost", + SourcePort: 6708, + LastCheckValid: 1, + ReadOnly: 0, + }}, + codeWanted: ReplicaIsWritable, + }, { + name: "ConnectedToWrongPrimary", + info: []*test.InfoForRecoveryAnalysis{{ + TabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + MysqlHostname: "localhost", + MysqlPort: 6708, + }, + LastCheckValid: 1, + CountReplicas: 4, + CountValidReplicas: 4, + CountValidReplicatingReplicas: 3, + CountValidOracleGTIDReplicas: 4, + CountLoggingReplicas: 2, + IsPrimary: 1, + }, { + TabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_REPLICA, + MysqlHostname: "localhost", + MysqlPort: 6709, + }, + SourceHost: "localhost", + SourcePort: 6706, + LastCheckValid: 1, + ReadOnly: 1, + }}, + codeWanted: ConnectedToWrongPrimary, + }, { + name: "ReplicationStopped", + info: []*test.InfoForRecoveryAnalysis{{ + TabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + MysqlHostname: "localhost", + MysqlPort: 6708, + }, + LastCheckValid: 1, + CountReplicas: 4, + CountValidReplicas: 4, + CountValidReplicatingReplicas: 3, + CountValidOracleGTIDReplicas: 4, + CountLoggingReplicas: 2, + IsPrimary: 1, + }, { + TabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_REPLICA, + MysqlHostname: "localhost", + MysqlPort: 6709, + }, + SourceHost: "localhost", + SourcePort: 6708, + LastCheckValid: 1, + ReadOnly: 1, + ReplicationStopped: 1, + }}, + codeWanted: ReplicationStopped, + }, + { + name: "ReplicaSemiSyncMustBeSet", + durability: "semi_sync", + info: []*test.InfoForRecoveryAnalysis{{ + TabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + MysqlHostname: "localhost", + MysqlPort: 6708, + }, + LastCheckValid: 1, + CountReplicas: 4, + CountValidReplicas: 4, + CountValidReplicatingReplicas: 3, + CountValidOracleGTIDReplicas: 4, + CountLoggingReplicas: 2, + IsPrimary: 1, + SemiSyncPrimaryEnabled: 1, + }, { + TabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_REPLICA, + MysqlHostname: "localhost", + MysqlPort: 6709, + }, + PrimaryTabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + MysqlHostname: "localhost", + MysqlPort: 6708, + }, + SourceHost: "localhost", + SourcePort: 6708, + LastCheckValid: 1, + ReadOnly: 1, + SemiSyncReplicaEnabled: 0, + }}, + codeWanted: ReplicaSemiSyncMustBeSet, + }, { + name: "ReplicaSemiSyncMustNotBeSet", + info: []*test.InfoForRecoveryAnalysis{{ + TabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + MysqlHostname: "localhost", + MysqlPort: 6708, + }, + LastCheckValid: 1, + CountReplicas: 4, + CountValidReplicas: 4, + CountValidReplicatingReplicas: 3, + CountValidOracleGTIDReplicas: 4, + CountLoggingReplicas: 2, + IsPrimary: 1, + }, { + TabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_REPLICA, + MysqlHostname: "localhost", + MysqlPort: 6709, + }, + PrimaryTabletInfo: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101}, + Hostname: "localhost", + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + MysqlHostname: "localhost", + MysqlPort: 6708, + }, + SourceHost: "localhost", + SourcePort: 6708, + LastCheckValid: 1, + ReadOnly: 1, + SemiSyncReplicaEnabled: 1, + }}, + codeWanted: ReplicaSemiSyncMustNotBeSet, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if tt.durability == "" { + tt.durability = "none" + } + err := reparentutil.SetDurabilityPolicy(tt.durability) + require.NoError(t, err) + + var rowMaps []sqlutils.RowMap + for _, analysis := range tt.info { + analysis.SetValuesFromTabletInfo() + rowMaps = append(rowMaps, analysis.ConvertToRowMap()) + } + db.Db = test.NewTestDB([][]sqlutils.RowMap{rowMaps}) + + got, err := GetReplicationAnalysis("", &ReplicationAnalysisHints{}) + if tt.wantErr != "" { + require.EqualError(t, err, tt.wantErr) + return + } + require.NoError(t, err) + require.Len(t, got, 1) + require.Equal(t, tt.codeWanted, got[0].Analysis) + }) + } +} diff --git a/go/vt/orchestrator/inst/instance_topology.go b/go/vt/orchestrator/inst/instance_topology.go index 15fe5f198da..9ebe8cdb033 100644 --- a/go/vt/orchestrator/inst/instance_topology.go +++ b/go/vt/orchestrator/inst/instance_topology.go @@ -929,7 +929,7 @@ func MakeCoPrimary(instanceKey *InstanceKey) (*Instance, error) { } log.Infof("Will make %+v co-primary of %+v", instanceKey, primary.Key) - var gitHint OperationGTIDHint = GTIDHintNeutral + var gitHint = GTIDHintNeutral if maintenanceToken, merr := BeginMaintenance(instanceKey, GetMaintenanceOwner(), fmt.Sprintf("make co-primary of %+v", primary.Key)); merr != nil { err = fmt.Errorf("Cannot begin maintenance on %+v: %v", *instanceKey, merr) goto Cleanup diff --git a/go/vt/orchestrator/logic/topology_recovery.go b/go/vt/orchestrator/logic/topology_recovery.go index d5b77fbda18..797d0977378 100644 --- a/go/vt/orchestrator/logic/topology_recovery.go +++ b/go/vt/orchestrator/logic/topology_recovery.go @@ -60,6 +60,21 @@ const ( IntermediatePrimaryRecovery RecoveryType = "IntermediatePrimaryRecovery" ) +// recoveryFunction is the code of the recovery function to be used +// this is returned from getCheckAndRecoverFunction to compare the functions returned +type recoveryFunction int + +const ( + noRecoveryFunc recoveryFunction = iota + recoverGenericProblemFunc + recoverDeadPrimaryFunc + recoverPrimaryHasPrimaryFunc + recoverLockedSemiSyncPrimaryFunc + electNewPrimaryFunc + fixPrimaryFunc + fixReplicaFunc +) + type RecoveryAcknowledgement struct { CreatedAt time.Time Owner string @@ -612,45 +627,27 @@ func replacePromotedReplicaWithCandidate(topologyRecovery *TopologyRecovery, dea return promotedReplica, nil } -// checkAndRecoverDeadPrimary checks a given analysis, decides whether to take action, and possibly takes action -// Returns true when action was taken. -func checkAndRecoverDeadPrimary(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { - if !(forceInstanceRecovery || analysisEntry.ClusterDetails.HasAutomatedPrimaryRecovery) { - return false, nil, nil - } - - // We lock the shard here and then refresh the tablets information - ctx, unlock, err := LockShard(context.Background(), analysisEntry.AnalyzedInstanceKey) - if err != nil { +// recoverPrimaryHasPrimary resets the replication on the primary instance +func recoverPrimaryHasPrimary(ctx context.Context, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { + topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, false, true) + if topologyRecovery == nil { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another fixPrimaryHasPrimary.", analysisEntry.AnalyzedInstanceKey)) return false, nil, err } - defer unlock(&err) - - // TODO (@GuptaManan100): Refresh only the shard tablet information instead of all the tablets - RefreshTablets(true /* forceRefresh */) + log.Infof("Analysis: %v, will fix incorrect primaryship %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey) - // Run a replication analysis again. We need this because vtorc works on ephemeral data to find the failure scenarios. - // That data might be old, because of a cluster operation that was run through vtctld or some other vtorc. So before we do any - // changes, we should be checking that this failure is indeed needed to be fixed. We do this after locking the shard to be sure - // that the data that we use now is up-to-date. - analysisEntries, err := inst.GetReplicationAnalysis(analysisEntry.ClusterDetails.ClusterName, &inst.ReplicationAnalysisHints{}) + // Reset replication on current primary. + _, err = inst.ResetReplicationOperation(&analysisEntry.AnalyzedInstanceKey) if err != nil { - return false, nil, err - } - - // The recovery is only required if the same instance key requires a DeadPrimary or DeadPrimaryAndSomeReplicas recovery. - recoveryRequired := false - for _, entry := range analysisEntries { - if entry.AnalyzedInstanceKey.Equals(&analysisEntry.AnalyzedInstanceKey) { - if entry.Analysis == inst.DeadPrimary || entry.Analysis == inst.DeadPrimaryAndSomeReplicas { - recoveryRequired = true - } - } + return false, topologyRecovery, err } + return true, topologyRecovery, nil +} - // No recovery is required. Some other agent already fixed the issue. - if !recoveryRequired { - log.Infof("Analysis: %v - No longer valid, some other agent must have fixed the problem.", analysisEntry.Analysis) +// recoverDeadPrimary checks a given analysis, decides whether to take action, and possibly takes action +// Returns true when action was taken. +func recoverDeadPrimary(ctx context.Context, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { + if !(forceInstanceRecovery || analysisEntry.ClusterDetails.HasAutomatedPrimaryRecovery) { return false, nil, nil } @@ -1001,37 +998,6 @@ func RecoverDeadIntermediatePrimary(topologyRecovery *TopologyRecovery, skipProc return successorInstance, err } -// checkAndRecoverDeadIntermediatePrimary checks a given analysis, decides whether to take action, and possibly takes action -// Returns true when action was taken. -func checkAndRecoverDeadIntermediatePrimary(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) { - if !(forceInstanceRecovery || analysisEntry.ClusterDetails.HasAutomatedIntermediatePrimaryRecovery) { - return false, nil, nil - } - topologyRecovery, err := AttemptRecoveryRegistration(&analysisEntry, !forceInstanceRecovery, !forceInstanceRecovery) - if topologyRecovery == nil { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- RecoverDeadIntermediatePrimary: found an active or recent recovery on %+v. Will not issue another RecoverDeadIntermediatePrimary.", analysisEntry.AnalyzedInstanceKey)) - return false, nil, err - } - - // That's it! We must do recovery! - recoverDeadIntermediatePrimaryCounter.Inc(1) - promotedReplica, err := RecoverDeadIntermediatePrimary(topologyRecovery, skipProcesses) - if promotedReplica != nil { - // success - recoverDeadIntermediatePrimarySuccessCounter.Inc(1) - - if !skipProcesses { - // Execute post intermediate-primary-failover processes - topologyRecovery.SuccessorKey = &promotedReplica.Key - topologyRecovery.SuccessorAlias = promotedReplica.InstanceAlias - executeProcesses(config.Config.PostIntermediatePrimaryFailoverProcesses, "PostIntermediatePrimaryFailoverProcesses", topologyRecovery, false) - } - } else { - recoverDeadIntermediatePrimaryFailureCounter.Inc(1) - } - return true, topologyRecovery, err -} - // RecoverDeadCoPrimary recovers a dead co-primary, complete logic inside func RecoverDeadCoPrimary(topologyRecovery *TopologyRecovery, skipProcesses bool) (promotedReplica *inst.Instance, lostReplicas [](*inst.Instance), err error) { topologyRecovery.Type = CoPrimaryRecovery @@ -1151,54 +1117,8 @@ func RecoverDeadCoPrimary(topologyRecovery *TopologyRecovery, skipProcesses bool return promotedReplica, lostReplicas, err } -// checkAndRecoverDeadCoPrimary checks a given analysis, decides whether to take action, and possibly takes action -// Returns true when action was taken. -func checkAndRecoverDeadCoPrimary(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) { - failedInstanceKey := &analysisEntry.AnalyzedInstanceKey - if !(forceInstanceRecovery || analysisEntry.ClusterDetails.HasAutomatedPrimaryRecovery) { - return false, nil, nil - } - topologyRecovery, err := AttemptRecoveryRegistration(&analysisEntry, !forceInstanceRecovery, !forceInstanceRecovery) - if topologyRecovery == nil { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another RecoverDeadCoPrimary.", analysisEntry.AnalyzedInstanceKey)) - return false, nil, err - } - - // That's it! We must do recovery! - recoverDeadCoPrimaryCounter.Inc(1) - promotedReplica, lostReplicas, err := RecoverDeadCoPrimary(topologyRecovery, skipProcesses) - resolveRecovery(topologyRecovery, promotedReplica) - if promotedReplica == nil { - inst.AuditOperation("recover-dead-co-primary", failedInstanceKey, "Failure: no replica promoted.") - } else { - inst.AuditOperation("recover-dead-co-primary", failedInstanceKey, fmt.Sprintf("promoted: %+v", promotedReplica.Key)) - } - topologyRecovery.LostReplicas.AddInstances(lostReplicas) - if promotedReplica != nil { - if config.Config.FailPrimaryPromotionIfSQLThreadNotUpToDate && !promotedReplica.SQLThreadUpToDate() { - return false, nil, log.Errorf("Promoted replica %+v: sql thread is not up to date (relay logs still unapplied). Aborting promotion", promotedReplica.Key) - } - // success - recoverDeadCoPrimarySuccessCounter.Inc(1) - - if config.Config.ApplyMySQLPromotionAfterPrimaryFailover { - AuditTopologyRecovery(topologyRecovery, "- RecoverDeadPrimary: will apply MySQL changes to promoted primary") - inst.SetReadOnly(&promotedReplica.Key, false) - } - if !skipProcesses { - // Execute post intermediate-primary-failover processes - topologyRecovery.SuccessorKey = &promotedReplica.Key - topologyRecovery.SuccessorAlias = promotedReplica.InstanceAlias - executeProcesses(config.Config.PostPrimaryFailoverProcesses, "PostPrimaryFailoverProcesses", topologyRecovery, false) - } - } else { - recoverDeadCoPrimaryFailureCounter.Inc(1) - } - return true, topologyRecovery, err -} - // checkAndRecoverGenericProblem is a general-purpose recovery function -func checkAndRecoverLockedSemiSyncPrimary(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { +func checkAndRecoverLockedSemiSyncPrimary(ctx context.Context, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, true, true) if topologyRecovery == nil { @@ -1210,7 +1130,7 @@ func checkAndRecoverLockedSemiSyncPrimary(analysisEntry inst.ReplicationAnalysis } // checkAndRecoverGenericProblem is a general-purpose recovery function -func checkAndRecoverGenericProblem(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) { +func checkAndRecoverGenericProblem(ctx context.Context, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) { return false, nil, nil } @@ -1305,68 +1225,61 @@ func checkAndExecuteFailureDetectionProcesses(analysisEntry inst.ReplicationAnal return true, true, err } +// getCheckAndRecoverFunction gets the recovery function to use for the given analysis. +// It also returns a recoveryFunction which is supposed to be unique for each function that we return. +// It is used for checking the equality of the returned function. func getCheckAndRecoverFunction(analysisCode inst.AnalysisCode, analyzedInstanceKey *inst.InstanceKey) ( - checkAndRecoverFunction func(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error), + checkAndRecoverFunction func(ctx context.Context, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error), + recoverFunctionCode recoveryFunction, isActionableRecovery bool, ) { switch analysisCode { // primary case inst.DeadPrimary, inst.DeadPrimaryAndSomeReplicas: if isInEmergencyOperationGracefulPeriod(analyzedInstanceKey) { - return checkAndRecoverGenericProblem, false + return checkAndRecoverGenericProblem, recoverGenericProblemFunc, false } - return checkAndRecoverDeadPrimary, true + return recoverDeadPrimary, recoverDeadPrimaryFunc, true + case inst.PrimaryHasPrimary: + return recoverPrimaryHasPrimary, recoverPrimaryHasPrimaryFunc, true case inst.LockedSemiSyncPrimary: if isInEmergencyOperationGracefulPeriod(analyzedInstanceKey) { - return checkAndRecoverGenericProblem, false + return checkAndRecoverGenericProblem, recoverGenericProblemFunc, false } - return checkAndRecoverLockedSemiSyncPrimary, true - // topo + return checkAndRecoverLockedSemiSyncPrimary, recoverLockedSemiSyncPrimaryFunc, true case inst.ClusterHasNoPrimary: - return electNewPrimary, true - case inst.PrimaryHasPrimary: - return fixClusterAndPrimary, true + return electNewPrimary, electNewPrimaryFunc, true case inst.PrimaryIsReadOnly, inst.PrimarySemiSyncMustBeSet, inst.PrimarySemiSyncMustNotBeSet: - return fixPrimary, true + return fixPrimary, fixPrimaryFunc, true + // replica case inst.NotConnectedToPrimary, inst.ConnectedToWrongPrimary, inst.ReplicationStopped, inst.ReplicaIsWritable, inst.ReplicaSemiSyncMustBeSet, inst.ReplicaSemiSyncMustNotBeSet: - return fixReplica, false - // intermediate primary - case inst.DeadIntermediatePrimary: - return checkAndRecoverDeadIntermediatePrimary, true - case inst.DeadIntermediatePrimaryAndSomeReplicas: - return checkAndRecoverDeadIntermediatePrimary, true - case inst.DeadIntermediatePrimaryWithSingleReplicaFailingToConnect: - return checkAndRecoverDeadIntermediatePrimary, true - case inst.AllIntermediatePrimaryReplicasFailingToConnectOrDead: - return checkAndRecoverDeadIntermediatePrimary, true - case inst.DeadIntermediatePrimaryAndReplicas: - return checkAndRecoverGenericProblem, false - // co-primary - case inst.DeadCoPrimary: - return checkAndRecoverDeadCoPrimary, true - case inst.DeadCoPrimaryAndSomeReplicas: - return checkAndRecoverDeadCoPrimary, true + return fixReplica, fixReplicaFunc, true // primary, non actionable case inst.DeadPrimaryAndReplicas: - return checkAndRecoverGenericProblem, false + return checkAndRecoverGenericProblem, recoverGenericProblemFunc, false case inst.UnreachablePrimary: - return checkAndRecoverGenericProblem, false + return checkAndRecoverGenericProblem, recoverGenericProblemFunc, false case inst.UnreachablePrimaryWithLaggingReplicas: - return checkAndRecoverGenericProblem, false + return checkAndRecoverGenericProblem, recoverGenericProblemFunc, false case inst.AllPrimaryReplicasNotReplicating: - return checkAndRecoverGenericProblem, false + return checkAndRecoverGenericProblem, recoverGenericProblemFunc, false case inst.AllPrimaryReplicasNotReplicatingOrDead: - return checkAndRecoverGenericProblem, false - case inst.UnreachableIntermediatePrimaryWithLaggingReplicas: - return checkAndRecoverGenericProblem, false + return checkAndRecoverGenericProblem, recoverGenericProblemFunc, false } // Right now this is mostly causing noise with no clear action. // Will revisit this in the future. // case inst.AllPrimaryReplicasStale: - // return checkAndRecoverGenericProblem, false + // return checkAndRecoverGenericProblem, recoverGenericProblemFunc, false + + return nil, noRecoveryFunc, false +} - return nil, false +// analysisEntriesHaveSameRecovery tells whether the two analysis entries have the same recovery function or not +func analysisEntriesHaveSameRecovery(prevAnalysis, newAnalysis inst.ReplicationAnalysis) bool { + _, prevRecoveryFunctionCode, _ := getCheckAndRecoverFunction(prevAnalysis.Analysis, &prevAnalysis.AnalyzedInstanceKey) + _, newRecoveryFunctionCode, _ := getCheckAndRecoverFunction(newAnalysis.Analysis, &newAnalysis.AnalyzedInstanceKey) + return prevRecoveryFunctionCode == newRecoveryFunctionCode } func runEmergentOperations(analysisEntry *inst.ReplicationAnalysis) { @@ -1381,14 +1294,10 @@ func runEmergentOperations(analysisEntry *inst.ReplicationAnalysis) { case inst.LockedSemiSyncPrimaryHypothesis: go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) go emergentlyRecordStaleBinlogCoordinates(&analysisEntry.AnalyzedInstanceKey, &analysisEntry.AnalyzedInstanceBinlogCoordinates) - case inst.UnreachableIntermediatePrimaryWithLaggingReplicas: - go emergentlyRestartReplicationOnTopologyInstanceReplicas(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) case inst.AllPrimaryReplicasNotReplicating: go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) case inst.AllPrimaryReplicasNotReplicatingOrDead: go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) - case inst.FirstTierReplicaFailingToConnectToPrimary: - go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstancePrimaryKey, analysisEntry.Analysis) } } @@ -1398,7 +1307,7 @@ func executeCheckAndRecoverFunction(analysisEntry inst.ReplicationAnalysis, cand atomic.AddInt64(&countPendingRecoveries, 1) defer atomic.AddInt64(&countPendingRecoveries, -1) - checkAndRecoverFunction, isActionableRecovery := getCheckAndRecoverFunction(analysisEntry.Analysis, &analysisEntry.AnalyzedInstanceKey) + checkAndRecoverFunction, _, isActionableRecovery := getCheckAndRecoverFunction(analysisEntry.Analysis, &analysisEntry.AnalyzedInstanceKey) analysisEntry.IsActionableRecovery = isActionableRecovery runEmergentOperations(&analysisEntry) @@ -1448,11 +1357,37 @@ func executeCheckAndRecoverFunction(analysisEntry inst.ReplicationAnalysis, cand analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey, candidateInstanceKey, skipProcesses) } + // We lock the shard here and then refresh the tablets information + ctx, unlock, err := LockShard(context.Background(), analysisEntry.AnalyzedInstanceKey) + if err != nil { + return false, nil, err + } + defer unlock(&err) + + // Check if the recovery is already fixed or not. We need this because vtorc works on ephemeral data to find the failure scenarios. + // That data might be old, because of a cluster operation that was run through vtctld or some other vtorc. So before we do any + // changes, we should be checking that this failure is indeed needed to be fixed. We do this after locking the shard to be sure + // that the data that we use now is up-to-date. + if isActionableRecovery { + // TODO (@GuptaManan100): Refresh only the shard tablet information instead of all the tablets + RefreshTablets(true /* forceRefresh */) + alreadyFixed, err := checkIfAlreadyFixed(analysisEntry) + if err != nil { + log.Errorf("CheckAndRecover: Analysis: %+v, InstanceKey: %+v, candidateInstanceKey: %+v, "+"skipProcesses: %v: error while trying to find if the problem is already fixed: %v", + analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey, candidateInstanceKey, skipProcesses, err) + return false, nil, err + } + if alreadyFixed { + log.Infof("Analysis: %v - No longer valid, some other agent must have fixed the problem.", analysisEntry.Analysis) + return false, nil, nil + } + } + // Actually attempt recovery: if isActionableRecovery || util.ClearToLog("executeCheckAndRecoverFunction: recovery", analysisEntry.AnalyzedInstanceKey.StringCode()) { log.Infof("executeCheckAndRecoverFunction: proceeding with %+v recovery on %+v; isRecoverable?: %+v; skipProcesses: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey, isActionableRecovery, skipProcesses) } - recoveryAttempted, topologyRecovery, err = checkAndRecoverFunction(analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses) + recoveryAttempted, topologyRecovery, err = checkAndRecoverFunction(ctx, analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses) if !recoveryAttempted { return recoveryAttempted, topologyRecovery, err } @@ -1483,6 +1418,28 @@ func executeCheckAndRecoverFunction(analysisEntry inst.ReplicationAnalysis, cand return recoveryAttempted, topologyRecovery, err } +// checkIfAlreadyFixed checks whether the problem that the analysis entry represents has already been fixed by another agent or not +func checkIfAlreadyFixed(analysisEntry inst.ReplicationAnalysis) (bool, error) { + // Run a replication analysis again. We will check if the problem persisted + // TODO (@GuptaManan100): Use specific cluster name to filter the scope of replication analysis. + // Can't do this now since SuggestedClusterAlias, ClusterName, ClusterAlias aren't consistent + // and passing any one causes issues in some failures + analysisEntries, err := inst.GetReplicationAnalysis("", &inst.ReplicationAnalysisHints{}) + if err != nil { + return false, err + } + + for _, entry := range analysisEntries { + // If there is a analysis which has the same recovery required, then we should proceed with the recovery + if entry.AnalyzedInstanceKey.Equals(&analysisEntry.AnalyzedInstanceKey) && analysisEntriesHaveSameRecovery(analysisEntry, entry) { + return false, nil + } + } + + // We didn't find a replication analysis matching the original failure, which means that some other agent probably fixed it. + return true, nil +} + // CheckAndRecover is the main entry point for the recovery mechanism func CheckAndRecover(specificInstance *inst.InstanceKey, candidateInstanceKey *inst.InstanceKey, skipProcesses bool) (recoveryAttempted bool, promotedReplicaKey *inst.InstanceKey, err error) { // Allow the analysis to run even if we don't want to recover @@ -1756,42 +1713,7 @@ func postPrsCompletion(topologyRecovery *TopologyRecovery, analysisEntry inst.Re } // electNewPrimary elects a new primary while none were present before. -func electNewPrimary(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { - // We lock the shard here and then refresh the tablets information - ctx, unlock, err := LockShard(context.Background(), analysisEntry.AnalyzedInstanceKey) - if err != nil { - return false, nil, err - } - defer unlock(&err) - - // TODO (@GuptaManan100): Refresh only the shard tablet information instead of all the tablets - RefreshTablets(true /* forceRefresh */) - - // Run a replication analysis again. We need this because vtorc works on ephemeral data to find the failure scenarios. - // That data might be old, because of a cluster operation that was run through vtctld or some other vtorc. So before we do any - // changes, we should be checking that this failure is indeed needed to be fixed. We do this after locking the shard to be sure - // that the data that we use now is up-to-date. - analysisEntries, err := inst.GetReplicationAnalysis(analysisEntry.ClusterDetails.ClusterName, &inst.ReplicationAnalysisHints{}) - if err != nil { - return false, nil, err - } - - // The recovery is only required if the same instance key requires a ClusterHasNoPrimary recovery. - recoveryRequired := false - for _, entry := range analysisEntries { - if entry.AnalyzedInstanceKey.Equals(&analysisEntry.AnalyzedInstanceKey) { - if entry.Analysis == inst.ClusterHasNoPrimary { - recoveryRequired = true - } - } - } - - // No recovery is required. Some other agent already fixed the issue. - if !recoveryRequired { - log.Infof("Analysis: %v - No longer valid, some other agent must have fixed the problem.", analysisEntry.Analysis) - return false, nil, nil - } - +func electNewPrimary(ctx context.Context, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, false /*failIfFailedInstanceInActiveRecovery*/, true /*failIfClusterInActiveRecovery*/) if topologyRecovery == nil || err != nil { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another electNewPrimary.", analysisEntry.AnalyzedInstanceKey)) @@ -1839,80 +1761,8 @@ func electNewPrimary(analysisEntry inst.ReplicationAnalysis, candidateInstanceKe return true, topologyRecovery, err } -// fixClusterAndPrimary performs a traditional vitess PlannedReparentShard. -func fixClusterAndPrimary(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { - // We lock the shard here and then refresh the tablets information - _, unlock, err := LockShard(context.Background(), analysisEntry.AnalyzedInstanceKey) - if err != nil { - return false, nil, err - } - unlockFunctionCalled := false - defer func() { - if !unlockFunctionCalled { - unlock(&err) - } - }() - - // TODO (@GuptaManan100): Refresh only the shard tablet information instead of all the tablets - RefreshTablets(true /* forceRefresh */) - - // Run a replication analysis again. We need this because vtorc works on ephemeral data to find the failure scenarios. - // That data might be old, because of a cluster operation that was run through vtctld or some other vtorc. So before we do any - // changes, we should be checking that this failure is indeed needed to be fixed. We do this after locking the shard to be sure - // that the data that we use now is up-to-date. - analysisEntries, err := inst.GetReplicationAnalysis(analysisEntry.ClusterDetails.ClusterName, &inst.ReplicationAnalysisHints{}) - if err != nil { - return false, nil, err - } - - // The recovery is only required if the same instance key requires a DeadPrimary or DeadPrimaryAndSomeReplicas recovery. - recoveryRequired := false - for _, entry := range analysisEntries { - if entry.AnalyzedInstanceKey.Equals(&analysisEntry.AnalyzedInstanceKey) { - if entry.Analysis == inst.PrimaryHasPrimary { - recoveryRequired = true - } - } - } - - // No recovery is required. Some other agent already fixed the issue. - if !recoveryRequired { - log.Infof("Analysis: %v - No longer valid, some other agent must have fixed the problem.", analysisEntry.Analysis) - return false, nil, nil - } - - topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, false, true) - if topologyRecovery == nil { - AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another fixClusterAndPrimary.", analysisEntry.AnalyzedInstanceKey)) - return false, nil, err - } - log.Infof("Analysis: %v, will fix incorrect primaryship %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey) - - // Reset replication on current primary. This will prevent the co-primary code-path. - // TODO(sougou): this should probably done while holding a lock. - _, err = inst.ResetReplicationOperation(&analysisEntry.AnalyzedInstanceKey) - if err != nil { - return false, topologyRecovery, err - } - - unlockFunctionCalled = true - unlock(&err) - altAnalysis, err := forceAnalysisEntry(analysisEntry.ClusterDetails.ClusterName, inst.DeadPrimary, "", &analysisEntry.AnalyzedInstancePrimaryKey) - if err != nil { - return false, topologyRecovery, err - } - recoveryAttempted, topologyRecovery, err = ForceExecuteRecovery(altAnalysis, &analysisEntry.AnalyzedInstanceKey, false) - if err != nil { - return recoveryAttempted, topologyRecovery, err - } - if _, err := TabletRefresh(analysisEntry.AnalyzedInstanceKey); err != nil { - log.Errore(err) - } - return recoveryAttempted, topologyRecovery, err -} - // fixPrimary sets the primary as read-write. -func fixPrimary(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { +func fixPrimary(ctx context.Context, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, false, true) if topologyRecovery == nil { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another fixPrimary.", analysisEntry.AnalyzedInstanceKey)) @@ -1920,15 +1770,6 @@ func fixPrimary(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *in } log.Infof("Analysis: %v, will fix primary to read-write %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey) - _, unlock, err := LockShard(context.Background(), analysisEntry.AnalyzedInstanceKey) - if err != nil { - log.Infof("CheckAndRecover: Analysis: %+v, InstanceKey: %+v, candidateInstanceKey: %+v, "+ - "skipProcesses: %v: NOT detecting/recovering host, could not obtain shard lock (%v)", - analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey, candidateInstanceKey, skipProcesses, err) - return false, topologyRecovery, err - } - defer unlock(&err) - // TODO(sougou): this code pattern has reached DRY limits. Reuse. count := inst.SemiSyncAckers(analysisEntry.AnalyzedInstanceKey) err = inst.SetSemiSyncPrimary(&analysisEntry.AnalyzedInstanceKey, count > 0) @@ -1944,7 +1785,7 @@ func fixPrimary(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *in } // fixReplica sets the replica as read-only and points it at the current primary. -func fixReplica(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { +func fixReplica(ctx context.Context, analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, false, true) if topologyRecovery == nil { AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another fixReplica.", analysisEntry.AnalyzedInstanceKey)) @@ -1952,15 +1793,6 @@ func fixReplica(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *in } log.Infof("Analysis: %v, will fix replica %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey) - _, unlock, err := LockShard(context.Background(), analysisEntry.AnalyzedInstanceKey) - if err != nil { - log.Infof("CheckAndRecover: Analysis: %+v, InstanceKey: %+v, candidateInstanceKey: %+v, "+ - "skipProcesses: %v: NOT detecting/recovering host, could not obtain shard lock (%v)", - analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey, candidateInstanceKey, skipProcesses, err) - return false, topologyRecovery, err - } - defer unlock(&err) - if _, err := inst.SetReadOnly(&analysisEntry.AnalyzedInstanceKey, true); err != nil { return false, topologyRecovery, err } diff --git a/go/vt/orchestrator/logic/topology_recovery_test.go b/go/vt/orchestrator/logic/topology_recovery_test.go new file mode 100644 index 00000000000..c13b14f6190 --- /dev/null +++ b/go/vt/orchestrator/logic/topology_recovery_test.go @@ -0,0 +1,87 @@ +/* +Copyright 2022 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package logic + +import ( + "testing" + "time" + + "github.com/patrickmn/go-cache" + "github.com/stretchr/testify/require" + + "vitess.io/vitess/go/vt/orchestrator/inst" +) + +func TestAnalysisEntriesHaveSameRecovery(t *testing.T) { + tests := []struct { + prevAnalysisCode inst.AnalysisCode + newAnalysisCode inst.AnalysisCode + shouldBeEqual bool + }{ + { + // DeadPrimary and DeadPrimaryAndSomeReplicas have the same recovery + prevAnalysisCode: inst.DeadPrimary, + newAnalysisCode: inst.DeadPrimaryAndSomeReplicas, + shouldBeEqual: true, + }, { + // same codes will always have same recovery + prevAnalysisCode: inst.DeadPrimary, + newAnalysisCode: inst.DeadPrimary, + shouldBeEqual: true, + }, { + prevAnalysisCode: inst.PrimaryHasPrimary, + newAnalysisCode: inst.DeadPrimaryAndSomeReplicas, + shouldBeEqual: false, + }, { + prevAnalysisCode: inst.DeadPrimary, + newAnalysisCode: inst.PrimaryHasPrimary, + shouldBeEqual: false, + }, { + prevAnalysisCode: inst.LockedSemiSyncPrimary, + newAnalysisCode: inst.PrimaryHasPrimary, + shouldBeEqual: false, + }, { + prevAnalysisCode: inst.PrimaryIsReadOnly, + newAnalysisCode: inst.PrimarySemiSyncMustNotBeSet, + shouldBeEqual: true, + }, { + prevAnalysisCode: inst.PrimarySemiSyncMustBeSet, + newAnalysisCode: inst.PrimarySemiSyncMustNotBeSet, + shouldBeEqual: true, + }, { + prevAnalysisCode: inst.PrimaryIsReadOnly, + newAnalysisCode: inst.DeadPrimary, + shouldBeEqual: false, + }, { + prevAnalysisCode: inst.NotConnectedToPrimary, + newAnalysisCode: inst.ConnectedToWrongPrimary, + shouldBeEqual: true, + }, { + prevAnalysisCode: inst.ConnectedToWrongPrimary, + newAnalysisCode: inst.ReplicaIsWritable, + shouldBeEqual: true, + }, + } + emergencyOperationGracefulPeriodMap = cache.New(time.Second*5, time.Millisecond*500) + t.Parallel() + for _, tt := range tests { + t.Run(string(tt.prevAnalysisCode)+","+string(tt.newAnalysisCode), func(t *testing.T) { + res := analysisEntriesHaveSameRecovery(inst.ReplicationAnalysis{Analysis: tt.prevAnalysisCode}, inst.ReplicationAnalysis{Analysis: tt.newAnalysisCode}) + require.Equal(t, tt.shouldBeEqual, res) + }) + } +} diff --git a/go/vt/orchestrator/test/db.go b/go/vt/orchestrator/test/db.go new file mode 100644 index 00000000000..ba22e9b0d1f --- /dev/null +++ b/go/vt/orchestrator/test/db.go @@ -0,0 +1,59 @@ +/* +Copyright 2021 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package test + +import ( + "fmt" + + "vitess.io/vitess/go/vt/orchestrator/db" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" +) + +var _ db.DB = (*DB)(nil) + +type DB struct { + rowMaps [][]sqlutils.RowMap +} + +func NewTestDB(rowMaps [][]sqlutils.RowMap) *DB { + return &DB{ + rowMaps: rowMaps, + } +} + +func (t *DB) QueryOrchestrator(query string, argsArray []any, onRow func(sqlutils.RowMap) error) error { + rowMaps, err := t.getRowMapsForQuery() + if err != nil { + return err + } + for _, rowMap := range rowMaps { + err = onRow(rowMap) + if err != nil { + return err + } + } + return nil +} + +func (t *DB) getRowMapsForQuery() ([]sqlutils.RowMap, error) { + if len(t.rowMaps) == 0 { + return nil, fmt.Errorf("no rows left to return. We received more queries than expected") + } + result := t.rowMaps[0] + t.rowMaps = t.rowMaps[1:] + return result, nil +} diff --git a/go/vt/orchestrator/test/recovery_analysis.go b/go/vt/orchestrator/test/recovery_analysis.go new file mode 100644 index 00000000000..7feea9d6251 --- /dev/null +++ b/go/vt/orchestrator/test/recovery_analysis.go @@ -0,0 +1,165 @@ +/* +Copyright 2021 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package test + +import ( + "fmt" + "time" + + "google.golang.org/protobuf/encoding/prototext" + + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" + + topodatapb "vitess.io/vitess/go/vt/proto/topodata" +) + +type InfoForRecoveryAnalysis struct { + TabletInfo *topodatapb.Tablet + PrimaryTabletInfo *topodatapb.Tablet + PrimaryTimestamp *time.Time + IsPrimary int + IsCoPrimary int + Hostname string + Port int + SourceHost string + SourcePort int + DataCenter string + Region string + PhysicalEnvironment string + LogFile string + LogPos int64 + IsStaleBinlogCoordinates int + SuggestedClusterAlias string + ClusterName string + ClusterAlias string + ClusterDomain string + GTIDMode string + LastCheckValid int + LastCheckPartialSuccess int + CountReplicas uint + CountValidReplicas uint + CountValidReplicatingReplicas uint + CountReplicasFailingToConnectToPrimary uint + CountDowntimedReplicas uint + ReplicationDepth uint + ReplicaHosts string + IsFailingToConnectToPrimary int + ReplicationStopped int + IsDowntimed int + DowntimeEndTimestamp string + DowntimeRemainingSeconds int + IsBinlogServer int + CountValidOracleGTIDReplicas uint + CountValidMariaDBGTIDReplicas uint + CountValidBinlogServerReplicas uint + SemiSyncPrimaryEnabled int + SemiSyncPrimaryStatus int + SemiSyncPrimaryWaitForReplicaCount uint + SemiSyncPrimaryClients uint + SemiSyncReplicaEnabled int + CountSemiSyncReplicasEnabled uint + CountLoggingReplicas uint + CountStatementBasedLoggingReplicas uint + CountMixedBasedLoggingReplicas uint + CountRowBasedLoggingReplicas uint + CountDistinctMajorVersionsLoggingReplicas uint + CountDelayedReplicas uint + CountLaggingReplicas uint + MinReplicaGTIDMode string + MaxReplicaGTIDMode string + MaxReplicaGTIDErrant string + ReadOnly uint +} + +func (info *InfoForRecoveryAnalysis) ConvertToRowMap() sqlutils.RowMap { + rowMap := make(sqlutils.RowMap) + rowMap["binary_log_file"] = sqlutils.CellData{String: info.LogFile, Valid: true} + rowMap["binary_log_pos"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.LogPos), Valid: true} + rowMap["cluster_alias"] = sqlutils.CellData{String: info.ClusterAlias, Valid: true} + rowMap["cluster_domain"] = sqlutils.CellData{String: info.ClusterDomain, Valid: true} + rowMap["cluster_name"] = sqlutils.CellData{String: info.ClusterName, Valid: true} + rowMap["count_binlog_server_replicas"] = sqlutils.CellData{Valid: false} + rowMap["count_co_primary_replicas"] = sqlutils.CellData{Valid: false} + rowMap["count_delayed_replicas"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountDelayedReplicas), Valid: true} + rowMap["count_distinct_logging_major_versions"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountDistinctMajorVersionsLoggingReplicas), Valid: true} + rowMap["count_downtimed_replicas"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountDowntimedReplicas), Valid: true} + rowMap["count_lagging_replicas"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountLaggingReplicas), Valid: true} + rowMap["count_logging_replicas"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountLoggingReplicas), Valid: true} + rowMap["count_mariadb_gtid_replicas"] = sqlutils.CellData{Valid: false} + rowMap["count_mixed_based_logging_replicas"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountMixedBasedLoggingReplicas), Valid: true} + rowMap["count_oracle_gtid_replicas"] = sqlutils.CellData{Valid: false} + rowMap["count_replicas"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountReplicas), Valid: true} + rowMap["count_replicas_failing_to_connect_to_primary"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountReplicasFailingToConnectToPrimary), Valid: true} + rowMap["count_row_based_logging_replicas"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountRowBasedLoggingReplicas), Valid: true} + rowMap["count_semi_sync_replicas"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountSemiSyncReplicasEnabled), Valid: true} + rowMap["count_statement_based_logging_replicas"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountStatementBasedLoggingReplicas), Valid: true} + rowMap["count_valid_binlog_server_replicas"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountValidBinlogServerReplicas), Valid: true} + rowMap["count_valid_mariadb_gtid_replicas"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountValidMariaDBGTIDReplicas), Valid: true} + rowMap["count_valid_oracle_gtid_replicas"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountValidOracleGTIDReplicas), Valid: true} + rowMap["count_valid_replicas"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountValidReplicas), Valid: true} + rowMap["count_valid_replicating_replicas"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.CountValidReplicatingReplicas), Valid: true} + rowMap["data_center"] = sqlutils.CellData{String: info.DataCenter, Valid: true} + rowMap["downtime_end_timestamp"] = sqlutils.CellData{String: info.DowntimeEndTimestamp, Valid: true} + rowMap["downtime_remaining_seconds"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.DowntimeRemainingSeconds), Valid: true} + rowMap["gtid_mode"] = sqlutils.CellData{String: info.GTIDMode, Valid: true} + rowMap["hostname"] = sqlutils.CellData{String: info.Hostname, Valid: true} + rowMap["is_binlog_server"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.IsBinlogServer), Valid: true} + rowMap["is_co_primary"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.IsCoPrimary), Valid: true} + rowMap["is_downtimed"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.IsDowntimed), Valid: true} + rowMap["is_failing_to_connect_to_primary"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.IsFailingToConnectToPrimary), Valid: true} + rowMap["is_last_check_valid"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.LastCheckValid), Valid: true} + rowMap["is_primary"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.IsPrimary), Valid: true} + rowMap["is_stale_binlog_coordinates"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.IsStaleBinlogCoordinates), Valid: true} + rowMap["last_check_partial_success"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.LastCheckPartialSuccess), Valid: true} + rowMap["max_replica_gtid_errant"] = sqlutils.CellData{String: info.MaxReplicaGTIDErrant, Valid: true} + rowMap["max_replica_gtid_mode"] = sqlutils.CellData{String: info.MaxReplicaGTIDMode, Valid: true} + rowMap["min_replica_gtid_mode"] = sqlutils.CellData{String: info.MinReplicaGTIDMode, Valid: true} + rowMap["physical_environment"] = sqlutils.CellData{String: info.PhysicalEnvironment, Valid: true} + rowMap["port"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.Port), Valid: true} + if info.PrimaryTabletInfo == nil { + rowMap["primary_tablet_info"] = sqlutils.CellData{Valid: false} + } else { + res, _ := prototext.Marshal(info.PrimaryTabletInfo) + rowMap["primary_tablet_info"] = sqlutils.CellData{String: string(res), Valid: true} + } + rowMap["primary_timestamp"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.PrimaryTimestamp), Valid: true} + rowMap["read_only"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.ReadOnly), Valid: true} + rowMap["region"] = sqlutils.CellData{String: info.Region, Valid: true} + rowMap["replica_hosts"] = sqlutils.CellData{String: info.ReplicaHosts, Valid: true} + rowMap["replication_depth"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.ReplicationDepth), Valid: true} + rowMap["replication_stopped"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.ReplicationStopped), Valid: true} + rowMap["semi_sync_primary_clients"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.SemiSyncPrimaryClients), Valid: true} + rowMap["semi_sync_primary_enabled"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.SemiSyncPrimaryEnabled), Valid: true} + rowMap["semi_sync_primary_status"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.SemiSyncPrimaryStatus), Valid: true} + rowMap["semi_sync_primary_wait_for_replica_count"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.SemiSyncPrimaryWaitForReplicaCount), Valid: true} + rowMap["semi_sync_replica_enabled"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.SemiSyncReplicaEnabled), Valid: true} + rowMap["source_host"] = sqlutils.CellData{String: info.SourceHost, Valid: true} + rowMap["source_port"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.SourcePort), Valid: true} + rowMap["suggested_cluster_alias"] = sqlutils.CellData{String: info.SuggestedClusterAlias, Valid: true} + res, _ := prototext.Marshal(info.TabletInfo) + rowMap["tablet_info"] = sqlutils.CellData{String: string(res), Valid: true} + return rowMap +} + +func (info *InfoForRecoveryAnalysis) SetValuesFromTabletInfo() { + info.Hostname = info.TabletInfo.MysqlHostname + info.Port = int(info.TabletInfo.MysqlPort) + info.DataCenter = info.TabletInfo.Alias.Cell + info.ClusterName = fmt.Sprintf("%v:%d", info.TabletInfo.MysqlHostname, info.TabletInfo.MysqlPort) + info.ClusterAlias = fmt.Sprintf("%v:%d", info.TabletInfo.MysqlHostname, info.TabletInfo.MysqlPort) + info.ClusterDomain = fmt.Sprintf("%v:%d", info.TabletInfo.MysqlHostname, info.TabletInfo.MysqlPort) +}