Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VTOrc: Refactor and reload of ephemeral information for remaining recovery functions #10150

Merged
merged 9 commits into from
May 4, 2022
14 changes: 14 additions & 0 deletions go/vt/orchestrator/db/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,25 @@ import (

var (
EmptyArgs []any
Db DB = (*vtorcDB)(nil)
)

var mysqlURI string
var dbMutex sync.Mutex

type DB interface {
QueryOrchestrator(query string, argsArray []any, onRow func(sqlutils.RowMap) error) error
}

type vtorcDB struct {
}

var _ DB = (*vtorcDB)(nil)

func (m *vtorcDB) QueryOrchestrator(query string, argsArray []any, onRow func(sqlutils.RowMap) error) error {
return QueryOrchestrator(query, argsArray, onRow)
}

type DummySQLResult struct {
}

Expand Down
68 changes: 27 additions & 41 deletions go/vt/orchestrator/inst/analysis.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,47 +30,33 @@ type AnalysisCode string
type StructureAnalysisCode string

const (
NoProblem AnalysisCode = "NoProblem"
ClusterHasNoPrimary AnalysisCode = "ClusterHasNoPrimary"
DeadPrimaryWithoutReplicas AnalysisCode = "DeadPrimaryWithoutReplicas"
DeadPrimary AnalysisCode = "DeadPrimary"
DeadPrimaryAndReplicas AnalysisCode = "DeadPrimaryAndReplicas"
DeadPrimaryAndSomeReplicas AnalysisCode = "DeadPrimaryAndSomeReplicas"
PrimaryHasPrimary AnalysisCode = "PrimaryHasPrimary"
PrimaryIsReadOnly AnalysisCode = "PrimaryIsReadOnly"
PrimarySemiSyncMustBeSet AnalysisCode = "PrimarySemiSyncMustBeSet"
PrimarySemiSyncMustNotBeSet AnalysisCode = "PrimarySemiSyncMustNotBeSet"
ReplicaIsWritable AnalysisCode = "ReplicaIsWritable"
NotConnectedToPrimary AnalysisCode = "NotConnectedToPrimary"
ConnectedToWrongPrimary AnalysisCode = "ConnectedToWrongPrimary"
ReplicationStopped AnalysisCode = "ReplicationStopped"
ReplicaSemiSyncMustBeSet AnalysisCode = "ReplicaSemiSyncMustBeSet"
ReplicaSemiSyncMustNotBeSet AnalysisCode = "ReplicaSemiSyncMustNotBeSet"
UnreachablePrimaryWithLaggingReplicas AnalysisCode = "UnreachablePrimaryWithLaggingReplicas"
UnreachablePrimary AnalysisCode = "UnreachablePrimary"
PrimarySingleReplicaNotReplicating AnalysisCode = "PrimarySingleReplicaNotReplicating"
PrimarySingleReplicaDead AnalysisCode = "PrimarySingleReplicaDead"
AllPrimaryReplicasNotReplicating AnalysisCode = "AllPrimaryReplicasNotReplicating"
AllPrimaryReplicasNotReplicatingOrDead AnalysisCode = "AllPrimaryReplicasNotReplicatingOrDead"
LockedSemiSyncPrimaryHypothesis AnalysisCode = "LockedSemiSyncPrimaryHypothesis"
LockedSemiSyncPrimary AnalysisCode = "LockedSemiSyncPrimary"
PrimaryWithoutReplicas AnalysisCode = "PrimaryWithoutReplicas"
DeadCoPrimary AnalysisCode = "DeadCoPrimary"
DeadCoPrimaryAndSomeReplicas AnalysisCode = "DeadCoPrimaryAndSomeReplicas"
UnreachableCoPrimary AnalysisCode = "UnreachableCoPrimary"
AllCoPrimaryReplicasNotReplicating AnalysisCode = "AllCoPrimaryReplicasNotReplicating"
DeadIntermediatePrimary AnalysisCode = "DeadIntermediatePrimary"
DeadIntermediatePrimaryWithSingleReplica AnalysisCode = "DeadIntermediatePrimaryWithSingleReplica"
DeadIntermediatePrimaryWithSingleReplicaFailingToConnect AnalysisCode = "DeadIntermediatePrimaryWithSingleReplicaFailingToConnect"
DeadIntermediatePrimaryAndSomeReplicas AnalysisCode = "DeadIntermediatePrimaryAndSomeReplicas"
DeadIntermediatePrimaryAndReplicas AnalysisCode = "DeadIntermediatePrimaryAndReplicas"
UnreachableIntermediatePrimaryWithLaggingReplicas AnalysisCode = "UnreachableIntermediatePrimaryWithLaggingReplicas"
UnreachableIntermediatePrimary AnalysisCode = "UnreachableIntermediatePrimary"
AllIntermediatePrimaryReplicasFailingToConnectOrDead AnalysisCode = "AllIntermediatePrimaryReplicasFailingToConnectOrDead"
AllIntermediatePrimaryReplicasNotReplicating AnalysisCode = "AllIntermediatePrimaryReplicasNotReplicating"
FirstTierReplicaFailingToConnectToPrimary AnalysisCode = "FirstTierReplicaFailingToConnectToPrimary"
BinlogServerFailingToConnectToPrimary AnalysisCode = "BinlogServerFailingToConnectToPrimary"
GraceFulPrimaryTakeover AnalysisCode = "GracefulPrimaryTakeover"
NoProblem AnalysisCode = "NoProblem"
ClusterHasNoPrimary AnalysisCode = "ClusterHasNoPrimary"
DeadPrimaryWithoutReplicas AnalysisCode = "DeadPrimaryWithoutReplicas"
DeadPrimary AnalysisCode = "DeadPrimary"
DeadPrimaryAndReplicas AnalysisCode = "DeadPrimaryAndReplicas"
DeadPrimaryAndSomeReplicas AnalysisCode = "DeadPrimaryAndSomeReplicas"
PrimaryHasPrimary AnalysisCode = "PrimaryHasPrimary"
PrimaryIsReadOnly AnalysisCode = "PrimaryIsReadOnly"
PrimarySemiSyncMustBeSet AnalysisCode = "PrimarySemiSyncMustBeSet"
PrimarySemiSyncMustNotBeSet AnalysisCode = "PrimarySemiSyncMustNotBeSet"
ReplicaIsWritable AnalysisCode = "ReplicaIsWritable"
NotConnectedToPrimary AnalysisCode = "NotConnectedToPrimary"
ConnectedToWrongPrimary AnalysisCode = "ConnectedToWrongPrimary"
ReplicationStopped AnalysisCode = "ReplicationStopped"
ReplicaSemiSyncMustBeSet AnalysisCode = "ReplicaSemiSyncMustBeSet"
ReplicaSemiSyncMustNotBeSet AnalysisCode = "ReplicaSemiSyncMustNotBeSet"
UnreachablePrimaryWithLaggingReplicas AnalysisCode = "UnreachablePrimaryWithLaggingReplicas"
UnreachablePrimary AnalysisCode = "UnreachablePrimary"
PrimarySingleReplicaNotReplicating AnalysisCode = "PrimarySingleReplicaNotReplicating"
PrimarySingleReplicaDead AnalysisCode = "PrimarySingleReplicaDead"
AllPrimaryReplicasNotReplicating AnalysisCode = "AllPrimaryReplicasNotReplicating"
AllPrimaryReplicasNotReplicatingOrDead AnalysisCode = "AllPrimaryReplicasNotReplicatingOrDead"
LockedSemiSyncPrimaryHypothesis AnalysisCode = "LockedSemiSyncPrimaryHypothesis"
LockedSemiSyncPrimary AnalysisCode = "LockedSemiSyncPrimary"
PrimaryWithoutReplicas AnalysisCode = "PrimaryWithoutReplicas"
BinlogServerFailingToConnectToPrimary AnalysisCode = "BinlogServerFailingToConnectToPrimary"
GraceFulPrimaryTakeover AnalysisCode = "GracefulPrimaryTakeover"
)

const (
Expand Down
72 changes: 2 additions & 70 deletions go/vt/orchestrator/inst/analysis_dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,7 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints)
`

clusters := make(map[string]*clusterAnalysis)
err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error {
err := db.Db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error {
a := ReplicationAnalysis{
Analysis: NoProblem,
ProcessingNodeHostname: process.ThisHostname,
Expand Down Expand Up @@ -579,71 +579,10 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints)
a.Analysis = AllPrimaryReplicasNotReplicatingOrDead
a.Description = "Primary is reachable but none of its replicas is replicating"
//
} else /* co-primary */ if a.IsCoPrimary && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 {
a.Analysis = DeadCoPrimary
a.Description = "Co-primary cannot be reached by orchestrator and none of its replicas is replicating"
//
} else if a.IsCoPrimary && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 {
a.Analysis = DeadCoPrimaryAndSomeReplicas
a.Description = "Co-primary cannot be reached by orchestrator; some of its replicas are unreachable and none of its reachable replicas is replicating"
//
} else if a.IsCoPrimary && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 {
a.Analysis = UnreachableCoPrimary
a.Description = "Co-primary cannot be reached by orchestrator but it has replicating replicas; possibly a network/host issue"
//
} else if a.IsCoPrimary && a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicatingReplicas == 0 {
a.Analysis = AllCoPrimaryReplicasNotReplicating
a.Description = "Co-primary is reachable but none of its replicas is replicating"
//
} else /* intermediate-primary */ if !a.IsPrimary && !a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == a.CountReplicas && a.CountReplicasFailingToConnectToPrimary == a.CountReplicas && a.CountValidReplicatingReplicas == 0 {
a.Analysis = DeadIntermediatePrimaryWithSingleReplicaFailingToConnect
a.Description = "Intermediate primary cannot be reached by orchestrator and its (single) replica is failing to connect"
//
} else if !a.IsPrimary && !a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 {
a.Analysis = DeadIntermediatePrimaryWithSingleReplica
a.Description = "Intermediate primary cannot be reached by orchestrator and its (single) replica is not replicating"
//
} else if !a.IsPrimary && !a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 {
a.Analysis = DeadIntermediatePrimary
a.Description = "Intermediate primary cannot be reached by orchestrator and none of its replicas is replicating"
//
} else if !a.IsPrimary && !a.LastCheckValid && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 {
a.Analysis = DeadIntermediatePrimaryAndSomeReplicas
a.Description = "Intermediate primary cannot be reached by orchestrator; some of its replicas are unreachable and none of its reachable replicas is replicating"
//
} else if !a.IsPrimary && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas == 0 {
a.Analysis = DeadIntermediatePrimaryAndReplicas
a.Description = "Intermediate primary cannot be reached by orchestrator and all of its replicas are unreachable"
//
} else if !a.IsPrimary && !a.LastCheckValid && a.CountLaggingReplicas == a.CountReplicas && a.CountDelayedReplicas < a.CountReplicas && a.CountValidReplicatingReplicas > 0 {
a.Analysis = UnreachableIntermediatePrimaryWithLaggingReplicas
a.Description = "Intermediate primary cannot be reached by orchestrator and all of its replicas are lagging"
//
} else if !a.IsPrimary && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 {
a.Analysis = UnreachableIntermediatePrimary
a.Description = "Intermediate primary cannot be reached by orchestrator but it has replicating replicas; possibly a network/host issue"
//
} else if !a.IsPrimary && a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicatingReplicas == 0 &&
a.CountReplicasFailingToConnectToPrimary > 0 && a.CountReplicasFailingToConnectToPrimary == a.CountValidReplicas {
// All replicas are either failing to connect to primary (and at least one of these have to exist)
// or completely dead.
// Must have at least two replicas to reach such conclusion -- do note that the intermediate primary is still
// reachable to orchestrator, so we base our conclusion on replicas only at this point.
a.Analysis = AllIntermediatePrimaryReplicasFailingToConnectOrDead
a.Description = "Intermediate primary is reachable but all of its replicas are failing to connect"
//
} else if !a.IsPrimary && a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicatingReplicas == 0 {
a.Analysis = AllIntermediatePrimaryReplicasNotReplicating
a.Description = "Intermediate primary is reachable but none of its replicas is replicating"
//
} else if a.IsBinlogServer && a.IsFailingToConnectToPrimary {
a.Analysis = BinlogServerFailingToConnectToPrimary
a.Description = "Binlog server is unable to connect to its primary"
//
} else if a.ReplicationDepth == 1 && a.IsFailingToConnectToPrimary {
a.Analysis = FirstTierReplicaFailingToConnectToPrimary
a.Description = "1st tier replica (directly replicating from topology primary) is unable to connect to the primary"
//
}
// else if a.IsPrimary && a.CountReplicas == 0 {
// a.Analysis = PrimaryWithoutReplicas
Expand All @@ -666,14 +605,7 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints)
switch a.Analysis {
case AllPrimaryReplicasNotReplicating,
AllPrimaryReplicasNotReplicatingOrDead,
PrimarySingleReplicaDead,
AllCoPrimaryReplicasNotReplicating,
DeadIntermediatePrimaryWithSingleReplica,
DeadIntermediatePrimaryWithSingleReplicaFailingToConnect,
DeadIntermediatePrimaryAndReplicas,
DeadIntermediatePrimaryAndSomeReplicas,
AllIntermediatePrimaryReplicasFailingToConnectOrDead,
AllIntermediatePrimaryReplicasNotReplicating:
PrimarySingleReplicaDead:
a.IsReplicasDowntimed = true
a.SkippableDueToDowntime = true
}
Expand Down
Loading