From 37ed0c929d0c70b3be7bb9e3243b8ba1957cef89 Mon Sep 17 00:00:00 2001 From: Amir Abushareb Date: Wed, 13 Sep 2023 11:13:11 +0100 Subject: [PATCH 01/14] vtorc/inst: add AnalysisCodes() Signed-off-by: Amir Abushareb --- go/vt/vtorc/inst/analysis.go | 35 ++++++++++++++++++++++++++++++- go/vt/vtorc/inst/analysis_test.go | 6 ++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/go/vt/vtorc/inst/analysis.go b/go/vt/vtorc/inst/analysis.go index 8707e6ba828..9519d6a3e57 100644 --- a/go/vt/vtorc/inst/analysis.go +++ b/go/vt/vtorc/inst/analysis.go @@ -25,7 +25,6 @@ import ( ) type AnalysisCode string -type StructureAnalysisCode string const ( NoProblem AnalysisCode = "NoProblem" @@ -61,6 +60,40 @@ const ( ErrantGTIDDetected AnalysisCode = "ErrantGTIDDetected" ) +// AnalysisCodes returns all analysis codes except for `NoProblem`. +func AnalysisCodes() []string { + return []string{ + string(ClusterHasNoPrimary), + string(DeadPrimaryWithoutReplicas), + string(DeadPrimary), + string(DeadPrimaryAndReplicas), + string(DeadPrimaryAndSomeReplicas), + string(PrimaryHasPrimary), + string(PrimaryIsReadOnly), + string(PrimarySemiSyncMustBeSet), + string(PrimarySemiSyncMustNotBeSet), + string(ReplicaIsWritable), + string(NotConnectedToPrimary), + string(ConnectedToWrongPrimary), + string(ReplicationStopped), + string(ReplicaSemiSyncMustBeSet), + string(ReplicaSemiSyncMustNotBeSet), + string(UnreachablePrimaryWithLaggingReplicas), + string(UnreachablePrimary), + string(PrimarySingleReplicaNotReplicating), + string(PrimarySingleReplicaDead), + string(AllPrimaryReplicasNotReplicating), + string(AllPrimaryReplicasNotReplicatingOrDead), + string(LockedSemiSyncPrimaryHypothesis), + string(LockedSemiSyncPrimary), + string(PrimaryWithoutReplicas), + string(BinlogServerFailingToConnectToPrimary), + string(GraceFulPrimaryTakeover), + } +} + +type StructureAnalysisCode string + const ( StatementAndMixedLoggingReplicasStructureWarning StructureAnalysisCode = "StatementAndMixedLoggingReplicasStructureWarning" StatementAndRowLoggingReplicasStructureWarning StructureAnalysisCode = "StatementAndRowLoggingReplicasStructureWarning" diff --git a/go/vt/vtorc/inst/analysis_test.go b/go/vt/vtorc/inst/analysis_test.go index 70849379a5e..866352bec7d 100644 --- a/go/vt/vtorc/inst/analysis_test.go +++ b/go/vt/vtorc/inst/analysis_test.go @@ -17,6 +17,7 @@ package inst import ( + "slices" "testing" "vitess.io/vitess/go/vt/vtorc/config" @@ -46,3 +47,8 @@ func TestGetAnalysisInstanceType(t *testing.T) { require.Equal(t, string(analysis.GetAnalysisInstanceType()), "co-primary") } } + +func TestAnalysisCodes(t *testing.T) { + require.False(t, slices.Contains(AnalysisCodes(), string(NoProblem))) + require.Equal(t, 26, len(AnalysisCodes())) +} From b7cf0145b70c309b6d07c0b618b4bbbe957430ae Mon Sep 17 00:00:00 2001 From: Amir Abushareb Date: Wed, 13 Sep 2023 11:14:08 +0100 Subject: [PATCH 02/14] vtorc/logic: add detected_problems counter Signed-off-by: Amir Abushareb --- go/vt/vtorc/logic/topology_recovery.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go index 8bd6da048d7..545749c5cd8 100644 --- a/go/vt/vtorc/logic/topology_recovery.go +++ b/go/vt/vtorc/logic/topology_recovery.go @@ -63,6 +63,9 @@ var ( countPendingRecoveries = stats.NewGauge("PendingRecoveries", "Count of the number of pending recoveries") + // detectedProblemsCounter is used to count the number of detected problems. + detectedProblemsCounter = stats.NewCountersWithSingleLabel("DetectedProblems", "Count of the different detected problems", "AnalysisCode", inst.AnalysisCodes()...) + // recoveriesCounter counts the number of recoveries that VTOrc has performed recoveriesCounter = stats.NewCountersWithSingleLabel("RecoveriesCount", "Count of the different recoveries performed", "RecoveryType", actionableRecoveriesNames...) @@ -578,6 +581,13 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er countPendingRecoveries.Add(1) defer countPendingRecoveries.Add(-1) + // Regardless of whether there's an action to take, we have to increment + // the problems counter with the analysis entry unless it indicates there's + // no problem. + if code := analysisEntry.Analysis; code != inst.NoProblem { + detectedProblemsCounter.Add(string(code), 1) + } + checkAndRecoverFunctionCode := getCheckAndRecoverFunctionCode(analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias) isActionableRecovery := hasActionableRecovery(checkAndRecoverFunctionCode) analysisEntry.IsActionableRecovery = isActionableRecovery From f1b0c4abc4ca4d95e224250f771b64db97ade547 Mon Sep 17 00:00:00 2001 From: Amir Abushareb Date: Thu, 14 Sep 2023 11:12:17 +0100 Subject: [PATCH 03/14] vtorc/logic: add tablet_alias, keyspace and shard to counter Signed-off-by: Amir Abushareb --- go/vt/vtorc/logic/topology_recovery.go | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go index 545749c5cd8..98f802d2151 100644 --- a/go/vt/vtorc/logic/topology_recovery.go +++ b/go/vt/vtorc/logic/topology_recovery.go @@ -64,7 +64,12 @@ var ( countPendingRecoveries = stats.NewGauge("PendingRecoveries", "Count of the number of pending recoveries") // detectedProblemsCounter is used to count the number of detected problems. - detectedProblemsCounter = stats.NewCountersWithSingleLabel("DetectedProblems", "Count of the different detected problems", "AnalysisCode", inst.AnalysisCodes()...) + detectedProblemsCounter = stats.NewCountersWithMultiLabels("DetectedProblems", "Count of the different detected problems", []string{ + "AnalysisCode", + "TabletAlias", + "Keyspace", + "Shard", + }) // recoveriesCounter counts the number of recoveries that VTOrc has performed recoveriesCounter = stats.NewCountersWithSingleLabel("RecoveriesCount", "Count of the different recoveries performed", "RecoveryType", actionableRecoveriesNames...) @@ -585,7 +590,12 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er // the problems counter with the analysis entry unless it indicates there's // no problem. if code := analysisEntry.Analysis; code != inst.NoProblem { - detectedProblemsCounter.Add(string(code), 1) + detectedProblemsCounter.Add([]string{ + string(code), + analysisEntry.AnalyzedInstanceAlias, + analysisEntry.AnalyzedKeyspace, + analysisEntry.AnalyzedShard, + }, 1) } checkAndRecoverFunctionCode := getCheckAndRecoverFunctionCode(analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias) From ea7ea6889e008ef307f26372a41b458369c1b663 Mon Sep 17 00:00:00 2001 From: Amir Abushareb Date: Tue, 19 Sep 2023 09:27:26 +0100 Subject: [PATCH 04/14] vtorc/logic: AnalysisCode -> Analysis Signed-off-by: Amir Abushareb --- go/vt/vtorc/logic/topology_recovery.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go index 98f802d2151..9d96fdee5f2 100644 --- a/go/vt/vtorc/logic/topology_recovery.go +++ b/go/vt/vtorc/logic/topology_recovery.go @@ -65,7 +65,7 @@ var ( // detectedProblemsCounter is used to count the number of detected problems. detectedProblemsCounter = stats.NewCountersWithMultiLabels("DetectedProblems", "Count of the different detected problems", []string{ - "AnalysisCode", + "Analysis", "TabletAlias", "Keyspace", "Shard", From cf866ba31dbf9588f3db77e30721557002faf6de Mon Sep 17 00:00:00 2001 From: Amir Abushareb Date: Tue, 19 Sep 2023 09:29:44 +0100 Subject: [PATCH 05/14] vtorc/inst: remove unused AnalaysisCodes() Signed-off-by: Amir Abushareb --- go/vt/vtorc/inst/analysis.go | 32 ------------------------------- go/vt/vtorc/inst/analysis_test.go | 6 ------ 2 files changed, 38 deletions(-) diff --git a/go/vt/vtorc/inst/analysis.go b/go/vt/vtorc/inst/analysis.go index 9519d6a3e57..54500621cb9 100644 --- a/go/vt/vtorc/inst/analysis.go +++ b/go/vt/vtorc/inst/analysis.go @@ -60,38 +60,6 @@ const ( ErrantGTIDDetected AnalysisCode = "ErrantGTIDDetected" ) -// AnalysisCodes returns all analysis codes except for `NoProblem`. -func AnalysisCodes() []string { - return []string{ - string(ClusterHasNoPrimary), - string(DeadPrimaryWithoutReplicas), - string(DeadPrimary), - string(DeadPrimaryAndReplicas), - string(DeadPrimaryAndSomeReplicas), - string(PrimaryHasPrimary), - string(PrimaryIsReadOnly), - string(PrimarySemiSyncMustBeSet), - string(PrimarySemiSyncMustNotBeSet), - string(ReplicaIsWritable), - string(NotConnectedToPrimary), - string(ConnectedToWrongPrimary), - string(ReplicationStopped), - string(ReplicaSemiSyncMustBeSet), - string(ReplicaSemiSyncMustNotBeSet), - string(UnreachablePrimaryWithLaggingReplicas), - string(UnreachablePrimary), - string(PrimarySingleReplicaNotReplicating), - string(PrimarySingleReplicaDead), - string(AllPrimaryReplicasNotReplicating), - string(AllPrimaryReplicasNotReplicatingOrDead), - string(LockedSemiSyncPrimaryHypothesis), - string(LockedSemiSyncPrimary), - string(PrimaryWithoutReplicas), - string(BinlogServerFailingToConnectToPrimary), - string(GraceFulPrimaryTakeover), - } -} - type StructureAnalysisCode string const ( diff --git a/go/vt/vtorc/inst/analysis_test.go b/go/vt/vtorc/inst/analysis_test.go index 866352bec7d..70849379a5e 100644 --- a/go/vt/vtorc/inst/analysis_test.go +++ b/go/vt/vtorc/inst/analysis_test.go @@ -17,7 +17,6 @@ package inst import ( - "slices" "testing" "vitess.io/vitess/go/vt/vtorc/config" @@ -47,8 +46,3 @@ func TestGetAnalysisInstanceType(t *testing.T) { require.Equal(t, string(analysis.GetAnalysisInstanceType()), "co-primary") } } - -func TestAnalysisCodes(t *testing.T) { - require.False(t, slices.Contains(AnalysisCodes(), string(NoProblem))) - require.Equal(t, 26, len(AnalysisCodes())) -} From 82b4a75260b631d97faa80f7fbca2b84ca8580ed Mon Sep 17 00:00:00 2001 From: Amir Abushareb Date: Wed, 20 Sep 2023 15:42:33 +0100 Subject: [PATCH 06/14] vtorc/logic: fix err scoping Signed-off-by: Amir Abushareb --- go/vt/vtorc/logic/topology_recovery.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go index 9d96fdee5f2..9b43ff3a9a8 100644 --- a/go/vt/vtorc/logic/topology_recovery.go +++ b/go/vt/vtorc/logic/topology_recovery.go @@ -775,17 +775,16 @@ func CheckAndRecover() { log.Error(err) return } + // intentionally iterating entries in random order for _, j := range rand.Perm(len(replicationAnalysis)) { analysisEntry := replicationAnalysis[j] go func() { - err = executeCheckAndRecoverFunction(analysisEntry) - if err != nil { + if err := executeCheckAndRecoverFunction(analysisEntry); err != nil { log.Error(err) } }() - } } From bba7fd100051579f6449ba057a7cbe7a198e2742 Mon Sep 17 00:00:00 2001 From: Amir Abushareb Date: Wed, 20 Sep 2023 15:53:39 +0100 Subject: [PATCH 07/14] stats: add GetLabelName() Signed-off-by: Amir Abushareb --- go/stats/counters.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/go/stats/counters.go b/go/stats/counters.go index a4dfc0dcb1f..41f04ab4146 100644 --- a/go/stats/counters.go +++ b/go/stats/counters.go @@ -183,6 +183,11 @@ func NewCountersWithMultiLabels(name, help string, labels []string) *CountersWit return t } +// GetLabelName returns a label name using the provided values. +func (mc *CountersWithMultiLabels) GetLabelName(names ...string) string { + return safeJoinLabels(names, mc.combinedLabels) +} + // Labels returns the list of labels. func (mc *CountersWithMultiLabels) Labels() []string { return mc.labels From fda66ef1729289cc5bc5e3c03e881681bec0ed1d Mon Sep 17 00:00:00 2001 From: Amir Abushareb Date: Wed, 20 Sep 2023 15:55:23 +0100 Subject: [PATCH 08/14] vtorc/logic: set counter to 1 when issue detected, reset when its no longer an issue Signed-off-by: Amir Abushareb --- go/vt/vtorc/logic/topology_recovery.go | 39 ++++++++++++++++++-------- 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go index 9b43ff3a9a8..89eb555dbb5 100644 --- a/go/vt/vtorc/logic/topology_recovery.go +++ b/go/vt/vtorc/logic/topology_recovery.go @@ -586,18 +586,6 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er countPendingRecoveries.Add(1) defer countPendingRecoveries.Add(-1) - // Regardless of whether there's an action to take, we have to increment - // the problems counter with the analysis entry unless it indicates there's - // no problem. - if code := analysisEntry.Analysis; code != inst.NoProblem { - detectedProblemsCounter.Add([]string{ - string(code), - analysisEntry.AnalyzedInstanceAlias, - analysisEntry.AnalyzedKeyspace, - analysisEntry.AnalyzedShard, - }, 1) - } - checkAndRecoverFunctionCode := getCheckAndRecoverFunctionCode(analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias) isActionableRecovery := hasActionableRecovery(checkAndRecoverFunctionCode) analysisEntry.IsActionableRecovery = isActionableRecovery @@ -776,6 +764,33 @@ func CheckAndRecover() { return } + // Regardless of if the problem is solved or not we want to monitor active + // issues, we use a map of labels and set a counter to `1` for each problem + // then we reset any counter that is not present in the current analysis. + active := make(map[string][]string) + for _, e := range replicationAnalysis { + if e.Analysis != inst.NoProblem { + names := [...]string{ + string(e.Analysis), + e.AnalyzedInstanceAlias, + e.AnalyzedKeyspace, + e.AnalyzedShard, + } + + key := detectedProblemsCounter.GetLabelName(names[:]...) + active[key] = names[:] + detectedProblemsCounter.Reset(names[:]) + detectedProblemsCounter.Add(names[:], 1) + } + } + + // Reset any non-active problems. + for key := range detectedProblemsCounter.Counts() { + if names, ok := active[key]; !ok { + detectedProblemsCounter.Reset(names) + } + } + // intentionally iterating entries in random order for _, j := range rand.Perm(len(replicationAnalysis)) { analysisEntry := replicationAnalysis[j] From 7371434a937be96e8f1c95e572afb75401d6f2e5 Mon Sep 17 00:00:00 2001 From: Amir Abushareb Date: Thu, 21 Sep 2023 08:45:01 +0100 Subject: [PATCH 09/14] stats: move GetLabelName to GaugesWithMultiLabels Signed-off-by: Amir Abushareb --- go/stats/counters.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/go/stats/counters.go b/go/stats/counters.go index 41f04ab4146..2b6f2c39e8e 100644 --- a/go/stats/counters.go +++ b/go/stats/counters.go @@ -183,11 +183,6 @@ func NewCountersWithMultiLabels(name, help string, labels []string) *CountersWit return t } -// GetLabelName returns a label name using the provided values. -func (mc *CountersWithMultiLabels) GetLabelName(names ...string) string { - return safeJoinLabels(names, mc.combinedLabels) -} - // Labels returns the list of labels. func (mc *CountersWithMultiLabels) Labels() []string { return mc.labels @@ -373,6 +368,11 @@ func NewGaugesWithMultiLabels(name, help string, labels []string) *GaugesWithMul return t } +// GetLabelName returns a label name using the provided values. +func (mg *GaugesWithMultiLabels) GetLabelName(names ...string) string { + return safeJoinLabels(names, mg.combinedLabels) +} + // Set sets the value of a named counter. // len(names) must be equal to len(Labels). func (mg *GaugesWithMultiLabels) Set(names []string, value int64) { From 155a803a8ba585f9d9858692a0c1a84d80904f37 Mon Sep 17 00:00:00 2001 From: Amir Abushareb Date: Thu, 21 Sep 2023 08:45:23 +0100 Subject: [PATCH 10/14] vtorc/logic: use gauge for detected problems Signed-off-by: Amir Abushareb --- go/vt/vtorc/logic/topology_recovery.go | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go index 89eb555dbb5..d6b6b4b1bae 100644 --- a/go/vt/vtorc/logic/topology_recovery.go +++ b/go/vt/vtorc/logic/topology_recovery.go @@ -63,8 +63,11 @@ var ( countPendingRecoveries = stats.NewGauge("PendingRecoveries", "Count of the number of pending recoveries") - // detectedProblemsCounter is used to count the number of detected problems. - detectedProblemsCounter = stats.NewCountersWithMultiLabels("DetectedProblems", "Count of the different detected problems", []string{ + // detectedProblems is used to track the number of detected problems. + // + // When an issue is active it will be set to 1, when it is no longer active + // it will be reset back to 0. + detectedProblems = stats.NewGaugesWithMultiLabels("DetectedProblems", "Count of the different detected problems", []string{ "Analysis", "TabletAlias", "Keyspace", @@ -777,17 +780,16 @@ func CheckAndRecover() { e.AnalyzedShard, } - key := detectedProblemsCounter.GetLabelName(names[:]...) + key := detectedProblems.GetLabelName(names[:]...) active[key] = names[:] - detectedProblemsCounter.Reset(names[:]) - detectedProblemsCounter.Add(names[:], 1) + detectedProblems.Set(names[:], 1) } } // Reset any non-active problems. - for key := range detectedProblemsCounter.Counts() { + for key := range detectedProblems.Counts() { if names, ok := active[key]; !ok { - detectedProblemsCounter.Reset(names) + detectedProblems.Set(names, 0) } } From 527d3224ede69cdb6cc2111047c1a9a70b1c071d Mon Sep 17 00:00:00 2001 From: Amir Abushareb Date: Fri, 22 Sep 2023 08:42:33 +0100 Subject: [PATCH 11/14] stats: add (*GaugesWithMultiLabels).ResetKey Allows the caller to reset a specific key when it is obtained from the internal counters map. Signed-off-by: Amir Abushareb --- go/stats/counters.go | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/go/stats/counters.go b/go/stats/counters.go index 2b6f2c39e8e..e79da39c48b 100644 --- a/go/stats/counters.go +++ b/go/stats/counters.go @@ -370,7 +370,7 @@ func NewGaugesWithMultiLabels(name, help string, labels []string) *GaugesWithMul // GetLabelName returns a label name using the provided values. func (mg *GaugesWithMultiLabels) GetLabelName(names ...string) string { - return safeJoinLabels(names, mg.combinedLabels) + return safeJoinLabels(names, nil) } // Set sets the value of a named counter. @@ -382,6 +382,17 @@ func (mg *GaugesWithMultiLabels) Set(names []string, value int64) { mg.counters.set(safeJoinLabels(names, nil), value) } +// ResetKey resets a specific key. +// +// It is the equivalent of `Reset(names)` except that it expects the key to +// be obtained from the internal counters map. +// +// This is useful when you range over all internal counts and you want to reset +// specific keys. +func (mg *GaugesWithMultiLabels) ResetKey(key string) { + mg.counters.set(key, 0) +} + // GaugesFuncWithMultiLabels is a wrapper around CountersFuncWithMultiLabels // for values that go up/down for implementations (like Prometheus) that // need to differ between Counters and Gauges. From 696ea3eff8c214244e9fad80cb8251b68d0d94f4 Mon Sep 17 00:00:00 2001 From: Amir Abushareb Date: Fri, 22 Sep 2023 08:57:19 +0100 Subject: [PATCH 12/14] vtorc/logic: use .ResetKey to reset a gauge Signed-off-by: Amir Abushareb --- go/vt/vtorc/logic/topology_recovery.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go index d6b6b4b1bae..d3e73c00886 100644 --- a/go/vt/vtorc/logic/topology_recovery.go +++ b/go/vt/vtorc/logic/topology_recovery.go @@ -770,7 +770,7 @@ func CheckAndRecover() { // Regardless of if the problem is solved or not we want to monitor active // issues, we use a map of labels and set a counter to `1` for each problem // then we reset any counter that is not present in the current analysis. - active := make(map[string][]string) + active := make(map[string]struct{}) for _, e := range replicationAnalysis { if e.Analysis != inst.NoProblem { names := [...]string{ @@ -781,15 +781,15 @@ func CheckAndRecover() { } key := detectedProblems.GetLabelName(names[:]...) - active[key] = names[:] + active[key] = struct{}{} detectedProblems.Set(names[:], 1) } } // Reset any non-active problems. for key := range detectedProblems.Counts() { - if names, ok := active[key]; !ok { - detectedProblems.Set(names, 0) + if _, ok := active[key]; !ok { + detectedProblems.ResetKey(key) } } From c13d6b364fcf253bf03c9b5ccedab74e61e4c3ab Mon Sep 17 00:00:00 2001 From: Amir Abushareb Date: Wed, 27 Sep 2023 19:52:26 +0100 Subject: [PATCH 13/14] test/endtoend: add detected problems endtoend test Signed-off-by: Amir Abushareb --- go/test/endtoend/vtorc/general/vtorc_test.go | 35 ++++++++++++++++++++ go/test/endtoend/vtorc/utils/utils.go | 33 ++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/go/test/endtoend/vtorc/general/vtorc_test.go b/go/test/endtoend/vtorc/general/vtorc_test.go index 8013a8fb98b..151e84cd147 100644 --- a/go/test/endtoend/vtorc/general/vtorc_test.go +++ b/go/test/endtoend/vtorc/general/vtorc_test.go @@ -28,6 +28,7 @@ import ( "vitess.io/vitess/go/test/endtoend/cluster" "vitess.io/vitess/go/test/endtoend/vtorc/utils" "vitess.io/vitess/go/vt/log" + "vitess.io/vitess/go/vt/vtorc/inst" "vitess.io/vitess/go/vt/vtorc/logic" ) @@ -102,6 +103,7 @@ func TestKeyspaceShard(t *testing.T) { // 3. stop replication, let vtorc repair // 4. setup replication from non-primary, let vtorc repair // 5. make instance A replicates from B and B from A, wait for repair +// 6. disable recoveries and make sure the detected problems are set correctly. func TestVTOrcRepairs(t *testing.T) { defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) @@ -224,6 +226,39 @@ func TestVTOrcRepairs(t *testing.T) { utils.WaitForTabletType(t, replica, "drained") }) + t.Run("Sets DetectedProblems metric correctly", func(t *testing.T) { + // Since we're using a boolean metric here, disable recoveries for now. + status, _, err := utils.MakeAPICall(t, vtOrcProcess, "/api/disable-global-recoveries") + require.NoError(t, err) + require.Equal(t, 200, status) + + // Make the current primary database read-only. + _, err = utils.RunSQL(t, "set global read_only=ON", curPrimary, "") + require.NoError(t, err) + + // Wait for problems to be set. + utils.WaitForDetectedProblems(t, vtOrcProcess, + string(inst.PrimaryIsReadOnly), + curPrimary.Alias, + keyspace.Name, + shard0.Name, + 1, + ) + + // Enable recoveries. + status, _, err = utils.MakeAPICall(t, vtOrcProcess, "/api/enable-global-recoveries") + require.NoError(t, err) + assert.Equal(t, 200, status) + + // wait for detected problem to be cleared. + utils.WaitForDetectedProblems(t, vtOrcProcess, + string(inst.PrimaryIsReadOnly), + curPrimary.Alias, + keyspace.Name, + shard0.Name, + 0, + ) + }) } func TestRepairAfterTER(t *testing.T) { diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index de48b8f4781..a23662d5cc5 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -969,6 +969,39 @@ func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcPr assert.EqualValues(t, countExpected, successCount) } +// WaitForDetectedProblems waits until the given analysis code, alias, keyspace and shard count matches the count expected. +func WaitForDetectedProblems(t *testing.T, vtorcInstance *cluster.VTOrcProcess, code, alias, ks, shard string, expect int) { + t.Helper() + key := strings.Join([]string{code, alias, ks, shard}, ".") + timeout := 15 * time.Second + startTime := time.Now() + + for time.Since(startTime) < timeout { + vars := vtorcInstance.GetVars() + problems := vars["DetectedProblems"].(map[string]interface{}) + actual := problems[key] + if actual == expect { + return + } + time.Sleep(time.Second) + } + + vars := vtorcInstance.GetVars() + problems := vars["DetectedProblems"].(map[string]interface{}) + actual, ok := problems[key] + + assert.True(t, ok, + "The metric DetectedProblems[%s] should exist but does not (all problems: %+v)", + key, problems, + ) + + assert.EqualValues(t, expect, actual, + "The metric DetectedProblems[%s] should be %v but is %v (all problems: %+v)", + key, expect, actual, + problems, + ) +} + // WaitForTabletType waits for the tablet to reach a certain type. func WaitForTabletType(t *testing.T, tablet *cluster.Vttablet, expectedTabletType string) { t.Helper() From 7e07f06dc802813ecbed998d9e3e70fc73047a91 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Thu, 28 Sep 2023 14:53:12 +0530 Subject: [PATCH 14/14] test: use the newly introduced getIntFromValue function Signed-off-by: Manan Gupta --- go/test/endtoend/vtorc/utils/utils.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index ee85c1742db..07b5b016fcc 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -1037,7 +1037,7 @@ func WaitForDetectedProblems(t *testing.T, vtorcInstance *cluster.VTOrcProcess, for time.Since(startTime) < timeout { vars := vtorcInstance.GetVars() problems := vars["DetectedProblems"].(map[string]interface{}) - actual := problems[key] + actual := getIntFromValue(problems[key]) if actual == expect { return } @@ -1047,6 +1047,7 @@ func WaitForDetectedProblems(t *testing.T, vtorcInstance *cluster.VTOrcProcess, vars := vtorcInstance.GetVars() problems := vars["DetectedProblems"].(map[string]interface{}) actual, ok := problems[key] + actual = getIntFromValue(actual) assert.True(t, ok, "The metric DetectedProblems[%s] should exist but does not (all problems: %+v)",