Skip to content
This repository has been archived by the owner on Sep 30, 2024. It is now read-only.

Better analysis of UnreachableMaster #1225

Merged
merged 2 commits into from
Aug 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 16 additions & 10 deletions go/inst/analysis_dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -475,16 +475,16 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints)
a.IsReadOnly = m.GetUint("read_only") == 1

if !a.LastCheckValid {
analysisMessage := fmt.Sprintf("analysis: ClusterName: %+v, IsMaster: %+v, LastCheckValid: %+v, LastCheckPartialSuccess: %+v, CountReplicas: %+v, CountValidReplicatingReplicas: %+v, CountLaggingReplicas: %+v, CountDelayedReplicas: %+v, ",
a.ClusterDetails.ClusterName, a.IsMaster, a.LastCheckValid, a.LastCheckPartialSuccess, a.CountReplicas, a.CountValidReplicatingReplicas, a.CountLaggingReplicas, a.CountDelayedReplicas,
analysisMessage := fmt.Sprintf("analysis: ClusterName: %+v, IsMaster: %+v, LastCheckValid: %+v, LastCheckPartialSuccess: %+v, CountReplicas: %+v, CountValidReplicas: %+v, CountValidReplicatingReplicas: %+v, CountLaggingReplicas: %+v, CountDelayedReplicas: %+v, CountReplicasFailingToConnectToMaster: %+v",
a.ClusterDetails.ClusterName, a.IsMaster, a.LastCheckValid, a.LastCheckPartialSuccess, a.CountReplicas, a.CountValidReplicas, a.CountValidReplicatingReplicas, a.CountLaggingReplicas, a.CountDelayedReplicas, a.CountReplicasFailingToConnectToMaster,
)
if util.ClearToLog("analysis_dao", analysisMessage) {
log.Debugf(analysisMessage)
}
}
if a.IsMaster && !a.LastCheckValid && a.CountReplicas == 0 {
a.Analysis = DeadMasterWithoutReplicas
a.Description = "Master cannot be reached by orchestrator and has no slave"
a.Description = "Master cannot be reached by orchestrator and has no replica"
//
} else if a.IsMaster && !a.LastCheckValid && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 {
a.Analysis = DeadMaster
Expand All @@ -503,6 +503,12 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints)
a.Description = "Master cannot be reached by orchestrator and all of its replicas are lagging"
//
} else if a.IsMaster && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 {
// partial success is here to redice noise
a.Analysis = UnreachableMaster
a.Description = "Master cannot be reached by orchestrator but it has replicating replicas; possibly a network/host issue"
//
} else if a.IsMaster && !a.LastCheckValid && a.LastCheckPartialSuccess && a.CountReplicasFailingToConnectToMaster > 0 && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 {
// there's partial success, but also at least one replica is failing to connect to master
a.Analysis = UnreachableMaster
a.Description = "Master cannot be reached by orchestrator but it has replicating replicas; possibly a network/host issue"
//
Expand All @@ -517,11 +523,11 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints)
//
} else if a.IsMaster && a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 {
a.Analysis = MasterSingleReplicaNotReplicating
a.Description = "Master is reachable but its single slave is not replicating"
a.Description = "Master is reachable but its single replica is not replicating"
//
} else if a.IsMaster && a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == 0 {
a.Analysis = MasterSingleReplicaDead
a.Description = "Master is reachable but its single slave is dead"
a.Description = "Master is reachable but its single replica is dead"
//
} else if a.IsMaster && a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 {
a.Analysis = AllMasterReplicasNotReplicating
Expand Down Expand Up @@ -549,13 +555,13 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints)
//
} else /* intermediate-master */ if !a.IsMaster && !a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == a.CountReplicas && a.CountReplicasFailingToConnectToMaster == a.CountReplicas && a.CountValidReplicatingReplicas == 0 {
a.Analysis = DeadIntermediateMasterWithSingleReplicaFailingToConnect
a.Description = "Intermediate master cannot be reached by orchestrator and its (single) slave is failing to connect"
a.Description = "Intermediate master cannot be reached by orchestrator and its (single) replica is failing to connect"
//
} else /* intermediate-master */ if !a.IsMaster && !a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 {
} else if !a.IsMaster && !a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 {
a.Analysis = DeadIntermediateMasterWithSingleReplica
a.Description = "Intermediate master cannot be reached by orchestrator and its (single) slave is not replicating"
a.Description = "Intermediate master cannot be reached by orchestrator and its (single) replica is not replicating"
//
} else /* intermediate-master */ if !a.IsMaster && !a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 {
} else if !a.IsMaster && !a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 {
a.Analysis = DeadIntermediateMaster
a.Description = "Intermediate master cannot be reached by orchestrator and none of its replicas is replicating"
//
Expand Down Expand Up @@ -594,7 +600,7 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints)
//
} else if a.ReplicationDepth == 1 && a.IsFailingToConnectToMaster {
a.Analysis = FirstTierReplicaFailingToConnectToMaster
a.Description = "1st tier slave (directly replicating from topology master) is unable to connect to the master"
a.Description = "1st tier replica (directly replicating from topology master) is unable to connect to the master"
//
}
// else if a.IsMaster && a.CountReplicas == 0 {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
UPDATE database_instance SET last_seen=last_checked - interval 1 minute where port=22293;
UPDATE database_instance SET last_check_partial_success = 1 where port=22293;
UPDATE database_instance SET slave_io_running=0, last_io_error='error connecting to master' where port=22294;
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
testhost:22293 (cluster testhost:22293): UnreachableMaster
testhost:22294 (cluster testhost:22293): FirstTierReplicaFailingToConnectToMaster
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
-c replication-analysis
3 changes: 2 additions & 1 deletion tests/integration/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ generate_config_file() {
cp ${tests_path}/orchestrator.conf.json ${test_config_file}
sed -i -e "s/backend-db-placeholder/${db_type}/g" ${test_config_file}
sed -i -e "s^sqlite-data-file-placeholder^${sqlite_file}^g" ${test_config_file}
touch "$test_mysql_defaults_file" # required even for sqlite because config file references the my.cnf cgf file
echo "- generate_config_file OK"
}

Expand All @@ -202,7 +203,7 @@ test_all() {
echo "- deploy_internal_db OK"

test_pattern="${1:-.}"
find $tests_path ! -path . -type d -mindepth 1 -maxdepth 1 | xargs ls -td1 | cut -d "/" -f 4 | egrep "$test_pattern" | while read test_name ; do
find $tests_path -mindepth 1 -maxdepth 1 ! -path . -type d | xargs ls -td1 | cut -d "/" -f 4 | egrep "$test_pattern" | while read test_name ; do
test_single "$test_name"
if [ $? -ne 0 ] ; then
echo "+ FAIL"
Expand Down