openark · shlomi-noach · May 30, 2020 · May 21, 2020 · May 21, 2020 · May 24, 2020
diff --git a/docs/failure-detection.md b/docs/failure-detection.md
@@ -37,6 +37,7 @@ Observe the following list of potential failures:
 * DeadMasterWithoutSlaves
 * UnreachableMasterWithLaggingReplicas
 * UnreachableMaster
+* LockedSemiSyncMaster
 * AllMasterSlavesNotReplicating
 * AllMasterSlavesNotReplicatingOrDead
 * DeadCoMaster
@@ -95,12 +96,22 @@ This scenario can happen when the master is overloaded. Clients would see a "Too
 
 `orchestrator` responds to this scenario by restarting replication on all of master's immediate replicas. This will close the old client connections on those replicas and attempt to initiate new ones. These may now fail to connect, leading to a complete replication failure on all replicas. This will next lead `orchestrator` to analyze a `DeadMaster`.
 
+### LockedSemiSyncMaster
+
+1. Master is running with semi-sync enabled
+2. Number of connected semi-sync replicas falls short of expected `rpl_semi_sync_master_wait_for_slave_count`
+3. `rpl_semi_sync_master_timeout` is high enough such that master locks writes and does not fall back to asynchronous replication
+
+Remediation can be to disable semi-sync on the master, or to bring up (or enable) sufficient semi-sync replicas.
+
+At this time `orchestrator` does not invoke processes for this type of analysis.
 
 ### Failures of no interest
 
 The following scenarios are of no interest to `orchestrator`, and while the information and state are available to `orchestrator`, it does not recognize such scenarios as _failures_ per se; there's no detection hooks invoked and obviously no recoveries attempted:
 
 - Failure of simple replicas (_leaves_ on the replication topology graph)
+  Exception: semi sync replicas causing `LockedSemiSyncMaster`
 - Replication lags, even severe.
 
 ### Visibility

diff --git a/go/config/config.go b/go/config/config.go
@@ -56,6 +56,7 @@ const (
 	PseudoGTIDSchema                             = "_pseudo_gtid_"
 	PseudoGTIDIntervalSeconds                    = 5
 	PseudoGTIDExpireMinutes                      = 60
+	StaleInstanceCoordinatesExpireSeconds        = 60
 	CheckAutoPseudoGTIDGrantsIntervalSeconds     = 60
 	SelectTrueQuery                              = "select 1"
 )

diff --git a/go/db/generate_base.go b/go/db/generate_base.go
@@ -838,4 +838,17 @@ var generateSQLBase = []string{
 	`
 		CREATE INDEX tag_name_idx_database_instance_tags ON database_instance_tags (tag_name)
 	`,
+	`
+		CREATE TABLE IF NOT EXISTS database_instance_stale_binlog_coordinates (
+			hostname varchar(128) CHARACTER SET ascii NOT NULL,
+			port smallint(5) unsigned NOT NULL,
+			binary_log_file varchar(128) NOT NULL,
+			binary_log_pos bigint(20) unsigned NOT NULL,
+			first_seen timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
+			PRIMARY KEY (hostname, port)
+		) ENGINE=InnoDB DEFAULT CHARSET=ascii
+	`,
+	`
+		CREATE INDEX first_seen_idx_database_instance_stale_binlog_coordinates ON database_instance_stale_binlog_coordinates (first_seen)
+	`,
 }
diff --git a/go/inst/analysis.go b/go/inst/analysis.go
@@ -38,6 +38,8 @@ const (
 	MasterSingleSlaveDead                                              = "MasterSingleSlaveDead"
 	AllMasterSlavesNotReplicating                                      = "AllMasterSlavesNotReplicating"
 	AllMasterSlavesNotReplicatingOrDead                                = "AllMasterSlavesNotReplicatingOrDead"
+	LockedSemiSyncMasterHypothesis                                     = "LockedSemiSyncMasterHypothesis"
+	LockedSemiSyncMaster                                               = "LockedSemiSyncMaster"
 	MasterWithoutSlaves                                                = "MasterWithoutSlaves"
 	DeadCoMaster                                                       = "DeadCoMaster"
 	DeadCoMasterAndSomeSlaves                                          = "DeadCoMasterAndSomeSlaves"
@@ -118,6 +120,7 @@ type ReplicationAnalysis struct {
 	AnalyzedInstanceDataCenter                string
 	AnalyzedInstanceRegion                    string
 	AnalyzedInstancePhysicalEnvironment       string
+	AnalyzedInstanceBinlogCoordinates         BinlogCoordinates
 	IsMaster                                  bool
 	IsCoMaster                                bool
 	LastCheckValid                            bool
@@ -143,6 +146,9 @@ type ReplicationAnalysis struct {
 	MariaDBGTIDImmediateTopology              bool
 	BinlogServerImmediateTopology             bool
 	SemiSyncMasterEnabled                     bool
+	SemiSyncMasterStatus                      bool
+	SemiSyncMasterWaitForReplicaCount         uint
+	SemiSyncMasterClients                     uint
 	CountSemiSyncReplicasEnabled              uint
 	CountLoggingReplicas                      uint
 	CountStatementBasedLoggingReplicas        uint

diff --git a/go/inst/analysis_dao.go b/go/inst/analysis_dao.go
@@ -55,7 +55,7 @@ func initializeAnalysisDaoPostConfiguration() {
 func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) ([]ReplicationAnalysis, error) {
 	result := []ReplicationAnalysis{}
 
-	args := sqlutils.Args(ValidSecondsFromSeenToLastAttemptedCheck(), config.Config.ReasonableReplicationLagSeconds, clusterName)
+	args := sqlutils.Args(config.Config.ReasonableReplicationLagSeconds, ValidSecondsFromSeenToLastAttemptedCheck(), config.Config.ReasonableReplicationLagSeconds, clusterName)
 	analysisQueryReductionClause := ``
 
 	if config.Config.ReduceReplicationAnalysisCount {
@@ -97,7 +97,17 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints)
 			MIN(master_instance.physical_environment) AS physical_environment,
 		        MIN(master_instance.master_host) AS master_host,
 		        MIN(master_instance.master_port) AS master_port,
-		        MIN(master_instance.cluster_name) AS cluster_name,
+						MIN(master_instance.cluster_name) AS cluster_name,
+						MIN(master_instance.binary_log_file) AS binary_log_file,
+		        MIN(master_instance.binary_log_pos) AS binary_log_pos,
+						MIN(
+							IFNULL(
+								master_instance.binary_log_file = database_instance_stale_binlog_coordinates.binary_log_file
+								AND master_instance.binary_log_pos = database_instance_stale_binlog_coordinates.binary_log_pos
+								AND database_instance_stale_binlog_coordinates.first_seen < NOW() - interval ? second,
+								0
+							)
+						) AS is_stale_binlog_coordinates,
 		        MIN(IFNULL(cluster_alias.alias, master_instance.cluster_name)) AS cluster_alias,
 		        MIN(IFNULL(cluster_domain_name.domain_name, master_instance.cluster_name)) AS cluster_domain,
 		        MIN(
@@ -157,6 +167,12 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints)
 						MIN(
 				    		master_instance.semi_sync_master_wait_for_slave_count
 				    	) AS semi_sync_master_wait_for_slave_count,
+						MIN(
+							master_instance.semi_sync_master_clients
+						) AS semi_sync_master_clients,
+						MIN(
+							master_instance.semi_sync_master_status
+						) AS semi_sync_master_status,
 						SUM(
 								replica_instance.is_co_master
 							) AS count_co_master_replicas,
@@ -238,6 +254,11 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints)
 		        database_instance_maintenance ON (master_instance.hostname = database_instance_maintenance.hostname
 							AND master_instance.port = database_instance_maintenance.port
 							AND database_instance_maintenance.maintenance_active = 1)
+					LEFT JOIN
+						database_instance_stale_binlog_coordinates ON (
+							master_instance.hostname = database_instance_stale_binlog_coordinates.hostname
+							AND master_instance.port = database_instance_stale_binlog_coordinates.port
+						)
           LEFT JOIN
 		        database_instance_downtime as master_downtime ON (master_instance.hostname = master_downtime.hostname
 							AND master_instance.port = master_downtime.port
@@ -278,6 +299,12 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints)
 		a.AnalyzedInstanceDataCenter = m.GetString("data_center")
 		a.AnalyzedInstanceRegion = m.GetString("region")
 		a.AnalyzedInstancePhysicalEnvironment = m.GetString("physical_environment")
+		a.AnalyzedInstanceBinlogCoordinates = BinlogCoordinates{
+			LogFile: m.GetString("binary_log_file"),
+			LogPos:  m.GetInt64("binary_log_pos"),
+			Type:    BinaryLog,
+		}
+		isStaleBinlogCoordinates := m.GetBool("is_stale_binlog_coordinates")
 		a.ClusterDetails.ClusterName = m.GetString("cluster_name")
 		a.ClusterDetails.ClusterAlias = m.GetString("cluster_alias")
 		a.ClusterDetails.ClusterDomain = m.GetString("cluster_domain")
@@ -308,9 +335,11 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints)
 		a.BinlogServerImmediateTopology = countValidBinlogServerSlaves == a.CountValidReplicas && a.CountValidReplicas > 0
 		a.PseudoGTIDImmediateTopology = m.GetBool("is_pseudo_gtid")
 		a.SemiSyncMasterEnabled = m.GetBool("semi_sync_master_enabled")
+		a.SemiSyncMasterStatus = m.GetBool("semi_sync_master_status")
 		a.CountSemiSyncReplicasEnabled = m.GetUint("count_semi_sync_replicas")
-		countValidSemiSyncReplicasEnabled := m.GetUint("count_valid_semi_sync_replicas")
-		semiSyncMasterWaitForReplicaCount := m.GetUint("semi_sync_master_wait_for_slave_count")
+		// countValidSemiSyncReplicasEnabled := m.GetUint("count_valid_semi_sync_replicas")
+		a.SemiSyncMasterWaitForReplicaCount = m.GetUint("semi_sync_master_wait_for_slave_count")
+		a.SemiSyncMasterClients = m.GetUint("semi_sync_master_clients")
 
 		a.MinReplicaGTIDMode = m.GetString("min_replica_gtid_mode")
 		a.MaxReplicaGTIDMode = m.GetString("max_replica_gtid_mode")
@@ -359,6 +388,15 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints)
 			a.Analysis = UnreachableMaster
 			a.Description = "Master cannot be reached by orchestrator but it has replicating replicas; possibly a network/host issue"
 			//
+		} else if a.IsMaster && a.SemiSyncMasterEnabled && a.SemiSyncMasterStatus && a.SemiSyncMasterWaitForReplicaCount > 0 && a.SemiSyncMasterClients < a.SemiSyncMasterWaitForReplicaCount {
+			if isStaleBinlogCoordinates {
+				a.Analysis = LockedSemiSyncMaster
+				a.Description = "Semi sync master is locked since it doesn't get enough replica acknowledgements"
+			} else {
+				a.Analysis = LockedSemiSyncMasterHypothesis
+				a.Description = "Semi sync master seems to be locked, more samplings needed to validate"
+			}
+			//
 		} else if a.IsMaster && a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 {
 			a.Analysis = MasterSingleSlaveNotReplicating
 			a.Description = "Master is reachable but its single slave is not replicating"
@@ -517,10 +555,10 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints)
 				a.StructureAnalysis = append(a.StructureAnalysis, NoWriteableMasterStructureWarning)
 			}
 
-			if a.IsMaster && a.SemiSyncMasterEnabled && countValidSemiSyncReplicasEnabled == 0 {
+			if a.IsMaster && a.SemiSyncMasterEnabled && !a.SemiSyncMasterStatus && a.SemiSyncMasterWaitForReplicaCount > 0 && a.SemiSyncMasterClients == 0 {
 				a.StructureAnalysis = append(a.StructureAnalysis, NoValidSemiSyncReplicasStructureWarning)
 			}
-			if a.IsMaster && a.SemiSyncMasterEnabled && countValidSemiSyncReplicasEnabled > 0 && countValidSemiSyncReplicasEnabled < semiSyncMasterWaitForReplicaCount {
+			if a.IsMaster && a.SemiSyncMasterEnabled && !a.SemiSyncMasterStatus && a.SemiSyncMasterWaitForReplicaCount > 0 && a.SemiSyncMasterClients > 0 && a.SemiSyncMasterClients < a.SemiSyncMasterWaitForReplicaCount {
 				a.StructureAnalysis = append(a.StructureAnalysis, NotEnoughValidSemiSyncReplicasStructureWarning)
 			}
 

diff --git a/go/inst/instance_dao.go b/go/inst/instance_dao.go
@@ -2861,19 +2861,21 @@ func RecordInstanceCoordinatesHistory() error {
 	}
 	writeFunc := func() error {
 		_, err := db.ExecOrchestrator(`
-    	insert into
-    		database_instance_coordinates_history (
+			insert into
+				database_instance_coordinates_history (
 					hostname, port,	last_seen, recorded_timestamp,
 					binary_log_file, binary_log_pos, relay_log_file, relay_log_pos
 				)
-    	select
-    		hostname, port, last_seen, NOW(),
+			select
+				hostname, port, last_seen, NOW(),
 				binary_log_file, binary_log_pos, relay_log_file, relay_log_pos
 			from
 				database_instance
 			where
-				binary_log_file != ''
-				OR relay_log_file != ''
+				(
+					binary_log_file != ''
+					or relay_log_file != ''
+				)
 				`,
 		)
 		return log.Errore(err)
@@ -2905,6 +2907,55 @@ func GetHeuristiclyRecentCoordinatesForInstance(instanceKey *InstanceKey) (selfC
 	return selfCoordinates, relayLogCoordinates, err
 }
 
+// RecordInstanceCoordinatesHistory snapshots the binlog coordinates of instances
+func RecordStaleInstanceBinlogCoordinates(instanceKey *InstanceKey, binlogCoordinates *BinlogCoordinates) error {
+	args := sqlutils.Args(
+		instanceKey.Hostname, instanceKey.Port,
+		binlogCoordinates.LogFile, binlogCoordinates.LogPos,
+	)
+	_, err := db.ExecOrchestrator(`
+			delete from
+				database_instance_stale_binlog_coordinates
+			where
+				hostname=? and port=?
+				and (
+					binary_log_file != ?
+					or binary_log_pos != ?
+				)
+				`,
+		args...,
+	)
+	if err != nil {
+		return log.Errore(err)
+	}
+	_, err = db.ExecOrchestrator(`
+			insert ignore into
+				database_instance_stale_binlog_coordinates (
+					hostname, port,	binary_log_file, binary_log_pos, first_seen
+				)
+				values (
+					?, ?, ?, ?, NOW()
+				)`,
+		args...)
+	return log.Errore(err)
+}
+
+func ExpireStaleInstanceBinlogCoordinates() error {
+	expireSeconds := config.Config.ReasonableReplicationLagSeconds * 2
+	if expireSeconds < config.StaleInstanceCoordinatesExpireSeconds {
+		expireSeconds = config.StaleInstanceCoordinatesExpireSeconds
+	}
+	writeFunc := func() error {
+		_, err := db.ExecOrchestrator(`
+					delete from database_instance_stale_binlog_coordinates
+					where first_seen < NOW() - INTERVAL ? SECOND
+					`, expireSeconds,
+		)
+		return log.Errore(err)
+	}
+	return ExecDBWriteFunc(writeFunc)
+}
+
 // GetPreviousKnownRelayLogCoordinatesForInstance returns known relay log coordinates, that are not the
 // exact current coordinates
 func GetPreviousKnownRelayLogCoordinatesForInstance(instance *Instance) (relayLogCoordinates *BinlogCoordinates, err error) {

diff --git a/go/logic/orchestrator.go b/go/logic/orchestrator.go
@@ -564,6 +564,7 @@ func ContinuousDiscovery() {
 					go inst.ExpirePoolInstances()
 					go inst.FlushNontrivialResolveCacheToDatabase()
 					go inst.ExpireInjectedPseudoGTID()
+					go inst.ExpireStaleInstanceBinlogCoordinates()
 					go process.ExpireNodesHistory()
 					go process.ExpireAccessTokens()
 					go process.ExpireAvailableNodes()

diff --git a/go/logic/topology_recovery.go b/go/logic/topology_recovery.go
@@ -1395,22 +1395,33 @@ func checkAndRecoverDeadCoMaster(analysisEntry inst.ReplicationAnalysis, candida
 	return true, topologyRecovery, err
 }
 
+// checkAndRecoverGenericProblem is a general-purpose recovery function
+func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
+
+	topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, true, true)
+	if topologyRecovery == nil {
+		AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another RecoverLockedSemiSyncMaster.", analysisEntry.AnalyzedInstanceKey))
+		return false, nil, err
+	}
+
+	return false, nil, nil
+}
+
 // checkAndRecoverGenericProblem is a general-purpose recovery function
 func checkAndRecoverGenericProblem(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) {
 	return false, nil, nil
 }
 
 // Force a re-read of a topology instance; this is done because we need to substantiate a suspicion
 // that we may have a failover scenario. we want to speed up reading the complete picture.
-func emergentlyReadTopologyInstance(instanceKey *inst.InstanceKey, analysisCode inst.AnalysisCode) {
+func emergentlyReadTopologyInstance(instanceKey *inst.InstanceKey, analysisCode inst.AnalysisCode) (instance *inst.Instance, err error) {
 	if existsInCacheError := emergencyReadTopologyInstanceMap.Add(instanceKey.StringCode(), true, cache.DefaultExpiration); existsInCacheError != nil {
 		// Just recently attempted
-		return
+		return nil, nil
 	}
-	go inst.ExecuteOnTopology(func() {
-		inst.ReadTopologyInstance(instanceKey)
-		inst.AuditOperation("emergently-read-topology-instance", instanceKey, string(analysisCode))
-	})
+	instance, err = inst.ReadTopologyInstance(instanceKey)
+	inst.AuditOperation("emergently-read-topology-instance", instanceKey, string(analysisCode))
+	return instance, err
 }
 
 // Force reading of replicas of given instance. This is because we suspect the instance is dead, and want to speed up
@@ -1469,6 +1480,11 @@ func emergentlyRestartReplicationOnTopologyInstanceReplicas(instanceKey *inst.In
 	}
 }
 
+func emergentlyRecordStaleBinlogCoordinates(instanceKey *inst.InstanceKey, binlogCoordinates *inst.BinlogCoordinates) {
+	err := inst.RecordStaleInstanceBinlogCoordinates(instanceKey, binlogCoordinates)
+	log.Errore(err)
+}
+
 // checkAndExecuteFailureDetectionProcesses tries to register for failure detection and potentially executes
 // failure-detection processes.
 func checkAndExecuteFailureDetectionProcesses(analysisEntry inst.ReplicationAnalysis, skipProcesses bool) (detectionRegistrationSuccess bool, processesExecutionAttempted bool, err error) {
@@ -1499,6 +1515,12 @@ func getCheckAndRecoverFunction(analysisCode inst.AnalysisCode, analyzedInstance
 		} else {
 			return checkAndRecoverDeadMaster, true
 		}
+	case inst.LockedSemiSyncMaster:
+		if isInEmergencyOperationGracefulPeriod(analyzedInstanceKey) {
+			return checkAndRecoverGenericProblem, false
+		} else {
+			return checkAndRecoverLockedSemiSyncMaster, true
+		}
 	// intermediate master
 	case inst.DeadIntermediateMaster:
 		return checkAndRecoverDeadIntermediateMaster, true
@@ -1546,6 +1568,9 @@ func runEmergentOperations(analysisEntry *inst.ReplicationAnalysis) {
 		go emergentlyReadTopologyInstanceReplicas(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis)
 	case inst.UnreachableMasterWithLaggingReplicas:
 		go emergentlyRestartReplicationOnTopologyInstanceReplicas(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis)
+	case inst.LockedSemiSyncMasterHypothesis:
+		go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis)
+		go emergentlyRecordStaleBinlogCoordinates(&analysisEntry.AnalyzedInstanceKey, &analysisEntry.AnalyzedInstanceBinlogCoordinates)
 	case inst.UnreachableIntermediateMasterWithLaggingReplicas:
 		go emergentlyRestartReplicationOnTopologyInstanceReplicas(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis)
 	case inst.AllMasterSlavesNotReplicating:

diff --git a/resources/public/js/cluster-analysis-shared.js b/resources/public/js/cluster-analysis-shared.js
@@ -6,6 +6,7 @@ var interestingAnalysis = {
 	"UnreachableMasterWithStaleSlaves": true,
 	"UnreachableMasterWithLaggingReplicas": true,
 	"UnreachableMaster" : true,
+	"LockedSemiSyncMaster" : true,
 	"AllMasterSlavesNotReplicating" : true,
 	"AllMasterSlavesNotReplicatingOrDead" : true,
 	"AllMasterSlavesStale" : true,

diff --git a/tests/integration/analysis-no-valid-semi-sync-replicas/create.sql b/tests/integration/analysis-no-valid-semi-sync-replicas/create.sql
diff --git a/tests/integration/analysis-not-enough-valid-semi-sync-replicas/create.sql b/tests/integration/analysis-not-enough-valid-semi-sync-replicas/create.sql
diff --git a/tests/integration/analysis-semi-sync-locked-master-hypothesis/create.sql b/tests/integration/analysis-semi-sync-locked-master-hypothesis/create.sql
@@ -0,0 +1,8 @@
+UPDATE database_instance SET semi_sync_master_enabled=1, semi_sync_master_status=1, semi_sync_master_wait_for_slave_count=2, semi_sync_master_clients=1 where port=22293;
+UPDATE database_instance SET last_seen=last_checked - interval 1 minute where port=22296;
+INSERT INTO database_instance_stale_binlog_coordinates (
+    hostname, port,	binary_log_file, binary_log_pos, first_seen
+  )
+  values (
+    'testhost', 22293, 'mysql-bin.000167', 137086726, CURRENT_TIMESTAMP
+  );
diff --git a/tests/integration/analysis-semi-sync-locked-master-hypothesis/expect_output b/tests/integration/analysis-semi-sync-locked-master-hypothesis/expect_output
@@ -0,0 +1 @@
+testhost:22293 (cluster testhost:22293): LockedSemiSyncMasterHypothesis
diff --git a/...is-no-valid-semi-sync-replicas/extra_args → ...-sync-locked-master-hypothesis/extra_args b/...is-no-valid-semi-sync-replicas/extra_args → ...-sync-locked-master-hypothesis/extra_args