Skip to content
This repository has been archived by the owner on Sep 30, 2024. It is now read-only.

Commit

Permalink
Merge pull request #793 from github/problems-reason
Browse files Browse the repository at this point in the history
WIP: server side problem analysis
  • Loading branch information
Shlomi Noach authored Feb 10, 2019
2 parents 7bef26f + a302871 commit 981dda9
Show file tree
Hide file tree
Showing 8 changed files with 88 additions and 124 deletions.
3 changes: 3 additions & 0 deletions go/inst/instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,16 @@ type Instance struct {
UnresolvedHostname string
AllowTLS bool

Problems []string

LastDiscoveryLatency time.Duration
}

// NewInstance creates a new, empty instance
func NewInstance() *Instance {
return &Instance{
SlaveHosts: make(map[InstanceKey]bool),
Problems: []string{},
}
}

Expand Down
21 changes: 18 additions & 3 deletions go/inst/instance_dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -1081,6 +1081,21 @@ func readInstanceRow(m sqlutils.RowMap) *Instance {

instance.SlaveHosts.ReadJson(slaveHostsJSON)
instance.applyFlavorName()

// problems
if !instance.IsLastCheckValid {
instance.Problems = append(instance.Problems, "last_check_invalid")
} else if !instance.IsRecentlyChecked {
instance.Problems = append(instance.Problems, "not_recently_checked")
} else if instance.ReplicationThreadsExist() && !instance.ReplicaRunning() {
instance.Problems = append(instance.Problems, "not_replicating")
} else if instance.SlaveLagSeconds.Valid && instance.SlaveLagSeconds.Int64 > int64(config.Config.ReasonableReplicationLagSeconds) {
instance.Problems = append(instance.Problems, "replication_lag")
}
if instance.GtidErrant != "" {
instance.Problems = append(instance.Problems, "errant_gtid")
}

return instance
}

Expand Down Expand Up @@ -1267,15 +1282,15 @@ func ReadProblemInstances(clusterName string) ([](*Instance), error) {
and (
(last_seen < last_checked)
or (unix_timestamp() - unix_timestamp(last_checked) > ?)
or (replication_sql_thread_state != 1)
or (replication_io_thread_state != 1)
or (replication_sql_thread_state not in (-1 ,1))
or (replication_io_thread_state not in (-1 ,1))
or (abs(cast(seconds_behind_master as signed) - cast(sql_delay as signed)) > ?)
or (abs(cast(slave_lag_seconds as signed) - cast(sql_delay as signed)) > ?)
or (gtid_errant != '')
)
`

args := sqlutils.Args(clusterName, clusterName, config.Config.InstancePollSeconds, config.Config.ReasonableReplicationLagSeconds, config.Config.ReasonableReplicationLagSeconds)
args := sqlutils.Args(clusterName, clusterName, config.Config.InstancePollSeconds*5, config.Config.ReasonableReplicationLagSeconds, config.Config.ReasonableReplicationLagSeconds)
instances, err := readInstancesByCondition(condition, args, "")
if err != nil {
return instances, err
Expand Down
12 changes: 12 additions & 0 deletions resources/public/css/orchestrator.css
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,15 @@ body {
color: #ffffff;
}

.instance h3.label-errant {
background-color: #b0b0b0;
color: #ffffff;
}

.instance h3.label-errant .glyphicon {
color: #ffffff;
}

.instance h3.label-fatal {
background-color: #000000;
color: #ffffff;
Expand Down Expand Up @@ -233,6 +242,9 @@ body {
background-color: #000000;
}

.instance .badge.label-errant {
background-color: #b0b0b0;
}

.instance .badge.label-primary {
background-color: #428BCA;
Expand Down
27 changes: 27 additions & 0 deletions resources/public/js/cluster-analysis-shared.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,30 @@ var interestingAnalysis = {
"UnreachableIntermediateMaster" : true,
"BinlogServerFailingToConnectToMaster" : true,
};

var errorMapping = {
"in_maintenance": {
"badge": "label-info",
"description": "In maintenance"
},
"last_check_invalid": {
"badge": "label-fatal",
"description": "Last check invalid"
},
"not_recently_checked": {
"badge": "label-stale",
"description": "Not recently checked (stale)"
},
"not_replicating": {
"badge": "label-danger",
"description": "Not replicating"
},
"replication_lag": {
"badge": "label-warning",
"description": "Replication lag"
},
"errant_gtid": {
"badge": "label-errant",
"description": "Errant GTID"
}
};
40 changes: 4 additions & 36 deletions resources/public/js/cluster-pools.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,6 @@ $(document).ready(function() {

showLoader();

var errorMapping = {
"inMaintenanceProblem": {
"badge": "label-info",
"description": "In maintenance"
},
"lastCheckInvalidProblem": {
"badge": "label-fatal",
"description": "Last check invalid"
},
"notRecentlyCheckedProblem": {
"badge": "label-stale",
"description": "Not recently checked (stale)"
},
"notReplicatingProblem": {
"badge": "label-danger",
"description": "Not replicating"
},
"replicationLagProblem": {
"badge": "label-warning",
"description": "Replication lag"
}
};

$.get(appUrl("/api/cluster-pool-instances/" + currentClusterName()), function(clusterPoolInstances) {
$.get(appUrl("/api/problems"), function(problemInstances) {
problemInstances = problemInstances || [];
Expand Down Expand Up @@ -71,6 +48,9 @@ $(document).ready(function() {
}

function incrementPoolsProblems(instance, problemType) {
if (!problemType) {
return
}
if (typeof instance.problemHint === 'undefined') {
instance.problemHint = problemType
}
Expand All @@ -83,19 +63,7 @@ $(document).ready(function() {
});
}
problemInstances.forEach(function(instance) {
if (instance.inMaintenanceProblem()) {
incrementPoolsProblems(instance, "inMaintenanceProblem")
}
//
if (instance.lastCheckInvalidProblem()) {
incrementPoolsProblems(instance, "lastCheckInvalidProblem")
} else if (instance.notRecentlyCheckedProblem()) {
incrementPoolsProblems(instance, "notRecentlyCheckedProblem")
} else if (instance.notReplicatingProblem()) {
incrementPoolsProblems(instance, "notReplicatingProblem")
} else if (instance.replicationLagProblem()) {
incrementPoolsProblems(instance, "replicationLagProblem")
}
incrementPoolsProblems(instance, instance.problem)
});

pools.forEach(function(pool) {
Expand Down
18 changes: 4 additions & 14 deletions resources/public/js/cluster.js
Original file line number Diff line number Diff line change
Expand Up @@ -1210,18 +1210,9 @@ function Cluster() {
instanceDescription += ", " + instance.SlaveLagSeconds.Int64 + "s lag";
incrementProblems("", instanceDescription)
instanceFullNames.push(getInstanceTitle(instance.Key.Hostname, instance.Key.Port));
if (instance.inMaintenanceProblem()) {
incrementProblems("inMaintenanceProblem", instanceDescription)
}
if (instance.lastCheckInvalidProblem()) {
incrementProblems("lastCheckInvalidProblem", instanceDescription)
} else if (instance.notRecentlyCheckedProblem()) {
incrementProblems("notRecentlyCheckedProblem", instanceDescription)
} else if (instance.notReplicatingProblem()) {
incrementProblems("notReplicatingProblem", instanceDescription)
} else if (instance.replicationLagProblem()) {
incrementProblems("replicationLagProblem", instanceDescription)
}
instance.Problems.forEach(function(problem) {
incrementProblems(problem, instanceDescription)
});
});
var aggergateInstance = instances[0];
aggergateInstance.isAggregate = true;
Expand Down Expand Up @@ -1396,7 +1387,7 @@ function Cluster() {
}
wrappedContent = '<div data-tag="'+tag+'">' + content + '<div style="clear: both;"></div></div>';
if (tag === "analysis") {
$(wrappedContent).insertAfter("#cluster_info [data-tag=glyphs]")
$("#cluster_info").append(wrappedContent)
} else {
$("#cluster_info").append(wrappedContent)
}
Expand Down Expand Up @@ -1532,7 +1523,6 @@ function Cluster() {
analysisContent += "<div>" + analysisEntry.AnalyzedInstanceKey.Hostname + ":" + analysisEntry.AnalyzedInstanceKey.Port + "</div>";
var content = '<div><div class="pull-left">'+glyph+'</div><div class="pull-right">'+analysisContent+'</div></div>';
addSidebarInfoPopoverContent(content, "analysis", false);

if (analysisEntry.IsStructureAnalysis) {
return;
}
Expand Down
40 changes: 4 additions & 36 deletions resources/public/js/clusters.js
Original file line number Diff line number Diff line change
@@ -1,29 +1,6 @@
$(document).ready(function() {
showLoader();

var errorMapping = {
"inMaintenanceProblem": {
"badge": "label-info",
"description": "In maintenance"
},
"lastCheckInvalidProblem": {
"badge": "label-fatal",
"description": "Last check invalid"
},
"notRecentlyCheckedProblem": {
"badge": "label-stale",
"description": "Not recently checked (stale)"
},
"notReplicatingProblem": {
"badge": "label-danger",
"description": "Not replicating"
},
"replicationLagProblem": {
"badge": "label-warning",
"description": "Replication lag"
}
};

$.get(appUrl("/api/clusters-info"), function(clusters) {
$.get(appUrl("/api/replication-analysis"), function(replicationAnalysis) {
$.get(appUrl("/api/problems"), function(problemInstances) {
Expand Down Expand Up @@ -76,26 +53,17 @@ $(document).ready(function() {
}

function incrementClusterProblems(clusterName, problemType) {
if (!problemType) {
return
}
if (clustersProblems[clusterName][problemType] > 0) {
clustersProblems[clusterName][problemType] = clustersProblems[clusterName][problemType] + 1;
} else {
clustersProblems[clusterName][problemType] = 1;
}
}
problemInstances.forEach(function(instance) {
if (instance.inMaintenanceProblem()) {
incrementClusterProblems(instance.ClusterName, "inMaintenanceProblem")
}
//
if (instance.lastCheckInvalidProblem()) {
incrementClusterProblems(instance.ClusterName, "lastCheckInvalidProblem")
} else if (instance.notRecentlyCheckedProblem()) {
incrementClusterProblems(instance.ClusterName, "notRecentlyCheckedProblem")
} else if (instance.notReplicatingProblem()) {
incrementClusterProblems(instance.ClusterName, "notReplicatingProblem")
} else if (instance.replicationLagProblem()) {
incrementClusterProblems(instance.ClusterName, "replicationLagProblem")
}
incrementClusterProblems(instance.ClusterName, instance.problem)
});

clusters.forEach(function(cluster) {
Expand Down
51 changes: 16 additions & 35 deletions resources/public/js/orchestrator.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,29 +9,6 @@ reloadPageHint = {
port: ""
}

var errorMapping = {
"inMaintenanceProblem": {
"badge": "label-info",
"description": "In maintenance"
},
"lastCheckInvalidProblem": {
"badge": "label-fatal",
"description": "Last check invalid"
},
"notRecentlyCheckedProblem": {
"badge": "label-stale",
"description": "Not recently checked (stale)"
},
"notReplicatingProblem": {
"badge": "label-danger",
"description": "Not replicating"
},
"replicationLagProblem": {
"badge": "label-warning",
"description": "Replication lag"
}
};

function updateCountdownDisplay() {
if ($.cookie("auto-refresh") == "true") {
$("#refreshCountdown").html('<span class="glyphicon glyphicon-repeat" title="Click to pause"></span> ' + secondsTillRefresh + 's');
Expand Down Expand Up @@ -659,50 +636,54 @@ function normalizeInstance(instance) {
}

function normalizeInstanceProblem(instance) {

function instanceProblemIfExists(problemName) {
if (instance.Problems.includes(problemName)) {
return problemName
}
return null;
}
instance.inMaintenanceProblem = function() {
return instance.inMaintenance;
return instanceProblemIfExists('in_maintenance');
}
instance.lastCheckInvalidProblem = function() {
return !instance.IsLastCheckValid;
return instanceProblemIfExists('last_check_invalid');
}
instance.notRecentlyCheckedProblem = function() {
return !instance.IsRecentlyChecked;
return instanceProblemIfExists('not_recently_checked');
}
instance.notReplicatingProblem = function() {
return !instance.replicationRunning && !(instance.isMaster && !instance.isCoMaster);
return instanceProblemIfExists('not_replicating');
}
instance.replicationLagProblem = function() {
return !instance.replicationLagReasonable;
return instanceProblemIfExists('replication_lag');
}
instance.errantGTIDProblem = function() {
return (instance.GtidErrant != '');
return instanceProblemIfExists('errant_gtid');
}

instance.problem = null;
if (instance.Problems.length > 0) {
instance.problem = instance.Problems[0]; // highest priority one
}
instance.problemOrder = 0;
if (instance.inMaintenanceProblem()) {
instance.problem = "in_maintenance";
instance.problemDescription = "This instance is now under maintenance due to some pending operation.\nSee audit page";
instance.problemOrder = 1;
} else if (instance.lastCheckInvalidProblem()) {
instance.problem = "last_check_invalid";
instance.problemDescription = "Instance cannot be reached by orchestrator.\nIt might be dead or there may be a network problem";
instance.problemOrder = 2;
} else if (instance.notRecentlyCheckedProblem()) {
instance.problem = "not_recently_checked";
instance.problemDescription = "Orchestrator has not made an attempt to reach this instance for a while now.\nThis should generally not happen; consider refreshing or re-discovering this instance";
instance.problemOrder = 3;
} else if (instance.notReplicatingProblem()) {
// check replicas only; where not replicating
instance.problem = "not_replicating";
instance.problemDescription = "Replication is not running.\nEither stopped manually or is failing on I/O or SQL error.";
instance.problemOrder = 4;
} else if (instance.replicationLagProblem()) {
instance.problem = "replication_lag";
instance.problemDescription = "Replica is lagging.\nThis diagnostic is based on either Seconds_behind_master or configured ReplicationLagQuery";
instance.problemOrder = 5;
} else if (instance.errantGTIDProblem()) {
instance.problem = "Errant GTID";
instance.problemDescription = "Replica has GTID entries not found on its master";
instance.problemOrder = 6;
}
Expand Down

0 comments on commit 981dda9

Please sign in to comment.