Skip to content

Commit

Permalink
MasterFaultDetection can start after the initial cluster state has be…
Browse files Browse the repository at this point in the history
…en processed and the NodeConnectionService connect to the new master

This removes the need to explicitly connect to the master, which triggers an assertion due to the blocking operation on the cluster state thread.

Relates to elastic#22828
  • Loading branch information
bleskes committed Feb 7, 2017
1 parent 0c011cb commit 2e5290f
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -111,28 +111,10 @@ public void restart(DiscoveryNode masterNode, String reason) {
}
}

public void start(final DiscoveryNode masterNode, String reason) {
synchronized (masterNodeMutex) {
if (logger.isDebugEnabled()) {
logger.debug("[master] starting fault detection against master [{}], reason [{}]", masterNode, reason);
}
innerStart(masterNode);
}
}

private void innerStart(final DiscoveryNode masterNode) {
this.masterNode = masterNode;
this.retryCount = 0;
this.notifiedMasterFailure.set(false);

// try and connect to make sure we are connected
try {
transportService.connectToNode(masterNode);
} catch (final Exception e) {
// notify master failure (which stops also) and bail..
notifyMasterFailure(masterNode, e, "failed to perform initial connect ");
return;
}
if (masterPinger != null) {
masterPinger.stop();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -727,12 +727,6 @@ public ClusterTasksResult<LocalClusterUpdateTask> execute(ClusterState currentSt
if (shouldIgnoreOrRejectNewClusterState(logger, currentState, newClusterState)) {
return unchanged();
}

// check to see that we monitor the correct master of the cluster
if (masterFD.masterNode() == null || !masterFD.masterNode().equals(newClusterState.nodes().getMasterNode())) {
masterFD.restart(newClusterState.nodes().getMasterNode(), "new cluster state received and we are monitoring the wrong master [" + masterFD.masterNode() + "]");
}

if (currentState.blocks().hasGlobalBlock(discoverySettings.getNoMasterBlock())) {
// its a fresh update from the master as we transition from a start of not having a master to having one
logger.debug("got first state from fresh master [{}]", newClusterState.nodes().getMasterNodeId());
Expand Down Expand Up @@ -786,6 +780,10 @@ public void onFailure(String source, Exception e) {
public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
try {
if (newClusterState != null) {
// check to see that we monitor the correct master of the cluster
if (masterFD.masterNode() == null || !masterFD.masterNode().equals(newClusterState.nodes().getMasterNode())) {
masterFD.restart(newClusterState.nodes().getMasterNode(), "new cluster state received and we are monitoring the wrong master [" + masterFD.masterNode() + "]");
}
publishClusterState.pendingStatesQueue().markAsProcessed(newClusterState);
}
} catch (Exception e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ public void testMasterFaultDetectionConnectOnDisconnect() throws InterruptedExce
setState(clusterServiceA, state);
MasterFaultDetection masterFD = new MasterFaultDetection(settings.build(), threadPool, serviceA,
clusterServiceA);
masterFD.start(nodeB, "test");
masterFD.restart(nodeB, "test");

final String[] failureReason = new String[1];
final DiscoveryNode[] failureNode = new DiscoveryNode[1];
Expand Down Expand Up @@ -290,14 +290,14 @@ public void testMasterFaultDetectionNotSizeLimited() throws InterruptedException

MasterFaultDetection masterFDNodeA = new MasterFaultDetection(Settings.builder().put(settingsA).put(settings).build(),
threadPool, serviceA, clusterServiceA);
masterFDNodeA.start(nodeB, "test");
masterFDNodeA.restart(nodeB, "test");

final ClusterState stateNodeB = ClusterState.builder(clusterName).nodes(buildNodesForB(true)).build();
setState(clusterServiceB, stateNodeB);

MasterFaultDetection masterFDNodeB = new MasterFaultDetection(Settings.builder().put(settingsB).put(settings).build(),
threadPool, serviceB, clusterServiceB);
masterFDNodeB.start(nodeB, "test");
masterFDNodeB.restart(nodeB, "test");

// let's do a few pings
pingProbeA.awaitMinCompletedPings();
Expand Down

0 comments on commit 2e5290f

Please sign in to comment.