From 2e5290f197e01770707acc0b3d9480d0fe5439a9 Mon Sep 17 00:00:00 2001 From: Boaz Leskes Date: Wed, 8 Feb 2017 00:03:34 +0200 Subject: [PATCH] MasterFaultDetection can start after the initial cluster state has been processed and the NodeConnectionService connect to the new master This removes the need to explicitly connect to the master, which triggers an assertion due to the blocking operation on the cluster state thread. Relates to #22828 --- .../discovery/zen/MasterFaultDetection.java | 18 ------------------ .../discovery/zen/ZenDiscovery.java | 10 ++++------ .../discovery/ZenFaultDetectionTests.java | 6 +++--- 3 files changed, 7 insertions(+), 27 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/discovery/zen/MasterFaultDetection.java b/core/src/main/java/org/elasticsearch/discovery/zen/MasterFaultDetection.java index b7acfb685deae..81d4c19d33ee2 100644 --- a/core/src/main/java/org/elasticsearch/discovery/zen/MasterFaultDetection.java +++ b/core/src/main/java/org/elasticsearch/discovery/zen/MasterFaultDetection.java @@ -111,28 +111,10 @@ public void restart(DiscoveryNode masterNode, String reason) { } } - public void start(final DiscoveryNode masterNode, String reason) { - synchronized (masterNodeMutex) { - if (logger.isDebugEnabled()) { - logger.debug("[master] starting fault detection against master [{}], reason [{}]", masterNode, reason); - } - innerStart(masterNode); - } - } - private void innerStart(final DiscoveryNode masterNode) { this.masterNode = masterNode; this.retryCount = 0; this.notifiedMasterFailure.set(false); - - // try and connect to make sure we are connected - try { - transportService.connectToNode(masterNode); - } catch (final Exception e) { - // notify master failure (which stops also) and bail.. - notifyMasterFailure(masterNode, e, "failed to perform initial connect "); - return; - } if (masterPinger != null) { masterPinger.stop(); } diff --git a/core/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/core/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java index b6a023bad3598..94c46ed867094 100644 --- a/core/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java +++ b/core/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java @@ -727,12 +727,6 @@ public ClusterTasksResult execute(ClusterState currentSt if (shouldIgnoreOrRejectNewClusterState(logger, currentState, newClusterState)) { return unchanged(); } - - // check to see that we monitor the correct master of the cluster - if (masterFD.masterNode() == null || !masterFD.masterNode().equals(newClusterState.nodes().getMasterNode())) { - masterFD.restart(newClusterState.nodes().getMasterNode(), "new cluster state received and we are monitoring the wrong master [" + masterFD.masterNode() + "]"); - } - if (currentState.blocks().hasGlobalBlock(discoverySettings.getNoMasterBlock())) { // its a fresh update from the master as we transition from a start of not having a master to having one logger.debug("got first state from fresh master [{}]", newClusterState.nodes().getMasterNodeId()); @@ -786,6 +780,10 @@ public void onFailure(String source, Exception e) { public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) { try { if (newClusterState != null) { + // check to see that we monitor the correct master of the cluster + if (masterFD.masterNode() == null || !masterFD.masterNode().equals(newClusterState.nodes().getMasterNode())) { + masterFD.restart(newClusterState.nodes().getMasterNode(), "new cluster state received and we are monitoring the wrong master [" + masterFD.masterNode() + "]"); + } publishClusterState.pendingStatesQueue().markAsProcessed(newClusterState); } } catch (Exception e) { diff --git a/core/src/test/java/org/elasticsearch/discovery/ZenFaultDetectionTests.java b/core/src/test/java/org/elasticsearch/discovery/ZenFaultDetectionTests.java index b24c5c367b4bc..837c74a3c61b5 100644 --- a/core/src/test/java/org/elasticsearch/discovery/ZenFaultDetectionTests.java +++ b/core/src/test/java/org/elasticsearch/discovery/ZenFaultDetectionTests.java @@ -244,7 +244,7 @@ public void testMasterFaultDetectionConnectOnDisconnect() throws InterruptedExce setState(clusterServiceA, state); MasterFaultDetection masterFD = new MasterFaultDetection(settings.build(), threadPool, serviceA, clusterServiceA); - masterFD.start(nodeB, "test"); + masterFD.restart(nodeB, "test"); final String[] failureReason = new String[1]; final DiscoveryNode[] failureNode = new DiscoveryNode[1]; @@ -290,14 +290,14 @@ public void testMasterFaultDetectionNotSizeLimited() throws InterruptedException MasterFaultDetection masterFDNodeA = new MasterFaultDetection(Settings.builder().put(settingsA).put(settings).build(), threadPool, serviceA, clusterServiceA); - masterFDNodeA.start(nodeB, "test"); + masterFDNodeA.restart(nodeB, "test"); final ClusterState stateNodeB = ClusterState.builder(clusterName).nodes(buildNodesForB(true)).build(); setState(clusterServiceB, stateNodeB); MasterFaultDetection masterFDNodeB = new MasterFaultDetection(Settings.builder().put(settingsB).put(settings).build(), threadPool, serviceB, clusterServiceB); - masterFDNodeB.start(nodeB, "test"); + masterFDNodeB.restart(nodeB, "test"); // let's do a few pings pingProbeA.awaitMinCompletedPings();