opensearch-project · shwetathareja · Oct 7, 2022 · Sep 26, 2022 · Sep 26, 2022 · Sep 26, 2022
@@ -56,6 +56,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 - Relax visibility of the HTTP_CHANNEL_KEY and HTTP_SERVER_CHANNEL_KEY to make it possible for the plugins to access associated Netty4HttpChannel / Netty4HttpServerChannel instance ([#4638](https://github.com/opensearch-project/OpenSearch/pull/4638))
 - Load the deprecated master role in a dedicated method instead of in setAdditionalRoles() ([#4582](https://github.com/opensearch-project/OpenSearch/pull/4582))
 - Add APIs (GET/PUT) to decommission awareness attribute ([#4261](https://github.com/opensearch-project/OpenSearch/pull/4261))
+- Controlling discovery for decommissioned nodes ([#4590](https://github.com/opensearch-project/OpenSearch/pull/4590))
 
 ### Deprecated
 

@@ -139,6 +139,7 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
 
     private final Settings settings;
     private final boolean singleNodeDiscovery;
+    private volatile boolean localNodeCommissioned;
     private final ElectionStrategy electionStrategy;
     private final TransportService transportService;
     private final ClusterManagerService clusterManagerService;
@@ -219,7 +220,8 @@ public Coordinator(
             this::joinLeaderInTerm,
             this.onJoinValidators,
             rerouteService,
-            nodeHealthService
+            nodeHealthService,
+            this::nodeCommissioned
         );
         this.persistedStateSupplier = persistedStateSupplier;
         this.noClusterManagerBlockService = new NoClusterManagerBlockService(settings, clusterSettings);
@@ -282,6 +284,7 @@ public Coordinator(
             joinHelper::logLastFailedJoinAttempt
         );
         this.nodeHealthService = nodeHealthService;
+        this.localNodeCommissioned = true;
     }
 
     private ClusterFormationState getClusterFormationState() {
@@ -1425,6 +1428,11 @@ protected void onFoundPeersUpdated() {
         }
     }
 
+    private synchronized void nodeCommissioned(boolean localNodeCommissioned) {
+        this.localNodeCommissioned = localNodeCommissioned;
+        peerFinder.setFindPeersInterval(localNodeCommissioned);
+    }
+
     private void startElectionScheduler() {
         assert electionScheduler == null : electionScheduler;
 
@@ -1451,6 +1459,11 @@ public void run() {
                             return;
                         }
 
+                        if (localNodeCommissioned == false) {
+                            logger.debug("skip prevoting as local node is decommissioned");
+                            return;
+                        }
+
                         if (prevotingRound != null) {
                             prevotingRound.close();
                         }

@@ -42,6 +42,7 @@
 import org.opensearch.cluster.ClusterStateTaskListener;
 import org.opensearch.cluster.NotClusterManagerException;
 import org.opensearch.cluster.coordination.Coordinator.Mode;
+import org.opensearch.cluster.decommission.NodeDecommissionedException;
 import org.opensearch.cluster.metadata.Metadata;
 import org.opensearch.cluster.node.DiscoveryNode;
 import org.opensearch.cluster.routing.RerouteService;
@@ -57,6 +58,7 @@
 import org.opensearch.monitor.StatusInfo;
 import org.opensearch.threadpool.ThreadPool;
 import org.opensearch.threadpool.ThreadPool.Names;
+import org.opensearch.transport.RemoteTransportException;
 import org.opensearch.transport.TransportChannel;
 import org.opensearch.transport.TransportException;
 import org.opensearch.transport.TransportRequest;
@@ -78,6 +80,7 @@
 import java.util.Set;
 import java.util.concurrent.atomic.AtomicReference;
 import java.util.function.BiConsumer;
+import java.util.function.Consumer;
 import java.util.function.Function;
 import java.util.function.LongSupplier;
 import java.util.function.Supplier;
@@ -118,6 +121,7 @@ public class JoinHelper {
     private final AtomicReference<FailedJoinAttempt> lastFailedJoinAttempt = new AtomicReference<>();
 
     private final Supplier<JoinTaskExecutor> joinTaskExecutorGenerator;
+    private final Consumer<Boolean> nodeCommissioned;
 
     JoinHelper(
         Settings settings,
@@ -130,12 +134,14 @@ public class JoinHelper {
         Function<StartJoinRequest, Join> joinLeaderInTerm,
         Collection<BiConsumer<DiscoveryNode, ClusterState>> joinValidators,
         RerouteService rerouteService,
-        NodeHealthService nodeHealthService
+        NodeHealthService nodeHealthService,
+        Consumer<Boolean> nodeCommissioned
     ) {
         this.clusterManagerService = clusterManagerService;
         this.transportService = transportService;
         this.nodeHealthService = nodeHealthService;
         this.joinTimeout = JOIN_TIMEOUT_SETTING.get(settings);
+        this.nodeCommissioned = nodeCommissioned;
         this.joinTaskExecutorGenerator = () -> new JoinTaskExecutor(settings, allocationService, logger, rerouteService, transportService) {
 
             private final long term = currentTermSupplier.getAsLong();
@@ -342,6 +348,7 @@ public void handleResponse(Empty response) {
                         pendingOutgoingJoins.remove(dedupKey);
                         logger.debug("successfully joined {} with {}", destination, joinRequest);
                         lastFailedJoinAttempt.set(null);
+                        nodeCommissioned.accept(true);
                         onCompletion.run();
                     }
 
@@ -352,6 +359,13 @@ public void handleException(TransportException exp) {
                         FailedJoinAttempt attempt = new FailedJoinAttempt(destination, joinRequest, exp);
                         attempt.logNow();
                         lastFailedJoinAttempt.set(attempt);
+                        if (exp instanceof RemoteTransportException && (exp.getCause() instanceof NodeDecommissionedException)) {
+                            logger.info(
+                                "local node is decommissioned [{}]. Will not be able to join the cluster",
+                                exp.getCause().getMessage()
+                            );
+                            nodeCommissioned.accept(false);
+                        }
                         onCompletion.run();
                     }
 

@@ -534,6 +534,7 @@ public void apply(Settings value, Settings current, Settings previous) {
                 PersistentTasksClusterService.CLUSTER_TASKS_ALLOCATION_RECHECK_INTERVAL_SETTING,
                 EnableAssignmentDecider.CLUSTER_TASKS_ALLOCATION_ENABLE_SETTING,
                 PeerFinder.DISCOVERY_FIND_PEERS_INTERVAL_SETTING,
+                PeerFinder.DISCOVERY_FIND_PEERS_INTERVAL_DURING_DECOMMISSION_SETTING,
                 PeerFinder.DISCOVERY_REQUEST_PEERS_TIMEOUT_SETTING,
                 ClusterFormationFailureHelper.DISCOVERY_CLUSTER_FORMATION_WARNING_TIMEOUT_SETTING,
                 ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING,

@@ -84,14 +84,23 @@ public abstract class PeerFinder {
         Setting.Property.NodeScope
     );
 
+    // the time between attempts to find all peers when node is in decommissioned state, default set to 2 minutes
+    public static final Setting<TimeValue> DISCOVERY_FIND_PEERS_INTERVAL_DURING_DECOMMISSION_SETTING = Setting.timeSetting(
+        "discovery.find_peers_interval_during_decommission",
+        TimeValue.timeValueSeconds(120L),
+        TimeValue.timeValueMillis(1000),
+        Setting.Property.NodeScope
+    );
+
     public static final Setting<TimeValue> DISCOVERY_REQUEST_PEERS_TIMEOUT_SETTING = Setting.timeSetting(
         "discovery.request_peers_timeout",
         TimeValue.timeValueMillis(3000),
         TimeValue.timeValueMillis(1),
         Setting.Property.NodeScope
     );
 
-    private final TimeValue findPeersInterval;
+    private final Settings settings;
+    private TimeValue findPeersInterval;
     private final TimeValue requestPeersTimeout;
 
     private final Object mutex = new Object();
@@ -112,6 +121,7 @@ public PeerFinder(
         TransportAddressConnector transportAddressConnector,
         ConfiguredHostsResolver configuredHostsResolver
     ) {
+        this.settings = settings;
         findPeersInterval = DISCOVERY_FIND_PEERS_INTERVAL_SETTING.get(settings);
         requestPeersTimeout = DISCOVERY_REQUEST_PEERS_TIMEOUT_SETTING.get(settings);
         this.transportService = transportService;
@@ -128,6 +138,23 @@ public PeerFinder(
         );
     }
 
+    public synchronized void setFindPeersInterval(boolean localNodeCommissioned) {
+        findPeersInterval = localNodeCommissioned
+            ? DISCOVERY_FIND_PEERS_INTERVAL_SETTING.get(settings)
+            : DISCOVERY_FIND_PEERS_INTERVAL_DURING_DECOMMISSION_SETTING.get(settings);
+        logger.info(
+            "setting findPeersInterval to [{}] as node commission status = [{}] for local node [{}]",
+            findPeersInterval,
+            localNodeCommissioned,
+            transportService.getLocalNode()
+        );
+    }
+
+    // public for testing
+    public TimeValue getFindPeersInterval() {
+        return findPeersInterval;
+    }
+
     public void activate(final DiscoveryNodes lastAcceptedNodes) {
         logger.trace("activating with {}", lastAcceptedNodes);
 

@@ -90,7 +90,8 @@ public void testJoinDeduplication() {
             startJoinRequest -> { throw new AssertionError(); },
             Collections.emptyList(),
             (s, p, r) -> {},
-            () -> new StatusInfo(HEALTHY, "info")
+            () -> new StatusInfo(HEALTHY, "info"),
+            nodeCommissioned -> {}
         );
         transportService.start();
 
@@ -230,7 +231,8 @@ private void assertJoinValidationRejectsMismatchedClusterUUID(String actionName,
             startJoinRequest -> { throw new AssertionError(); },
             Collections.emptyList(),
             (s, p, r) -> {},
-            null
+            null,
+            nodeCommissioned -> {}
         ); // registers request handler
         transportService.start();
         transportService.acceptIncomingRequests();
@@ -284,7 +286,8 @@ public void testJoinFailureOnUnhealthyNodes() {
             startJoinRequest -> { throw new AssertionError(); },
             Collections.emptyList(),
             (s, p, r) -> {},
-            () -> nodeHealthServiceStatus.get()
+            () -> nodeHealthServiceStatus.get(),
+            nodeCommissioned -> {}
         );
         transportService.start();
 

@@ -807,6 +807,42 @@ public void testReconnectsToDisconnectedNodes() {
         assertFoundPeers(rebootedOtherNode);
     }
 
+    public void testConnectionAttemptDuringDecommissioning() {
+        boolean localNodeCommissioned = randomBoolean();
+        peerFinder.setFindPeersInterval(localNodeCommissioned);
+
+        long findPeersInterval = peerFinder.getFindPeersInterval().millis();
+
+        final DiscoveryNode otherNode = newDiscoveryNode("node-1");
+        providedAddresses.add(otherNode.getAddress());
+        transportAddressConnector.addReachableNode(otherNode);
+
+        peerFinder.activate(lastAcceptedNodes);
+        runAllRunnableTasks();
+        assertFoundPeers(otherNode);
+
+        transportAddressConnector.reachableNodes.clear();
+        final DiscoveryNode newNode = new DiscoveryNode("new-node", otherNode.getAddress(), Version.CURRENT);
+        transportAddressConnector.addReachableNode(newNode);
+
+        connectedNodes.remove(otherNode);
+        disconnectedNodes.add(otherNode);
+
+        // peer discovery will be delayed now
+        if (localNodeCommissioned == false) {
+            deterministicTaskQueue.advanceTime();
+            runAllRunnableTasks();
+            assertPeersNotDiscovered(newNode);
+        }
+
+        final long expectedTime = CONNECTION_TIMEOUT_MILLIS + findPeersInterval;
+        while (deterministicTaskQueue.getCurrentTimeMillis() < expectedTime) {
+            deterministicTaskQueue.advanceTime();
+            runAllRunnableTasks();
+        }
+        assertFoundPeers(newNode);
+    }
+
     private void respondToRequests(Function<DiscoveryNode, PeersResponse> responseFactory) {
         final CapturedRequest[] capturedRequests = capturingTransport.getCapturedRequestsAndClear();
         for (final CapturedRequest capturedRequest : capturedRequests) {
@@ -828,6 +864,16 @@ private void assertFoundPeers(DiscoveryNode... expectedNodesArray) {
         assertNotifiedOfAllUpdates();
     }
 
+    private void assertPeersNotDiscovered(DiscoveryNode... undiscoveredNodesArray) {
+        final Set<DiscoveryNode> undiscoveredNodes = Arrays.stream(undiscoveredNodesArray).collect(Collectors.toSet());
+        final List<DiscoveryNode> actualNodesList = StreamSupport.stream(peerFinder.getFoundPeers().spliterator(), false)
+            .collect(Collectors.toList());
+        final HashSet<DiscoveryNode> actualNodesSet = new HashSet<>(actualNodesList);
+        Set<DiscoveryNode> intersection = new HashSet<>(actualNodesSet);
+        intersection.retainAll(undiscoveredNodes);
+        assertEquals(intersection.size(), 0);
+    }
+
     private void assertNotifiedOfAllUpdates() {
         final Stream<DiscoveryNode> actualNodes = StreamSupport.stream(peerFinder.getFoundPeers().spliterator(), false);
         final Stream<DiscoveryNode> notifiedNodes = StreamSupport.stream(foundPeersFromNotification.spliterator(), false);