opensearch-project · shwetathareja · Oct 7, 2022 · Sep 26, 2022 · Sep 26, 2022 · Sep 26, 2022
@@ -47,6 +47,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 - [Remote Store] Change behaviour in replica recovery for remote translog enabled indices ([#4318](https://github.com/opensearch-project/OpenSearch/pull/4318))
 - Unmute test RelocationIT.testRelocationWhileIndexingRandom ([#4580](https://github.com/opensearch-project/OpenSearch/pull/4580))
 - Add DecommissionService and helper to execute awareness attribute decommissioning ([#4084](https://github.com/opensearch-project/OpenSearch/pull/4084))
+- Controlling discovery for decommissioned nodes ([#4590](https://github.com/opensearch-project/OpenSearch/pull/4590))
 
 
 ### Deprecated

@@ -208,19 +208,6 @@ public Coordinator(
         this.onJoinValidators = JoinTaskExecutor.addBuiltInJoinValidators(onJoinValidators);
         this.singleNodeDiscovery = DiscoveryModule.isSingleNodeDiscovery(settings);
         this.electionStrategy = electionStrategy;
-        this.joinHelper = new JoinHelper(
-            settings,
-            allocationService,
-            clusterManagerService,
-            transportService,
-            this::getCurrentTerm,
-            this::getStateForClusterManagerService,
-            this::handleJoinRequest,
-            this::joinLeaderInTerm,
-            this.onJoinValidators,
-            rerouteService,
-            nodeHealthService
-        );
         this.persistedStateSupplier = persistedStateSupplier;
         this.noClusterManagerBlockService = new NoClusterManagerBlockService(settings, clusterSettings);
         this.lastKnownLeader = Optional.empty();
@@ -244,6 +231,20 @@ public Coordinator(
             new HandshakingTransportAddressConnector(settings, transportService),
             configuredHostsResolver
         );
+        this.joinHelper = new JoinHelper(
+            settings,
+            allocationService,
+            clusterManagerService,
+            transportService,
+            this::getCurrentTerm,
+            this::getStateForClusterManagerService,
+            this::handleJoinRequest,
+            this::joinLeaderInTerm,
+            this.onJoinValidators,
+            rerouteService,
+            nodeHealthService,
+            peerFinder.nodeCommissionedListener()
+        );
         this.publicationHandler = new PublicationTransportHandler(
             transportService,
             namedWriteableRegistry,
@@ -1451,6 +1452,11 @@ public void run() {
                             return;
                         }
 
+                        if (peerFinder.localNodeDecommissioned()) {
+                            logger.debug("skip prevoting as local node is decommissioned");
+                            return;
+                        }
+
                         if (prevotingRound != null) {
                             prevotingRound.close();
                         }

@@ -42,6 +42,7 @@
 import org.opensearch.cluster.ClusterStateTaskListener;
 import org.opensearch.cluster.NotClusterManagerException;
 import org.opensearch.cluster.coordination.Coordinator.Mode;
+import org.opensearch.cluster.decommission.NodeDecommissionedException;
 import org.opensearch.cluster.metadata.Metadata;
 import org.opensearch.cluster.node.DiscoveryNode;
 import org.opensearch.cluster.routing.RerouteService;
@@ -57,6 +58,7 @@
 import org.opensearch.monitor.StatusInfo;
 import org.opensearch.threadpool.ThreadPool;
 import org.opensearch.threadpool.ThreadPool.Names;
+import org.opensearch.transport.RemoteTransportException;
 import org.opensearch.transport.TransportChannel;
 import org.opensearch.transport.TransportException;
 import org.opensearch.transport.TransportRequest;
@@ -112,6 +114,7 @@ public class JoinHelper {
 
     private final TimeValue joinTimeout; // only used for Zen1 joining
     private final NodeHealthService nodeHealthService;
+    private final ActionListener<Void> nodeCommissionedListener;
 
     private final Set<Tuple<DiscoveryNode, JoinRequest>> pendingOutgoingJoins = Collections.synchronizedSet(new HashSet<>());
 
@@ -130,12 +133,14 @@ public class JoinHelper {
         Function<StartJoinRequest, Join> joinLeaderInTerm,
         Collection<BiConsumer<DiscoveryNode, ClusterState>> joinValidators,
         RerouteService rerouteService,
-        NodeHealthService nodeHealthService
+        NodeHealthService nodeHealthService,
+        ActionListener<Void> nodeCommissionedListener
     ) {
         this.clusterManagerService = clusterManagerService;
         this.transportService = transportService;
         this.nodeHealthService = nodeHealthService;
         this.joinTimeout = JOIN_TIMEOUT_SETTING.get(settings);
+        this.nodeCommissionedListener = nodeCommissionedListener;
         this.joinTaskExecutorGenerator = () -> new JoinTaskExecutor(settings, allocationService, logger, rerouteService, transportService) {
 
             private final long term = currentTermSupplier.getAsLong();
@@ -342,6 +347,7 @@ public void handleResponse(Empty response) {
                         pendingOutgoingJoins.remove(dedupKey);
                         logger.debug("successfully joined {} with {}", destination, joinRequest);
                         lastFailedJoinAttempt.set(null);
+                        nodeCommissionedListener.onResponse(null);
                         onCompletion.run();
                     }
 
@@ -352,6 +358,10 @@ public void handleException(TransportException exp) {
                         FailedJoinAttempt attempt = new FailedJoinAttempt(destination, joinRequest, exp);
                         attempt.logNow();
                         lastFailedJoinAttempt.set(attempt);
+                        if (exp instanceof RemoteTransportException && (exp.getCause() instanceof NodeDecommissionedException)) {
+                            logger.info("local node is decommissioned. Will not be able to join the cluster");
+                            nodeCommissionedListener.onFailure(exp);
+                        }
                         onCompletion.run();
                     }
 

@@ -534,6 +534,7 @@ public void apply(Settings value, Settings current, Settings previous) {
                 PersistentTasksClusterService.CLUSTER_TASKS_ALLOCATION_RECHECK_INTERVAL_SETTING,
                 EnableAssignmentDecider.CLUSTER_TASKS_ALLOCATION_ENABLE_SETTING,
                 PeerFinder.DISCOVERY_FIND_PEERS_INTERVAL_SETTING,
+                PeerFinder.DISCOVERY_FIND_PEERS_INTERVAL_DURING_DECOMMISSION_SETTING,
                 PeerFinder.DISCOVERY_REQUEST_PEERS_TIMEOUT_SETTING,
                 ClusterFormationFailureHelper.DISCOVERY_CLUSTER_FORMATION_WARNING_TIMEOUT_SETTING,
                 ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING,

@@ -84,14 +84,23 @@ public abstract class PeerFinder {
         Setting.Property.NodeScope
     );
 
+    // the time between attempts to find all peers when node is in decommissioned state, default set to 3 minutes
+    public static final Setting<TimeValue> DISCOVERY_FIND_PEERS_INTERVAL_DURING_DECOMMISSION_SETTING = Setting.timeSetting(
+        "discovery.find_peers_interval_during_decommission",
+        TimeValue.timeValueMinutes(3L),
+        TimeValue.timeValueMillis(1000),
+        Setting.Property.NodeScope
+    );
+
     public static final Setting<TimeValue> DISCOVERY_REQUEST_PEERS_TIMEOUT_SETTING = Setting.timeSetting(
         "discovery.request_peers_timeout",
         TimeValue.timeValueMillis(3000),
         TimeValue.timeValueMillis(1),
         Setting.Property.NodeScope
     );
 
-    private final TimeValue findPeersInterval;
+    private final Settings settings;
+    private TimeValue findPeersInterval;
     private final TimeValue requestPeersTimeout;
 
     private final Object mutex = new Object();
@@ -101,6 +110,7 @@ public abstract class PeerFinder {
 
     private volatile long currentTerm;
     private boolean active;
+    private boolean localNodeDecommissioned = false;
     private DiscoveryNodes lastAcceptedNodes;
     private final Map<TransportAddress, Peer> peersByAddress = new LinkedHashMap<>();
     private Optional<DiscoveryNode> leader = Optional.empty();
@@ -112,6 +122,7 @@ public PeerFinder(
         TransportAddressConnector transportAddressConnector,
         ConfiguredHostsResolver configuredHostsResolver
     ) {
+        this.settings = settings;
         findPeersInterval = DISCOVERY_FIND_PEERS_INTERVAL_SETTING.get(settings);
         requestPeersTimeout = DISCOVERY_REQUEST_PEERS_TIMEOUT_SETTING.get(settings);
         this.transportService = transportService;
@@ -128,6 +139,28 @@ public PeerFinder(
         );
     }
 
+    public ActionListener<Void> nodeCommissionedListener() {
+        return new ActionListener<Void>() {
+            @Override
+            public void onResponse(Void unused) {
+                localNodeDecommissioned = false;
+                findPeersInterval = DISCOVERY_FIND_PEERS_INTERVAL_SETTING.get(settings);
+                logger.info("updated findPeersInterval to [{}] as node is commissioned", findPeersInterval);
+            }
+
+            @Override
+            public void onFailure(Exception e) {
+                localNodeDecommissioned = true;
+                findPeersInterval = DISCOVERY_FIND_PEERS_INTERVAL_DURING_DECOMMISSION_SETTING.get(settings);
+                logger.info("updated findPeersInterval to [{}] as node is decommissioned", findPeersInterval);
+            }
+        };
+    }
+
+    public boolean localNodeDecommissioned() {
+        return localNodeDecommissioned;
+    }
+
     public void activate(final DiscoveryNodes lastAcceptedNodes) {
         logger.trace("activating with {}", lastAcceptedNodes);
 

@@ -33,6 +33,7 @@
 
 import org.apache.logging.log4j.Level;
 import org.opensearch.Version;
+import org.opensearch.action.ActionListener;
 import org.opensearch.action.ActionListenerResponseHandler;
 import org.opensearch.action.support.PlainActionFuture;
 import org.opensearch.cluster.ClusterName;
@@ -55,6 +56,7 @@
 import java.util.Optional;
 import java.util.concurrent.atomic.AtomicReference;
 
+import static org.mockito.Mockito.mock;
 import static org.opensearch.monitor.StatusInfo.Status.HEALTHY;
 import static org.opensearch.monitor.StatusInfo.Status.UNHEALTHY;
 import static org.opensearch.node.Node.NODE_NAME_SETTING;
@@ -90,7 +92,8 @@ public void testJoinDeduplication() {
             startJoinRequest -> { throw new AssertionError(); },
             Collections.emptyList(),
             (s, p, r) -> {},
-            () -> new StatusInfo(HEALTHY, "info")
+            () -> new StatusInfo(HEALTHY, "info"),
+            mock(ActionListener.class)
         );
         transportService.start();
 
@@ -230,7 +233,8 @@ private void assertJoinValidationRejectsMismatchedClusterUUID(String actionName,
             startJoinRequest -> { throw new AssertionError(); },
             Collections.emptyList(),
             (s, p, r) -> {},
-            null
+            null,
+            mock(ActionListener.class)
         ); // registers request handler
         transportService.start();
         transportService.acceptIncomingRequests();
@@ -284,7 +288,8 @@ public void testJoinFailureOnUnhealthyNodes() {
             startJoinRequest -> { throw new AssertionError(); },
             Collections.emptyList(),
             (s, p, r) -> {},
-            () -> nodeHealthServiceStatus.get()
+            () -> nodeHealthServiceStatus.get(),
+            mock(ActionListener.class)
         );
         transportService.start();
 

@@ -807,6 +807,14 @@ public void testReconnectsToDisconnectedNodes() {
         assertFoundPeers(rebootedOtherNode);
     }
 
+    public void testNodeCommissioning() {
+        peerFinder.nodeCommissionedListener().onFailure(new Exception("unit-test"));
+        assertTrue(peerFinder.localNodeDecommissioned());
+
+        peerFinder.nodeCommissionedListener().onResponse(null);
+        assertFalse(peerFinder.localNodeDecommissioned());
+    }
+
     private void respondToRequests(Function<DiscoveryNode, PeersResponse> responseFactory) {
         final CapturedRequest[] capturedRequests = capturingTransport.getCapturedRequestsAndClear();
         for (final CapturedRequest capturedRequest : capturedRequests) {