From d7a1eaa7f51d0972d10c0df1d3cd77d6b755dd41 Mon Sep 17 00:00:00 2001 From: Andrei Dan Date: Mon, 21 Sep 2020 19:12:00 +0100 Subject: [PATCH] ILM: allow check-migration step to continue if tier setting unset (#62636) This allows the `check-migration` step to move past the allocation check if the tier routing settings are manually unset. This helps a user unblock ILM in case a tier is removed (ie. if the warm tier is decommissioned this will allow users to resume the ILM policies stuck in `check-migration` waiting for the warm nodes to become available and the managed index to allocate. this allows the index to allocate on the other available tiers) --- .../core/ilm/DataTierMigrationRoutedStep.java | 22 ++++- .../ilm/DataTierMigrationRoutedStepTests.java | 56 +++++++++++- .../xpack/ilm/DataTiersMigrationsTests.java | 91 +++++++++++++++++++ 3 files changed, 160 insertions(+), 9 deletions(-) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/DataTierMigrationRoutedStep.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/DataTierMigrationRoutedStep.java index 1e3b284ec1072..fc8ecf107f15d 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/DataTierMigrationRoutedStep.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/DataTierMigrationRoutedStep.java @@ -13,6 +13,7 @@ import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.node.DiscoveryNodeRole; import org.elasticsearch.cluster.routing.allocation.decider.AllocationDeciders; +import org.elasticsearch.common.Strings; import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Setting; import org.elasticsearch.common.settings.Settings; @@ -26,8 +27,9 @@ import java.util.Set; import static org.elasticsearch.cluster.node.DiscoveryNodeRole.DATA_ROLE; -import static org.elasticsearch.xpack.cluster.routing.allocation.DataTierAllocationDecider.INDEX_ROUTING_INCLUDE_SETTING; +import static org.elasticsearch.xpack.cluster.routing.allocation.DataTierAllocationDecider.INDEX_ROUTING_PREFER_SETTING; import static org.elasticsearch.xpack.core.ilm.AllocationRoutedStep.getPendingAllocations; +import static org.elasticsearch.xpack.core.ilm.step.info.AllocationInfo.waitingForActiveShardsAllocationInfo; /** * Checks whether all shards have been correctly routed in response to updating the allocation rules for an index in order @@ -71,11 +73,23 @@ public Result isConditionMet(Index index, ClusterState clusterState) { logger.debug("[{}] lifecycle action for index [{}] executed but index no longer exists", getKey().getAction(), index.getName()); return new Result(false, null); } - String destinationTier = INDEX_ROUTING_INCLUDE_SETTING.get(idxMeta.getSettings()); + String destinationTier = INDEX_ROUTING_PREFER_SETTING.get(idxMeta.getSettings()); if (ActiveShardCount.ALL.enoughShardsActive(clusterState, index.getName()) == false) { - logger.debug("[{}] migration of index [{}] to the [{}] tier cannot progress, as not all shards are active", + if (Strings.isEmpty(destinationTier)) { + logger.debug("[{}] lifecycle action for index [{}] cannot make progress because not all shards are active", + getKey().getAction(), index.getName()); + } else { + logger.debug("[{}] migration of index [{}] to the [{}] tier cannot progress, as not all shards are active", getKey().getAction(), index.getName(), destinationTier); - return new Result(false, AllocationInfo.waitingForActiveShardsAllocationInfo(idxMeta.getNumberOfReplicas())); + } + return new Result(false, waitingForActiveShardsAllocationInfo(idxMeta.getNumberOfReplicas())); + } + + if (Strings.isEmpty(destinationTier)) { + logger.debug("index [{}] has no data tier routing setting configured and all its shards are active. considering the [{}] " + + "step condition met and continuing to the next step", index.getName(), getKey().getName()); + // the user removed the tier routing setting and all the shards are active so we'll cary on + return new Result(true, null); } int allocationPendingAllShards = getPendingAllocations(index, ALLOCATION_DECIDERS, clusterState); diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/DataTierMigrationRoutedStepTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/DataTierMigrationRoutedStepTests.java index 13ae30adba4f6..6518706544aa7 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/DataTierMigrationRoutedStepTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/DataTierMigrationRoutedStepTests.java @@ -30,7 +30,7 @@ import java.util.Set; import static java.util.Collections.emptyMap; -import static org.elasticsearch.xpack.cluster.routing.allocation.DataTierAllocationDecider.INDEX_ROUTING_INCLUDE_SETTING; +import static org.elasticsearch.xpack.cluster.routing.allocation.DataTierAllocationDecider.INDEX_ROUTING_PREFER; import static org.elasticsearch.xpack.core.ilm.step.info.AllocationInfo.waitingForActiveShardsAllocationInfo; import static org.hamcrest.Matchers.is; import static org.hamcrest.Matchers.nullValue; @@ -95,7 +95,7 @@ public void testExecuteWithUnassignedShard() { public void testExecuteWithPendingShards() { IndexMetadata indexMetadata = IndexMetadata.builder(randomAlphaOfLengthBetween(5, 10)) - .settings(settings(Version.CURRENT).put(INDEX_ROUTING_INCLUDE_SETTING.getKey(), DataTier.DATA_WARM)) + .settings(settings(Version.CURRENT).put(INDEX_ROUTING_PREFER, DataTier.DATA_WARM)) .numberOfShards(1).numberOfReplicas(0).build(); Index index = indexMetadata.getIndex(); IndexRoutingTable.Builder indexRoutingTable = IndexRoutingTable.builder(index) @@ -122,7 +122,7 @@ public void testExecuteWithPendingShards() { public void testExecuteWithPendingShardsAndTargetRoleNotPresentInCluster() { IndexMetadata indexMetadata = IndexMetadata.builder(randomAlphaOfLengthBetween(5, 10)) - .settings(settings(Version.CURRENT).put(INDEX_ROUTING_INCLUDE_SETTING.getKey(), DataTier.DATA_WARM)) + .settings(settings(Version.CURRENT).put(INDEX_ROUTING_PREFER, DataTier.DATA_WARM)) .numberOfShards(1).numberOfReplicas(0).build(); Index index = indexMetadata.getIndex(); IndexRoutingTable.Builder indexRoutingTable = IndexRoutingTable.builder(index) @@ -159,7 +159,7 @@ public void testExecuteIndexMissing() { public void testExecuteIsComplete() { IndexMetadata indexMetadata = IndexMetadata.builder(randomAlphaOfLengthBetween(5, 10)) - .settings(settings(Version.CURRENT).put(INDEX_ROUTING_INCLUDE_SETTING.getKey(), DataTier.DATA_WARM)) + .settings(settings(Version.CURRENT).put(INDEX_ROUTING_PREFER, DataTier.DATA_WARM)) .numberOfShards(1).numberOfReplicas(0).build(); Index index = indexMetadata.getIndex(); IndexRoutingTable.Builder indexRoutingTable = IndexRoutingTable.builder(index) @@ -181,7 +181,7 @@ public void testExecuteIsComplete() { public void testExecuteWithGenericDataNodes() { IndexMetadata indexMetadata = IndexMetadata.builder(randomAlphaOfLengthBetween(5, 10)) - .settings(settings(Version.CURRENT).put(INDEX_ROUTING_INCLUDE_SETTING.getKey(), DataTier.DATA_WARM)) + .settings(settings(Version.CURRENT).put(INDEX_ROUTING_PREFER, DataTier.DATA_WARM)) .numberOfShards(1).numberOfReplicas(0).build(); Index index = indexMetadata.getIndex(); IndexRoutingTable.Builder indexRoutingTable = IndexRoutingTable.builder(index) @@ -200,6 +200,52 @@ public void testExecuteWithGenericDataNodes() { assertThat(result.getInfomationContext(), is(nullValue())); } + public void testExecuteForIndexWithoutTierRoutingInformationWaitsForReplicasToBeActive() { + IndexMetadata indexMetadata = IndexMetadata.builder(randomAlphaOfLengthBetween(5, 10)) + .settings(settings(Version.CURRENT)) + .numberOfShards(1).numberOfReplicas(1).build(); + Index index = indexMetadata.getIndex(); + { + IndexRoutingTable.Builder indexRoutingTable = IndexRoutingTable.builder(index) + .addShard(TestShardRouting.newShardRouting(new ShardId(index, 0), "node1", true, ShardRoutingState.STARTED)) + .addReplica(); + + ClusterState clusterState = + ClusterState.builder(ClusterState.EMPTY_STATE).metadata(Metadata.builder().put(indexMetadata, true).build()) + .nodes(DiscoveryNodes.builder() + .add(newNode("node1", Collections.singleton(DataTier.DATA_HOT_NODE_ROLE))) + ) + .routingTable(RoutingTable.builder().add(indexRoutingTable).build()) + .build(); + DataTierMigrationRoutedStep step = createRandomInstance(); + Result expectedResult = new Result(false, waitingForActiveShardsAllocationInfo(1)); + + Result result = step.isConditionMet(index, clusterState); + assertThat(result.isComplete(), is(false)); + assertThat(result.getInfomationContext(), is(expectedResult.getInfomationContext())); + } + + { + IndexRoutingTable.Builder indexRoutingTable = IndexRoutingTable.builder(index) + .addShard(TestShardRouting.newShardRouting(new ShardId(index, 0), "node1", true, ShardRoutingState.STARTED)) + .addShard(TestShardRouting.newShardRouting(new ShardId(index, 0), "node2", false, ShardRoutingState.STARTED)); + + ClusterState clusterState = + ClusterState.builder(ClusterState.EMPTY_STATE).metadata(Metadata.builder().put(indexMetadata, true).build()) + .nodes(DiscoveryNodes.builder() + .add(newNode("node1", Collections.singleton(DataTier.DATA_HOT_NODE_ROLE))) + .add(newNode("node2", Collections.singleton(DataTier.DATA_WARM_NODE_ROLE))) + ) + .routingTable(RoutingTable.builder().add(indexRoutingTable).build()) + .build(); + DataTierMigrationRoutedStep step = createRandomInstance(); + + Result result = step.isConditionMet(index, clusterState); + assertThat(result.isComplete(), is(true)); + assertThat(result.getInfomationContext(), is(nullValue())); + } + } + private DiscoveryNode newNode(String nodeId, Set roles) { return new DiscoveryNode(nodeId, buildNewFakeTransportAddress(), emptyMap(), roles, Version.CURRENT); } diff --git a/x-pack/plugin/ilm/src/internalClusterTest/java/org/elasticsearch/xpack/ilm/DataTiersMigrationsTests.java b/x-pack/plugin/ilm/src/internalClusterTest/java/org/elasticsearch/xpack/ilm/DataTiersMigrationsTests.java index 7b8f9353cd654..da9d66b78f439 100644 --- a/x-pack/plugin/ilm/src/internalClusterTest/java/org/elasticsearch/xpack/ilm/DataTiersMigrationsTests.java +++ b/x-pack/plugin/ilm/src/internalClusterTest/java/org/elasticsearch/xpack/ilm/DataTiersMigrationsTests.java @@ -6,11 +6,16 @@ package org.elasticsearch.xpack.ilm; +import org.elasticsearch.action.admin.cluster.allocation.ClusterAllocationExplainRequest; +import org.elasticsearch.action.admin.cluster.allocation.ClusterAllocationExplainResponse; import org.elasticsearch.action.admin.indices.create.CreateIndexResponse; +import org.elasticsearch.action.admin.indices.settings.put.UpdateSettingsRequest; +import org.elasticsearch.cluster.routing.ShardRoutingState; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.plugins.Plugin; import org.elasticsearch.test.ESIntegTestCase; +import org.elasticsearch.xpack.cluster.routing.allocation.DataTierAllocationDecider; import org.elasticsearch.xpack.core.DataTier; import org.elasticsearch.xpack.core.LocalStateCompositeXPackPlugin; import org.elasticsearch.xpack.core.XPackSettings; @@ -30,6 +35,7 @@ import java.util.Collections; import java.util.Locale; import java.util.Map; +import java.util.concurrent.TimeUnit; import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_REPLICAS; import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_SHARDS; @@ -138,4 +144,89 @@ public void testIndexDataTierMigration() throws Exception { assertThat(indexLifecycleExplainResponse.getStep(), is("complete")); }); } + + public void testUserOptsOutOfTierMigration() throws Exception { + internalCluster().startMasterOnlyNodes(1, Settings.EMPTY); + logger.info("starting hot data node"); + internalCluster().startNode(hotNode(Settings.EMPTY)); + + Phase hotPhase = new Phase("hot", TimeValue.ZERO, Collections.emptyMap()); + Phase warmPhase = new Phase("warm", TimeValue.ZERO, Collections.emptyMap()); + Phase coldPhase = new Phase("cold", TimeValue.ZERO, Collections.emptyMap()); + LifecyclePolicy lifecyclePolicy = new LifecyclePolicy(policy, Map.of("hot", hotPhase, "warm", warmPhase, "cold", coldPhase)); + PutLifecycleAction.Request putLifecycleRequest = new PutLifecycleAction.Request(lifecyclePolicy); + PutLifecycleAction.Response putLifecycleResponse = client().execute(PutLifecycleAction.INSTANCE, putLifecycleRequest).get(); + assertAcked(putLifecycleResponse); + + Settings settings = Settings.builder().put(indexSettings()).put(SETTING_NUMBER_OF_SHARDS, 1) + .put(SETTING_NUMBER_OF_REPLICAS, 1).put(LifecycleSettings.LIFECYCLE_NAME, policy).build(); + CreateIndexResponse res = client().admin().indices().prepareCreate(managedIndex).setSettings(settings).get(); + assertTrue(res.isAcknowledged()); + + assertBusy(() -> { + ExplainLifecycleRequest explainRequest = new ExplainLifecycleRequest().indices(managedIndex); + ExplainLifecycleResponse explainResponse = client().execute(ExplainLifecycleAction.INSTANCE, + explainRequest).get(); + + IndexLifecycleExplainResponse indexLifecycleExplainResponse = explainResponse.getIndexResponses().get(managedIndex); + assertThat(indexLifecycleExplainResponse.getPhase(), is("warm")); + assertThat(indexLifecycleExplainResponse.getStep(), is(DataTierMigrationRoutedStep.NAME)); + }); + + Settings removeTierRoutingSetting = Settings.builder().putNull(DataTierAllocationDecider.INDEX_ROUTING_PREFER).build(); + UpdateSettingsRequest updateSettingsRequest = new UpdateSettingsRequest(managedIndex).settings(removeTierRoutingSetting); + assertAcked(client().admin().indices().updateSettings(updateSettingsRequest).actionGet()); + + assertBusy(() -> { + ExplainLifecycleRequest explainRequest = new ExplainLifecycleRequest().indices(managedIndex); + ExplainLifecycleResponse explainResponse = client().execute(ExplainLifecycleAction.INSTANCE, + explainRequest).get(); + + IndexLifecycleExplainResponse indexLifecycleExplainResponse = explainResponse.getIndexResponses().get(managedIndex); + assertThat(indexLifecycleExplainResponse.getPhase(), is("warm")); + assertThat(indexLifecycleExplainResponse.getStep(), is(DataTierMigrationRoutedStep.NAME)); + assertReplicaIsUnassigned(); + }, 30, TimeUnit.SECONDS); + + internalCluster().startNode(coldNode(Settings.EMPTY)); + + // the index should successfully allocate + ensureGreen(managedIndex); + + // the index is successfully allocated but the migrate action from the cold phase re-configured the tier migration setting to the + // cold tier so ILM is stuck in `check-migration` in the cold phase this time + // we have 2 options to resume the ILM execution: + // 1. start another cold node so both the primary and replica can relocate to the cold nodes + // 2. remove the tier routing setting from the index again (we're doing this below) + assertBusy(() -> { + ExplainLifecycleRequest explainRequest = new ExplainLifecycleRequest().indices(managedIndex); + ExplainLifecycleResponse explainResponse = client().execute(ExplainLifecycleAction.INSTANCE, + explainRequest).get(); + + IndexLifecycleExplainResponse indexLifecycleExplainResponse = explainResponse.getIndexResponses().get(managedIndex); + assertThat(indexLifecycleExplainResponse.getPhase(), is("cold")); + assertThat(indexLifecycleExplainResponse.getStep(), is(DataTierMigrationRoutedStep.NAME)); + }); + + // remove the tier routing setting again + assertAcked(client().admin().indices().updateSettings(updateSettingsRequest).actionGet()); + + // wait for lifecycle to complete in the cold phase + assertBusy(() -> { + ExplainLifecycleRequest explainRequest = new ExplainLifecycleRequest().indices(managedIndex); + ExplainLifecycleResponse explainResponse = client().execute(ExplainLifecycleAction.INSTANCE, + explainRequest).get(); + + IndexLifecycleExplainResponse indexLifecycleExplainResponse = explainResponse.getIndexResponses().get(managedIndex); + assertThat(indexLifecycleExplainResponse.getPhase(), is("cold")); + assertThat(indexLifecycleExplainResponse.getStep(), is("complete")); + }, 30, TimeUnit.SECONDS); + } + + private void assertReplicaIsUnassigned() { + ClusterAllocationExplainRequest explainReplicaShard = + new ClusterAllocationExplainRequest().setIndex(managedIndex).setPrimary(false).setShard(0); + ClusterAllocationExplainResponse response = client().admin().cluster().allocationExplain(explainReplicaShard).actionGet(); + assertThat(response.getExplanation().getShardState(), is(ShardRoutingState.UNASSIGNED)); + } }