From 99d935c0a208073acca12892bbea5b5938d7b663 Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Tue, 1 Sep 2020 13:24:29 +0200 Subject: [PATCH] Fix Concurrent Snapshot Create+Delete + Delete Index (#61770) (#61774) We had a bug here were we put a `null` value into the shard assignment mapping when reassigning work after a snapshot delete had gone through. This only affects partial snaphots but essentially dead-locks the snapshot process. Closes #61762 --- .../snapshots/ConcurrentSnapshotsIT.java | 29 ++++++++++++++++++- .../cluster/SnapshotsInProgress.java | 7 +++++ .../snapshots/SnapshotsService.java | 17 +++++++---- 3 files changed, 47 insertions(+), 6 deletions(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/ConcurrentSnapshotsIT.java b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/ConcurrentSnapshotsIT.java index 8c8f2b1f9a1fe..d5fec93d3c365 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/ConcurrentSnapshotsIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/ConcurrentSnapshotsIT.java @@ -819,6 +819,28 @@ public void testMultipleSnapshotsQueuedAfterDelete() throws Exception { assertAcked(deleteFuture.get()); } + public void testMultiplePartialSnapshotsQueuedAfterDelete() throws Exception { + final String masterNode = internalCluster().startMasterOnlyNode(); + internalCluster().startDataOnlyNode(); + final String repoName = "test-repo"; + createRepository(repoName, "mock"); + createIndexWithContent("index-one"); + createIndexWithContent("index-two"); + createNSnapshots(repoName, randomIntBetween(1, 5)); + + final ActionFuture deleteFuture = startAndBlockOnDeleteSnapshot(repoName, "*"); + final ActionFuture snapshotThree = startFullSnapshot(repoName, "snapshot-three", true); + final ActionFuture snapshotFour = startFullSnapshot(repoName, "snapshot-four", true); + awaitNSnapshotsInProgress(2); + + assertAcked(client().admin().indices().prepareDelete("index-two")); + unblockNode(repoName, masterNode); + + assertThat(snapshotThree.get().getSnapshotInfo().state(), is(SnapshotState.PARTIAL)); + assertThat(snapshotFour.get().getSnapshotInfo().state(), is(SnapshotState.PARTIAL)); + assertAcked(deleteFuture.get()); + } + public void testQueuedSnapshotsWaitingForShardReady() throws Exception { internalCluster().startMasterOnlyNode(); internalCluster().startDataOnlyNodes(2); @@ -1238,8 +1260,13 @@ private ActionFuture startFullSnapshotFromMasterClient(S } private ActionFuture startFullSnapshot(String repoName, String snapshotName) { + return startFullSnapshot(repoName, snapshotName, false); + } + + private ActionFuture startFullSnapshot(String repoName, String snapshotName, boolean partial) { logger.info("--> creating full snapshot [{}] to repo [{}]", snapshotName, repoName); - return client().admin().cluster().prepareCreateSnapshot(repoName, snapshotName).setWaitForCompletion(true).execute(); + return client().admin().cluster().prepareCreateSnapshot(repoName, snapshotName).setWaitForCompletion(true) + .setPartial(partial).execute(); } private void awaitClusterState(Predicate statePredicate) throws Exception { diff --git a/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java b/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java index 5ffe67f5dd14f..3be6ce510602f 100644 --- a/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java +++ b/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java @@ -345,6 +345,13 @@ public static class ShardSnapshotStatus { public static final ShardSnapshotStatus UNASSIGNED_QUEUED = new SnapshotsInProgress.ShardSnapshotStatus(null, ShardState.QUEUED, null); + /** + * Shard snapshot status for shards that could not be snapshotted because their index was deleted from before the shard snapshot + * started. + */ + public static final ShardSnapshotStatus MISSING = + new SnapshotsInProgress.ShardSnapshotStatus(null, ShardState.MISSING, "missing index", null); + private final ShardState state; @Nullable diff --git a/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java b/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java index 347eaa1c6ca50..9c482021a3918 100644 --- a/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java +++ b/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java @@ -2192,9 +2192,17 @@ private SnapshotsInProgress updatedSnapshotsInProgress(ClusterState currentState final ImmutableOpenMap.Builder updatedAssignmentsBuilder = ImmutableOpenMap.builder(entry.shards()); for (ShardId shardId : canBeUpdated) { - final boolean added = reassignedShardIds.add(shardId); - assert added; - updatedAssignmentsBuilder.put(shardId, shardAssignments.get(shardId)); + final ShardSnapshotStatus updated = shardAssignments.get(shardId); + if (updated == null) { + // We don't have a new assignment for this shard because its index was concurrently deleted + assert currentState.routingTable().hasIndex(shardId.getIndex()) == false : + "Missing assignment for [" + shardId + "]"; + updatedAssignmentsBuilder.put(shardId, ShardSnapshotStatus.MISSING); + } else { + final boolean added = reassignedShardIds.add(shardId); + assert added; + updatedAssignmentsBuilder.put(shardId, updated); + } } snapshotEntries.add(entry.withShards(updatedAssignmentsBuilder.build())); changed = true; @@ -2284,8 +2292,7 @@ private static ImmutableOpenMap