Skip to content

Commit

Permalink
Fix Concurrent Snapshot Create+Delete + Delete Index (#61770) (#61774)
Browse files Browse the repository at this point in the history
We had a bug here were we put a `null` value into the shard
assignment mapping when reassigning work after a snapshot delete
had gone through. This only affects partial snaphots but essentially
dead-locks the snapshot process.

Closes #61762
  • Loading branch information
original-brownbear authored Sep 1, 2020
1 parent 13b9536 commit 99d935c
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -819,6 +819,28 @@ public void testMultipleSnapshotsQueuedAfterDelete() throws Exception {
assertAcked(deleteFuture.get());
}

public void testMultiplePartialSnapshotsQueuedAfterDelete() throws Exception {
final String masterNode = internalCluster().startMasterOnlyNode();
internalCluster().startDataOnlyNode();
final String repoName = "test-repo";
createRepository(repoName, "mock");
createIndexWithContent("index-one");
createIndexWithContent("index-two");
createNSnapshots(repoName, randomIntBetween(1, 5));

final ActionFuture<AcknowledgedResponse> deleteFuture = startAndBlockOnDeleteSnapshot(repoName, "*");
final ActionFuture<CreateSnapshotResponse> snapshotThree = startFullSnapshot(repoName, "snapshot-three", true);
final ActionFuture<CreateSnapshotResponse> snapshotFour = startFullSnapshot(repoName, "snapshot-four", true);
awaitNSnapshotsInProgress(2);

assertAcked(client().admin().indices().prepareDelete("index-two"));
unblockNode(repoName, masterNode);

assertThat(snapshotThree.get().getSnapshotInfo().state(), is(SnapshotState.PARTIAL));
assertThat(snapshotFour.get().getSnapshotInfo().state(), is(SnapshotState.PARTIAL));
assertAcked(deleteFuture.get());
}

public void testQueuedSnapshotsWaitingForShardReady() throws Exception {
internalCluster().startMasterOnlyNode();
internalCluster().startDataOnlyNodes(2);
Expand Down Expand Up @@ -1238,8 +1260,13 @@ private ActionFuture<CreateSnapshotResponse> startFullSnapshotFromMasterClient(S
}

private ActionFuture<CreateSnapshotResponse> startFullSnapshot(String repoName, String snapshotName) {
return startFullSnapshot(repoName, snapshotName, false);
}

private ActionFuture<CreateSnapshotResponse> startFullSnapshot(String repoName, String snapshotName, boolean partial) {
logger.info("--> creating full snapshot [{}] to repo [{}]", snapshotName, repoName);
return client().admin().cluster().prepareCreateSnapshot(repoName, snapshotName).setWaitForCompletion(true).execute();
return client().admin().cluster().prepareCreateSnapshot(repoName, snapshotName).setWaitForCompletion(true)
.setPartial(partial).execute();
}

private void awaitClusterState(Predicate<ClusterState> statePredicate) throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,13 @@ public static class ShardSnapshotStatus {
public static final ShardSnapshotStatus UNASSIGNED_QUEUED =
new SnapshotsInProgress.ShardSnapshotStatus(null, ShardState.QUEUED, null);

/**
* Shard snapshot status for shards that could not be snapshotted because their index was deleted from before the shard snapshot
* started.
*/
public static final ShardSnapshotStatus MISSING =
new SnapshotsInProgress.ShardSnapshotStatus(null, ShardState.MISSING, "missing index", null);

private final ShardState state;

@Nullable
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2192,9 +2192,17 @@ private SnapshotsInProgress updatedSnapshotsInProgress(ClusterState currentState
final ImmutableOpenMap.Builder<ShardId, ShardSnapshotStatus> updatedAssignmentsBuilder =
ImmutableOpenMap.builder(entry.shards());
for (ShardId shardId : canBeUpdated) {
final boolean added = reassignedShardIds.add(shardId);
assert added;
updatedAssignmentsBuilder.put(shardId, shardAssignments.get(shardId));
final ShardSnapshotStatus updated = shardAssignments.get(shardId);
if (updated == null) {
// We don't have a new assignment for this shard because its index was concurrently deleted
assert currentState.routingTable().hasIndex(shardId.getIndex()) == false :
"Missing assignment for [" + shardId + "]";
updatedAssignmentsBuilder.put(shardId, ShardSnapshotStatus.MISSING);
} else {
final boolean added = reassignedShardIds.add(shardId);
assert added;
updatedAssignmentsBuilder.put(shardId, updated);
}
}
snapshotEntries.add(entry.withShards(updatedAssignmentsBuilder.build()));
changed = true;
Expand Down Expand Up @@ -2284,8 +2292,7 @@ private static ImmutableOpenMap<ShardId, SnapshotsInProgress.ShardSnapshotStatus
IndexMetadata indexMetadata = metadata.index(indexName);
if (indexMetadata == null) {
// The index was deleted before we managed to start the snapshot - mark it as missing.
builder.put(new ShardId(indexName, IndexMetadata.INDEX_UUID_NA_VALUE, 0),
new SnapshotsInProgress.ShardSnapshotStatus(null, ShardState.MISSING, "missing index", null));
builder.put(new ShardId(indexName, IndexMetadata.INDEX_UUID_NA_VALUE, 0), ShardSnapshotStatus.MISSING);
} else {
final IndexRoutingTable indexRoutingTable = routingTable.index(indexName);
for (int i = 0; i < indexMetadata.getNumberOfShards(); i++) {
Expand Down

0 comments on commit 99d935c

Please sign in to comment.