Skip to content

Commit

Permalink
Check latest repoData before applying workaround for missing shardGen…
Browse files Browse the repository at this point in the history
… file (elastic#112979)

It is expected that the old master may attempt to read a shardGen file
that is deleted by the new master. This PR checks the latest repo data
before applying the workaround (or throwing AssertionError) for missing
shardGen files.

Relates: elastic#112337 Resolves: elastic#112811
(cherry picked from commit 99b5ed8)

# Conflicts:
#	muted-tests.yml
  • Loading branch information
ywangd committed Sep 19, 2024
1 parent 9f0ae87 commit 364f54c
Showing 1 changed file with 27 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -596,7 +596,13 @@ public void cloneShardSnapshot(
existingSnapshots = tuple.v1();
} else {
newGen = ShardGeneration.newGeneration();
existingSnapshots = buildBlobStoreIndexShardSnapshots(index, Collections.emptySet(), shardContainer, shardGeneration).v1();
existingSnapshots = buildBlobStoreIndexShardSnapshots(
index,
shardNum,
Collections.emptySet(),
shardContainer,
shardGeneration
).v1();
existingShardGen = shardGeneration;
}
SnapshotFiles existingTargetFiles = null;
Expand Down Expand Up @@ -1309,6 +1315,7 @@ protected void doRun() throws Exception {
newGen = -1L;
blobStoreIndexShardSnapshots = buildBlobStoreIndexShardSnapshots(
indexId,
shardId,
originalShardBlobs,
shardContainer,
originalRepositoryData.shardGenerations().getShardGen(indexId, shardId)
Expand Down Expand Up @@ -3203,6 +3210,7 @@ private void doSnapshotShard(SnapshotShardContext context) {
snapshotStatus.ensureNotAborted();
Tuple<BlobStoreIndexShardSnapshots, ShardGeneration> tuple = buildBlobStoreIndexShardSnapshots(
context.indexId(),
shardId.id(),
blobs,
shardContainer,
generation
Expand Down Expand Up @@ -3848,14 +3856,15 @@ public BlobStoreIndexShardSnapshots getBlobStoreIndexShardSnapshots(IndexId inde
blobs = shardContainer.listBlobsByPrefix(OperationPurpose.SNAPSHOT_METADATA, SNAPSHOT_INDEX_PREFIX).keySet();
}

return buildBlobStoreIndexShardSnapshots(indexId, blobs, shardContainer, shardGen).v1();
return buildBlobStoreIndexShardSnapshots(indexId, shardId, blobs, shardContainer, shardGen).v1();
}

/**
* Loads all available snapshots in the repository using the given {@code generation} or falling back to trying to determine it from
* the given list of blobs in the shard container.
*
* @param indexId {@link IndexId} identifying the corresponding index
* @param shardId The 0-based shard id, see also {@link ShardId#id()}
* @param blobs list of blobs in repository
* @param generation shard generation or {@code null} in case there was no shard generation tracked in the {@link RepositoryData} for
* this shard because its snapshot was created in a version older than
Expand All @@ -3864,6 +3873,7 @@ public BlobStoreIndexShardSnapshots getBlobStoreIndexShardSnapshots(IndexId inde
*/
private Tuple<BlobStoreIndexShardSnapshots, ShardGeneration> buildBlobStoreIndexShardSnapshots(
IndexId indexId,
int shardId,
Set<String> blobs,
BlobContainer shardContainer,
@Nullable ShardGeneration generation
Expand All @@ -3883,6 +3893,21 @@ private Tuple<BlobStoreIndexShardSnapshots, ShardGeneration> buildBlobStoreIndex
generation
);
} catch (NoSuchFileException noSuchFileException) {
// Master may have concurrently mutated the shard generation. This can happen when master fails over
// which is "expected". We do not need to apply the following workaround for missing file in this case.
final RepositoryData currentRepositoryData;
try {
final long latestGeneration = latestIndexBlobId();
currentRepositoryData = getRepositoryData(latestGeneration);
} catch (Exception e) {
noSuchFileException.addSuppressed(e);
throw noSuchFileException;
}
final ShardGeneration latestShardGen = currentRepositoryData.shardGenerations().getShardGen(indexId, shardId);
if (latestShardGen == null || latestShardGen.equals(generation) == false) {
throw noSuchFileException;
}

// This shouldn't happen (absent an external force deleting blobs from the repo) but in practice we've found bugs in the way
// we manipulate shard generation UUIDs under concurrent snapshot load which can lead to incorrectly deleting a referenced
// shard-level `index-UUID` blob during finalization. We definitely want to treat this as a test failure (see the `assert`
Expand Down

0 comments on commit 364f54c

Please sign in to comment.