Skip to content

Commit

Permalink
Reset starting seqno if fail to read last commit (#45106)
Browse files Browse the repository at this point in the history
Previously, if the metadata snapshot is empty (either no commit found or
error), we won't compute the starting sequence number and use -2 to opt
out the operation-based recovery. With #43463, we have a starting
sequence number before reading the last commit. Thus, we need to reset
it if we fail to snapshot the store.

Closes #45072
  • Loading branch information
dnhatn authored Aug 1, 2019
1 parent 5322b00 commit 77720e8
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -309,25 +309,6 @@ public RecoveryResponse read(StreamInput in) throws IOException {
}
}

/**
* Obtains a snapshot of the store metadata for the recovery target.
*
* @param recoveryTarget the target of the recovery
* @return a snapshot of the store metadata
*/
private static Store.MetadataSnapshot getStoreMetadataSnapshot(final Logger logger, final RecoveryTarget recoveryTarget) {
try {
return recoveryTarget.indexShard().snapshotStoreMetadata();
} catch (final org.apache.lucene.index.IndexNotFoundException e) {
// happens on an empty folder. no need to log
logger.trace("{} shard folder empty, recovering all files", recoveryTarget);
return Store.MetadataSnapshot.EMPTY;
} catch (final IOException e) {
logger.warn("error while listing local files, recovering as if there are none", e);
return Store.MetadataSnapshot.EMPTY;
}
}

/**
* Prepare the start recovery request.
*
Expand All @@ -343,7 +324,24 @@ public static StartRecoveryRequest getStartRecoveryRequest(Logger logger, Discov
final StartRecoveryRequest request;
logger.trace("{} collecting local files for [{}]", recoveryTarget.shardId(), recoveryTarget.sourceNode());

final Store.MetadataSnapshot metadataSnapshot = getStoreMetadataSnapshot(logger, recoveryTarget);
Store.MetadataSnapshot metadataSnapshot;
try {
metadataSnapshot = recoveryTarget.indexShard().snapshotStoreMetadata();
} catch (final org.apache.lucene.index.IndexNotFoundException e) {
// happens on an empty folder. no need to log
assert startingSeqNo == UNASSIGNED_SEQ_NO : startingSeqNo;
logger.trace("{} shard folder empty, recovering all files", recoveryTarget);
metadataSnapshot = Store.MetadataSnapshot.EMPTY;
} catch (final IOException e) {
if (startingSeqNo != UNASSIGNED_SEQ_NO) {
logger.warn(new ParameterizedMessage("error while listing local files, resetting the starting sequence number from {} " +
"to unassigned and recovering as if there are none", startingSeqNo), e);
startingSeqNo = UNASSIGNED_SEQ_NO;
} else {
logger.warn("error while listing local files, recovering as if there are none", e);
}
metadataSnapshot = Store.MetadataSnapshot.EMPTY;
}
logger.trace("{} local file count [{}]", recoveryTarget.shardId(), metadataSnapshot.size());
request = new StartRecoveryRequest(
recoveryTarget.shardId(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -258,4 +258,24 @@ public void testClosedIndexSkipsLocalRecovery() throws Exception {
assertThat(replica.getLastKnownGlobalCheckpoint(), equalTo(UNASSIGNED_SEQ_NO));
closeShards(replica);
}

public void testResetStartingSeqNoIfLastCommitCorrupted() throws Exception {
IndexShard shard = newStartedShard(false);
populateRandomData(shard);
DiscoveryNode pNode = new DiscoveryNode("foo", buildNewFakeTransportAddress(),
Collections.emptyMap(), Collections.emptySet(), Version.CURRENT);
DiscoveryNode rNode = new DiscoveryNode("foo", buildNewFakeTransportAddress(),
Collections.emptyMap(), Collections.emptySet(), Version.CURRENT);
shard = reinitShard(shard, ShardRoutingHelper.initWithSameId(shard.routingEntry(), RecoverySource.PeerRecoverySource.INSTANCE));
shard.markAsRecovering("peer recovery", new RecoveryState(shard.routingEntry(), pNode, rNode));
shard.prepareForIndexRecovery();
long startingSeqNo = shard.recoverLocallyUpToGlobalCheckpoint();
shard.store().markStoreCorrupted(new IOException("simulated"));
RecoveryTarget recoveryTarget = new RecoveryTarget(shard, null, null);
StartRecoveryRequest request = PeerRecoveryTargetService.getStartRecoveryRequest(logger, rNode, recoveryTarget, startingSeqNo);
assertThat(request.startingSeqNo(), equalTo(UNASSIGNED_SEQ_NO));
assertThat(request.metadataSnapshot().size(), equalTo(0));
recoveryTarget.decRef();
closeShards(shard);
}
}

0 comments on commit 77720e8

Please sign in to comment.