-
Notifications
You must be signed in to change notification settings - Fork 1.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Remote Store] Add support for refresh level durability #5253
Changes from all commits
834552e
6bf909c
a4c22be
b7f0939
e66a36f
2dd797a
74abcec
7628322
26b80de
3b88f2f
97ecc22
25179b4
bfb5d04
499a425
9562461
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,12 +19,16 @@ | |
import org.apache.lucene.store.FilterDirectory; | ||
import org.apache.lucene.store.IOContext; | ||
import org.apache.lucene.store.IndexInput; | ||
import org.apache.lucene.store.IndexOutput; | ||
import org.opensearch.common.concurrent.GatedCloseable; | ||
import org.opensearch.index.engine.EngineException; | ||
import org.opensearch.index.engine.InternalEngine; | ||
import org.opensearch.index.seqno.SequenceNumbers; | ||
import org.opensearch.index.store.RemoteSegmentStoreDirectory; | ||
|
||
import java.io.IOException; | ||
import java.util.Collection; | ||
import java.util.Collections; | ||
import java.util.Comparator; | ||
import java.util.HashMap; | ||
import java.util.List; | ||
|
@@ -34,6 +38,8 @@ | |
import java.util.concurrent.atomic.AtomicBoolean; | ||
import java.util.stream.Collectors; | ||
|
||
import static org.opensearch.index.seqno.SequenceNumbers.LOCAL_CHECKPOINT_KEY; | ||
|
||
/** | ||
* RefreshListener implementation to upload newly created segment files to the remote store | ||
* | ||
|
@@ -44,6 +50,7 @@ public final class RemoteStoreRefreshListener implements ReferenceManager.Refres | |
static final Set<String> EXCLUDE_FILES = Set.of("write.lock"); | ||
// Visible for testing | ||
static final int LAST_N_METADATA_FILES_TO_KEEP = 10; | ||
static final String SEGMENT_INFO_SNAPSHOT_FILENAME_PREFIX = "segment_infos_snapshot_filename"; | ||
|
||
private final IndexShard indexShard; | ||
private final Directory storeDirectory; | ||
|
@@ -88,46 +95,67 @@ public void afterRefresh(boolean didRefresh) { | |
this.remoteDirectory.init(); | ||
} | ||
try { | ||
String lastCommittedLocalSegmentFileName = SegmentInfos.getLastCommitSegmentsFileName(storeDirectory); | ||
if (!remoteDirectory.containsFile( | ||
lastCommittedLocalSegmentFileName, | ||
getChecksumOfLocalFile(lastCommittedLocalSegmentFileName) | ||
)) { | ||
// if a new segments_N file is present in local that is not uploaded to remote store yet, it | ||
// is considered as a first refresh post commit. A cleanup of stale commit files is triggered. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why just first? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is to avoid triggering deletes post each refresh. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. got it maybe add that in the comments |
||
// This is done to avoid delete post each refresh. | ||
// Ideally, we want this to be done in async flow. (GitHub issue #4315) | ||
if (isRefreshAfterCommit()) { | ||
deleteStaleCommits(); | ||
} | ||
|
||
String segmentInfoSnapshotFilename = null; | ||
try (GatedCloseable<SegmentInfos> segmentInfosGatedCloseable = indexShard.getSegmentInfosSnapshot()) { | ||
SegmentInfos segmentInfos = segmentInfosGatedCloseable.get(); | ||
Collection<String> refreshedLocalFiles = segmentInfos.files(true); | ||
|
||
List<String> segmentInfosFiles = refreshedLocalFiles.stream() | ||
Collection<String> localSegmentsPostRefresh = segmentInfos.files(true); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Renamed the variable name from |
||
|
||
List<String> segmentInfosFiles = localSegmentsPostRefresh.stream() | ||
.filter(file -> file.startsWith(IndexFileNames.SEGMENTS)) | ||
.collect(Collectors.toList()); | ||
Optional<String> latestSegmentInfos = segmentInfosFiles.stream() | ||
.max(Comparator.comparingLong(IndexFileNames::parseGeneration)); | ||
.max(Comparator.comparingLong(SegmentInfos::generationFromSegmentsFileName)); | ||
|
||
if (latestSegmentInfos.isPresent()) { | ||
refreshedLocalFiles.addAll(SegmentInfos.readCommit(storeDirectory, latestSegmentInfos.get()).files(true)); | ||
// SegmentInfosSnapshot is a snapshot of reader's view of segments and may not contain | ||
// all the segments from last commit if they are merged away but not yet committed. | ||
// Each metadata file in the remote segment store represents a commit and the following | ||
// statement keeps sure that each metadata will always contain all the segments from last commit + refreshed | ||
// segments. | ||
localSegmentsPostRefresh.addAll( | ||
SegmentInfos.readCommit(storeDirectory, latestSegmentInfos.get()).files(true) | ||
); | ||
segmentInfosFiles.stream() | ||
.filter(file -> !file.equals(latestSegmentInfos.get())) | ||
.forEach(refreshedLocalFiles::remove); | ||
.forEach(localSegmentsPostRefresh::remove); | ||
|
||
boolean uploadStatus = uploadNewSegments(refreshedLocalFiles); | ||
boolean uploadStatus = uploadNewSegments(localSegmentsPostRefresh); | ||
if (uploadStatus) { | ||
segmentInfoSnapshotFilename = uploadSegmentInfosSnapshot(latestSegmentInfos.get(), segmentInfos); | ||
localSegmentsPostRefresh.add(segmentInfoSnapshotFilename); | ||
|
||
remoteDirectory.uploadMetadata( | ||
refreshedLocalFiles, | ||
localSegmentsPostRefresh, | ||
storeDirectory, | ||
indexShard.getOperationPrimaryTerm(), | ||
segmentInfos.getGeneration() | ||
); | ||
localSegmentChecksumMap.keySet() | ||
.stream() | ||
.filter(file -> !refreshedLocalFiles.contains(file)) | ||
.filter(file -> !localSegmentsPostRefresh.contains(file)) | ||
.collect(Collectors.toSet()) | ||
.forEach(localSegmentChecksumMap::remove); | ||
} | ||
} | ||
} catch (EngineException e) { | ||
logger.warn("Exception while reading SegmentInfosSnapshot", e); | ||
} finally { | ||
try { | ||
if (segmentInfoSnapshotFilename != null) { | ||
storeDirectory.deleteFile(segmentInfoSnapshotFilename); | ||
} | ||
} catch (IOException e) { | ||
logger.warn("Exception while deleting: " + segmentInfoSnapshotFilename, e); | ||
} | ||
} | ||
} catch (IOException e) { | ||
// We don't want to fail refresh if upload of new segments fails. The missed segments will be re-tried | ||
|
@@ -141,6 +169,39 @@ public void afterRefresh(boolean didRefresh) { | |
} | ||
} | ||
|
||
private boolean isRefreshAfterCommit() throws IOException { | ||
String lastCommittedLocalSegmentFileName = SegmentInfos.getLastCommitSegmentsFileName(storeDirectory); | ||
return (lastCommittedLocalSegmentFileName != null | ||
&& !remoteDirectory.containsFile(lastCommittedLocalSegmentFileName, getChecksumOfLocalFile(lastCommittedLocalSegmentFileName))); | ||
} | ||
|
||
String uploadSegmentInfosSnapshot(String latestSegmentsNFilename, SegmentInfos segmentInfosSnapshot) throws IOException { | ||
// We use lastRefreshedCheckpoint as local checkpoint for the SegmentInfosSnapshot. This is better than using | ||
// getProcessedLocalCheckpoint() as processedCheckpoint can advance between reading the value and setting up | ||
// in SegmentInfos.userData. This could lead to data loss as, during recovery, translog will be replayed based on | ||
// LOCAL_CHECKPOINT_KEY. | ||
// lastRefreshedCheckpoint is updated after refresh listeners are executed, this means, InternalEngine.lastRefreshedCheckpoint() | ||
// will return checkpoint of last refresh but that does not impact the correctness as duplicate sequence numbers | ||
// will not be replayed. | ||
assert indexShard.getEngine() instanceof InternalEngine : "Expected shard with InternalEngine, got: " | ||
+ indexShard.getEngine().getClass(); | ||
final long lastRefreshedCheckpoint = ((InternalEngine) indexShard.getEngine()).lastRefreshedCheckpoint(); | ||
|
||
Map<String, String> userData = segmentInfosSnapshot.getUserData(); | ||
userData.put(LOCAL_CHECKPOINT_KEY, String.valueOf(lastRefreshedCheckpoint)); | ||
userData.put(SequenceNumbers.MAX_SEQ_NO, Long.toString(lastRefreshedCheckpoint)); | ||
segmentInfosSnapshot.setUserData(userData, false); | ||
|
||
long commitGeneration = SegmentInfos.generationFromSegmentsFileName(latestSegmentsNFilename); | ||
String segmentInfoSnapshotFilename = SEGMENT_INFO_SNAPSHOT_FILENAME_PREFIX + "__" + commitGeneration; | ||
try (IndexOutput indexOutput = storeDirectory.createOutput(segmentInfoSnapshotFilename, IOContext.DEFAULT)) { | ||
segmentInfosSnapshot.write(indexOutput); | ||
} | ||
storeDirectory.sync(Collections.singleton(segmentInfoSnapshotFilename)); | ||
remoteDirectory.copyFrom(storeDirectory, segmentInfoSnapshotFilename, segmentInfoSnapshotFilename, IOContext.DEFAULT, true); | ||
return segmentInfoSnapshotFilename; | ||
} | ||
|
||
// Visible for testing | ||
boolean uploadNewSegments(Collection<String> localFiles) throws IOException { | ||
AtomicBoolean uploadSuccess = new AtomicBoolean(true); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,6 +38,8 @@ | |
import org.apache.lucene.index.NoMergePolicy; | ||
import org.apache.lucene.index.SegmentInfos; | ||
import org.apache.lucene.search.Sort; | ||
import org.apache.lucene.store.BufferedChecksumIndexInput; | ||
import org.apache.lucene.store.ChecksumIndexInput; | ||
import org.apache.lucene.store.Directory; | ||
import org.apache.lucene.store.FilterDirectory; | ||
import org.apache.lucene.store.IOContext; | ||
|
@@ -76,6 +78,8 @@ | |
import java.util.stream.Collectors; | ||
|
||
import static org.opensearch.common.unit.TimeValue.timeValueMillis; | ||
import static org.opensearch.index.seqno.SequenceNumbers.LOCAL_CHECKPOINT_KEY; | ||
import static org.opensearch.index.shard.RemoteStoreRefreshListener.SEGMENT_INFO_SNAPSHOT_FILENAME_PREFIX; | ||
|
||
/** | ||
* This package private utility class encapsulates the logic to recover an index shard from either an existing index on | ||
|
@@ -463,9 +467,30 @@ private void recoverFromRemoteStore(IndexShard indexShard) throws IndexShardReco | |
for (String file : storeDirectory.listAll()) { | ||
storeDirectory.deleteFile(file); | ||
} | ||
String segmentInfosSnapshotFilename = null; | ||
for (String file : remoteDirectory.listAll()) { | ||
storeDirectory.copyFrom(remoteDirectory, file, file, IOContext.DEFAULT); | ||
if (file.startsWith(SEGMENT_INFO_SNAPSHOT_FILENAME_PREFIX)) { | ||
segmentInfosSnapshotFilename = file; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. would we need the latest file here. The list wouldn't guarantee sorted in the order we need, right ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When we create instance of |
||
} | ||
} | ||
|
||
if (segmentInfosSnapshotFilename != null) { | ||
try ( | ||
ChecksumIndexInput indexInput = new BufferedChecksumIndexInput( | ||
storeDirectory.openInput(segmentInfosSnapshotFilename, IOContext.DEFAULT) | ||
) | ||
) { | ||
SegmentInfos infosSnapshot = SegmentInfos.readCommit( | ||
store.directory(), | ||
indexInput, | ||
Long.parseLong(segmentInfosSnapshotFilename.split("__")[1]) | ||
); | ||
long processedLocalCheckpoint = Long.parseLong(infosSnapshot.getUserData().get(LOCAL_CHECKPOINT_KEY)); | ||
store.commitSegmentInfos(infosSnapshot, processedLocalCheckpoint, processedLocalCheckpoint); | ||
} | ||
} | ||
|
||
// This creates empty trans-log for now | ||
// ToDo: Add code to restore from remote trans-log | ||
bootstrap(indexShard, store); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need to change this behavior when remote txlog is not enabled ? We will need to fail refresh in that case. Since it might be async , it will be tricky for customer to see the failures though .
Alternatively we need to call out that refresh level/commit level durability is best effort and provide some visibility in what is durably stored at any given point of time.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, as of now, commit/refresh level durability is best efforts and we need to combine it with remote translog to provide request level durability.
As I think more, failing refresh/commit on segment upload failure would be useful in following cases:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we would have to model the refresh listener as a fail-fast blocking listener. We could add a condition to fail based on whether operation level durability is enabled
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Makes sense. With remote translog enabled, we can change the way segments are uploaded.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does a failure impact staleness of data on replicas independent of operation level durability
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Bukhtawar Are you referring to remote store integration with Segment replication?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we track this as an open item
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Created issue: #5578