Skip to content

Commit

Permalink
Operation routing changes to use binary search algo for child shard r…
Browse files Browse the repository at this point in the history
…esolution
  • Loading branch information
vikasvb90 committed Dec 9, 2024
1 parent 9136d86 commit 80a0654
Show file tree
Hide file tree
Showing 37 changed files with 866 additions and 599 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ public void testRolloverWithIndexSettings() throws Exception {
final ClusterState state = client().admin().cluster().prepareState().get().getState();
final IndexMetadata oldIndex = state.metadata().index("test_index-2");
final IndexMetadata newIndex = state.metadata().index("test_index-000003");
assertThat(newIndex.getNumberOfServingShards(), equalTo(1));
assertThat(newIndex.getNumberOfShards(), equalTo(1));
assertThat(newIndex.getNumberOfReplicas(), equalTo(0));
assertTrue(newIndex.getAliases().containsKey("test_alias"));
assertTrue(newIndex.getAliases().containsKey("extra_alias"));
Expand Down Expand Up @@ -318,7 +318,7 @@ public void testRolloverWithIndexSettingsBalancedWithUseZoneForReplicaDefaultCou

final ClusterState state = client().admin().cluster().prepareState().get().getState();
final IndexMetadata newIndex = state.metadata().index("test_index-000003");
assertThat(newIndex.getNumberOfServingShards(), equalTo(3));
assertThat(newIndex.getNumberOfShards(), equalTo(3));
assertThat(newIndex.getNumberOfReplicas(), equalTo(2));
manageReplicaSettingForDefaultReplica(false);
randomIndexTemplate();
Expand Down Expand Up @@ -348,7 +348,7 @@ public void testRolloverWithIndexSettingsWithoutPrefix() throws Exception {
final ClusterState state = client().admin().cluster().prepareState().get().getState();
final IndexMetadata oldIndex = state.metadata().index("test_index-2");
final IndexMetadata newIndex = state.metadata().index("test_index-000003");
assertThat(newIndex.getNumberOfServingShards(), equalTo(1));
assertThat(newIndex.getNumberOfShards(), equalTo(1));
assertThat(newIndex.getNumberOfReplicas(), equalTo(0));
assertTrue(newIndex.getAliases().containsKey("test_alias"));
assertTrue(newIndex.getAliases().containsKey("extra_alias"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ private Map<String, long[]> assertAndCapturePrimaryTerms(Map<String, long[]> pre
for (final IndexMetadata indexMetadata : state.metadata().indices().values()) {
final String index = indexMetadata.getIndex().getName();
final long[] previous = previousTerms.get(index);
final long[] current = IntStream.range(0, indexMetadata.getNumberOfServingShards() + indexMetadata.getNumOfNonServingShards())
final long[] current = IntStream.range(0, indexMetadata.getNumberOfShards())
.mapToLong(indexMetadata::primaryTerm).toArray();
if (previous == null) {
result.put(index, current);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import org.opensearch.cluster.ClusterState;
import org.opensearch.cluster.health.ClusterHealthStatus;
import org.opensearch.cluster.metadata.IndexMetadata;
import org.opensearch.cluster.metadata.ShardRange;
import org.opensearch.common.Priority;
import org.opensearch.common.settings.Settings;
import org.opensearch.common.unit.TimeValue;
Expand All @@ -27,9 +28,11 @@
import org.opensearch.test.BackgroundIndexer;
import org.opensearch.test.OpenSearchIntegTestCase;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;

import static org.hamcrest.Matchers.*;
import static org.opensearch.index.query.QueryBuilders.matchAllQuery;
Expand All @@ -48,7 +51,8 @@ private Set<Integer> triggerSplitAndGetChildShardIds(int parentShardId, int numb
assertAcked(response);
ClusterState clusterState = client().admin().cluster().prepareState().get().getState();
IndexMetadata indexMetadata = clusterState.metadata().index("test");
return new HashSet<>(indexMetadata.getChildShardIds(parentShardId));
ShardRange[] shards = indexMetadata.getSplitShardsMetadata().getChildShardsOfParent(parentShardId);
return Arrays.stream(shards).map(ShardRange::getShardId).collect(Collectors.toSet());
}

private void waitForSplit(int numberOfSplits, Set<Integer> childShardIds, int parentShardId) throws Exception {
Expand Down Expand Up @@ -89,10 +93,12 @@ private void assertClusterHealth() {
private void verifyAfterSplit(long totalIndexedDocs, Set<String> ids, int parentShardId, Set<Integer> childShardIds) throws InterruptedException {
ClusterState clusterState = internalCluster().clusterManagerClient().admin().cluster().prepareState().get().getState();
IndexMetadata indexMetadata = clusterState.metadata().index("test");
assertTrue(indexMetadata.isParentShard(parentShardId));
assertEquals(childShardIds, new HashSet<>(indexMetadata.getChildShardIds(parentShardId)));
assertNotNull(indexMetadata.getSplitShardsMetadata().getChildShardsOfParent(parentShardId));
ShardRange[] shards = indexMetadata.getSplitShardsMetadata().getChildShardsOfParent(parentShardId);
Set<Integer> currentShardIds = Arrays.stream(shards).map(ShardRange::getShardId).collect(Collectors.toSet());
assertEquals(childShardIds, currentShardIds);
Set<Integer> newServingChildShardIds = new HashSet<>();
for (int shardId : indexMetadata.getServingShardIds()) {
for (int shardId : currentShardIds) {
assertTrue(parentShardId != shardId);
if (childShardIds.contains(shardId)) newServingChildShardIds.add(shardId);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,8 @@ private Map<ShardId, IndexShardSnapshotStatus> snapshotShards(
IndexId indexId = repositoryData.resolveIndexId(index);
IndexMetadata indexMetadata = repository.getSnapshotIndexMetaData(repositoryData, snapshotInfo.snapshotId(), indexId);
if (indexMetadata != null) {
for (int i : indexMetadata.getServingShardIds()) {
int numberOfShards = indexMetadata.getNumberOfShards();
for (int i = 0; i < numberOfShards; i++) {
ShardId shardId = new ShardId(indexMetadata.getIndex(), i);
SnapshotShardFailure shardFailure = findShardFailure(snapshotInfo.shardFailures(), shardId);
if (shardFailure != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,7 @@ protected ShardsIterator shards(ClusterState clusterState, PitSegmentsRequest re
null,
null,
null,
-1L,
null,
null
-1L
)
);
}
Expand Down Expand Up @@ -230,9 +228,7 @@ public PitAwareShardRouting(
RecoverySource recoverySource,
UnassignedInfo unassignedInfo,
AllocationId allocationId,
long expectedShardSize,
ShardId[] childShardIds,
ShardId splittingShardId
long expectedShardSize
) {
super(
shardId,
Expand All @@ -244,8 +240,9 @@ public PitAwareShardRouting(
unassignedInfo,
allocationId,
expectedShardSize,
childShardIds,
splittingShardId
null,
null,
null
);
this.pitId = pitId;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -879,7 +879,7 @@ public static Translog.Location performOnReplica(BulkShardRequest request, Index
// split recovery where after all child shards are added to replication tracker, bulk
// operations are replicated to all child primaries.
int computedShardId = OperationRouting.generateShardId(indexMetadata, item.request().id(),
item.request().routing(), (shardId) -> true);
item.request().routing(), true);
discardOperation = computedShardId != replica.shardId().id();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
import org.opensearch.cluster.block.ClusterBlockLevel;
import org.opensearch.cluster.metadata.IndexMetadata;
import org.opensearch.cluster.metadata.IndexNameExpressionResolver;
import org.opensearch.cluster.metadata.ShardRange;
import org.opensearch.cluster.node.DiscoveryNode;
import org.opensearch.cluster.node.DiscoveryNodes;
import org.opensearch.cluster.routing.GroupShardsIterator;
Expand Down Expand Up @@ -1441,10 +1442,12 @@ static List<SearchShardIterator> getLocalLocalShardsIteratorFromPointInTime(
final ShardId shardId = entry.getKey();
IndexMetadata indexMetadata = clusterState.metadata().getIndexSafe(shardId.getIndex());
final List<ShardId> allShardIds;
if (indexMetadata.isNonServingShard(shardId.id()) && indexMetadata.isParentShard(shardId.id())) {
List<Integer> childShardIDs = indexMetadata.getSplitMetadata(shardId.id()).getChildShards();
if (indexMetadata.getSplitShardsMetadata().isEmptyParentShard(shardId.id())) {
ShardRange[] childShards = indexMetadata.getSplitShardsMetadata().getChildShardsOfParent(shardId.id());
allShardIds = new ArrayList<>();
childShardIDs.forEach(childShardId -> allShardIds.add(new ShardId(shardId.getIndex(), childShardId)));
for (ShardRange childShard : childShards) {
allShardIds.add(new ShardId(shardId.getIndex(), childShard.getShardId()));
}
} else {
allShardIds = List.of(shardId);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -590,7 +590,7 @@ void runWithPrimaryShardReference(final PrimaryShardReference primaryShardRefere
primaryShardReference.close(); // release shard operation lock as soon as possible
if (primaryShardReference.routingEntry().splitting()) {
// This means shard was being split and was in relocation handoff stage when replication op on primary arrived.
// Write ops specifically will now get retried and will be routed to respective child shards.
// Write ops specifically will now get retried and will be routed to respective child shards by coordinator.
throw new PrimaryShardSplitException("Primary shard is already split. Cannot perform replication operation on parent primary.");
}

Expand Down Expand Up @@ -1071,8 +1071,16 @@ protected void doRun() {
: "request waitForActiveShards must be set in resolveRequest";

ShardRouting primary = null;
if (indexMetadata.isParentShard(request.shardId().id()) && indexMetadata.isNonServingShard(request.shardId.id())) {
throw new PrimaryShardSplitException("Primary shard is already split. Cannot perform replication operation on parent primary.");
if (indexMetadata.getSplitShardsMetadata().isEmptyParentShard(request.shardId().id())) {
if (state.version() < request.routedBasedOnClusterVersion()) {
// This will get retried on coordinator. Entire request will be re-driven on respective child shards.
// Since, we are throwing a custom exception, coordinator will re-drive it explicitly on child shards
// even if it is also stale and yet to receive update from cluster manager.
throw new PrimaryShardSplitException("Primary shard is already split. Cannot perform replication operation on parent primary.");
} else {
finishAsFailed(new IndexNotFoundException(request.shardId().getIndex()));
return;
}
} else {
IndexRoutingTable indexRoutingTable = state.getRoutingTable().index(request.shardId().getIndex());
IndexShardRoutingTable shardRoutingTable = indexRoutingTable.shard(request.shardId().id());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ public String toString() {
.append(indexMetadata.getAliasesVersion())
.append("]\n");
for (int shard = 0; shard < indexMetadata.getNumberOfShards(); shard++) {
if (indexMetadata.isServingShard(shard)) {
if (indexMetadata.getSplitShardsMetadata().isEmptyParentShard(shard) == false) {
sb.append(TAB).append(TAB).append(shard).append(": ");
sb.append("p_term [").append(indexMetadata.primaryTerm(shard)).append("], ");
sb.append("isa_ids ").append(indexMetadata.inSyncAllocationIds(shard)).append("\n");
Expand Down
25 changes: 0 additions & 25 deletions server/src/main/java/org/opensearch/cluster/DiffableUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -566,29 +566,4 @@ public Set<String> read(StreamInput in, K key) throws IOException {
return Collections.unmodifiableSet(new HashSet<>(Arrays.asList(in.readStringArray())));
}
}

/**
* Implementation of ValueSerializer that serializes immutable sets
*
* @param <K> type of map key
*
* @opensearch.internal
*/
public static class IntegerSetValueSerializer<K> extends NonDiffableValueSerializer<K, Set<Integer>> {
private static final IntegerSetValueSerializer INSTANCE = new IntegerSetValueSerializer();

public static <K> IntegerSetValueSerializer<K> getInstance() {
return INSTANCE;
}

@Override
public void write(Set<Integer> value, StreamOutput out) throws IOException {
out.writeCollection(value, StreamOutput::writeVInt);
}

@Override
public Set<Integer> read(StreamInput in, K key) throws IOException {
return Collections.unmodifiableSet(in.readSet(StreamInput::readVInt));
}
}
}
Loading

0 comments on commit 80a0654

Please sign in to comment.