Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not cancel ongoing recovery for noop copy on broken node #48265

Merged
merged 14 commits into from
Nov 1, 2019
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -547,7 +547,7 @@ assert getByAllocationId(failedShard.shardId(), failedShard.allocationId().getId
assert replicaShard != null : "failed to re-resolve " + routing + " when failing replicas";
UnassignedInfo primaryFailedUnassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.PRIMARY_FAILED,
"primary failed while replica initializing", null, 0, unassignedInfo.getUnassignedTimeInNanos(),
unassignedInfo.getUnassignedTimeInMillis(), false, AllocationStatus.NO_ATTEMPT);
unassignedInfo.getUnassignedTimeInMillis(), false, AllocationStatus.NO_ATTEMPT, false);
failShard(logger, replicaShard, primaryFailedUnassignedInfo, indexMetaData, routingChangesObserver);
}
}
Expand Down Expand Up @@ -873,7 +873,7 @@ public void ignoreShard(ShardRouting shard, AllocationStatus allocationStatus, R
UnassignedInfo newInfo = new UnassignedInfo(currInfo.getReason(), currInfo.getMessage(), currInfo.getFailure(),
currInfo.getNumFailedAllocations(), currInfo.getUnassignedTimeInNanos(),
currInfo.getUnassignedTimeInMillis(), currInfo.isDelayed(),
allocationStatus);
allocationStatus, false);
ShardRouting updatedShard = shard.updateUnassigned(newInfo, shard.recoverySource());
changes.unassignedInfoUpdated(shard, newInfo);
shard = updatedShard;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
package org.elasticsearch.cluster.routing;

import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.metadata.MetaData;
import org.elasticsearch.cluster.routing.allocation.decider.Decision;
Expand Down Expand Up @@ -214,6 +215,7 @@ public String value() {
private final Exception failure;
private final int failedAllocations;
private final AllocationStatus lastAllocationStatus; // result of the last allocation attempt for this shard
private final boolean wasCancelledForNoopRecovery;

/**
* creates an UnassignedInfo object based on **current** time
Expand All @@ -223,20 +225,22 @@ public String value() {
**/
public UnassignedInfo(Reason reason, String message) {
this(reason, message, null, reason == Reason.ALLOCATION_FAILED ? 1 : 0, System.nanoTime(), System.currentTimeMillis(), false,
AllocationStatus.NO_ATTEMPT);
AllocationStatus.NO_ATTEMPT, false);
}

/**
* @param reason the cause for making this shard unassigned. See {@link Reason} for more information.
* @param message more information about cause.
* @param failure the shard level failure that caused this shard to be unassigned, if exists.
* @param unassignedTimeNanos the time to use as the base for any delayed re-assignment calculation
* @param unassignedTimeMillis the time of unassignment used to display to in our reporting.
* @param delayed if allocation of this shard is delayed due to INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.
* @param lastAllocationStatus the result of the last allocation attempt for this shard
* @param reason the cause for making this shard unassigned. See {@link Reason} for more information.
* @param message more information about cause.
* @param failure the shard level failure that caused this shard to be unassigned, if exists.
* @param unassignedTimeNanos the time to use as the base for any delayed re-assignment calculation
* @param unassignedTimeMillis the time of unassignment used to display to in our reporting.
* @param delayed if allocation of this shard is delayed due to INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.
* @param lastAllocationStatus the result of the last allocation attempt for this shard
* @param wasCancelledForNoopRecovery if allocation of this shard was cancelled in favour of a noop recovery
*/
public UnassignedInfo(Reason reason, @Nullable String message, @Nullable Exception failure, int failedAllocations,
long unassignedTimeNanos, long unassignedTimeMillis, boolean delayed, AllocationStatus lastAllocationStatus) {
long unassignedTimeNanos, long unassignedTimeMillis, boolean delayed, AllocationStatus lastAllocationStatus,
boolean wasCancelledForNoopRecovery) {
this.reason = Objects.requireNonNull(reason);
this.unassignedTimeMillis = unassignedTimeMillis;
this.unassignedTimeNanos = unassignedTimeNanos;
Expand All @@ -245,10 +249,13 @@ public UnassignedInfo(Reason reason, @Nullable String message, @Nullable Excepti
this.failure = failure;
this.failedAllocations = failedAllocations;
this.lastAllocationStatus = Objects.requireNonNull(lastAllocationStatus);
this.wasCancelledForNoopRecovery = wasCancelledForNoopRecovery;
assert (failedAllocations > 0) == (reason == Reason.ALLOCATION_FAILED) :
"failedAllocations: " + failedAllocations + " for reason " + reason;
assert !(message == null && failure != null) : "provide a message if a failure exception is provided";
assert !(delayed && reason != Reason.NODE_LEFT) : "shard can only be delayed if it is unassigned due to a node leaving";
assert wasCancelledForNoopRecovery == false || reason == Reason.ALLOCATION_FAILED :
"cancelled status was not reset for reason [" + reason + "]";
}

public UnassignedInfo(StreamInput in) throws IOException {
Expand All @@ -262,6 +269,11 @@ public UnassignedInfo(StreamInput in) throws IOException {
this.failure = in.readException();
this.failedAllocations = in.readVInt();
this.lastAllocationStatus = AllocationStatus.readFrom(in);
if (in.getVersion().onOrAfter(Version.V_8_0_0)) {
this.wasCancelledForNoopRecovery = in.readBoolean();
} else {
this.wasCancelledForNoopRecovery = false;
}
}

public void writeTo(StreamOutput out) throws IOException {
Expand All @@ -273,6 +285,9 @@ public void writeTo(StreamOutput out) throws IOException {
out.writeException(failure);
out.writeVInt(failedAllocations);
lastAllocationStatus.writeTo(out);
if (out.getVersion().onOrAfter(Version.V_8_0_0)) {
out.writeBoolean(wasCancelledForNoopRecovery);
}
}

/**
Expand Down Expand Up @@ -347,6 +362,13 @@ public AllocationStatus getLastAllocationStatus() {
return lastAllocationStatus;
}

/**
* @return true if the allocation of this shard was cancelled in favour of a noop recovery
*/
public boolean wasCancelledForNoopRecovery() {
return wasCancelledForNoopRecovery;
}

/**
* Calculates the delay left based on current time (in nanoseconds) and the delay defined by the index settings.
* Only relevant if shard is effectively delayed (see {@link #isDelayed()})
Expand Down Expand Up @@ -432,6 +454,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
builder.field("details", details);
}
builder.field("allocation_status", lastAllocationStatus.value());
builder.field("cancelled_for_noop", wasCancelledForNoopRecovery);
builder.endObject();
return builder;
}
Expand Down Expand Up @@ -459,13 +482,16 @@ public boolean equals(Object o) {
if (reason != that.reason) {
return false;
}
if (message != null ? !message.equals(that.message) : that.message != null) {
if (Objects.equals(message, that.message) == false) {
return false;
}
if (lastAllocationStatus != that.lastAllocationStatus) {
return false;
}
return !(failure != null ? !failure.equals(that.failure) : that.failure != null);
if (Objects.equals(failure, that.failure) == false) {
return false;
}
return wasCancelledForNoopRecovery == that.wasCancelledForNoopRecovery;
}

@Override
Expand All @@ -477,6 +503,7 @@ public int hashCode() {
result = 31 * result + (message != null ? message.hashCode() : 0);
result = 31 * result + (failure != null ? failure.hashCode() : 0);
result = 31 * result + lastAllocationStatus.hashCode();
result = 31 * result + Boolean.hashCode(wasCancelledForNoopRecovery);
return result;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,10 +194,17 @@ public ClusterState applyFailedShards(final ClusterState clusterState, final Lis
shardToFail.shardId(), shardToFail, failedShard);
}
int failedAllocations = failedShard.unassignedInfo() != null ? failedShard.unassignedInfo().getNumFailedAllocations() : 0;
boolean wasCancelledForNoopRecovery;
if (failedShard.unassignedInfo() != null) {
wasCancelledForNoopRecovery = failedShard.unassignedInfo().wasCancelledForNoopRecovery()
|| failedShard.unassignedInfo().getReason() == UnassignedInfo.Reason.REALLOCATED_REPLICA;
} else {
wasCancelledForNoopRecovery = false;
}
String message = "failed shard on node [" + shardToFail.currentNodeId() + "]: " + failedShardEntry.getMessage();
UnassignedInfo unassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.ALLOCATION_FAILED, message,
failedShardEntry.getFailure(), failedAllocations + 1, currentNanoTime, System.currentTimeMillis(), false,
AllocationStatus.NO_ATTEMPT);
AllocationStatus.NO_ATTEMPT, wasCancelledForNoopRecovery);
if (failedShardEntry.markAsStale()) {
allocation.removeAllocationId(failedShard);
}
Expand Down Expand Up @@ -289,7 +296,7 @@ private void removeDelayMarkers(RoutingAllocation allocation) {
if (newComputedLeftDelayNanos == 0) {
unassignedIterator.updateUnassigned(new UnassignedInfo(unassignedInfo.getReason(), unassignedInfo.getMessage(),
unassignedInfo.getFailure(), unassignedInfo.getNumFailedAllocations(), unassignedInfo.getUnassignedTimeInNanos(),
unassignedInfo.getUnassignedTimeInMillis(), false, unassignedInfo.getLastAllocationStatus()),
unassignedInfo.getUnassignedTimeInMillis(), false, unassignedInfo.getLastAllocationStatus(), false),
shardRouting.recoverySource(), allocation.changes());
}
}
Expand All @@ -308,7 +315,7 @@ private void resetFailedAllocationCounter(RoutingAllocation allocation) {
UnassignedInfo.Reason.MANUAL_ALLOCATION : unassignedInfo.getReason(), unassignedInfo.getMessage(),
unassignedInfo.getFailure(), 0, unassignedInfo.getUnassignedTimeInNanos(),
unassignedInfo.getUnassignedTimeInMillis(), unassignedInfo.isDelayed(),
unassignedInfo.getLastAllocationStatus()), shardRouting.recoverySource(), allocation.changes());
unassignedInfo.getLastAllocationStatus(), false), shardRouting.recoverySource(), allocation.changes());
}
}

Expand Down Expand Up @@ -421,7 +428,7 @@ private void disassociateDeadNodes(RoutingAllocation allocation) {
final IndexMetaData indexMetaData = allocation.metaData().getIndexSafe(shardRouting.index());
boolean delayed = INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.get(indexMetaData.getSettings()).nanos() > 0;
UnassignedInfo unassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.NODE_LEFT, "node_left [" + node.nodeId() + "]",
null, 0, allocation.getCurrentNanoTime(), System.currentTimeMillis(), delayed, AllocationStatus.NO_ATTEMPT);
null, 0, allocation.getCurrentNanoTime(), System.currentTimeMillis(), delayed, AllocationStatus.NO_ATTEMPT, false);
allocation.routingNodes().failShard(logger, shardRouting, unassignedInfo, indexMetaData, allocation.changes());
}
// its a dead node, remove it, note, its important to remove it *after* we apply failed shard
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ private void failAllocationOfNewPrimaries(RoutingAllocation allocation) {
if (shardRouting.primary() && unassignedInfo.getLastAllocationStatus() == AllocationStatus.NO_ATTEMPT) {
unassignedIterator.updateUnassigned(new UnassignedInfo(unassignedInfo.getReason(), unassignedInfo.getMessage(),
unassignedInfo.getFailure(), unassignedInfo.getNumFailedAllocations(), unassignedInfo.getUnassignedTimeInNanos(),
unassignedInfo.getUnassignedTimeInMillis(), unassignedInfo.isDelayed(), AllocationStatus.DECIDERS_NO),
unassignedInfo.getUnassignedTimeInMillis(), unassignedInfo.isDelayed(), AllocationStatus.DECIDERS_NO, false),
shardRouting.recoverySource(), allocation.changes());
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ public RerouteExplanation execute(RoutingAllocation allocation, boolean explain)
", " + shardRouting.unassignedInfo().getMessage();
unassignedInfoToUpdate = new UnassignedInfo(UnassignedInfo.Reason.FORCED_EMPTY_PRIMARY, unassignedInfoMessage,
shardRouting.unassignedInfo().getFailure(), 0, System.nanoTime(), System.currentTimeMillis(), false,
shardRouting.unassignedInfo().getLastAllocationStatus());
shardRouting.unassignedInfo().getLastAllocationStatus(), false);
}

initializeUnassignedShard(allocation, routingNodes, routingNode, shardRouting, unassignedInfoToUpdate,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,12 @@ public void processExistingRecoveries(RoutingAllocation allocation) {
continue;
}

// if we already cancelled this shard before, then we should not cancel it again
if (shard.unassignedInfo() != null && shard.unassignedInfo().wasCancelledForNoopRecovery()) {
logger.trace("{} was cancelled for a noop recovery before", shard);
continue;
}

AsyncShardFetch.FetchResult<NodeStoreFilesMetaData> shardStores = fetchData(shard, allocation);
if (shardStores.hasData() == false) {
logger.trace("{}: fetching new stores for initializing shard", shard);
Expand Down Expand Up @@ -110,7 +116,7 @@ && canPerformOperationBasedRecovery(primaryStore, shardStores, currentNode) == f
"existing allocation of replica to [" + currentNode + "] cancelled, can perform a noop recovery on ["+
nodeWithHighestMatch + "]",
null, 0, allocation.getCurrentNanoTime(), System.currentTimeMillis(), false,
UnassignedInfo.AllocationStatus.NO_ATTEMPT);
UnassignedInfo.AllocationStatus.NO_ATTEMPT, false);
// don't cancel shard in the loop as it will cause a ConcurrentModificationException
shardCancellationActions.add(() -> routingNodes.failShard(logger, shard, unassignedInfo,
metaData.getIndexSafe(shard.index()), allocation.changes()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ public void testSerialization() throws Exception {
UnassignedInfo.Reason reason = RandomPicks.randomFrom(random(), UnassignedInfo.Reason.values());
UnassignedInfo meta = reason == UnassignedInfo.Reason.ALLOCATION_FAILED ?
new UnassignedInfo(reason, randomBoolean() ? randomAlphaOfLength(4) : null, null,
randomIntBetween(1, 100), System.nanoTime(), System.currentTimeMillis(), false, AllocationStatus.NO_ATTEMPT):
randomIntBetween(1, 100), System.nanoTime(), System.currentTimeMillis(), false, AllocationStatus.NO_ATTEMPT,
randomBoolean()):
new UnassignedInfo(reason, randomBoolean() ? randomAlphaOfLength(4) : null);
BytesStreamOutput out = new BytesStreamOutput();
meta.writeTo(out);
Expand All @@ -94,6 +95,7 @@ public void testSerialization() throws Exception {
assertThat(read.getMessage(), equalTo(meta.getMessage()));
assertThat(read.getDetails(), equalTo(meta.getDetails()));
assertThat(read.getNumFailedAllocations(), equalTo(meta.getNumFailedAllocations()));
assertThat(read.wasCancelledForNoopRecovery(), equalTo(meta.wasCancelledForNoopRecovery()));
}

public void testIndexCreated() {
Expand Down Expand Up @@ -296,7 +298,7 @@ public void testFailedShard() {
public void testRemainingDelayCalculation() throws Exception {
final long baseTime = System.nanoTime();
UnassignedInfo unassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.NODE_LEFT, "test", null, 0, baseTime,
System.currentTimeMillis(), randomBoolean(), AllocationStatus.NO_ATTEMPT);
System.currentTimeMillis(), randomBoolean(), AllocationStatus.NO_ATTEMPT, false);
final long totalDelayNanos = TimeValue.timeValueMillis(10).nanos();
final Settings indexSettings = Settings.builder()
.put(UnassignedInfo.INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.getKey(), TimeValue.timeValueNanos(totalDelayNanos)).build();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ public void testCanAllocatePrimaryExistingInRestoreInProgress() {
UnassignedInfo currentInfo = primary.unassignedInfo();
UnassignedInfo newInfo = new UnassignedInfo(currentInfo.getReason(), currentInfo.getMessage(), new IOException("i/o failure"),
currentInfo.getNumFailedAllocations(), currentInfo.getUnassignedTimeInNanos(),
currentInfo.getUnassignedTimeInMillis(), currentInfo.isDelayed(), currentInfo.getLastAllocationStatus());
currentInfo.getUnassignedTimeInMillis(), currentInfo.isDelayed(), currentInfo.getLastAllocationStatus(), false);
primary = primary.updateUnassigned(newInfo, primary.recoverySource());

IndexRoutingTable indexRoutingTable = routingTable.index("test");
Expand Down
Loading