Skip to content

Commit

Permalink
Stricter failure handling in TransportGetSnapshotsAction (#107191)
Browse files Browse the repository at this point in the history
Today if there's a failure during a multi-repo get-snapshots request
then we record a per-repository failure but allow the rest of the
request to proceed. This is trappy for clients, it means that they must
always remember to check the `failures` response field or else risk
missing some results. It's also a pain for the implementation because it
means we have to collect the per-repository results separately first
before adding them to the final results set just in case the last one
triggers a failure.

This commit drops this leniency and bubbles all failures straight up to
the top level.
  • Loading branch information
DaveCTurner authored Jul 3, 2024
1 parent 69ee6d7 commit c5e8173
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 44 deletions.
17 changes: 17 additions & 0 deletions docs/changelog/107191.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
pr: 107191
summary: Stricter failure handling in multi-repo get-snapshots request handling
area: Snapshot/Restore
type: bug
issues: []
highlight:
title: Stricter failure handling in multi-repo get-snapshots request handling
body: |
If a multi-repo get-snapshots request encounters a failure in one of the
targeted repositories then earlier versions of Elasticsearch would proceed
as if the faulty repository did not exist, except for a per-repository
failure report in a separate section of the response body. This makes it
impossible to paginate the results properly in the presence of failures. In
versions 8.15.0 and later this API's failure handling behaviour has been
made stricter, reporting an overall failure if any targeted repository's
contents cannot be listed.
notable: true
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,8 @@

import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
import static org.hamcrest.Matchers.empty;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.in;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.is;

public class GetSnapshotsIT extends AbstractSnapshotIntegTestCase {
Expand Down Expand Up @@ -314,6 +312,7 @@ public void testExcludePatterns() throws Exception {
assertThat(
clusterAdmin().prepareGetSnapshots(TEST_REQUEST_TIMEOUT, matchAllPattern())
.setSnapshots("non-existing*", otherPrefixSnapshot1, "-o*")
.setIgnoreUnavailable(true)
.get()
.getSnapshots(),
empty()
Expand Down Expand Up @@ -586,12 +585,17 @@ public void testRetrievingSnapshotsWhenRepositoryIsMissing() throws Exception {
final List<String> snapshotNames = createNSnapshots(repoName, randomIntBetween(1, 10));
snapshotNames.sort(String::compareTo);

final GetSnapshotsResponse response = clusterAdmin().prepareGetSnapshots(TEST_REQUEST_TIMEOUT, repoName, missingRepoName)
final var oneRepoFuture = clusterAdmin().prepareGetSnapshots(TEST_REQUEST_TIMEOUT, repoName, missingRepoName)
.setSort(SnapshotSortKey.NAME)
.get();
assertThat(response.getSnapshots().stream().map(info -> info.snapshotId().getName()).toList(), equalTo(snapshotNames));
assertTrue(response.getFailures().containsKey(missingRepoName));
assertThat(response.getFailures().get(missingRepoName), instanceOf(RepositoryMissingException.class));
.setIgnoreUnavailable(randomBoolean())
.execute();
expectThrows(RepositoryMissingException.class, oneRepoFuture::actionGet);

final var multiRepoFuture = clusterAdmin().prepareGetSnapshots(TEST_REQUEST_TIMEOUT, repoName, missingRepoName)
.setSort(SnapshotSortKey.NAME)
.setIgnoreUnavailable(randomBoolean())
.execute();
expectThrows(RepositoryMissingException.class, multiRepoFuture::actionGet);
}

// Create a snapshot that is guaranteed to have a unique start time and duration for tests around ordering by either.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.not;
import static org.hamcrest.Matchers.oneOf;
Expand Down Expand Up @@ -395,16 +394,13 @@ public void testGetSnapshotsMultipleRepos() throws Exception {
}

logger.info("--> specify all snapshot names with ignoreUnavailable=false");
GetSnapshotsResponse getSnapshotsResponse2 = client.admin()
final var failingFuture = client.admin()
.cluster()
.prepareGetSnapshots(TEST_REQUEST_TIMEOUT, randomFrom("_all", "repo*"))
.setIgnoreUnavailable(false)
.setSnapshots(snapshotList.toArray(new String[0]))
.get();

for (String repo : repoList) {
assertThat(getSnapshotsResponse2.getFailures().get(repo), instanceOf(SnapshotMissingException.class));
}
.execute();
expectThrows(SnapshotMissingException.class, failingFuture::actionGet);

logger.info("--> specify all snapshot names with ignoreUnavailable=true");
GetSnapshotsResponse getSnapshotsResponse3 = client.admin()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.search.sort.SortOrder;
Expand Down Expand Up @@ -220,13 +219,6 @@ public String[] policies() {
return policies;
}

public boolean isSingleRepositoryRequest() {
return repositories.length == 1
&& repositories[0] != null
&& "_all".equals(repositories[0]) == false
&& Regex.isSimpleMatchPattern(repositories[0]) == false;
}

/**
* Returns the names of the snapshots.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.xcontent.ChunkedToXContentObject;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.core.UpdateForV9;
import org.elasticsearch.snapshots.SnapshotInfo;
import org.elasticsearch.xcontent.ToXContent;

Expand All @@ -33,6 +34,7 @@ public class GetSnapshotsResponse extends ActionResponse implements ChunkedToXCo

private final List<SnapshotInfo> snapshots;

@UpdateForV9 // always empty, can be dropped
private final Map<String, ElasticsearchException> failures;

@Nullable
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

package org.elasticsearch.action.admin.cluster.snapshots.get;

import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.ActionType;
import org.elasticsearch.action.support.ActionFilters;
Expand Down Expand Up @@ -120,10 +119,14 @@ protected void masterOperation(
) {
assert task instanceof CancellableTask : task + " not cancellable";

final var resolvedRepositories = ResolvedRepositories.resolve(state, request.repositories());
if (resolvedRepositories.hasMissingRepositories()) {
throw new RepositoryMissingException(String.join(", ", resolvedRepositories.missing()));
}

new GetSnapshotsOperation(
(CancellableTask) task,
ResolvedRepositories.resolve(state, request.repositories()),
request.isSingleRepositoryRequest() == false,
resolvedRepositories.repositoryMetadata(),
request.snapshots(),
request.ignoreUnavailable(),
request.policies(),
Expand Down Expand Up @@ -151,7 +154,6 @@ private class GetSnapshotsOperation {

// repositories
private final List<RepositoryMetadata> repositories;
private final boolean isMultiRepoRequest;

// snapshots selection
private final SnapshotNamePredicate snapshotNamePredicate;
Expand Down Expand Up @@ -179,7 +181,6 @@ private class GetSnapshotsOperation {
private final GetSnapshotInfoExecutor getSnapshotInfoExecutor;

// results
private final Map<String, ElasticsearchException> failuresByRepository = ConcurrentCollections.newConcurrentMap();
private final Queue<List<SnapshotInfo>> allSnapshotInfos = ConcurrentCollections.newQueue();

/**
Expand All @@ -195,8 +196,7 @@ private class GetSnapshotsOperation {

GetSnapshotsOperation(
CancellableTask cancellableTask,
ResolvedRepositories resolvedRepositories,
boolean isMultiRepoRequest,
List<RepositoryMetadata> repositories,
String[] snapshots,
boolean ignoreUnavailable,
String[] policies,
Expand All @@ -211,8 +211,7 @@ private class GetSnapshotsOperation {
boolean indices
) {
this.cancellableTask = cancellableTask;
this.repositories = resolvedRepositories.repositoryMetadata();
this.isMultiRepoRequest = isMultiRepoRequest;
this.repositories = repositories;
this.ignoreUnavailable = ignoreUnavailable;
this.sortBy = sortBy;
this.order = order;
Expand All @@ -232,10 +231,6 @@ private class GetSnapshotsOperation {
threadPool.info(ThreadPool.Names.SNAPSHOT_META).getMax(),
cancellableTask::isCancelled
);

for (final var missingRepo : resolvedRepositories.missing()) {
failuresByRepository.put(missingRepo, new RepositoryMissingException(missingRepo));
}
}

void getMultipleReposSnapshotInfo(ActionListener<GetSnapshotsResponse> listener) {
Expand All @@ -249,6 +244,10 @@ void getMultipleReposSnapshotInfo(ActionListener<GetSnapshotsResponse> listener)
continue;
}

if (listeners.isFailing()) {
return;
}

SubscribableListener

.<RepositoryData>newForked(repositoryDataListener -> {
Expand All @@ -261,14 +260,7 @@ void getMultipleReposSnapshotInfo(ActionListener<GetSnapshotsResponse> listener)

.<Void>andThen((l, repositoryData) -> loadSnapshotInfos(repoName, repositoryData, l))

.addListener(listeners.acquire().delegateResponse((l, e) -> {
if (isMultiRepoRequest && e instanceof ElasticsearchException elasticsearchException) {
failuresByRepository.put(repoName, elasticsearchException);
l.onResponse(null);
} else {
l.onFailure(e);
}
}));
.addListener(listeners.acquire());
}
}
})
Expand Down Expand Up @@ -503,7 +495,7 @@ private GetSnapshotsResponse buildResponse() {
}
return new GetSnapshotsResponse(
snapshotInfos,
failuresByRepository,
null,
remaining > 0 ? sortBy.encodeAfterQueryParam(snapshotInfos.get(snapshotInfos.size() - 1)) : null,
totalCount.get(),
remaining
Expand Down

0 comments on commit c5e8173

Please sign in to comment.