Skip to content

Commit

Permalink
Protect NodeConnectionsService from stale conns (#101988)
Browse files Browse the repository at this point in the history
A call to `ConnectionTarget#connect` which happens strictly after all
calls that close connections should leave us connected to the target.
However concurrent calls to `ConnectionTarget#connect` can overlap, and
today this means that a connection returned from an earlier call may
overwrite one from a later call. The trouble is that the earlier
connection attempt may yield a closed connection (it was concurrent with
the disconnections) so we must not let it supersede the newer one.

With this commit we prevent concurrent connection attempts, which avoids
earlier attempts from overwriting the connections resulting from later
attempts.

Backport of #92558
When combined with #101910, closes #100493
  • Loading branch information
DaveCTurner authored Nov 9, 2023
1 parent 7d975ab commit 5ab8599
Show file tree
Hide file tree
Showing 3 changed files with 195 additions and 93 deletions.
6 changes: 6 additions & 0 deletions docs/changelog/92558.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 92558
summary: Protect `NodeConnectionsService` from stale conns
area: Network
type: bug
issues:
- 92029
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
import org.elasticsearch.common.util.concurrent.ListenableFuture;
import org.elasticsearch.core.Releasable;
import org.elasticsearch.core.Releasables;
import org.elasticsearch.core.TimeValue;
Expand Down Expand Up @@ -219,12 +220,19 @@ public void reconnectToNodes(DiscoveryNodes discoveryNodes, Runnable onCompletio
});
}

// placeholder listener for a fire-and-forget connection attempt
private static final ActionListener<Void> NOOP = ActionListener.wrap(r -> {}, e -> {});

private class ConnectionTarget {
private final DiscoveryNode discoveryNode;

private final AtomicInteger consecutiveFailureCount = new AtomicInteger();
private final AtomicReference<Releasable> connectionRef = new AtomicReference<>();

// all access to these fields is synchronized
private ActionListener<Void> pendingListener;
private boolean connectionInProgress;

ConnectionTarget(DiscoveryNode discoveryNode) {
this.discoveryNode = discoveryNode;
}
Expand All @@ -235,57 +243,101 @@ private void setConnectionRef(Releasable connectionReleasable) {

Runnable connect(ActionListener<Void> listener) {
return () -> {
final boolean alreadyConnected = transportService.nodeConnected(discoveryNode);
registerListener(listener);
doConnect();
};
}

if (alreadyConnected) {
logger.trace("refreshing connection to {}", discoveryNode);
} else {
logger.debug("connecting to {}", discoveryNode);
private synchronized void registerListener(ActionListener<Void> listener) {
if (listener == null) {
pendingListener = pendingListener == null ? NOOP : pendingListener;
} else if (pendingListener == null || pendingListener == NOOP) {
pendingListener = listener;
} else if (pendingListener instanceof ListenableFuture<?>) {
((ListenableFuture<Void>) pendingListener).addListener(listener);
} else {
ListenableFuture<Void> wrapper = new ListenableFuture<Void>();
wrapper.addListener(pendingListener);
wrapper.addListener(listener);
pendingListener = wrapper;
}
}

private synchronized ActionListener<Void> acquireListener() {
// Avoid concurrent connection attempts because they don't necessarily complete in order otherwise, and out-of-order completion
// might mean we end up disconnected from a node even though we triggered a call to connect() after all close() calls had
// finished.
if (connectionInProgress == false) {
ActionListener<Void> listener = pendingListener;
if (listener != null) {
pendingListener = null;
connectionInProgress = true;
return listener;
}
}
return null;
}

private synchronized void releaseListener() {
assert connectionInProgress;
connectionInProgress = false;
}

private void doConnect() {
ActionListener<Void> listener = acquireListener();
if (listener == null) {
return;
}

final boolean alreadyConnected = transportService.nodeConnected(discoveryNode);

if (alreadyConnected) {
logger.trace("refreshing connection to {}", discoveryNode);
} else {
logger.debug("connecting to {}", discoveryNode);
}

// It's possible that connectionRef is a reference to an older connection that closed out from under us, but that something
// else has opened a fresh connection to the node. Therefore we always call connectToNode() and update connectionRef.
transportService.connectToNode(discoveryNode, new ActionListener<Releasable>() {
@Override
public void onResponse(Releasable connectionReleasable) {
if (alreadyConnected) {
logger.trace("refreshed connection to {}", discoveryNode);
} else {
logger.debug("connected to {}", discoveryNode);
}
consecutiveFailureCount.set(0);
setConnectionRef(connectionReleasable);

final boolean isActive;
synchronized (mutex) {
isActive = targetsByNode.get(discoveryNode) == ConnectionTarget.this;
}
if (isActive == false) {
logger.debug("connected to stale {} - releasing stale connection", discoveryNode);
setConnectionRef(null);
}
if (listener != null) {
listener.onResponse(null);
}
// It's possible that connectionRef is a reference to an older connection that closed out from under us, but that something else
// has opened a fresh connection to the node. Therefore we always call connectToNode() and update connectionRef.
transportService.connectToNode(discoveryNode, ActionListener.runAfter(new ActionListener<Releasable>() {
@Override
public void onResponse(Releasable connectionReleasable) {
if (alreadyConnected) {
logger.trace("refreshed connection to {}", discoveryNode);
} else {
logger.debug("connected to {}", discoveryNode);
}
consecutiveFailureCount.set(0);
setConnectionRef(connectionReleasable);

@Override
public void onFailure(Exception e) {
final int currentFailureCount = consecutiveFailureCount.incrementAndGet();
// only warn every 6th failure
final Level level = currentFailureCount % 6 == 1 ? Level.WARN : Level.DEBUG;
logger.log(
level,
new ParameterizedMessage("failed to connect to {} (tried [{}] times)", discoveryNode, currentFailureCount),
e
);
final boolean isActive;
synchronized (mutex) {
isActive = targetsByNode.get(discoveryNode) == ConnectionTarget.this;
}
if (isActive == false) {
logger.debug("connected to stale {} - releasing stale connection", discoveryNode);
setConnectionRef(null);
if (listener != null) {
listener.onFailure(e);
}
}
});
};
listener.onResponse(null);
}

@Override
public void onFailure(Exception e) {
final int currentFailureCount = consecutiveFailureCount.incrementAndGet();
// only warn every 6th failure
final Level level = currentFailureCount % 6 == 1 ? Level.WARN : Level.DEBUG;
logger.log(
level,
new ParameterizedMessage("failed to connect to {} (tried [{}] times)", discoveryNode, currentFailureCount),
e
);
setConnectionRef(null);
listener.onFailure(e);
}
}, () -> {
releaseListener();
transportService.getThreadPool().generic().execute(this::doConnect);
}));
}

void disconnect() {
Expand Down
Loading

0 comments on commit 5ab8599

Please sign in to comment.