Catch and handle disconnect exceptions in search (#115836)

Getting a connection can throw an exception for a disconnected node.
We failed to handle these in the adjusted spots, leading to a phase failure
(and possible memory leaks for outstanding operations) instead of correctly
recording a per-shard failure.
This commit is contained in:
Armin Braun 2024-10-29 14:11:14 +01:00 committed by GitHub
parent 6742147d6a
commit 78a531bf4e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 111 additions and 65 deletions

View File

@ -0,0 +1,5 @@
pr: 115836
summary: Catch and handle disconnect exceptions in search
area: Search
type: bug
issues: []

View File

@ -84,15 +84,20 @@ final class DfsQueryPhase extends SearchPhase {
for (final DfsSearchResult dfsResult : searchResults) {
final SearchShardTarget shardTarget = dfsResult.getSearchShardTarget();
Transport.Connection connection = context.getConnection(shardTarget.getClusterAlias(), shardTarget.getNodeId());
ShardSearchRequest shardRequest = rewriteShardSearchRequest(dfsResult.getShardSearchRequest());
final int shardIndex = dfsResult.getShardIndex();
QuerySearchRequest querySearchRequest = new QuerySearchRequest(
context.getOriginalIndices(dfsResult.getShardIndex()),
context.getOriginalIndices(shardIndex),
dfsResult.getContextId(),
shardRequest,
rewriteShardSearchRequest(dfsResult.getShardSearchRequest()),
dfs
);
final int shardIndex = dfsResult.getShardIndex();
final Transport.Connection connection;
try {
connection = context.getConnection(shardTarget.getClusterAlias(), shardTarget.getNodeId());
} catch (Exception e) {
shardFailure(e, querySearchRequest, shardIndex, shardTarget, counter);
return;
}
searchTransportService.sendExecuteQuery(
connection,
querySearchRequest,
@ -112,10 +117,7 @@ final class DfsQueryPhase extends SearchPhase {
@Override
public void onFailure(Exception exception) {
try {
context.getLogger()
.debug(() -> "[" + querySearchRequest.contextId() + "] Failed to execute query phase", exception);
progressListener.notifyQueryFailure(shardIndex, shardTarget, exception);
counter.onFailure(shardIndex, shardTarget, exception);
shardFailure(exception, querySearchRequest, shardIndex, shardTarget, counter);
} finally {
if (context.isPartOfPointInTime(querySearchRequest.contextId()) == false) {
// the query might not have been executed at all (for example because thread pool rejected
@ -134,6 +136,18 @@ final class DfsQueryPhase extends SearchPhase {
}
}
private void shardFailure(
Exception exception,
QuerySearchRequest querySearchRequest,
int shardIndex,
SearchShardTarget shardTarget,
CountedCollector<SearchPhaseResult> counter
) {
context.getLogger().debug(() -> "[" + querySearchRequest.contextId() + "] Failed to execute query phase", exception);
progressListener.notifyQueryFailure(shardIndex, shardTarget, exception);
counter.onFailure(shardIndex, shardTarget, exception);
}
// package private for testing
ShardSearchRequest rewriteShardSearchRequest(ShardSearchRequest request) {
SearchSourceBuilder source = request.source();

View File

@ -21,6 +21,7 @@ import org.elasticsearch.search.fetch.ShardFetchSearchRequest;
import org.elasticsearch.search.internal.ShardSearchContextId;
import org.elasticsearch.search.rank.RankDoc;
import org.elasticsearch.search.rank.RankDocShardInfo;
import org.elasticsearch.transport.Transport;
import java.util.ArrayList;
import java.util.HashMap;
@ -214,9 +215,41 @@ final class FetchSearchPhase extends SearchPhase {
final ShardSearchContextId contextId = shardPhaseResult.queryResult() != null
? shardPhaseResult.queryResult().getContextId()
: shardPhaseResult.rankFeatureResult().getContextId();
var listener = new SearchActionListener<FetchSearchResult>(shardTarget, shardIndex) {
@Override
public void innerOnResponse(FetchSearchResult result) {
try {
progressListener.notifyFetchResult(shardIndex);
counter.onResult(result);
} catch (Exception e) {
context.onPhaseFailure(FetchSearchPhase.this, "", e);
}
}
@Override
public void onFailure(Exception e) {
try {
logger.debug(() -> "[" + contextId + "] Failed to execute fetch phase", e);
progressListener.notifyFetchFailure(shardIndex, shardTarget, e);
counter.onFailure(shardIndex, shardTarget, e);
} finally {
// the search context might not be cleared on the node where the fetch was executed for example
// because the action was rejected by the thread pool. in this case we need to send a dedicated
// request to clear the search context.
releaseIrrelevantSearchContext(shardPhaseResult, context);
}
}
};
final Transport.Connection connection;
try {
connection = context.getConnection(shardTarget.getClusterAlias(), shardTarget.getNodeId());
} catch (Exception e) {
listener.onFailure(e);
return;
}
context.getSearchTransport()
.sendExecuteFetch(
context.getConnection(shardTarget.getClusterAlias(), shardTarget.getNodeId()),
connection,
new ShardFetchSearchRequest(
context.getOriginalIndices(shardPhaseResult.getShardIndex()),
contextId,
@ -228,31 +261,7 @@ final class FetchSearchPhase extends SearchPhase {
aggregatedDfs
),
context.getTask(),
new SearchActionListener<>(shardTarget, shardIndex) {
@Override
public void innerOnResponse(FetchSearchResult result) {
try {
progressListener.notifyFetchResult(shardIndex);
counter.onResult(result);
} catch (Exception e) {
context.onPhaseFailure(FetchSearchPhase.this, "", e);
}
}
@Override
public void onFailure(Exception e) {
try {
logger.debug(() -> "[" + contextId + "] Failed to execute fetch phase", e);
progressListener.notifyFetchFailure(shardIndex, shardTarget, e);
counter.onFailure(shardIndex, shardTarget, e);
} finally {
// the search context might not be cleared on the node where the fetch was executed for example
// because the action was rejected by the thread pool. in this case we need to send a dedicated
// request to clear the search context.
releaseIrrelevantSearchContext(shardPhaseResult, context);
}
}
}
listener
);
}

View File

@ -24,6 +24,7 @@ import org.elasticsearch.search.rank.context.RankFeaturePhaseRankCoordinatorCont
import org.elasticsearch.search.rank.feature.RankFeatureDoc;
import org.elasticsearch.search.rank.feature.RankFeatureResult;
import org.elasticsearch.search.rank.feature.RankFeatureShardRequest;
import org.elasticsearch.transport.Transport;
import java.util.List;
@ -131,9 +132,38 @@ public class RankFeaturePhase extends SearchPhase {
final SearchShardTarget shardTarget = queryResult.queryResult().getSearchShardTarget();
final ShardSearchContextId contextId = queryResult.queryResult().getContextId();
final int shardIndex = queryResult.getShardIndex();
var listener = new SearchActionListener<RankFeatureResult>(shardTarget, shardIndex) {
@Override
protected void innerOnResponse(RankFeatureResult response) {
try {
progressListener.notifyRankFeatureResult(shardIndex);
rankRequestCounter.onResult(response);
} catch (Exception e) {
context.onPhaseFailure(RankFeaturePhase.this, "", e);
}
}
@Override
public void onFailure(Exception e) {
try {
logger.debug(() -> "[" + contextId + "] Failed to execute rank phase", e);
progressListener.notifyRankFeatureFailure(shardIndex, shardTarget, e);
rankRequestCounter.onFailure(shardIndex, shardTarget, e);
} finally {
releaseIrrelevantSearchContext(queryResult, context);
}
}
};
final Transport.Connection connection;
try {
connection = context.getConnection(shardTarget.getClusterAlias(), shardTarget.getNodeId());
} catch (Exception e) {
listener.onFailure(e);
return;
}
context.getSearchTransport()
.sendExecuteRankFeature(
context.getConnection(shardTarget.getClusterAlias(), shardTarget.getNodeId()),
connection,
new RankFeatureShardRequest(
context.getOriginalIndices(queryResult.getShardIndex()),
queryResult.getContextId(),
@ -141,28 +171,7 @@ public class RankFeaturePhase extends SearchPhase {
entry
),
context.getTask(),
new SearchActionListener<>(shardTarget, shardIndex) {
@Override
protected void innerOnResponse(RankFeatureResult response) {
try {
progressListener.notifyRankFeatureResult(shardIndex);
rankRequestCounter.onResult(response);
} catch (Exception e) {
context.onPhaseFailure(RankFeaturePhase.this, "", e);
}
}
@Override
public void onFailure(Exception e) {
try {
logger.debug(() -> "[" + contextId + "] Failed to execute rank phase", e);
progressListener.notifyRankFeatureFailure(shardIndex, shardTarget, e);
rankRequestCounter.onFailure(shardIndex, shardTarget, e);
} finally {
releaseIrrelevantSearchContext(queryResult, context);
}
}
}
listener
);
}

View File

@ -87,12 +87,14 @@ final class SearchDfsQueryThenFetchAsyncAction extends AbstractSearchAsyncAction
final SearchShardTarget shard,
final SearchActionListener<DfsSearchResult> listener
) {
getSearchTransport().sendExecuteDfs(
getConnection(shard.getClusterAlias(), shard.getNodeId()),
buildShardSearchRequest(shardIt, listener.requestIndex),
getTask(),
listener
);
final Transport.Connection connection;
try {
connection = getConnection(shard.getClusterAlias(), shard.getNodeId());
} catch (Exception e) {
listener.onFailure(e);
return;
}
getSearchTransport().sendExecuteDfs(connection, buildShardSearchRequest(shardIt, listener.requestIndex), getTask(), listener);
}
@Override

View File

@ -94,8 +94,15 @@ class SearchQueryThenFetchAsyncAction extends AbstractSearchAsyncAction<SearchPh
final SearchShardTarget shard,
final SearchActionListener<SearchPhaseResult> listener
) {
final Transport.Connection connection;
try {
connection = getConnection(shard.getClusterAlias(), shard.getNodeId());
} catch (Exception e) {
listener.onFailure(e);
return;
}
ShardSearchRequest request = rewriteShardSearchRequest(super.buildShardSearchRequest(shardIt, listener.requestIndex));
getSearchTransport().sendExecuteQuery(getConnection(shard.getClusterAlias(), shard.getNodeId()), request, getTask(), listener);
getSearchTransport().sendExecuteQuery(connection, request, getTask(), listener);
}
@Override