org.elasticsearch.xpack.esql.plugin.ComputeService Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of x-pack-esql Show documentation
The plugin that powers ESQL for Elasticsearch
There is a newer version: 8.16.1
/*
 * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 * or more contributor license agreements. Licensed under the Elastic License
 * 2.0; you may not use this file except in compliance with the Elastic License
 * 2.0.
 */

package org.elasticsearch.xpack.esql.plugin;

import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.ActionListenerResponseHandler;
import org.elasticsearch.action.ActionRunnable;
import org.elasticsearch.action.OriginalIndices;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchShardsGroup;
import org.elasticsearch.action.search.SearchShardsRequest;
import org.elasticsearch.action.search.SearchShardsResponse;
import org.elasticsearch.action.search.TransportSearchShardsAction;
import org.elasticsearch.action.support.ChannelActionListener;
import org.elasticsearch.action.support.ContextPreservingActionListener;
import org.elasticsearch.action.support.RefCountingListener;
import org.elasticsearch.action.support.RefCountingRunnable;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.common.util.concurrent.ThreadContext;
import org.elasticsearch.compute.data.BlockFactory;
import org.elasticsearch.compute.data.Page;
import org.elasticsearch.compute.operator.Driver;
import org.elasticsearch.compute.operator.DriverProfile;
import org.elasticsearch.compute.operator.DriverTaskRunner;
import org.elasticsearch.compute.operator.exchange.ExchangeService;
import org.elasticsearch.compute.operator.exchange.ExchangeSink;
import org.elasticsearch.compute.operator.exchange.ExchangeSinkHandler;
import org.elasticsearch.compute.operator.exchange.ExchangeSourceHandler;
import org.elasticsearch.core.IOUtils;
import org.elasticsearch.core.Releasable;
import org.elasticsearch.core.Releasables;
import org.elasticsearch.core.Tuple;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.SearchExecutionContext;
import org.elasticsearch.index.shard.IndexShard;
import org.elasticsearch.index.shard.ShardId;
import org.elasticsearch.index.shard.ShardNotFoundException;
import org.elasticsearch.logging.LogManager;
import org.elasticsearch.logging.Logger;
import org.elasticsearch.search.SearchService;
import org.elasticsearch.search.internal.AliasFilter;
import org.elasticsearch.search.internal.SearchContext;
import org.elasticsearch.search.internal.ShardSearchRequest;
import org.elasticsearch.tasks.CancellableTask;
import org.elasticsearch.tasks.Task;
import org.elasticsearch.tasks.TaskCancelledException;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.transport.RemoteClusterAware;
import org.elasticsearch.transport.RemoteClusterService;
import org.elasticsearch.transport.Transport;
import org.elasticsearch.transport.TransportChannel;
import org.elasticsearch.transport.TransportRequestHandler;
import org.elasticsearch.transport.TransportRequestOptions;
import org.elasticsearch.transport.TransportService;
import org.elasticsearch.xpack.esql.action.EsqlQueryAction;
import org.elasticsearch.xpack.esql.enrich.EnrichLookupService;
import org.elasticsearch.xpack.esql.plan.physical.ExchangeSinkExec;
import org.elasticsearch.xpack.esql.plan.physical.ExchangeSourceExec;
import org.elasticsearch.xpack.esql.plan.physical.FragmentExec;
import org.elasticsearch.xpack.esql.plan.physical.OutputExec;
import org.elasticsearch.xpack.esql.plan.physical.PhysicalPlan;
import org.elasticsearch.xpack.esql.planner.EsPhysicalOperationProviders;
import org.elasticsearch.xpack.esql.planner.LocalExecutionPlanner;
import org.elasticsearch.xpack.esql.planner.PlannerUtils;
import org.elasticsearch.xpack.esql.session.EsqlConfiguration;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Executor;
import java.util.concurrent.atomic.AtomicBoolean;

import static org.elasticsearch.xpack.esql.plugin.EsqlPlugin.ESQL_WORKER_THREAD_POOL_NAME;

/**
 * Computes the result of a {@link PhysicalPlan}.
 */
public class ComputeService {
    public record Result(List pages, List profiles) {}

    private static final Logger LOGGER = LogManager.getLogger(ComputeService.class);
    private final SearchService searchService;
    private final BigArrays bigArrays;
    private final BlockFactory blockFactory;

    private final TransportService transportService;
    private final Executor esqlExecutor;
    private final DriverTaskRunner driverRunner;
    private final ExchangeService exchangeService;
    private final EnrichLookupService enrichLookupService;
    private final ClusterService clusterService;

    public ComputeService(
        SearchService searchService,
        TransportService transportService,
        ExchangeService exchangeService,
        EnrichLookupService enrichLookupService,
        ClusterService clusterService,
        ThreadPool threadPool,
        BigArrays bigArrays,
        BlockFactory blockFactory
    ) {
        this.searchService = searchService;
        this.transportService = transportService;
        this.bigArrays = bigArrays.withCircuitBreaking();
        this.blockFactory = blockFactory;
        this.esqlExecutor = threadPool.executor(ThreadPool.Names.SEARCH);
        transportService.registerRequestHandler(DATA_ACTION_NAME, this.esqlExecutor, DataNodeRequest::new, new DataNodeRequestHandler());
        transportService.registerRequestHandler(
            CLUSTER_ACTION_NAME,
            this.esqlExecutor,
            ClusterComputeRequest::new,
            new ClusterRequestHandler()
        );
        this.driverRunner = new DriverTaskRunner(transportService, this.esqlExecutor);
        this.exchangeService = exchangeService;
        this.enrichLookupService = enrichLookupService;
        this.clusterService = clusterService;
    }

    public void execute(
        String sessionId,
        CancellableTask rootTask,
        PhysicalPlan physicalPlan,
        EsqlConfiguration configuration,
        ActionListener listener
    ) {
        Tuple coordinatorAndDataNodePlan = PlannerUtils.breakPlanBetweenCoordinatorAndDataNode(
            physicalPlan,
            configuration
        );
        final List collectedPages = Collections.synchronizedList(new ArrayList<>());
        listener = listener.delegateResponse((l, e) -> {
            collectedPages.forEach(p -> Releasables.closeExpectNoException(p::releaseBlocks));
            l.onFailure(e);
        });
        PhysicalPlan coordinatorPlan = new OutputExec(coordinatorAndDataNodePlan.v1(), collectedPages::add);
        PhysicalPlan dataNodePlan = coordinatorAndDataNodePlan.v2();
        if (dataNodePlan != null && dataNodePlan instanceof ExchangeSinkExec == false) {
            assert false : "expected data node plan starts with an ExchangeSink; got " + dataNodePlan;
            listener.onFailure(new IllegalStateException("expected data node plan starts with an ExchangeSink; got " + dataNodePlan));
            return;
        }
        Map clusterToConcreteIndices = transportService.getRemoteClusterService()
            .groupIndices(SearchRequest.DEFAULT_INDICES_OPTIONS, PlannerUtils.planConcreteIndices(physicalPlan).toArray(String[]::new));
        QueryPragmas queryPragmas = configuration.pragmas();
        if (dataNodePlan == null) {
            if (clusterToConcreteIndices.values().stream().allMatch(v -> v.indices().length == 0) == false) {
                String error = "expected no concrete indices without data node plan; got " + clusterToConcreteIndices;
                assert false : error;
                listener.onFailure(new IllegalStateException(error));
                return;
            }
            var computeContext = new ComputeContext(
                sessionId,
                RemoteClusterAware.LOCAL_CLUSTER_GROUP_KEY,
                List.of(),
                configuration,
                null,
                null
            );
            try (
                var computeListener = new ComputeListener(
                    transportService,
                    rootTask,
                    listener.map(r -> new Result(collectedPages, r.getProfiles()))
                )
            ) {
                runCompute(rootTask, computeContext, coordinatorPlan, computeListener.acquireCompute());
                return;
            }
        } else {
            if (clusterToConcreteIndices.values().stream().allMatch(v -> v.indices().length == 0)) {
                var error = "expected concrete indices with data node plan but got empty; data node plan " + dataNodePlan;
                assert false : error;
                listener.onFailure(new IllegalStateException(error));
                return;
            }
        }
        Map clusterToOriginalIndices = transportService.getRemoteClusterService()
            .groupIndices(SearchRequest.DEFAULT_INDICES_OPTIONS, PlannerUtils.planOriginalIndices(physicalPlan));
        var localOriginalIndices = clusterToOriginalIndices.remove(RemoteClusterAware.LOCAL_CLUSTER_GROUP_KEY);
        var localConcreteIndices = clusterToConcreteIndices.remove(RemoteClusterAware.LOCAL_CLUSTER_GROUP_KEY);
        final var exchangeSource = new ExchangeSourceHandler(
            queryPragmas.exchangeBufferSize(),
            transportService.getThreadPool().executor(ThreadPool.Names.SEARCH)
        );
        try (
            Releasable ignored = exchangeSource.addEmptySink();
            var computeListener = new ComputeListener(
                transportService,
                rootTask,
                listener.map(r -> new Result(collectedPages, r.getProfiles()))
            )
        ) {
            // run compute on the coordinator
            exchangeSource.addCompletionListener(computeListener.acquireAvoid());
            runCompute(
                rootTask,
                new ComputeContext(sessionId, RemoteClusterAware.LOCAL_CLUSTER_GROUP_KEY, List.of(), configuration, exchangeSource, null),
                coordinatorPlan,
                computeListener.acquireCompute()
            );
            // starts computes on data nodes on the main cluster
            if (localConcreteIndices != null && localConcreteIndices.indices().length > 0) {
                startComputeOnDataNodes(
                    sessionId,
                    RemoteClusterAware.LOCAL_CLUSTER_GROUP_KEY,
                    rootTask,
                    configuration,
                    dataNodePlan,
                    Set.of(localConcreteIndices.indices()),
                    localOriginalIndices.indices(),
                    exchangeSource,
                    computeListener
                );
            }
            // starts computes on remote clusters
            startComputeOnRemoteClusters(
                sessionId,
                rootTask,
                configuration,
                dataNodePlan,
                exchangeSource,
                getRemoteClusters(clusterToConcreteIndices, clusterToOriginalIndices),
                computeListener
            );
        }
    }

    private List getRemoteClusters(
        Map clusterToConcreteIndices,
        Map clusterToOriginalIndices
    ) {
        List remoteClusters = new ArrayList<>(clusterToConcreteIndices.size());
        RemoteClusterService remoteClusterService = transportService.getRemoteClusterService();
        for (Map.Entry e : clusterToConcreteIndices.entrySet()) {
            String clusterAlias = e.getKey();
            OriginalIndices concreteIndices = clusterToConcreteIndices.get(clusterAlias);
            OriginalIndices originalIndices = clusterToOriginalIndices.get(clusterAlias);
            if (originalIndices == null) {
                assert false : "can't find original indices for cluster " + clusterAlias;
                throw new IllegalStateException("can't find original indices for cluster " + clusterAlias);
            }
            if (concreteIndices.indices().length > 0) {
                Transport.Connection connection = remoteClusterService.getConnection(clusterAlias);
                remoteClusters.add(new RemoteCluster(clusterAlias, connection, concreteIndices.indices(), originalIndices.indices()));
            }
        }
        return remoteClusters;
    }

    private void startComputeOnDataNodes(
        String sessionId,
        String clusterAlias,
        CancellableTask parentTask,
        EsqlConfiguration configuration,
        PhysicalPlan dataNodePlan,
        Set concreteIndices,
        String[] originalIndices,
        ExchangeSourceHandler exchangeSource,
        ComputeListener computeListener
    ) {
        var planWithReducer = configuration.pragmas().nodeLevelReduction() == false
            ? dataNodePlan
            : dataNodePlan.transformUp(FragmentExec.class, f -> {
                PhysicalPlan reductionNode = PlannerUtils.dataNodeReductionPlan(f.fragment(), dataNodePlan);
                return reductionNode == null ? f : f.withReducer(reductionNode);
            });

        // The lambda is to say if a TEXT field has an identical exact subfield
        // We cannot use SearchContext because we don't have it yet.
        // Since it's used only for @timestamp, it is relatively safe to assume it's not needed
        // but it would be better to have a proper impl.
        QueryBuilder requestFilter = PlannerUtils.requestFilter(planWithReducer, x -> true);
        var lookupListener = ActionListener.releaseAfter(computeListener.acquireAvoid(), exchangeSource.addEmptySink());
        lookupDataNodes(parentTask, clusterAlias, requestFilter, concreteIndices, originalIndices, ActionListener.wrap(dataNodes -> {
            try (RefCountingListener refs = new RefCountingListener(lookupListener)) {
                // For each target node, first open a remote exchange on the remote node, then link the exchange source to
                // the new remote exchange sink, and initialize the computation on the target node via data-node-request.
                for (DataNode node : dataNodes) {
                    var queryPragmas = configuration.pragmas();
                    ExchangeService.openExchange(
                        transportService,
                        node.connection,
                        sessionId,
                        queryPragmas.exchangeBufferSize(),
                        esqlExecutor,
                        refs.acquire().delegateFailureAndWrap((l, unused) -> {
                            var remoteSink = exchangeService.newRemoteSink(parentTask, sessionId, transportService, node.connection);
                            exchangeSource.addRemoteSink(remoteSink, queryPragmas.concurrentExchangeClients());
                            var dataNodeListener = ActionListener.runBefore(computeListener.acquireCompute(), () -> l.onResponse(null));
                            transportService.sendChildRequest(
                                node.connection,
                                DATA_ACTION_NAME,
                                new DataNodeRequest(
                                    sessionId,
                                    configuration,
                                    clusterAlias,
                                    node.shardIds,
                                    node.aliasFilters,
                                    planWithReducer
                                ),
                                parentTask,
                                TransportRequestOptions.EMPTY,
                                new ActionListenerResponseHandler<>(dataNodeListener, ComputeResponse::new, esqlExecutor)
                            );
                        })
                    );
                }
            }
        }, lookupListener::onFailure));
    }

    private void startComputeOnRemoteClusters(
        String sessionId,
        CancellableTask rootTask,
        EsqlConfiguration configuration,
        PhysicalPlan plan,
        ExchangeSourceHandler exchangeSource,
        List clusters,
        ComputeListener computeListener
    ) {
        var queryPragmas = configuration.pragmas();
        var linkExchangeListeners = ActionListener.releaseAfter(computeListener.acquireAvoid(), exchangeSource.addEmptySink());
        try (RefCountingListener refs = new RefCountingListener(linkExchangeListeners)) {
            for (RemoteCluster cluster : clusters) {
                ExchangeService.openExchange(
                    transportService,
                    cluster.connection,
                    sessionId,
                    queryPragmas.exchangeBufferSize(),
                    esqlExecutor,
                    refs.acquire().delegateFailureAndWrap((l, unused) -> {
                        var remoteSink = exchangeService.newRemoteSink(rootTask, sessionId, transportService, cluster.connection);
                        exchangeSource.addRemoteSink(remoteSink, queryPragmas.concurrentExchangeClients());
                        var clusterRequest = new ClusterComputeRequest(
                            cluster.clusterAlias,
                            sessionId,
                            configuration,
                            plan,
                            cluster.concreteIndices,
                            cluster.originalIndices
                        );
                        var clusterListener = ActionListener.runBefore(computeListener.acquireCompute(), () -> l.onResponse(null));
                        transportService.sendChildRequest(
                            cluster.connection,
                            CLUSTER_ACTION_NAME,
                            clusterRequest,
                            rootTask,
                            TransportRequestOptions.EMPTY,
                            new ActionListenerResponseHandler<>(clusterListener, ComputeResponse::new, esqlExecutor)
                        );
                    })
                );
            }
        }
    }

    void runCompute(CancellableTask task, ComputeContext context, PhysicalPlan plan, ActionListener listener) {
        listener = ActionListener.runBefore(listener, () -> Releasables.close(context.searchContexts));
        List contexts = new ArrayList<>(context.searchContexts.size());
        for (int i = 0; i < context.searchContexts.size(); i++) {
            SearchContext searchContext = context.searchContexts.get(i);
            contexts.add(
                new EsPhysicalOperationProviders.DefaultShardContext(
                    i,
                    searchContext.getSearchExecutionContext(),
                    searchContext.request().getAliasFilter()
                )
            );
        }
        final List drivers;
        try {
            LocalExecutionPlanner planner = new LocalExecutionPlanner(
                context.sessionId,
                context.clusterAlias,
                task,
                bigArrays,
                blockFactory,
                clusterService.getSettings(),
                context.configuration,
                context.exchangeSource(),
                context.exchangeSink(),
                enrichLookupService,
                new EsPhysicalOperationProviders(contexts)
            );

            LOGGER.debug("Received physical plan:\n{}", plan);
            plan = PlannerUtils.localPlan(context.searchExecutionContexts(), context.configuration, plan);
            // the planner will also set the driver parallelism in LocalExecutionPlanner.LocalExecutionPlan (used down below)
            // it's doing this in the planning of EsQueryExec (the source of the data)
            // see also EsPhysicalOperationProviders.sourcePhysicalOperation
            LocalExecutionPlanner.LocalExecutionPlan localExecutionPlan = planner.plan(plan);

            if (LOGGER.isDebugEnabled()) {
                LOGGER.debug("Local execution plan:\n{}", localExecutionPlan.describe());
            }
            drivers = localExecutionPlan.createDrivers(context.sessionId);
            if (drivers.isEmpty()) {
                throw new IllegalStateException("no drivers created");
            }
            LOGGER.debug("using {} drivers", drivers.size());
        } catch (Exception e) {
            listener.onFailure(e);
            return;
        }
        ActionListener listenerCollectingStatus = listener.map(ignored -> {
            if (context.configuration.profile()) {
                return new ComputeResponse(drivers.stream().map(Driver::profile).toList());
            } else {
                return new ComputeResponse(List.of());
            }
        });
        listenerCollectingStatus = ActionListener.releaseAfter(listenerCollectingStatus, () -> Releasables.close(drivers));
        driverRunner.executeDrivers(
            task,
            drivers,
            transportService.getThreadPool().executor(ESQL_WORKER_THREAD_POOL_NAME),
            listenerCollectingStatus
        );
    }

    private void acquireSearchContexts(
        String clusterAlias,
        List shardIds,
        EsqlConfiguration configuration,
        Map aliasFilters,
        ActionListener> listener
    ) {
        final List targetShards = new ArrayList<>();
        try {
            for (ShardId shardId : shardIds) {
                var indexShard = searchService.getIndicesService().indexServiceSafe(shardId.getIndex()).getShard(shardId.id());
                targetShards.add(indexShard);
            }
        } catch (Exception e) {
            listener.onFailure(e);
            return;
        }
        final var doAcquire = ActionRunnable.supply(listener, () -> {
            final List searchContexts = new ArrayList<>(targetShards.size());
            boolean success = false;
            try {
                for (IndexShard shard : targetShards) {
                    var aliasFilter = aliasFilters.getOrDefault(shard.shardId().getIndex(), AliasFilter.EMPTY);
                    var shardRequest = new ShardSearchRequest(
                        shard.shardId(),
                        configuration.absoluteStartedTimeInMillis(),
                        aliasFilter,
                        clusterAlias
                    );
                    // TODO: `searchService.createSearchContext` allows opening search contexts without limits,
                    // we need to limit the number of active search contexts here or in SearchService
                    SearchContext context = searchService.createSearchContext(shardRequest, SearchService.NO_TIMEOUT);
                    searchContexts.add(context);
                }
                for (SearchContext searchContext : searchContexts) {
                    searchContext.preProcess();
                }
                success = true;
                return searchContexts;
            } finally {
                if (success == false) {
                    IOUtils.close(searchContexts);
                }
            }
        });
        final AtomicBoolean waitedForRefreshes = new AtomicBoolean();
        try (RefCountingRunnable refs = new RefCountingRunnable(() -> {
            if (waitedForRefreshes.get()) {
                esqlExecutor.execute(doAcquire);
            } else {
                doAcquire.run();
            }
        })) {
            for (IndexShard targetShard : targetShards) {
                final Releasable ref = refs.acquire();
                targetShard.ensureShardSearchActive(await -> {
                    try (ref) {
                        if (await) {
                            waitedForRefreshes.set(true);
                        }
                    }
                });
            }
        }
    }

    record DataNode(Transport.Connection connection, List shardIds, Map aliasFilters) {

    }

    record RemoteCluster(String clusterAlias, Transport.Connection connection, String[] concreteIndices, String[] originalIndices) {

    }

    /**
     * Performs can_match and find the target nodes for the given target indices and filter.
     * 
     * Ideally, the search_shards API should be called before the field-caps API; however, this can lead
     * to a situation where the column structure (i.e., matched data types) differs depending on the query.
     */
    private void lookupDataNodes(
        Task parentTask,
        String clusterAlias,
        QueryBuilder filter,
        Set concreteIndices,
        String[] originalIndices,
        ActionListener> listener
    ) {
        ThreadContext threadContext = transportService.getThreadPool().getThreadContext();
        ActionListener preservingContextListener = ContextPreservingActionListener.wrapPreservingContext(
            listener.map(resp -> {
                Map nodes = new HashMap<>();
                for (DiscoveryNode node : resp.getNodes()) {
                    nodes.put(node.getId(), node);
                }
                Map> nodeToShards = new HashMap<>();
                Map> nodeToAliasFilters = new HashMap<>();
                for (SearchShardsGroup group : resp.getGroups()) {
                    var shardId = group.shardId();
                    if (group.skipped()) {
                        continue;
                    }
                    if (group.allocatedNodes().isEmpty()) {
                        throw new ShardNotFoundException(group.shardId(), "no shard copies found {}", group.shardId());
                    }
                    if (concreteIndices.contains(shardId.getIndexName()) == false) {
                        continue;
                    }
                    String targetNode = group.allocatedNodes().get(0);
                    nodeToShards.computeIfAbsent(targetNode, k -> new ArrayList<>()).add(shardId);
                    AliasFilter aliasFilter = resp.getAliasFilters().get(shardId.getIndex().getUUID());
                    if (aliasFilter != null) {
                        nodeToAliasFilters.computeIfAbsent(targetNode, k -> new HashMap<>()).put(shardId.getIndex(), aliasFilter);
                    }
                }
                List dataNodes = new ArrayList<>(nodeToShards.size());
                for (Map.Entry> e : nodeToShards.entrySet()) {
                    DiscoveryNode node = nodes.get(e.getKey());
                    Map aliasFilters = nodeToAliasFilters.getOrDefault(e.getKey(), Map.of());
                    dataNodes.add(new DataNode(transportService.getConnection(node), e.getValue(), aliasFilters));
                }
                return dataNodes;
            }),
            threadContext
        );
        try (ThreadContext.StoredContext ignored = threadContext.newStoredContextPreservingResponseHeaders()) {
            threadContext.markAsSystemContext();
            SearchShardsRequest searchShardsRequest = new SearchShardsRequest(
                originalIndices,
                SearchRequest.DEFAULT_INDICES_OPTIONS,
                filter,
                null,
                null,
                false,
                clusterAlias
            );
            transportService.sendChildRequest(
                transportService.getLocalNode(),
                TransportSearchShardsAction.TYPE.name(),
                searchShardsRequest,
                parentTask,
                TransportRequestOptions.EMPTY,
                new ActionListenerResponseHandler<>(preservingContextListener, SearchShardsResponse::new, esqlExecutor)
            );
        }
    }

    // TODO: Use an internal action here
    public static final String DATA_ACTION_NAME = EsqlQueryAction.NAME + "/data";

    private class DataNodeRequestExecutor {
        private final DataNodeRequest request;
        private final CancellableTask parentTask;
        private final ExchangeSinkHandler exchangeSink;
        private final ComputeListener computeListener;
        private final int maxConcurrentShards;
        private final ExchangeSink blockingSink; // block until we have completed on all shards or the coordinator has enough data

        DataNodeRequestExecutor(
            DataNodeRequest request,
            CancellableTask parentTask,
            ExchangeSinkHandler exchangeSink,
            int maxConcurrentShards,
            ComputeListener computeListener
        ) {
            this.request = request;
            this.parentTask = parentTask;
            this.exchangeSink = exchangeSink;
            this.computeListener = computeListener;
            this.maxConcurrentShards = maxConcurrentShards;
            this.blockingSink = exchangeSink.createExchangeSink();
        }

        void start() {
            parentTask.addListener(
                () -> exchangeService.finishSinkHandler(request.sessionId(), new TaskCancelledException(parentTask.getReasonCancelled()))
            );
            runBatch(0);
        }

        private void runBatch(int startBatchIndex) {
            final EsqlConfiguration configuration = request.configuration();
            final String clusterAlias = request.clusterAlias();
            final var sessionId = request.sessionId();
            final int endBatchIndex = Math.min(startBatchIndex + maxConcurrentShards, request.shardIds().size());
            List shardIds = request.shardIds().subList(startBatchIndex, endBatchIndex);
            ActionListener batchListener = new ActionListener<>() {
                final ActionListener ref = computeListener.acquireCompute();

                @Override
                public void onResponse(ComputeResponse result) {
                    try {
                        onBatchCompleted(endBatchIndex);
                    } finally {
                        ref.onResponse(result);
                    }
                }

                @Override
                public void onFailure(Exception e) {
                    try {
                        exchangeService.finishSinkHandler(request.sessionId(), e);
                    } finally {
                        ref.onFailure(e);
                    }
                }
            };
            acquireSearchContexts(clusterAlias, shardIds, configuration, request.aliasFilters(), ActionListener.wrap(searchContexts -> {
                assert ThreadPool.assertCurrentThreadPool(ThreadPool.Names.SEARCH, ESQL_WORKER_THREAD_POOL_NAME);
                var computeContext = new ComputeContext(sessionId, clusterAlias, searchContexts, configuration, null, exchangeSink);
                runCompute(parentTask, computeContext, request.plan(), batchListener);
            }, batchListener::onFailure));
        }

        private void onBatchCompleted(int lastBatchIndex) {
            if (lastBatchIndex < request.shardIds().size() && exchangeSink.isFinished() == false) {
                runBatch(lastBatchIndex);
            } else {
                // don't return until all pages are fetched
                var completionListener = computeListener.acquireAvoid();
                exchangeSink.addCompletionListener(
                    ActionListener.runAfter(completionListener, () -> exchangeService.finishSinkHandler(request.sessionId(), null))
                );
                blockingSink.finish();
            }
        }
    }

    private void runComputeOnDataNode(
        CancellableTask task,
        String externalId,
        PhysicalPlan reducePlan,
        DataNodeRequest request,
        ComputeListener computeListener
    ) {
        var parentListener = computeListener.acquireAvoid();
        try {
            // run compute with target shards
            var internalSink = exchangeService.createSinkHandler(request.sessionId(), request.pragmas().exchangeBufferSize());
            DataNodeRequestExecutor dataNodeRequestExecutor = new DataNodeRequestExecutor(
                request,
                task,
                internalSink,
                request.configuration().pragmas().maxConcurrentShardsPerNode(),
                computeListener
            );
            dataNodeRequestExecutor.start();
            // run the node-level reduction
            var externalSink = exchangeService.getSinkHandler(externalId);
            task.addListener(() -> exchangeService.finishSinkHandler(externalId, new TaskCancelledException(task.getReasonCancelled())));
            var exchangeSource = new ExchangeSourceHandler(1, esqlExecutor);
            exchangeSource.addCompletionListener(computeListener.acquireAvoid());
            exchangeSource.addRemoteSink(internalSink::fetchPageAsync, 1);
            ActionListener reductionListener = computeListener.acquireCompute();
            runCompute(
                task,
                new ComputeContext(
                    request.sessionId(),
                    request.clusterAlias(),
                    List.of(),
                    request.configuration(),
                    exchangeSource,
                    externalSink
                ),
                reducePlan,
                ActionListener.wrap(resp -> {
                    // don't return until all pages are fetched
                    externalSink.addCompletionListener(ActionListener.running(() -> {
                        exchangeService.finishSinkHandler(externalId, null);
                        reductionListener.onResponse(resp);
                    }));
                }, e -> {
                    exchangeService.finishSinkHandler(externalId, e);
                    reductionListener.onFailure(e);
                })
            );
            parentListener.onResponse(null);
        } catch (Exception e) {
            exchangeService.finishSinkHandler(externalId, e);
            exchangeService.finishSinkHandler(request.sessionId(), e);
            parentListener.onFailure(e);
        }
    }

    private class DataNodeRequestHandler implements TransportRequestHandler {
        @Override
        public void messageReceived(DataNodeRequest request, TransportChannel channel, Task task) {
            final ActionListener listener = new ChannelActionListener<>(channel);
            final ExchangeSinkExec reducePlan;
            if (request.plan() instanceof ExchangeSinkExec plan) {
                var fragments = plan.collectFirstChildren(FragmentExec.class::isInstance);
                if (fragments.isEmpty()) {
                    listener.onFailure(new IllegalStateException("expected a fragment plan for a remote compute; got " + request.plan()));
                    return;
                }

                var localExchangeSource = new ExchangeSourceExec(plan.source(), plan.output(), plan.isIntermediateAgg());
                FragmentExec fragment = (FragmentExec) fragments.get(0);
                reducePlan = new ExchangeSinkExec(
                    plan.source(),
                    plan.output(),
                    plan.isIntermediateAgg(),
                    fragment.reducer() != null ? fragment.reducer().replaceChildren(List.of(localExchangeSource)) : localExchangeSource
                );
            } else {
                listener.onFailure(new IllegalStateException("expected exchange sink for a remote compute; got " + request.plan()));
                return;
            }
            final String sessionId = request.sessionId();
            request = new DataNodeRequest(
                sessionId + "[n]", // internal session
                request.configuration(),
                request.clusterAlias(),
                request.shardIds(),
                request.aliasFilters(),
                request.plan()
            );
            try (var computeListener = new ComputeListener(transportService, (CancellableTask) task, listener)) {
                runComputeOnDataNode((CancellableTask) task, sessionId, reducePlan, request, computeListener);
            }
        }
    }

    public static final String CLUSTER_ACTION_NAME = EsqlQueryAction.NAME + "/cluster";

    private class ClusterRequestHandler implements TransportRequestHandler {
        @Override
        public void messageReceived(ClusterComputeRequest request, TransportChannel channel, Task task) {
            ChannelActionListener listener = new ChannelActionListener<>(channel);
            if (request.plan() instanceof ExchangeSinkExec == false) {
                listener.onFailure(new IllegalStateException("expected exchange sink for a remote compute; got " + request.plan()));
                return;
            }
            try (var computeListener = new ComputeListener(transportService, (CancellableTask) task, listener)) {
                runComputeOnRemoteCluster(
                    request.clusterAlias(),
                    request.sessionId(),
                    (CancellableTask) task,
                    request.configuration(),
                    (ExchangeSinkExec) request.plan(),
                    Set.of(request.indices()),
                    request.originalIndices(),
                    computeListener
                );
            }
        }
    }

    /**
     * Performs a compute on a remote cluster. The output pages are placed in an exchange sink specified by
     * {@code globalSessionId}. The coordinator on the main cluster will poll pages from there.
     * 
     * Currently, the coordinator on the remote cluster simply collects pages from data nodes in the remote cluster
     * and places them in the exchange sink. We can achieve this by using a single exchange buffer to minimize overhead.
     * However, here we use two exchange buffers so that we can run an actual plan on this coordinator to perform partial
     * reduce operations, such as limit, topN, and partial-to-partial aggregation in the future.
     */
    void runComputeOnRemoteCluster(
        String clusterAlias,
        String globalSessionId,
        CancellableTask parentTask,
        EsqlConfiguration configuration,
        ExchangeSinkExec plan,
        Set concreteIndices,
        String[] originalIndices,
        ComputeListener computeListener
    ) {
        final var exchangeSink = exchangeService.getSinkHandler(globalSessionId);
        parentTask.addListener(
            () -> exchangeService.finishSinkHandler(globalSessionId, new TaskCancelledException(parentTask.getReasonCancelled()))
        );
        final String localSessionId = clusterAlias + ":" + globalSessionId;
        var exchangeSource = new ExchangeSourceHandler(
            configuration.pragmas().exchangeBufferSize(),
            transportService.getThreadPool().executor(ThreadPool.Names.SEARCH)
        );
        try (Releasable ignored = exchangeSource.addEmptySink()) {
            exchangeSink.addCompletionListener(computeListener.acquireAvoid());
            exchangeSource.addCompletionListener(computeListener.acquireAvoid());
            PhysicalPlan coordinatorPlan = new ExchangeSinkExec(
                plan.source(),
                plan.output(),
                plan.isIntermediateAgg(),
                new ExchangeSourceExec(plan.source(), plan.output(), plan.isIntermediateAgg())
            );
            runCompute(
                parentTask,
                new ComputeContext(localSessionId, clusterAlias, List.of(), configuration, exchangeSource, exchangeSink),
                coordinatorPlan,
                computeListener.acquireCompute()
            );
            startComputeOnDataNodes(
                localSessionId,
                clusterAlias,
                parentTask,
                configuration,
                plan,
                concreteIndices,
                originalIndices,
                exchangeSource,
                computeListener
            );
        }
    }

    record ComputeContext(
        String sessionId,
        String clusterAlias,
        List searchContexts,
        EsqlConfiguration configuration,
        ExchangeSourceHandler exchangeSource,
        ExchangeSinkHandler exchangeSink
    ) {
        public List searchExecutionContexts() {
            return searchContexts.stream().map(ctx -> ctx.getSearchExecutionContext()).toList();
        }
    }
}