All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.trino.execution.scheduler.UniformNodeSelector Maven / Gradle / Ivy

There is a newer version: 465
Show newest version
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.trino.execution.scheduler;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Suppliers;
import com.google.common.base.Ticker;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Multimap;
import com.google.common.collect.SetMultimap;
import com.google.common.util.concurrent.ListenableFuture;
import io.airlift.log.Logger;
import io.trino.execution.NodeTaskMap;
import io.trino.execution.RemoteTask;
import io.trino.execution.resourcegroups.IndexedPriorityQueue;
import io.trino.execution.scheduler.NodeSchedulerConfig.SplitsBalancingPolicy;
import io.trino.metadata.InternalNode;
import io.trino.metadata.InternalNodeManager;
import io.trino.metadata.Split;
import io.trino.spi.HostAddress;
import io.trino.spi.SplitWeight;
import io.trino.spi.TrinoException;
import jakarta.annotation.Nullable;

import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Supplier;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static io.trino.execution.scheduler.NodeScheduler.calculateLowWatermark;
import static io.trino.execution.scheduler.NodeScheduler.filterNodes;
import static io.trino.execution.scheduler.NodeScheduler.getAllNodes;
import static io.trino.execution.scheduler.NodeScheduler.randomizedNodes;
import static io.trino.execution.scheduler.NodeScheduler.selectDistributionNodes;
import static io.trino.execution.scheduler.NodeScheduler.selectExactNodes;
import static io.trino.execution.scheduler.NodeScheduler.selectNodes;
import static io.trino.execution.scheduler.NodeScheduler.toWhenHasSplitQueueSpaceFuture;
import static io.trino.spi.StandardErrorCode.NO_NODES_AVAILABLE;
import static java.util.Comparator.comparingLong;
import static java.util.Objects.requireNonNull;
import static java.util.concurrent.TimeUnit.SECONDS;

public class UniformNodeSelector
        implements NodeSelector
{
    private static final Logger log = Logger.get(UniformNodeSelector.class);

    private final InternalNodeManager nodeManager;
    private final NodeTaskMap nodeTaskMap;
    private final boolean includeCoordinator;
    private final AtomicReference> nodeMap;
    private final int minCandidates;
    private final long maxSplitsWeightPerNode;
    private final long minPendingSplitsWeightPerTask;
    private final int maxUnacknowledgedSplitsPerTask;
    private final SplitsBalancingPolicy splitsBalancingPolicy;
    private final boolean optimizedLocalScheduling;
    private final QueueSizeAdjuster queueSizeAdjuster;

    public UniformNodeSelector(
            InternalNodeManager nodeManager,
            NodeTaskMap nodeTaskMap,
            boolean includeCoordinator,
            Supplier nodeMap,
            int minCandidates,
            long maxSplitsWeightPerNode,
            long minPendingSplitsWeightPerTask,
            long maxAdjustedPendingSplitsWeightPerTask,
            int maxUnacknowledgedSplitsPerTask,
            SplitsBalancingPolicy splitsBalancingPolicy,
            boolean optimizedLocalScheduling)
    {
        this(nodeManager,
                nodeTaskMap,
                includeCoordinator,
                nodeMap,
                minCandidates,
                maxSplitsWeightPerNode,
                minPendingSplitsWeightPerTask,
                maxUnacknowledgedSplitsPerTask,
                splitsBalancingPolicy,
                optimizedLocalScheduling,
                new QueueSizeAdjuster(minPendingSplitsWeightPerTask, maxAdjustedPendingSplitsWeightPerTask));
    }

    @VisibleForTesting
    UniformNodeSelector(
            InternalNodeManager nodeManager,
            NodeTaskMap nodeTaskMap,
            boolean includeCoordinator,
            Supplier nodeMap,
            int minCandidates,
            long maxSplitsWeightPerNode,
            long minPendingSplitsWeightPerTask,
            int maxUnacknowledgedSplitsPerTask,
            SplitsBalancingPolicy splitsBalancingPolicy,
            boolean optimizedLocalScheduling,
            QueueSizeAdjuster queueSizeAdjuster)
    {
        this.nodeManager = requireNonNull(nodeManager, "nodeManager is null");
        this.nodeTaskMap = requireNonNull(nodeTaskMap, "nodeTaskMap is null");
        this.includeCoordinator = includeCoordinator;
        this.nodeMap = new AtomicReference<>(nodeMap);
        this.minCandidates = minCandidates;
        this.maxSplitsWeightPerNode = maxSplitsWeightPerNode;
        this.minPendingSplitsWeightPerTask = minPendingSplitsWeightPerTask;
        this.maxUnacknowledgedSplitsPerTask = maxUnacknowledgedSplitsPerTask;
        checkArgument(maxUnacknowledgedSplitsPerTask > 0, "maxUnacknowledgedSplitsPerTask must be > 0, found: %s", maxUnacknowledgedSplitsPerTask);
        this.splitsBalancingPolicy = requireNonNull(splitsBalancingPolicy, "splitsBalancingPolicy is null");
        this.optimizedLocalScheduling = optimizedLocalScheduling;
        this.queueSizeAdjuster = queueSizeAdjuster;
    }

    @Override
    public void lockDownNodes()
    {
        nodeMap.set(Suppliers.ofInstance(nodeMap.get().get()));
    }

    @Override
    public List allNodes()
    {
        return getAllNodes(nodeMap.get().get(), includeCoordinator);
    }

    @Override
    public InternalNode selectCurrentNode()
    {
        // TODO: this is a hack to force scheduling on the coordinator
        return nodeManager.getCurrentNode();
    }

    @Override
    public List selectRandomNodes(int limit, Set excludedNodes)
    {
        return selectNodes(limit, randomizedNodes(nodeMap.get().get(), includeCoordinator, excludedNodes));
    }

    @Override
    public SplitPlacementResult computeAssignments(Set splits, List existingTasks)
    {
        Multimap assignment = HashMultimap.create();
        NodeMap nodeMap = this.nodeMap.get().get();
        NodeAssignmentStats assignmentStats = new NodeAssignmentStats(nodeTaskMap, nodeMap, existingTasks);
        queueSizeAdjuster.update(existingTasks, assignmentStats);
        Set blockedExactNodes = new HashSet<>();
        boolean splitWaitingForAnyNode = false;
        // splitsToBeRedistributed becomes true only when splits go through locality-based assignment
        boolean splitsToBeRedistributed = false;
        Set remainingSplits = new HashSet<>(splits.size());

        List filteredNodes = filterNodes(nodeMap, includeCoordinator, ImmutableSet.of());
        ResettableRandomizedIterator randomCandidates = new ResettableRandomizedIterator<>(filteredNodes);
        Set schedulableNodes = new HashSet<>(filteredNodes);

        // optimizedLocalScheduling enables prioritized assignment of splits to local nodes when splits contain locality information
        if (optimizedLocalScheduling) {
            for (Split split : splits) {
                if (split.isRemotelyAccessible() && !split.getAddresses().isEmpty()) {
                    List candidateNodes = selectExactNodes(nodeMap, split.getAddresses(), includeCoordinator);

                    Optional chosenNode = candidateNodes.stream()
                            .filter(ownerNode -> assignmentStats.getTotalSplitsWeight(ownerNode) < maxSplitsWeightPerNode && assignmentStats.getUnacknowledgedSplitCountForStage(ownerNode) < maxUnacknowledgedSplitsPerTask)
                            .min(comparingLong(assignmentStats::getTotalSplitsWeight));

                    if (chosenNode.isPresent()) {
                        assignment.put(chosenNode.get(), split);
                        assignmentStats.addAssignedSplit(chosenNode.get(), split.getSplitWeight());
                        splitsToBeRedistributed = true;
                        continue;
                    }
                }
                remainingSplits.add(split);
            }
        }
        else {
            remainingSplits = splits;
        }

        for (Split split : remainingSplits) {
            randomCandidates.reset();

            List candidateNodes;
            if (!split.isRemotelyAccessible()) {
                candidateNodes = selectExactNodes(nodeMap, split.getAddresses(), includeCoordinator);
            }
            else {
                candidateNodes = selectNodes(minCandidates, randomCandidates);
            }
            if (candidateNodes.isEmpty()) {
                log.debug("No nodes available to schedule %s. Available nodes %s", split, nodeMap.getNodesByHost().keys());
                throw new TrinoException(NO_NODES_AVAILABLE, "No nodes available to run query");
            }

            InternalNode chosenNode = chooseNodeForSplit(assignmentStats, candidateNodes);
            if (chosenNode == null) {
                long minWeight = Long.MAX_VALUE;
                for (InternalNode node : candidateNodes) {
                    long queuedWeight = assignmentStats.getQueuedSplitsWeightForStage(node);
                    long adjustedMaxPendingSplitsWeightPerTask = queueSizeAdjuster.getAdjustedMaxPendingSplitsWeightPerTask(node.getNodeIdentifier());

                    if (queuedWeight <= minWeight && queuedWeight < adjustedMaxPendingSplitsWeightPerTask && assignmentStats.getUnacknowledgedSplitCountForStage(node) < maxUnacknowledgedSplitsPerTask) {
                        chosenNode = node;
                        minWeight = queuedWeight;
                    }
                    if (queuedWeight >= adjustedMaxPendingSplitsWeightPerTask) {
                        // Mark node for adjustment, since its queue is full, and we still have split to assign.
                        queueSizeAdjuster.scheduleAdjustmentForNode(node.getNodeIdentifier());
                    }
                }
            }
            if (chosenNode != null) {
                assignment.put(chosenNode, split);
                assignmentStats.addAssignedSplit(chosenNode, split.getSplitWeight());
            }
            else {
                candidateNodes.forEach(schedulableNodes::remove);
                if (split.isRemotelyAccessible()) {
                    splitWaitingForAnyNode = true;
                }
                // Exact node set won't matter, if a split is waiting for any node
                else if (!splitWaitingForAnyNode) {
                    blockedExactNodes.addAll(candidateNodes);
                }

                if (splitWaitingForAnyNode && schedulableNodes.isEmpty()) {
                    // All nodes assigned, no need to test if we can assign new split
                    break;
                }
            }
        }

        ListenableFuture blocked;
        if (splitWaitingForAnyNode) {
            blocked = toWhenHasSplitQueueSpaceFuture(existingTasks, calculateLowWatermark(minPendingSplitsWeightPerTask));
        }
        else {
            blocked = toWhenHasSplitQueueSpaceFuture(blockedExactNodes, existingTasks, calculateLowWatermark(minPendingSplitsWeightPerTask));
        }

        if (splitsToBeRedistributed) {
            equateDistribution(assignment, assignmentStats, nodeMap, includeCoordinator);
        }
        return new SplitPlacementResult(blocked, assignment);
    }

    @Override
    public SplitPlacementResult computeAssignments(Set splits, List existingTasks, BucketNodeMap bucketNodeMap)
    {
        // TODO: Implement split assignment adjustment based on how quick node is able to process splits. More information https://github.com/trinodb/trino/pull/15168
        return selectDistributionNodes(nodeMap.get().get(), nodeTaskMap, maxSplitsWeightPerNode, minPendingSplitsWeightPerTask, maxUnacknowledgedSplitsPerTask, splits, existingTasks, bucketNodeMap);
    }

    @Nullable
    private InternalNode chooseNodeForSplit(NodeAssignmentStats assignmentStats, List candidateNodes)
    {
        InternalNode chosenNode = null;
        long minWeight = Long.MAX_VALUE;

        List freeNodes = getFreeNodesForStage(assignmentStats, candidateNodes);
        switch (splitsBalancingPolicy) {
            case STAGE:
                for (InternalNode node : freeNodes) {
                    long queuedWeight = assignmentStats.getQueuedSplitsWeightForStage(node);
                    if (queuedWeight <= minWeight) {
                        chosenNode = node;
                        minWeight = queuedWeight;
                    }
                }
                break;
            case NODE:
                for (InternalNode node : freeNodes) {
                    long totalSplitsWeight = assignmentStats.getTotalSplitsWeight(node);
                    if (totalSplitsWeight <= minWeight) {
                        chosenNode = node;
                        minWeight = totalSplitsWeight;
                    }
                }
                break;
            default:
                throw new UnsupportedOperationException("Unsupported split balancing policy " + splitsBalancingPolicy);
        }

        return chosenNode;
    }

    private List getFreeNodesForStage(NodeAssignmentStats assignmentStats, List nodes)
    {
        ImmutableList.Builder freeNodes = ImmutableList.builder();
        for (InternalNode node : nodes) {
            if (assignmentStats.getTotalSplitsWeight(node) < maxSplitsWeightPerNode && assignmentStats.getUnacknowledgedSplitCountForStage(node) < maxUnacknowledgedSplitsPerTask) {
                freeNodes.add(node);
            }
        }
        return freeNodes.build();
    }

    /**
     * The method tries to make the distribution of splits more uniform. All nodes are arranged into a maxHeap and a minHeap
     * based on the number of splits that are assigned to them. Splits are redistributed, one at a time, from a maxNode to a
     * minNode until we have as uniform a distribution as possible.
     *
     * @param assignment the node-splits multimap after the first and the second stage
     * @param assignmentStats required to obtain info regarding splits assigned to a node outside the current batch of assignment
     * @param nodeMap to get a list of all nodes to which splits can be assigned
     */
    private void equateDistribution(Multimap assignment, NodeAssignmentStats assignmentStats, NodeMap nodeMap, boolean includeCoordinator)
    {
        if (assignment.isEmpty()) {
            return;
        }

        Collection allNodes = nodeMap.getNodesByHostAndPort().values().stream()
                .filter(node -> includeCoordinator || !nodeMap.getCoordinatorNodeIds().contains(node.getNodeIdentifier()))
                .collect(toImmutableList());

        if (allNodes.size() < 2) {
            return;
        }

        IndexedPriorityQueue maxNodes = new IndexedPriorityQueue<>();
        for (InternalNode node : assignment.keySet()) {
            maxNodes.addOrUpdate(node, assignmentStats.getTotalSplitsWeight(node));
        }

        IndexedPriorityQueue minNodes = new IndexedPriorityQueue<>();
        for (InternalNode node : allNodes) {
            minNodes.addOrUpdate(node, Long.MAX_VALUE - assignmentStats.getTotalSplitsWeight(node));
        }

        while (true) {
            if (maxNodes.isEmpty()) {
                return;
            }

            // fetch min and max node
            InternalNode maxNode = maxNodes.poll();
            InternalNode minNode = minNodes.poll();

            // Allow some degree of non uniformity when assigning splits to nodes. Usually data distribution
            // among nodes in a cluster won't be fully uniform (e.g. because hash function with non-uniform
            // distribution is used like consistent hashing). In such case it makes sense to assign splits to nodes
            // with data because of potential savings in network throughput and CPU time.
            // The difference of 5 between node with maximum and minimum splits is a tradeoff between ratio of
            // misassigned splits and assignment uniformity. Using larger numbers doesn't reduce the number of
            // misassigned splits greatly (in absolute values).
            if (assignmentStats.getTotalSplitsWeight(maxNode) - assignmentStats.getTotalSplitsWeight(minNode) <= SplitWeight.rawValueForStandardSplitCount(5)) {
                return;
            }

            // move split from max to min
            Split redistributed = redistributeSplit(assignment, maxNode, minNode, nodeMap.getNodesByHost());
            assignmentStats.removeAssignedSplit(maxNode, redistributed.getSplitWeight());
            assignmentStats.addAssignedSplit(minNode, redistributed.getSplitWeight());

            // add max back into maxNodes only if it still has assignments
            if (assignment.containsKey(maxNode)) {
                maxNodes.addOrUpdate(maxNode, assignmentStats.getTotalSplitsWeight(maxNode));
            }

            // Add or update both the Priority Queues with the updated node priorities
            maxNodes.addOrUpdate(minNode, assignmentStats.getTotalSplitsWeight(minNode));
            minNodes.addOrUpdate(minNode, Long.MAX_VALUE - assignmentStats.getTotalSplitsWeight(minNode));
            minNodes.addOrUpdate(maxNode, Long.MAX_VALUE - assignmentStats.getTotalSplitsWeight(maxNode));
        }
    }

    /**
     * The method selects and removes a split from the fromNode and assigns it to the toNode. There is an attempt to
     * redistribute a Non-local split if possible. This case is possible when there are multiple queries running
     * simultaneously. If a Non-local split cannot be found in the maxNode, any split is selected randomly and reassigned.
     */
    @VisibleForTesting
    public static Split redistributeSplit(Multimap assignment, InternalNode fromNode, InternalNode toNode, SetMultimap nodesByHost)
    {
        Iterator splitIterator = assignment.get(fromNode).iterator();
        Split splitToBeRedistributed = null;
        while (splitIterator.hasNext()) {
            Split split = splitIterator.next();
            // Try to select non-local split for redistribution
            if (!split.getAddresses().isEmpty() && !isSplitLocal(split.getAddresses(), fromNode.getHostAndPort(), nodesByHost)) {
                splitToBeRedistributed = split;
                break;
            }
        }
        // Select any split if maxNode has no non-local splits in the current batch of assignment
        if (splitToBeRedistributed == null) {
            splitIterator = assignment.get(fromNode).iterator();
            splitToBeRedistributed = splitIterator.next();
        }
        splitIterator.remove();
        assignment.put(toNode, splitToBeRedistributed);
        return splitToBeRedistributed;
    }

    /**
     * Helper method to determine if a split is local to a node irrespective of whether splitAddresses contain port information or not
     */
    private static boolean isSplitLocal(List splitAddresses, HostAddress nodeAddress, SetMultimap nodesByHost)
    {
        for (HostAddress address : splitAddresses) {
            if (nodeAddress.equals(address)) {
                return true;
            }
            InetAddress inetAddress;
            try {
                inetAddress = address.toInetAddress();
            }
            catch (UnknownHostException e) {
                continue;
            }
            if (!address.hasPort()) {
                Set localNodes = nodesByHost.get(inetAddress);
                return localNodes.stream()
                        .anyMatch(node -> node.getHostAndPort().equals(nodeAddress));
            }
        }
        return false;
    }

    static class QueueSizeAdjuster
    {
        private static final long SCALE_DOWN_INTERVAL = SECONDS.toNanos(1);
        private final Ticker ticker;
        private final Map taskAdjustmentInfos = new HashMap<>();
        private final Set previousScheduleFullTasks = new HashSet<>();
        private final long minPendingSplitsWeightPerTask;
        private final long maxAdjustedPendingSplitsWeightPerTask;

        private QueueSizeAdjuster(long minPendingSplitsWeightPerTask, long maxAdjustedPendingSplitsWeightPerTask)
        {
            this(minPendingSplitsWeightPerTask, maxAdjustedPendingSplitsWeightPerTask, Ticker.systemTicker());
        }

        @VisibleForTesting
        QueueSizeAdjuster(long minPendingSplitsWeightPerTask, long maxAdjustedPendingSplitsWeightPerTask, Ticker ticker)
        {
            this.ticker = requireNonNull(ticker, "ticker is null");
            this.maxAdjustedPendingSplitsWeightPerTask = maxAdjustedPendingSplitsWeightPerTask;
            this.minPendingSplitsWeightPerTask = minPendingSplitsWeightPerTask;
        }

        public void update(List existingTasks, NodeAssignmentStats nodeAssignmentStats)
        {
            if (!isEnabled()) {
                return;
            }
            for (RemoteTask task : existingTasks) {
                String nodeId = task.getNodeId();
                TaskAdjustmentInfo nodeTaskAdjustmentInfo = taskAdjustmentInfos.computeIfAbsent(nodeId, key -> new TaskAdjustmentInfo(minPendingSplitsWeightPerTask));
                Optional lastAdjustmentTime = nodeTaskAdjustmentInfo.getLastAdjustmentNanos();

                if (previousScheduleFullTasks.contains(nodeId) && nodeAssignmentStats.getQueuedSplitsWeightForStage(nodeId) == 0) {
                    // even if we max out adjustment we want to move forward lastAdjustmentTime
                    nodeTaskAdjustmentInfo.setAdjustedMaxSplitsWeightPerTask(Math.min(maxAdjustedPendingSplitsWeightPerTask, nodeTaskAdjustmentInfo.getAdjustedMaxSplitsWeightPerTask() * 2));
                }
                else if (lastAdjustmentTime.isPresent() && (ticker.read() - lastAdjustmentTime.get()) >= SCALE_DOWN_INTERVAL) {
                    nodeTaskAdjustmentInfo.setAdjustedMaxSplitsWeightPerTask((long) Math.max(minPendingSplitsWeightPerTask, nodeTaskAdjustmentInfo.getAdjustedMaxSplitsWeightPerTask() / 1.5));
                }
            }
            previousScheduleFullTasks.clear();
        }

        public long getAdjustedMaxPendingSplitsWeightPerTask(String nodeId)
        {
            TaskAdjustmentInfo nodeTaskAdjustmentInfo = taskAdjustmentInfos.get(nodeId);

            return nodeTaskAdjustmentInfo != null ? nodeTaskAdjustmentInfo.getAdjustedMaxSplitsWeightPerTask() : minPendingSplitsWeightPerTask;
        }

        public void scheduleAdjustmentForNode(String nodeIdentifier)
        {
            if (!isEnabled()) {
                return;
            }

            previousScheduleFullTasks.add(nodeIdentifier);
        }

        private boolean isEnabled()
        {
            return maxAdjustedPendingSplitsWeightPerTask != minPendingSplitsWeightPerTask;
        }

        private class TaskAdjustmentInfo
        {
            private long adjustedMaxSplitsWeightPerTask;
            private Optional lastAdjustmentNanos;

            public TaskAdjustmentInfo(long adjustedMaxSplitsWeightPerTask)
            {
                this.adjustedMaxSplitsWeightPerTask = adjustedMaxSplitsWeightPerTask;
                this.lastAdjustmentNanos = Optional.empty();
            }

            public long getAdjustedMaxSplitsWeightPerTask()
            {
                return adjustedMaxSplitsWeightPerTask;
            }

            public void setAdjustedMaxSplitsWeightPerTask(long adjustedMaxSplitsWeightPerTask)
            {
                this.adjustedMaxSplitsWeightPerTask = adjustedMaxSplitsWeightPerTask;
                this.lastAdjustmentNanos = Optional.of(ticker.read());
            }

            public Optional getLastAdjustmentNanos()
            {
                return lastAdjustmentNanos;
            }
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy