All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.trino.execution.scheduler.faulttolerant.ArbitraryDistributionSplitAssigner Maven / Gradle / Ivy

There is a newer version: 468
Show newest version
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.trino.execution.scheduler.faulttolerant;

import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableListMultimap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.ListMultimap;
import io.trino.exchange.SpoolingExchangeInput;
import io.trino.metadata.Split;
import io.trino.spi.HostAddress;
import io.trino.spi.SplitWeight;
import io.trino.spi.connector.CatalogHandle;
import io.trino.spi.exchange.ExchangeSourceHandle;
import io.trino.split.RemoteSplit;
import io.trino.sql.planner.plan.PlanNodeId;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.base.Verify.verify;
import static io.trino.operator.ExchangeOperator.REMOTE_CATALOG_HANDLE;
import static java.lang.Math.ceil;
import static java.lang.Math.min;
import static java.lang.Math.round;
import static java.util.Objects.requireNonNull;

class ArbitraryDistributionSplitAssigner
        implements SplitAssigner
{
    private final Optional catalogRequirement;
    private final Set partitionedSources;
    private final Set replicatedSources;
    private final Set allSources;
    private final int adaptiveGrowthPeriod;
    private final double adaptiveGrowthFactor;
    private final long minTargetPartitionSizeInBytes;
    private final long maxTargetPartitionSizeInBytes;
    private final long standardSplitSizeInBytes;
    private final int maxTaskSplitCount;

    private int nextPartitionId;
    private int adaptiveCounter;
    private long targetPartitionSizeInBytes;
    private long roundedTargetPartitionSizeInBytes;
    private final List allAssignments = new ArrayList<>();
    private final Map, PartitionAssignment> openAssignments = new HashMap<>();

    private final Set completedSources = new HashSet<>();

    private final ListMultimap replicatedSplits = ArrayListMultimap.create();
    private boolean noMoreReplicatedSplits;

    ArbitraryDistributionSplitAssigner(
            Optional catalogRequirement,
            Set partitionedSources,
            Set replicatedSources,
            int adaptiveGrowthPeriod,
            double adaptiveGrowthFactor,
            long minTargetPartitionSizeInBytes,
            long maxTargetPartitionSizeInBytes,
            long standardSplitSizeInBytes,
            int maxTaskSplitCount)
    {
        this.catalogRequirement = requireNonNull(catalogRequirement, "catalogRequirement is null");
        this.partitionedSources = ImmutableSet.copyOf(requireNonNull(partitionedSources, "partitionedSources is null"));
        this.replicatedSources = ImmutableSet.copyOf(requireNonNull(replicatedSources, "replicatedSources is null"));
        allSources = ImmutableSet.builder()
                .addAll(partitionedSources)
                .addAll(replicatedSources)
                .build();
        this.adaptiveGrowthPeriod = adaptiveGrowthPeriod;
        this.adaptiveGrowthFactor = adaptiveGrowthFactor;
        this.minTargetPartitionSizeInBytes = minTargetPartitionSizeInBytes;
        this.maxTargetPartitionSizeInBytes = maxTargetPartitionSizeInBytes;
        this.standardSplitSizeInBytes = standardSplitSizeInBytes;
        this.maxTaskSplitCount = maxTaskSplitCount;

        this.targetPartitionSizeInBytes = minTargetPartitionSizeInBytes;
        this.roundedTargetPartitionSizeInBytes = minTargetPartitionSizeInBytes;
    }

    @Override
    public AssignmentResult assign(PlanNodeId planNodeId, ListMultimap splits, boolean noMoreSplits)
    {
        for (Split split : splits.values()) {
            Optional splitCatalogRequirement = Optional.of(split.getCatalogHandle())
                    .filter(catalog -> !catalog.getType().isInternal() && !catalog.equals(REMOTE_CATALOG_HANDLE));
            checkArgument(
                    catalogRequirement.isEmpty() || catalogRequirement.equals(splitCatalogRequirement),
                    "unexpected split catalog requirement: %s",
                    splitCatalogRequirement);
        }
        if (replicatedSources.contains(planNodeId)) {
            return assignReplicatedSplits(planNodeId, ImmutableList.copyOf(splits.values()), noMoreSplits);
        }
        return assignPartitionedSplits(planNodeId, ImmutableList.copyOf(splits.values()), noMoreSplits);
    }

    @Override
    public AssignmentResult finish()
    {
        checkState(!allAssignments.isEmpty(), "allAssignments is not expected to be empty");
        return AssignmentResult.builder().build();
    }

    private AssignmentResult assignReplicatedSplits(PlanNodeId planNodeId, List splits, boolean noMoreSplits)
    {
        AssignmentResult.Builder assignment = AssignmentResult.builder();
        replicatedSplits.putAll(planNodeId, splits);
        for (PartitionAssignment partitionAssignment : allAssignments) {
            assignment.updatePartition(new PartitionUpdate(
                    partitionAssignment.getPartitionId(),
                    planNodeId,
                    false,
                    singleSourcePartition(SINGLE_SOURCE_PARTITION_ID, splits),
                    noMoreSplits));
        }
        if (noMoreSplits) {
            completedSources.add(planNodeId);
            if (completedSources.containsAll(replicatedSources)) {
                noMoreReplicatedSplits = true;
            }
        }
        if (noMoreReplicatedSplits) {
            for (PartitionAssignment partitionAssignment : allAssignments) {
                if (partitionAssignment.isFull()) {
                    assignment.sealPartition(partitionAssignment.getPartitionId());
                }
            }
        }
        if (completedSources.containsAll(allSources)) {
            if (allAssignments.isEmpty()) {
                // at least a single partition is expected to be created
                allAssignments.add(new PartitionAssignment(0));
                assignment.addPartition(new Partition(0, new NodeRequirements(catalogRequirement, ImmutableSet.of())));
                for (PlanNodeId replicatedSourceId : replicatedSources) {
                    assignment.updatePartition(new PartitionUpdate(
                            0,
                            replicatedSourceId,
                            false,
                            singleSourcePartition(SINGLE_SOURCE_PARTITION_ID, replicatedSplits.get(replicatedSourceId)),
                            true));
                }
                for (PlanNodeId partitionedSourceId : partitionedSources) {
                    assignment.updatePartition(new PartitionUpdate(
                            0,
                            partitionedSourceId,
                            false,
                            ImmutableListMultimap.of(),
                            true));
                }
                assignment.sealPartition(0);
            }
            else {
                for (PartitionAssignment partitionAssignment : allAssignments) {
                    // set noMoreSplits for partitioned sources
                    if (!partitionAssignment.isFull()) {
                        for (PlanNodeId partitionedSourceNodeId : partitionedSources) {
                            assignment.updatePartition(new PartitionUpdate(
                                    partitionAssignment.getPartitionId(),
                                    partitionedSourceNodeId,
                                    false,
                                    singleSourcePartition(0, ImmutableList.of()),
                                    true));
                        }
                        // seal partition
                        assignment.sealPartition(partitionAssignment.getPartitionId());
                    }
                }
            }
            replicatedSplits.clear();
            // no more partitions will be created
            assignment.setNoMorePartitions();
        }
        return assignment.build();
    }

    private ListMultimap singleSourcePartition(int sourcePartitionId, List splits)
    {
        ImmutableListMultimap.Builder builder = ImmutableListMultimap.builder();
        builder.putAll(0, splits);
        return builder.build();
    }

    private AssignmentResult assignPartitionedSplits(PlanNodeId planNodeId, List splits, boolean noMoreSplits)
    {
        AssignmentResult.Builder assignment = AssignmentResult.builder();

        for (Split split : splits) {
            Optional hostRequirement = getHostRequirement(split);
            PartitionAssignment partitionAssignment = openAssignments.get(hostRequirement);
            long splitSizeInBytes = getSplitSizeInBytes(split);
            if (partitionAssignment != null && ((partitionAssignment.getAssignedDataSizeInBytes() + splitSizeInBytes > roundedTargetPartitionSizeInBytes)
                    || (partitionAssignment.getAssignedSplitCount() + 1 > maxTaskSplitCount))) {
                partitionAssignment.setFull(true);
                for (PlanNodeId partitionedSourceNodeId : partitionedSources) {
                    assignment.updatePartition(new PartitionUpdate(
                            partitionAssignment.getPartitionId(),
                            partitionedSourceNodeId,
                            false,
                            ImmutableListMultimap.of(),
                            true));
                }
                if (completedSources.containsAll(replicatedSources)) {
                    assignment.sealPartition(partitionAssignment.getPartitionId());
                }
                partitionAssignment = null;
                openAssignments.remove(hostRequirement);

                adaptiveCounter++;
                if (adaptiveCounter >= adaptiveGrowthPeriod) {
                    targetPartitionSizeInBytes = (long) min(maxTargetPartitionSizeInBytes, ceil(targetPartitionSizeInBytes * adaptiveGrowthFactor));
                    // round to a multiple of minTargetPartitionSizeInBytes so work will be evenly distributed among drivers of a task
                    roundedTargetPartitionSizeInBytes = round(targetPartitionSizeInBytes * 1.0 / minTargetPartitionSizeInBytes) * minTargetPartitionSizeInBytes;
                    verify(roundedTargetPartitionSizeInBytes > 0, "roundedTargetPartitionSizeInBytes %s not positive", roundedTargetPartitionSizeInBytes);
                    adaptiveCounter = 0;
                }
            }
            if (partitionAssignment == null) {
                partitionAssignment = new PartitionAssignment(nextPartitionId++);
                allAssignments.add(partitionAssignment);
                openAssignments.put(hostRequirement, partitionAssignment);
                assignment.addPartition(new Partition(
                        partitionAssignment.getPartitionId(),
                        new NodeRequirements(catalogRequirement, hostRequirement.map(ImmutableSet::of).orElseGet(ImmutableSet::of))));

                for (PlanNodeId replicatedSourceId : replicatedSources) {
                    assignment.updatePartition(new PartitionUpdate(
                            partitionAssignment.getPartitionId(),
                            replicatedSourceId,
                            false,
                            singleSourcePartition(SINGLE_SOURCE_PARTITION_ID, replicatedSplits.get(replicatedSourceId)),
                            completedSources.contains(replicatedSourceId)));
                }
            }
            assignment.updatePartition(new PartitionUpdate(
                    partitionAssignment.getPartitionId(),
                    planNodeId,
                    true,
                    singleSourcePartition(SINGLE_SOURCE_PARTITION_ID, ImmutableList.of(split)),
                    false));
            partitionAssignment.assignSplit(splitSizeInBytes);
        }

        if (noMoreSplits) {
            completedSources.add(planNodeId);
        }

        if (completedSources.containsAll(allSources)) {
            if (allAssignments.isEmpty()) {
                // at least a single partition is expected to be created
                allAssignments.add(new PartitionAssignment(0));
                assignment.addPartition(new Partition(0, new NodeRequirements(catalogRequirement, ImmutableSet.of())));
                for (PlanNodeId replicatedSourceId : replicatedSources) {
                    assignment.updatePartition(new PartitionUpdate(
                            0,
                            replicatedSourceId,
                            false,
                            singleSourcePartition(SINGLE_SOURCE_PARTITION_ID, replicatedSplits.get(replicatedSourceId)),
                            true));
                }
                for (PlanNodeId partitionedSourceId : partitionedSources) {
                    assignment.updatePartition(new PartitionUpdate(
                            0,
                            partitionedSourceId,
                            false,
                            ImmutableListMultimap.of(),
                            true));
                }

                assignment.sealPartition(0);
            }
            else {
                for (PartitionAssignment partitionAssignment : openAssignments.values()) {
                    // set noMoreSplits for partitioned sources
                    for (PlanNodeId partitionedSourceNodeId : partitionedSources) {
                        assignment.updatePartition(new PartitionUpdate(
                                partitionAssignment.getPartitionId(),
                                partitionedSourceNodeId,
                                false,
                                ImmutableListMultimap.of(),
                                true));
                    }
                    // seal partition
                    assignment.sealPartition(partitionAssignment.getPartitionId());
                }
                openAssignments.clear();
            }
            replicatedSplits.clear();
            // no more partitions will be created
            assignment.setNoMorePartitions();
        }

        return assignment.build();
    }

    private Optional getHostRequirement(Split split)
    {
        if (split.getConnectorSplit().isRemotelyAccessible()) {
            return Optional.empty();
        }
        List addresses = split.getAddresses();
        checkArgument(!addresses.isEmpty(), "split is not remotely accessible but the list of hosts is empty: %s", split);
        HostAddress selectedAddress = null;
        long selectedAssignmentDataSize = Long.MAX_VALUE;
        for (HostAddress address : addresses) {
            PartitionAssignment assignment = openAssignments.get(Optional.of(address));
            if (assignment == null) {
                // prioritize unused addresses
                selectedAddress = address;
                break;
            }
            if (assignment.getAssignedDataSizeInBytes() < selectedAssignmentDataSize) {
                // otherwise prioritize the smallest assignment
                selectedAddress = address;
                selectedAssignmentDataSize = assignment.getAssignedDataSizeInBytes();
            }
        }
        verify(selectedAddress != null, "selectedAddress is null");
        return Optional.of(selectedAddress);
    }

    private long getSplitSizeInBytes(Split split)
    {
        if (split.getCatalogHandle().equals(REMOTE_CATALOG_HANDLE)) {
            RemoteSplit remoteSplit = (RemoteSplit) split.getConnectorSplit();
            SpoolingExchangeInput exchangeInput = (SpoolingExchangeInput) remoteSplit.getExchangeInput();
            long size = 0;
            for (ExchangeSourceHandle handle : exchangeInput.getExchangeSourceHandles()) {
                size += handle.getDataSizeInBytes();
            }
            return size;
        }
        return round(((split.getSplitWeight().getRawValue() * 1.0) / SplitWeight.standard().getRawValue()) * standardSplitSizeInBytes);
    }

    private static class PartitionAssignment
    {
        private final int partitionId;
        private long assignedDataSizeInBytes;
        private int assignedSplitCount;
        private boolean full;

        private PartitionAssignment(int partitionId)
        {
            this.partitionId = partitionId;
        }

        public int getPartitionId()
        {
            return partitionId;
        }

        public void assignSplit(long sizeInBytes)
        {
            assignedDataSizeInBytes += sizeInBytes;
            assignedSplitCount++;
        }

        public long getAssignedDataSizeInBytes()
        {
            return assignedDataSizeInBytes;
        }

        public int getAssignedSplitCount()
        {
            return assignedSplitCount;
        }

        public boolean isFull()
        {
            return full;
        }

        public void setFull(boolean full)
        {
            this.full = full;
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy