io.trino.execution.scheduler.ScaledWriterScheduler Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of trino-main Show documentation
Trino - Engine
There is a newer version: 465
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.trino.execution.scheduler;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMultimap;
import com.google.common.util.concurrent.SettableFuture;
import io.airlift.units.DataSize;
import io.trino.execution.RemoteTask;
import io.trino.execution.TaskStatus;
import io.trino.execution.buffer.OutputBufferStatus;
import io.trino.metadata.InternalNode;

import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.Supplier;

import static io.trino.execution.scheduler.ScheduleResult.BlockedReason.WRITER_SCALING;
import static io.trino.spi.StandardErrorCode.NO_NODES_AVAILABLE;
import static io.trino.util.Failures.checkCondition;
import static java.util.Objects.requireNonNull;
import static java.util.concurrent.TimeUnit.MILLISECONDS;

public class ScaledWriterScheduler
        implements StageScheduler
{
    private static final double BUFFER_FULL_THRESHOLD = 0.5;

    private final StageExecution stage;
    private final Supplier> sourceTasksProvider;
    private final Supplier> writerTasksProvider;
    private final NodeSelector nodeSelector;
    private final ScheduledExecutorService executor;
    private final long writerScalingMinDataProcessed;
    private final Set scheduledNodes = new HashSet<>();
    private final AtomicBoolean done = new AtomicBoolean();
    private final int maxWriterNodeCount;
    private volatile SettableFuture future = SettableFuture.create();

    public ScaledWriterScheduler(
            StageExecution stage,
            Supplier> sourceTasksProvider,
            Supplier> writerTasksProvider,
            NodeSelector nodeSelector,
            ScheduledExecutorService executor,
            DataSize writerScalingMinDataProcessed,
            int maxWriterNodeCount)
    {
        this.stage = requireNonNull(stage, "stage is null");
        this.sourceTasksProvider = requireNonNull(sourceTasksProvider, "sourceTasksProvider is null");
        this.writerTasksProvider = requireNonNull(writerTasksProvider, "writerTasksProvider is null");
        this.nodeSelector = requireNonNull(nodeSelector, "nodeSelector is null");
        this.executor = requireNonNull(executor, "executor is null");
        this.writerScalingMinDataProcessed = writerScalingMinDataProcessed.toBytes();
        this.maxWriterNodeCount = maxWriterNodeCount;
    }

    public void finish()
    {
        done.set(true);
        future.set(null);
    }

    @Override
    public ScheduleResult schedule()
    {
        List writers = scheduleTasks(getNewTaskCount());

        future.set(null);
        future = SettableFuture.create();
        executor.schedule(() -> future.set(null), 200, MILLISECONDS);

        return new ScheduleResult(done.get(), writers, future, WRITER_SCALING, 0);
    }

    private int getNewTaskCount()
    {
        if (scheduledNodes.isEmpty()) {
            return 1;
        }

        Collection writerTasks = writerTasksProvider.get();
        // Do not scale tasks until all existing writer tasks are initialized with maxWriterCount
        if (writerTasks.size() != scheduledNodes.size()
                || writerTasks.stream().map(TaskStatus::getMaxWriterCount).anyMatch(Optional::isEmpty)) {
            return 0;
        }

        // When there is a big data skewness, there could be a bottleneck due to the skewed workers even if most of the workers are not over-utilized.
        // Check both, weighted output buffer over-utilization rate and average output buffer over-utilization rate, in case when there are many over-utilized small tasks
        // due to fewer not-over-utilized big skewed tasks.
        if (isSourceTasksBufferFull() && isWriteThroughputSufficient() && scheduledNodes.size() < maxWriterNodeCount) {
            return 1;
        }

        return 0;
    }

    private boolean isSourceTasksBufferFull()
    {
        return isAverageBufferFull() || isWeightedBufferFull();
    }

    private boolean isWriteThroughputSufficient()
    {
        Collection writerTasks = writerTasksProvider.get();
        long writerInputBytes = writerTasks.stream()
                .map(TaskStatus::getWriterInputDataSize)
                .mapToLong(DataSize::toBytes)
                .sum();

        long minWriterInputBytesToScaleUp = writerTasks.stream()
                .map(TaskStatus::getMaxWriterCount)
                .map(Optional::get)
                .mapToLong(writerCount -> writerScalingMinDataProcessed * writerCount)
                .sum();
        return writerInputBytes >= minWriterInputBytesToScaleUp;
    }

    private boolean isWeightedBufferFull()
    {
        double totalOutputSize = 0.0;
        double overutilizedOutputSize = 0.0;
        for (TaskStatus task : sourceTasksProvider.get()) {
            if (!task.getState().isTerminatingOrDone()) {
                long outputDataSize = task.getOutputDataSize().toBytes();
                totalOutputSize += outputDataSize;
                if (task.getOutputBufferStatus().isOverutilized()) {
                    overutilizedOutputSize += outputDataSize;
                }
            }
        }

        return totalOutputSize > 0 && overutilizedOutputSize / totalOutputSize >= BUFFER_FULL_THRESHOLD;
    }

    private boolean isAverageBufferFull()
    {
        return sourceTasksProvider.get().stream()
                .filter(task -> !task.getState().isTerminatingOrDone())
                .map(TaskStatus::getOutputBufferStatus)
                .map(OutputBufferStatus::isOverutilized)
                .mapToDouble(full -> full ? 1.0 : 0.0)
                .average().orElse(0.0) >= BUFFER_FULL_THRESHOLD;
    }

    private List scheduleTasks(int count)
    {
        if (count == 0) {
            return ImmutableList.of();
        }

        List nodes = nodeSelector.selectRandomNodes(count, scheduledNodes);

        checkCondition(!scheduledNodes.isEmpty() || !nodes.isEmpty(), NO_NODES_AVAILABLE, "No nodes available to run query");

        ImmutableList.Builder tasks = ImmutableList.builder();
        for (InternalNode node : nodes) {
            Optional remoteTask = stage.scheduleTask(node, scheduledNodes.size(), ImmutableMultimap.of());
            remoteTask.ifPresent(task -> {
                tasks.add(task);
                scheduledNodes.add(node);
            });
        }

        return tasks.build();
    }
}