All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.trino.execution.executor.timesharing.MultilevelSplitQueue Maven / Gradle / Ivy

There is a newer version: 465
Show newest version
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.trino.execution.executor.timesharing;

import com.google.common.annotations.VisibleForTesting;
import com.google.errorprone.annotations.ThreadSafe;
import com.google.errorprone.annotations.concurrent.GuardedBy;
import com.google.inject.Inject;
import io.airlift.stats.CounterStat;
import io.trino.execution.TaskManagerConfig;
import org.weakref.jmx.Managed;
import org.weakref.jmx.Nested;

import java.util.Collection;
import java.util.PriorityQueue;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.math.DoubleMath.roundToLong;
import static java.math.RoundingMode.HALF_UP;
import static java.util.concurrent.TimeUnit.NANOSECONDS;
import static java.util.concurrent.TimeUnit.SECONDS;

@ThreadSafe
public class MultilevelSplitQueue
{
    static final int[] LEVEL_THRESHOLD_SECONDS = {0, 1, 10, 60, 300};
    static final long LEVEL_CONTRIBUTION_CAP = SECONDS.toNanos(30);

    @GuardedBy("lock")
    private final PriorityQueue[] levelWaitingSplits;

    private final AtomicLong[] levelScheduledTime;

    private final AtomicLong[] levelMinPriority;
    private final CounterStat[] selectedLevelCounters;

    private final ReentrantLock lock = new ReentrantLock();
    private final Condition notEmpty = lock.newCondition();

    private final double levelTimeMultiplier;

    @Inject
    public MultilevelSplitQueue(TaskManagerConfig taskManagerConfig)
    {
        this(taskManagerConfig.getLevelTimeMultiplier().doubleValue());
    }

    public MultilevelSplitQueue(double levelTimeMultiplier)
    {
        this.levelScheduledTime = new AtomicLong[LEVEL_THRESHOLD_SECONDS.length];
        this.levelMinPriority = new AtomicLong[LEVEL_THRESHOLD_SECONDS.length];
        this.levelWaitingSplits = new PriorityQueue[LEVEL_THRESHOLD_SECONDS.length];
        this.selectedLevelCounters = new CounterStat[LEVEL_THRESHOLD_SECONDS.length];

        for (int level = 0; level < LEVEL_THRESHOLD_SECONDS.length; level++) {
            levelScheduledTime[level] = new AtomicLong();
            levelMinPriority[level] = new AtomicLong(-1);
            levelWaitingSplits[level] = new PriorityQueue<>();
            selectedLevelCounters[level] = new CounterStat();
        }

        this.levelTimeMultiplier = levelTimeMultiplier;
    }

    private void addLevelTime(int level, long nanos)
    {
        levelScheduledTime[level].addAndGet(nanos);
    }

    /**
     * During periods of time when a level has no waiting splits, it will not accumulate
     * scheduled time and will fall behind relative to other levels.
     * 

* This can cause temporary starvation for other levels when splits do reach the * previously-empty level. *

* To prevent this we set the scheduled time for levels which were empty to the expected * scheduled time. */ public void offer(PrioritizedSplitRunner split) { checkArgument(split != null, "split is null"); split.setReady(); int level = split.getPriority().getLevel(); lock.lock(); try { if (levelWaitingSplits[level].isEmpty()) { // Accesses to levelScheduledTime are not synchronized, so we have a data race // here - our level time math will be off. However, the staleness is bounded by // the fact that only running splits that complete during this computation // can update the level time. Therefore, this is benign. long level0Time = getLevel0TargetTime(); long levelExpectedTime = (long) (level0Time / Math.pow(levelTimeMultiplier, level)); long delta = levelExpectedTime - levelScheduledTime[level].get(); levelScheduledTime[level].addAndGet(delta); } levelWaitingSplits[level].offer(split); notEmpty.signal(); } finally { lock.unlock(); } } public PrioritizedSplitRunner take() throws InterruptedException { while (true) { lock.lockInterruptibly(); try { PrioritizedSplitRunner result; while ((result = pollSplit()) == null) { notEmpty.await(); } if (result.updateLevelPriority()) { offer(result); continue; } int selectedLevel = result.getPriority().getLevel(); levelMinPriority[selectedLevel].set(result.getPriority().getLevelPriority()); selectedLevelCounters[selectedLevel].update(1); return result; } finally { lock.unlock(); } } } /** * Trino attempts to give each level a target amount of scheduled time, which is configurable * using levelTimeMultiplier. *

* This function selects the level that has the lowest ratio of actual to the target time * with the objective of minimizing deviation from the target scheduled time. From this level, * we pick the split with the lowest priority. */ @GuardedBy("lock") private PrioritizedSplitRunner pollSplit() { long targetScheduledTime = getLevel0TargetTime(); double worstRatio = 1; int selectedLevel = -1; for (int level = 0; level < LEVEL_THRESHOLD_SECONDS.length; level++) { if (!levelWaitingSplits[level].isEmpty()) { long levelTime = levelScheduledTime[level].get(); double ratio = levelTime == 0 ? 0 : targetScheduledTime / (1.0 * levelTime); if (selectedLevel == -1 || ratio > worstRatio) { worstRatio = ratio; selectedLevel = level; } } targetScheduledTime = roundToLong(targetScheduledTime / levelTimeMultiplier, HALF_UP); } if (selectedLevel == -1) { return null; } PrioritizedSplitRunner result = levelWaitingSplits[selectedLevel].poll(); checkState(result != null, "pollSplit cannot return null"); return result; } @GuardedBy("lock") private long getLevel0TargetTime() { long level0TargetTime = levelScheduledTime[0].get(); double currentMultiplier = levelTimeMultiplier; for (int level = 0; level < LEVEL_THRESHOLD_SECONDS.length; level++) { currentMultiplier /= levelTimeMultiplier; long levelTime = levelScheduledTime[level].get(); level0TargetTime = Math.max(level0TargetTime, (long) (levelTime / currentMultiplier)); } return level0TargetTime; } /** * Trino 'charges' the quanta run time to the task and the level it belongs to in * an effort to maintain the target thread utilization ratios between levels and to * maintain fairness within a level. *

* Consider an example split where a read hung for several minutes. This is either a bug * or a failing dependency. In either case we do not want to charge the task too much, * and we especially do not want to charge the level too much - i.e. cause other queries * in this level to starve. * * @return the new priority for the task */ public Priority updatePriority(Priority oldPriority, long quantaNanos, long scheduledNanos) { int oldLevel = oldPriority.getLevel(); int newLevel = computeLevel(scheduledNanos); long levelContribution = Math.min(quantaNanos, LEVEL_CONTRIBUTION_CAP); if (oldLevel == newLevel) { addLevelTime(oldLevel, levelContribution); return new Priority(oldLevel, oldPriority.getLevelPriority() + quantaNanos); } long remainingLevelContribution = levelContribution; long remainingTaskTime = quantaNanos; // a task normally slowly accrues scheduled time in a level and then moves to the next, but // if the split had a particularly long quanta, accrue time to each level as if it had run // in that level up to the level limit. for (int currentLevel = oldLevel; currentLevel < newLevel; currentLevel++) { long timeAccruedToLevel = Math.min(SECONDS.toNanos(LEVEL_THRESHOLD_SECONDS[currentLevel + 1] - LEVEL_THRESHOLD_SECONDS[currentLevel]), remainingLevelContribution); addLevelTime(currentLevel, timeAccruedToLevel); remainingLevelContribution -= timeAccruedToLevel; remainingTaskTime -= timeAccruedToLevel; } addLevelTime(newLevel, remainingLevelContribution); long newLevelMinPriority = getLevelMinPriority(newLevel, scheduledNanos); return new Priority(newLevel, newLevelMinPriority + remainingTaskTime); } public void remove(PrioritizedSplitRunner split) { checkArgument(split != null, "split is null"); lock.lock(); try { for (PriorityQueue level : levelWaitingSplits) { level.remove(split); } } finally { lock.unlock(); } } public void removeAll(Collection splits) { lock.lock(); try { for (PriorityQueue level : levelWaitingSplits) { level.removeAll(splits); } } finally { lock.unlock(); } } public long getLevelMinPriority(int level, long taskThreadUsageNanos) { levelMinPriority[level].compareAndSet(-1, taskThreadUsageNanos); return levelMinPriority[level].get(); } public int size() { lock.lock(); try { int total = 0; for (PriorityQueue level : levelWaitingSplits) { total += level.size(); } return total; } finally { lock.unlock(); } } public static int computeLevel(long threadUsageNanos) { long seconds = NANOSECONDS.toSeconds(threadUsageNanos); for (int level = 0; level < (LEVEL_THRESHOLD_SECONDS.length - 1); level++) { if (seconds < LEVEL_THRESHOLD_SECONDS[level + 1]) { return level; } } return LEVEL_THRESHOLD_SECONDS.length - 1; } @VisibleForTesting long getLevelScheduledTime(int level) { return levelScheduledTime[level].longValue(); } @Managed public long getLevel0Time() { return getLevelScheduledTime(0); } @Managed public long getLevel1Time() { return getLevelScheduledTime(1); } @Managed public long getLevel2Time() { return getLevelScheduledTime(2); } @Managed public long getLevel3Time() { return getLevelScheduledTime(3); } @Managed public long getLevel4Time() { return getLevelScheduledTime(4); } @Managed @Nested public CounterStat getSelectedCountLevel0() { return selectedLevelCounters[0]; } @Managed @Nested public CounterStat getSelectedCountLevel1() { return selectedLevelCounters[1]; } @Managed @Nested public CounterStat getSelectedCountLevel2() { return selectedLevelCounters[2]; } @Managed @Nested public CounterStat getSelectedCountLevel3() { return selectedLevelCounters[3]; } @Managed @Nested public CounterStat getSelectedCountLevel4() { return selectedLevelCounters[4]; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy