io.trino.execution.scheduler.faulttolerant.TaskDescriptorStorage Maven / Gradle / Ivy
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.execution.scheduler.faulttolerant;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Suppliers;
import com.google.common.base.VerifyException;
import com.google.common.collect.HashBasedTable;
import com.google.common.collect.Multimap;
import com.google.common.collect.Table;
import com.google.common.math.Quantiles;
import com.google.common.math.Stats;
import com.google.errorprone.annotations.concurrent.GuardedBy;
import com.google.inject.Inject;
import io.airlift.json.JsonCodec;
import io.airlift.log.Logger;
import io.airlift.units.DataSize;
import io.trino.annotation.NotThreadSafe;
import io.trino.execution.QueryManagerConfig;
import io.trino.execution.StageId;
import io.trino.metadata.Split;
import io.trino.spi.QueryId;
import io.trino.spi.TrinoException;
import io.trino.sql.planner.plan.PlanNodeId;
import org.weakref.jmx.Managed;
import org.weakref.jmx.Nested;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Supplier;
import java.util.stream.Stream;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.base.Verify.verify;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static com.google.common.collect.ImmutableMap.toImmutableMap;
import static com.google.common.collect.ImmutableSet.toImmutableSet;
import static com.google.common.collect.ImmutableSetMultimap.toImmutableSetMultimap;
import static com.google.common.math.Quantiles.percentiles;
import static io.airlift.units.DataSize.succinctBytes;
import static io.trino.spi.StandardErrorCode.EXCEEDED_TASK_DESCRIPTOR_STORAGE_CAPACITY;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;
public class TaskDescriptorStorage
{
private static final Logger log = Logger.get(TaskDescriptorStorage.class);
private final long maxMemoryInBytes;
private final JsonCodec splitJsonCodec;
private final StorageStats storageStats;
@GuardedBy("this")
private final Map storages = new HashMap<>();
@GuardedBy("this")
private long reservedBytes;
@Inject
public TaskDescriptorStorage(
QueryManagerConfig config,
JsonCodec splitJsonCodec)
{
this(config.getFaultTolerantExecutionTaskDescriptorStorageMaxMemory(), splitJsonCodec);
}
public TaskDescriptorStorage(DataSize maxMemory, JsonCodec splitJsonCodec)
{
this.maxMemoryInBytes = maxMemory.toBytes();
this.splitJsonCodec = requireNonNull(splitJsonCodec, "splitJsonCodec is null");
this.storageStats = new StorageStats(Suppliers.memoizeWithExpiration(this::computeStats, 1, TimeUnit.SECONDS));
}
/**
* Initializes task descriptor storage for a given queryId
.
* It is expected to be called before query scheduling begins.
*/
public synchronized void initialize(QueryId queryId)
{
TaskDescriptors storage = new TaskDescriptors();
verify(storages.putIfAbsent(queryId, storage) == null, "storage is already initialized for query: %s", queryId);
updateMemoryReservation(storage.getReservedBytes());
}
/**
* Stores {@link TaskDescriptor} for a task identified by the stageId
and partitionId
.
* The partitionId
is obtained from the {@link TaskDescriptor} by calling {@link TaskDescriptor#getPartitionId()}.
* If the query has been terminated the call is ignored.
*
* @throws IllegalStateException if the storage already has a task descriptor for a given task
*/
public synchronized void put(StageId stageId, TaskDescriptor descriptor)
{
TaskDescriptors storage = storages.get(stageId.getQueryId());
if (storage == null) {
// query has been terminated
return;
}
long previousReservedBytes = storage.getReservedBytes();
storage.put(stageId, descriptor.getPartitionId(), descriptor);
long currentReservedBytes = storage.getReservedBytes();
long delta = currentReservedBytes - previousReservedBytes;
updateMemoryReservation(delta);
}
/**
* Get task descriptor
*
* @return Non empty {@link TaskDescriptor} for a task identified by the stageId
and partitionId
.
* Returns {@link Optional#empty()} if the query of a given stageId
has been finished (e.g.: cancelled by the user or finished early).
* @throws java.util.NoSuchElementException if {@link TaskDescriptor} for a given task does not exist
*/
public synchronized Optional get(StageId stageId, int partitionId)
{
TaskDescriptors storage = storages.get(stageId.getQueryId());
if (storage == null) {
// query has been terminated
return Optional.empty();
}
return Optional.of(storage.get(stageId, partitionId));
}
/**
* Removes {@link TaskDescriptor} for a task identified by the stageId
and partitionId
.
* If the query has been terminated the call is ignored.
*
* @throws java.util.NoSuchElementException if {@link TaskDescriptor} for a given task does not exist
*/
public synchronized void remove(StageId stageId, int partitionId)
{
TaskDescriptors storage = storages.get(stageId.getQueryId());
if (storage == null) {
// query has been terminated
return;
}
long previousReservedBytes = storage.getReservedBytes();
storage.remove(stageId, partitionId);
long currentReservedBytes = storage.getReservedBytes();
long delta = currentReservedBytes - previousReservedBytes;
updateMemoryReservation(delta);
}
/**
* Notifies the storage that the query with a given queryId
has been finished and the task descriptors can be safely discarded.
*
* The engine may decided to destroy the storage while the scheduling is still in process (for example if query was cancelled). Under such
* circumstances the implementation will ignore future calls to {@link #put(StageId, TaskDescriptor)} and return
* {@link Optional#empty()} from {@link #get(StageId, int)}. The scheduler is expected to handle this condition appropriately.
*/
public synchronized void destroy(QueryId queryId)
{
TaskDescriptors storage = storages.remove(queryId);
if (storage != null) {
updateMemoryReservation(-storage.getReservedBytes());
}
}
private synchronized void updateMemoryReservation(long delta)
{
reservedBytes += delta;
if (delta <= 0) {
return;
}
while (reservedBytes > maxMemoryInBytes) {
// drop a query that uses the most storage
QueryId killCandidate = storages.entrySet().stream()
.max(Comparator.comparingLong(entry -> entry.getValue().getReservedBytes()))
.map(Map.Entry::getKey)
.orElseThrow(() -> new VerifyException(format("storage is empty but reservedBytes (%s) is still greater than maxMemoryInBytes (%s)", reservedBytes, maxMemoryInBytes)));
TaskDescriptors storage = storages.get(killCandidate);
long previousReservedBytes = storage.getReservedBytes();
if (log.isInfoEnabled()) {
log.info("Failing query %s; reclaiming %s of %s task descriptor memory from %s queries; extraStorageInfo=%s", killCandidate, storage.getReservedBytes(), succinctBytes(reservedBytes), storages.size(), storage.getDebugInfo());
}
storage.fail(new TrinoException(
EXCEEDED_TASK_DESCRIPTOR_STORAGE_CAPACITY,
format("Task descriptor storage capacity has been exceeded: %s > %s", succinctBytes(maxMemoryInBytes), succinctBytes(reservedBytes))));
long currentReservedBytes = storage.getReservedBytes();
reservedBytes += (currentReservedBytes - previousReservedBytes);
}
}
@VisibleForTesting
synchronized long getReservedBytes()
{
return reservedBytes;
}
@Managed
@Nested
public StorageStats getStats()
{
// This should not contain materialized values. GuiceMBeanExporter calls it only once during application startup
// and then only @Managed methods all called on that instance.
return storageStats;
}
private synchronized StorageStatsValue computeStats()
{
int queriesCount = storages.size();
long stagesCount = storages.values().stream().mapToLong(TaskDescriptors::getStagesCount).sum();
Quantiles.ScaleAndIndexes percentiles = percentiles().indexes(50, 90, 95);
long queryReservedBytesP50 = 0;
long queryReservedBytesP90 = 0;
long queryReservedBytesP95 = 0;
long queryReservedBytesAvg = 0;
long stageReservedBytesP50 = 0;
long stageReservedBytesP90 = 0;
long stageReservedBytesP95 = 0;
long stageReservedBytesAvg = 0;
if (queriesCount > 0) { // we cannot compute percentiles for empty set
Map queryReservedBytesPercentiles = percentiles.compute(
storages.values().stream()
.map(TaskDescriptors::getReservedBytes)
.collect(toImmutableList()));
queryReservedBytesP50 = queryReservedBytesPercentiles.get(50).longValue();
queryReservedBytesP90 = queryReservedBytesPercentiles.get(90).longValue();
queryReservedBytesP95 = queryReservedBytesPercentiles.get(95).longValue();
queryReservedBytesAvg = reservedBytes / queriesCount;
List storagesReservedBytes = storages.values().stream()
.flatMap(TaskDescriptors::getStagesReservedBytes)
.collect(toImmutableList());
if (!storagesReservedBytes.isEmpty()) {
Map stagesReservedBytesPercentiles = percentiles.compute(
storagesReservedBytes);
stageReservedBytesP50 = stagesReservedBytesPercentiles.get(50).longValue();
stageReservedBytesP90 = stagesReservedBytesPercentiles.get(90).longValue();
stageReservedBytesP95 = stagesReservedBytesPercentiles.get(95).longValue();
stageReservedBytesAvg = reservedBytes / stagesCount;
}
}
return new StorageStatsValue(
queriesCount,
stagesCount,
reservedBytes,
queryReservedBytesAvg,
queryReservedBytesP50,
queryReservedBytesP90,
queryReservedBytesP95,
stageReservedBytesAvg,
stageReservedBytesP50,
stageReservedBytesP90,
stageReservedBytesP95);
}
@NotThreadSafe
private class TaskDescriptors
{
private final Table descriptors = HashBasedTable.create();
private long reservedBytes;
private final Map stagesReservedBytes = new HashMap<>();
private RuntimeException failure;
public void put(StageId stageId, int partitionId, TaskDescriptor descriptor)
{
throwIfFailed();
checkState(!descriptors.contains(stageId, partitionId), "task descriptor is already present for key %s/%s ", stageId, partitionId);
descriptors.put(stageId, partitionId, descriptor);
long descriptorRetainedBytes = descriptor.getRetainedSizeInBytes();
reservedBytes += descriptorRetainedBytes;
stagesReservedBytes.computeIfAbsent(stageId, _ -> new AtomicLong()).addAndGet(descriptorRetainedBytes);
}
public TaskDescriptor get(StageId stageId, int partitionId)
{
throwIfFailed();
TaskDescriptor descriptor = descriptors.get(stageId, partitionId);
if (descriptor == null) {
throw new NoSuchElementException(format("descriptor not found for key %s/%s", stageId, partitionId));
}
return descriptor;
}
public void remove(StageId stageId, int partitionId)
{
throwIfFailed();
TaskDescriptor descriptor = descriptors.remove(stageId, partitionId);
if (descriptor == null) {
throw new NoSuchElementException(format("descriptor not found for key %s/%s", stageId, partitionId));
}
long descriptorRetainedBytes = descriptor.getRetainedSizeInBytes();
reservedBytes -= descriptorRetainedBytes;
requireNonNull(stagesReservedBytes.get(stageId), () -> format("no entry for stage %s", stageId)).addAndGet(-descriptorRetainedBytes);
}
public long getReservedBytes()
{
return reservedBytes;
}
private String getDebugInfo()
{
Multimap descriptorsByStageId = descriptors.cellSet().stream()
.collect(toImmutableSetMultimap(
Table.Cell::getRowKey,
Table.Cell::getValue));
Map debugInfoByStageId = descriptorsByStageId.asMap().entrySet().stream()
.collect(toImmutableMap(
Map.Entry::getKey,
entry -> getDebugInfo(entry.getValue())));
List biggestSplits = descriptorsByStageId.entries().stream()
.flatMap(entry -> entry.getValue().getSplits().getSplitsFlat().entries().stream().map(splitEntry -> Map.entry("%s/%s".formatted(entry.getKey(), splitEntry.getKey()), splitEntry.getValue())))
.sorted(Comparator.>comparingLong(entry -> entry.getValue().getRetainedSizeInBytes()).reversed())
.limit(3)
.map(entry -> "{nodeId=%s, size=%s, split=%s}".formatted(entry.getKey(), entry.getValue().getRetainedSizeInBytes(), splitJsonCodec.toJson(entry.getValue())))
.toList();
return "stagesInfo=%s; biggestSplits=%s".formatted(debugInfoByStageId, biggestSplits);
}
private String getDebugInfo(Collection taskDescriptors)
{
int taskDescriptorsCount = taskDescriptors.size();
Stats taskDescriptorsRetainedSizeStats = Stats.of(taskDescriptors.stream().mapToLong(TaskDescriptor::getRetainedSizeInBytes));
Set planNodeIds = taskDescriptors.stream().flatMap(taskDescriptor -> taskDescriptor.getSplits().getSplitsFlat().keySet().stream()).collect(toImmutableSet());
Map splitsDebugInfo = new HashMap<>();
for (PlanNodeId planNodeId : planNodeIds) {
Stats splitCountStats = Stats.of(taskDescriptors.stream().mapToLong(taskDescriptor -> taskDescriptor.getSplits().getSplitsFlat().asMap().get(planNodeId).size()));
Stats splitSizeStats = Stats.of(taskDescriptors.stream().flatMap(taskDescriptor -> taskDescriptor.getSplits().getSplitsFlat().get(planNodeId).stream()).mapToLong(Split::getRetainedSizeInBytes));
splitsDebugInfo.put(
planNodeId,
"{splitCountMean=%s, splitCountStdDev=%s, splitSizeMean=%s, splitSizeStdDev=%s}".formatted(
splitCountStats.mean(),
splitCountStats.populationStandardDeviation(),
splitSizeStats.mean(),
splitSizeStats.populationStandardDeviation()));
}
return "[taskDescriptorsCount=%s, taskDescriptorsRetainedSizeMean=%s, taskDescriptorsRetainedSizeStdDev=%s, splits=%s]".formatted(
taskDescriptorsCount,
taskDescriptorsRetainedSizeStats.mean(),
taskDescriptorsRetainedSizeStats.populationStandardDeviation(),
splitsDebugInfo);
}
private void fail(RuntimeException failure)
{
if (this.failure == null) {
descriptors.clear();
reservedBytes = 0;
this.failure = failure;
}
}
private void throwIfFailed()
{
if (failure != null) {
throw failure;
}
}
public int getStagesCount()
{
return descriptors.rowMap().size();
}
public Stream getStagesReservedBytes()
{
return stagesReservedBytes.values().stream()
.map(AtomicLong::get);
}
}
private record StorageStatsValue(
long queriesCount,
long stagesCount,
long reservedBytes,
long queryReservedBytesAvg,
long queryReservedBytesP50,
long queryReservedBytesP90,
long queryReservedBytesP95,
long stageReservedBytesAvg,
long stageReservedBytesP50,
long stageReservedBytesP90,
long stageReservedBytesP95) {}
public static class StorageStats
{
private final Supplier statsSupplier;
StorageStats(Supplier statsSupplier)
{
this.statsSupplier = requireNonNull(statsSupplier, "statsSupplier is null");
}
@Managed
public long getQueriesCount()
{
return statsSupplier.get().queriesCount();
}
@Managed
public long getStagesCount()
{
return statsSupplier.get().stagesCount();
}
@Managed
public long getReservedBytes()
{
return statsSupplier.get().reservedBytes();
}
@Managed
public long getQueryReservedBytesAvg()
{
return statsSupplier.get().queryReservedBytesAvg();
}
@Managed
public long getQueryReservedBytesP50()
{
return statsSupplier.get().queryReservedBytesP50();
}
@Managed
public long getQueryReservedBytesP90()
{
return statsSupplier.get().queryReservedBytesP90();
}
@Managed
public long getQueryReservedBytesP95()
{
return statsSupplier.get().queryReservedBytesP95();
}
@Managed
public long getStageReservedBytesAvg()
{
return statsSupplier.get().stageReservedBytesP50();
}
@Managed
public long getStageReservedBytesP50()
{
return statsSupplier.get().stageReservedBytesP50();
}
@Managed
public long getStageReservedBytesP90()
{
return statsSupplier.get().stageReservedBytesP90();
}
@Managed
public long getStageReservedBytesP95()
{
return statsSupplier.get().stageReservedBytesP95();
}
}
}