io.trino.operator.exchange.ScaleWriterExchanger Maven / Gradle / Ivy
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.operator.exchange;
import com.google.common.util.concurrent.ListenableFuture;
import io.airlift.log.Logger;
import io.airlift.units.DataSize;
import io.trino.spi.Page;
import java.util.List;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Consumer;
import java.util.function.Supplier;
import static java.util.Objects.requireNonNull;
/**
* Scale up local writers based on throughput and data processed by writers.
* Input pages are distributed across different writers in a round-robin fashion.
*/
public class ScaleWriterExchanger
implements LocalExchanger
{
private static final Logger log = Logger.get(ScaleWriterExchanger.class);
private final List> buffers;
private final LocalExchangeMemoryManager memoryManager;
private final long maxBufferedBytes;
private final AtomicLong dataProcessed;
private final long writerScalingMinDataProcessed;
private final Supplier totalMemoryUsed;
private final long maxMemoryPerNode;
// Start with single writer and increase the writer count based on
// data processed by writers and buffer utilization.
private int writerCount = 1;
private int nextWriterIndex = -1;
public ScaleWriterExchanger(
List> buffers,
LocalExchangeMemoryManager memoryManager,
long maxBufferedBytes,
AtomicLong dataProcessed,
DataSize writerScalingMinDataProcessed,
Supplier totalMemoryUsed,
long maxMemoryPerNode)
{
this.buffers = requireNonNull(buffers, "buffers is null");
this.memoryManager = requireNonNull(memoryManager, "memoryManager is null");
this.maxBufferedBytes = maxBufferedBytes;
this.dataProcessed = requireNonNull(dataProcessed, "dataProcessed is null");
this.writerScalingMinDataProcessed = writerScalingMinDataProcessed.toBytes();
this.totalMemoryUsed = requireNonNull(totalMemoryUsed, "totalMemoryUsed is null");
this.maxMemoryPerNode = maxMemoryPerNode;
}
@Override
public void accept(Page page)
{
dataProcessed.addAndGet(page.getSizeInBytes());
Consumer buffer = buffers.get(getNextWriterIndex());
memoryManager.updateMemoryUsage(page.getRetainedSizeInBytes());
buffer.accept(page);
}
private int getNextWriterIndex()
{
// Scale up writers when current buffer memory utilization is more than 50% of the
// maximum and data processed is greater than current writer count * writerScalingMinOutputSize.
// This also mean that we won't scale local writers if the writing speed can cope up
// with incoming data. In another word, buffer utilization is below 50%.
if (writerCount < buffers.size() && memoryManager.getBufferedBytes() >= maxBufferedBytes / 2) {
if (dataProcessed.get() >= writerCount * writerScalingMinDataProcessed
// Do not scale up if total memory used is greater than 50% of max memory per node.
// We have to be conservative here otherwise scaling of writers will happen first
// before we hit this limit, and then we won't be able to do anything to stop OOM error.
&& totalMemoryUsed.get() < maxMemoryPerNode * 0.5) {
writerCount++;
log.debug("Increased task writer count: %d", writerCount);
}
}
nextWriterIndex = (nextWriterIndex + 1) % writerCount;
return nextWriterIndex;
}
@Override
public ListenableFuture waitForWriting()
{
return memoryManager.getNotFullFuture();
}
}