All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.trino.plugin.exchange.filesystem.FileSystemExchangeSink Maven / Gradle / Ivy

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.trino.plugin.exchange.filesystem;

import com.google.common.collect.ImmutableList;
import com.google.common.util.concurrent.FutureCallback;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.errorprone.annotations.ThreadSafe;
import com.google.errorprone.annotations.concurrent.GuardedBy;
import io.airlift.slice.SizeOf;
import io.airlift.slice.Slice;
import io.airlift.slice.SliceOutput;
import io.airlift.slice.Slices;
import io.trino.spi.TrinoException;
import io.trino.spi.exchange.ExchangeSink;
import io.trino.spi.exchange.ExchangeSinkInstanceHandle;

import java.net.URI;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicReference;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Throwables.throwIfUnchecked;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static com.google.common.util.concurrent.Futures.immediateFailedFuture;
import static com.google.common.util.concurrent.Futures.immediateVoidFuture;
import static com.google.common.util.concurrent.MoreExecutors.directExecutor;
import static io.airlift.concurrent.MoreFutures.addExceptionCallback;
import static io.airlift.concurrent.MoreFutures.addSuccessCallback;
import static io.airlift.concurrent.MoreFutures.asVoid;
import static io.airlift.concurrent.MoreFutures.toCompletableFuture;
import static io.airlift.slice.SizeOf.estimatedSizeOf;
import static io.airlift.slice.SizeOf.instanceSize;
import static io.airlift.units.DataSize.succinctBytes;
import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED;
import static java.lang.Math.max;
import static java.lang.Math.min;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;
import static java.util.concurrent.CompletableFuture.completedFuture;
import static java.util.concurrent.CompletableFuture.failedFuture;

@ThreadSafe
public class FileSystemExchangeSink
        implements ExchangeSink
{
    public static final String COMMITTED_MARKER_FILE_NAME = "committed";
    public static final String DATA_FILE_SUFFIX = ".data";

    private static final int INSTANCE_SIZE = instanceSize(FileSystemExchangeSink.class);

    private final FileSystemExchangeStorage exchangeStorage;
    private final FileSystemExchangeStats stats;
    private final URI outputDirectory;
    private final int outputPartitionCount;
    private final boolean preserveOrderWithinPartition;
    private final int maxPageStorageSizeInBytes;
    private final long maxFileSizeInBytes;
    private final BufferPool bufferPool;

    private final Map writersMap = new ConcurrentHashMap<>();
    private final AtomicReference failure = new AtomicReference<>();
    private volatile boolean closed;

    public FileSystemExchangeSink(
            FileSystemExchangeStorage exchangeStorage,
            FileSystemExchangeStats stats,
            URI outputDirectory,
            int outputPartitionCount,
            boolean preserveOrderWithinPartition,
            int maxPageStorageSizeInBytes,
            int exchangeSinkBufferPoolMinSize,
            int exchangeSinkBuffersPerPartition,
            long maxFileSizeInBytes)
    {
        checkArgument(maxPageStorageSizeInBytes <= maxFileSizeInBytes,
                format("maxPageStorageSizeInBytes %s exceeded maxFileSizeInBytes %s", succinctBytes(maxPageStorageSizeInBytes), succinctBytes(maxFileSizeInBytes)));

        this.exchangeStorage = requireNonNull(exchangeStorage, "exchangeStorage is null");
        this.stats = requireNonNull(stats, "stats is null");
        this.outputDirectory = requireNonNull(outputDirectory, "outputDirectory is null");
        this.outputPartitionCount = outputPartitionCount;
        this.preserveOrderWithinPartition = preserveOrderWithinPartition;
        this.maxPageStorageSizeInBytes = maxPageStorageSizeInBytes;
        this.maxFileSizeInBytes = maxFileSizeInBytes;
        // buffer pooling to overlap computation and I/O
        this.bufferPool = new BufferPool(stats, max(outputPartitionCount * exchangeSinkBuffersPerPartition, exchangeSinkBufferPoolMinSize), exchangeStorage.getWriteBufferSize());
    }

    @Override
    public boolean isHandleUpdateRequired()
    {
        return false;
    }

    @Override
    public void updateHandle(ExchangeSinkInstanceHandle handle)
    {
        // this implementation never requests an update
        throw new UnsupportedOperationException();
    }

    // The future returned by {@link #isBlocked()} should only be considered as a best-effort hint.
    @Override
    public CompletableFuture isBlocked()
    {
        return bufferPool.isBlocked();
    }

    @Override
    public void add(int partitionId, Slice data)
    {
        throwIfFailed();

        checkArgument(partitionId < outputPartitionCount, "partition id is expected to be less than %s: %s", outputPartitionCount, partitionId);

        // Ensure no new writers can be created after `closed` is set to true
        BufferedStorageWriter writer;
        synchronized (this) {
            if (closed) {
                return;
            }
            writer = writersMap.computeIfAbsent(partitionId, this::createWriter);
        }
        writer.write(data);
    }

    private BufferedStorageWriter createWriter(int partitionId)
    {
        return new BufferedStorageWriter(
                exchangeStorage,
                stats,
                outputDirectory,
                preserveOrderWithinPartition,
                partitionId,
                bufferPool,
                failure,
                maxPageStorageSizeInBytes,
                maxFileSizeInBytes);
    }

    @Override
    public long getMemoryUsage()
    {
        return INSTANCE_SIZE
                + bufferPool.getRetainedSize()
                + estimatedSizeOf(writersMap, SizeOf::sizeOf, BufferedStorageWriter::getRetainedSize);
    }

    @Override
    public synchronized CompletableFuture finish()
    {
        if (closed) {
            return failedFuture(new IllegalStateException("Exchange sink has already closed"));
        }

        ListenableFuture finishFuture = asVoid(Futures.allAsList(
                writersMap.values().stream().map(BufferedStorageWriter::finish).collect(toImmutableList())));
        addSuccessCallback(finishFuture, this::destroy);
        finishFuture = Futures.transformAsync(
                finishFuture,
                _ -> exchangeStorage.createEmptyFile(outputDirectory.resolve(COMMITTED_MARKER_FILE_NAME)),
                directExecutor());
        Futures.addCallback(finishFuture, new FutureCallback<>()
        {
            @Override
            public void onSuccess(Void result)
            {
                closed = true;
            }

            @Override
            public void onFailure(Throwable ignored)
            {
                abort();
            }
        }, directExecutor());

        return stats.getExchangeSinkFinish().record(toCompletableFuture(finishFuture));
    }

    @Override
    public synchronized CompletableFuture abort()
    {
        if (closed) {
            return completedFuture(null);
        }
        closed = true;

        ListenableFuture abortFuture = asVoid(Futures.allAsList(
                writersMap.values().stream().map(BufferedStorageWriter::abort).collect(toImmutableList())));
        addSuccessCallback(abortFuture, this::destroy);

        return stats.getExchangeSinkAbort().record(toCompletableFuture(Futures.transformAsync(
                abortFuture,
                _ -> exchangeStorage.deleteRecursively(ImmutableList.of(outputDirectory)),
                directExecutor())));
    }

    private void throwIfFailed()
    {
        Throwable throwable = failure.get();
        if (throwable != null) {
            throwIfUnchecked(throwable);
            throw new RuntimeException(throwable);
        }
    }

    private void destroy()
    {
        writersMap.clear();
        bufferPool.close();
    }

    @ThreadSafe
    private static class BufferedStorageWriter
    {
        private static final int INSTANCE_SIZE = instanceSize(BufferedStorageWriter.class);

        private final FileSystemExchangeStorage exchangeStorage;
        private final FileSystemExchangeStats stats;
        private final URI outputDirectory;
        private final boolean preserveOrderWithinPartition;
        private final int partitionId;
        private final BufferPool bufferPool;
        private final AtomicReference failure;
        private final int maxPageStorageSizeInBytes;
        private final long maxFileSizeInBytes;

        @GuardedBy("this")
        private ExchangeStorageWriter currentWriter;
        @GuardedBy("this")
        private long currentFileSize;
        @GuardedBy("this")
        private SliceOutput currentBuffer;
        @GuardedBy("this")
        private final List writers = new ArrayList<>();
        @GuardedBy("this")
        private boolean closed;

        public BufferedStorageWriter(
                FileSystemExchangeStorage exchangeStorage,
                FileSystemExchangeStats stats,
                URI outputDirectory,
                boolean preserveOrderWithinPartition,
                int partitionId,
                BufferPool bufferPool,
                AtomicReference failure,
                int maxPageStorageSizeInBytes,
                long maxFileSizeInBytes)
        {
            this.exchangeStorage = requireNonNull(exchangeStorage, "exchangeStorage is null");
            this.stats = requireNonNull(stats, "stats is null");
            this.outputDirectory = requireNonNull(outputDirectory, "outputDirectory is null");
            this.preserveOrderWithinPartition = preserveOrderWithinPartition;
            this.partitionId = partitionId;
            this.bufferPool = requireNonNull(bufferPool, "bufferPool is null");
            this.failure = requireNonNull(failure, "failure is null");
            this.maxPageStorageSizeInBytes = maxPageStorageSizeInBytes;
            this.maxFileSizeInBytes = maxFileSizeInBytes;

            setupWriterForNextPart();
        }

        public synchronized void write(Slice data)
        {
            if (closed) {
                return;
            }

            int requiredPageStorageSize = Integer.BYTES + data.length();
            if (requiredPageStorageSize > maxPageStorageSizeInBytes) {
                throw new TrinoException(NOT_SUPPORTED, format("Max row size of %s exceeded: %s", succinctBytes(maxPageStorageSizeInBytes), succinctBytes(requiredPageStorageSize)));
            }

            if (currentFileSize + requiredPageStorageSize > maxFileSizeInBytes && !preserveOrderWithinPartition) {
                stats.getFileSizeInBytes().add(currentFileSize);
                flushIfNeeded(true);
                setupWriterForNextPart();
                currentFileSize = 0;
                currentBuffer = null;
            }

            Slice sizeSlice = Slices.allocate(Integer.BYTES);
            sizeSlice.setInt(0, data.length());
            writeInternal(sizeSlice);
            writeInternal(data);

            currentFileSize += requiredPageStorageSize;
        }

        public synchronized ListenableFuture finish()
        {
            if (closed) {
                return immediateFailedFuture(new IllegalStateException("BufferedStorageWriter has already closed"));
            }

            stats.getFileSizeInBytes().add(currentFileSize);
            flushIfNeeded(true);
            if (writers.size() == 1) {
                return currentWriter.finish();
            }
            return asVoid(Futures.allAsList(writers.stream().map(ExchangeStorageWriter::finish).collect(toImmutableList())));
        }

        public synchronized ListenableFuture abort()
        {
            if (closed) {
                return immediateVoidFuture();
            }
            closed = true;

            if (writers.size() == 1) {
                return currentWriter.abort();
            }
            return asVoid(Futures.allAsList(writers.stream().map(ExchangeStorageWriter::abort).collect(toImmutableList())));
        }

        public synchronized long getRetainedSize()
        {
            return INSTANCE_SIZE + estimatedSizeOf(writers, ExchangeStorageWriter::getRetainedSize);
        }

        @GuardedBy("this")
        private void setupWriterForNextPart()
        {
            currentWriter = exchangeStorage.createExchangeStorageWriter(
                    outputDirectory.resolve(partitionId + "_" + writers.size() + DATA_FILE_SUFFIX));
            writers.add(currentWriter);
        }

        @GuardedBy("this")
        private void writeInternal(Slice slice)
        {
            int position = 0;
            while (position < slice.length()) {
                if (currentBuffer == null) {
                    currentBuffer = bufferPool.take();
                    if (currentBuffer == null) {
                        // buffer pool is closed
                        return;
                    }
                }
                int writableBytes = min(currentBuffer.writableBytes(), slice.length() - position);
                currentBuffer.writeBytes(slice.getBytes(position, writableBytes));
                position += writableBytes;

                flushIfNeeded(false);
            }
        }

        @GuardedBy("this")
        private void flushIfNeeded(boolean finished)
        {
            SliceOutput buffer = currentBuffer;
            if (buffer != null && (!buffer.isWritable() || finished)) {
                if (!buffer.isWritable()) {
                    currentBuffer = null;
                }
                ListenableFuture writeFuture = currentWriter.write(buffer.slice());
                writeFuture.addListener(() -> bufferPool.offer(buffer), directExecutor());
                addExceptionCallback(writeFuture, throwable -> failure.compareAndSet(null, throwable));
            }
        }
    }

    @ThreadSafe
    private static class BufferPool
    {
        private static final int INSTANCE_SIZE = instanceSize(BufferPool.class);

        private final FileSystemExchangeStats stats;
        private final int maxNumBuffers;
        private final int writeBufferSize;
        private final long bufferRetainedSize;
        @GuardedBy("this")
        private final Queue freeBuffersQueue;
        @GuardedBy("this")
        private CompletableFuture blockedFuture = new CompletableFuture<>();
        @GuardedBy("this")
        private boolean closed;
        @GuardedBy("this")
        private int numBuffersCreated;

        public BufferPool(FileSystemExchangeStats stats, int maxNumBuffers, int writeBufferSize)
        {
            this.stats = requireNonNull(stats, "stats is null");
            checkArgument(maxNumBuffers >= 1, "maxNumBuffers must be at least one");

            this.maxNumBuffers = maxNumBuffers;
            this.writeBufferSize = writeBufferSize;
            this.numBuffersCreated = 1;
            this.freeBuffersQueue = new ArrayDeque<>(maxNumBuffers);
            freeBuffersQueue.add(Slices.allocate(writeBufferSize).getOutput());
            this.bufferRetainedSize = freeBuffersQueue.peek().getRetainedSize();
        }

        public synchronized CompletableFuture isBlocked()
        {
            if (!hasFreeBuffers()) {
                if (blockedFuture.isDone()) {
                    blockedFuture = new CompletableFuture<>();
                    stats.getExchangeSinkBlocked().record(blockedFuture);
                }
                return blockedFuture;
            }
            return NOT_BLOCKED;
        }

        public synchronized SliceOutput take()
        {
            while (true) {
                if (closed) {
                    return null;
                }
                if (hasFreeBuffers()) {
                    return freeBuffersQueue.poll();
                }
                try {
                    wait();
                }
                catch (InterruptedException e) {
                    Thread.currentThread().interrupt();
                    throw new RuntimeException(e);
                }
            }
        }

        public void offer(SliceOutput buffer)
        {
            buffer.reset();

            CompletableFuture completableFuture;
            synchronized (this) {
                if (closed) {
                    return;
                }
                completableFuture = blockedFuture;
                freeBuffersQueue.add(buffer);
                notify();
            }

            completableFuture.complete(null);
        }

        public synchronized long getRetainedSize()
        {
            if (closed) {
                return INSTANCE_SIZE;
            }
            return INSTANCE_SIZE + numBuffersCreated * bufferRetainedSize;
        }

        public void close()
        {
            CompletableFuture completableFuture;
            synchronized (this) {
                if (closed) {
                    return;
                }
                closed = true;
                notifyAll();
                completableFuture = blockedFuture;
                freeBuffersQueue.clear();
            }

            completableFuture.complete(null);
        }

        @GuardedBy("this")
        private boolean hasFreeBuffers()
        {
            if (!freeBuffersQueue.isEmpty()) {
                return true;
            }
            if (numBuffersCreated < maxNumBuffers) {
                freeBuffersQueue.add(Slices.allocate(writeBufferSize).getOutput());
                numBuffersCreated++;
                return true;
            }
            return false;
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy