All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.trino.plugin.hive.MergeFileWriter Maven / Gradle / Ivy

There is a newer version: 468
Show newest version
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.trino.plugin.hive;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.io.Closer;
import io.trino.filesystem.Location;
import io.trino.metastore.HiveType;
import io.trino.plugin.hive.HiveWriterFactory.RowIdSortingFileWriterMaker;
import io.trino.plugin.hive.acid.AcidTransaction;
import io.trino.plugin.hive.orc.OrcFileWriterFactory;
import io.trino.spi.Page;
import io.trino.spi.block.Block;
import io.trino.spi.block.LongArrayBlock;
import io.trino.spi.block.RowBlock;
import io.trino.spi.block.RunLengthEncodedBlock;
import io.trino.spi.connector.ConnectorSession;
import io.trino.spi.connector.MergePage;
import io.trino.spi.type.TypeManager;

import java.io.Closeable;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.OptionalInt;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static io.trino.orc.OrcWriter.OrcOperation.DELETE;
import static io.trino.orc.OrcWriter.OrcOperation.INSERT;
import static io.trino.plugin.hive.HivePageSource.BUCKET_CHANNEL;
import static io.trino.plugin.hive.HivePageSource.ORIGINAL_TRANSACTION_CHANNEL;
import static io.trino.plugin.hive.HivePageSource.ROW_ID_CHANNEL;
import static io.trino.plugin.hive.HiveStorageFormat.ORC;
import static io.trino.plugin.hive.acid.AcidSchema.ACID_COLUMN_NAMES;
import static io.trino.plugin.hive.acid.AcidSchema.createAcidSchema;
import static io.trino.plugin.hive.orc.OrcFileWriter.computeBucketValue;
import static io.trino.plugin.hive.util.AcidTables.deleteDeltaSubdir;
import static io.trino.plugin.hive.util.AcidTables.deltaSubdir;
import static io.trino.plugin.hive.util.HiveTypeUtil.getTypeSignature;
import static io.trino.spi.block.RowBlock.getRowFieldsFromBlock;
import static io.trino.spi.connector.MergePage.createDeleteAndInsertPages;
import static io.trino.spi.predicate.Utils.nativeValueToBlock;
import static io.trino.spi.type.BigintType.BIGINT;
import static io.trino.spi.type.IntegerType.INTEGER;
import static java.util.Objects.requireNonNull;

public final class MergeFileWriter
        implements FileWriter
{
    // The bucketPath looks like this: /root/dir/delta_nnnnnnn_mmmmmmm_ssss/bucket_bbbbb(_aaaa)?
    private static final Pattern BUCKET_PATH_MATCHER = Pattern.compile("(?s)(?.*)/(?delta_\\d+_\\d+)_(?\\d+)/(?bucket_(?\\d+))(?_\\d+)?$");

    // After compaction, the bucketPath looks like this: /root/dir/base_nnnnnnn(_vmmmmmmm)?/bucket_bbbbb(_aaaa)?
    private static final Pattern BASE_PATH_MATCHER = Pattern.compile("(?s)(?.*)/(?base_-?\\d+(_v\\d+)?)/(?bucket_(?\\d+))(?_\\d+)?$");

    private static final Block DELETE_OPERATION_BLOCK = nativeValueToBlock(INTEGER, (long) DELETE.getOperationNumber());
    private static final Block INSERT_OPERATION_BLOCK = nativeValueToBlock(INTEGER, (long) INSERT.getOperationNumber());

    private final AcidTransaction transaction;
    private final OptionalInt bucketNumber;
    private final Block bucketValueBlock;
    private final ConnectorSession session;
    private final Block hiveRowTypeNullsBlock;
    private final Location deltaDirectory;
    private final Location deleteDeltaDirectory;
    private final List inputColumns;
    private final RowIdSortingFileWriterMaker sortingFileWriterMaker;
    private final OrcFileWriterFactory orcFileWriterFactory;
    private final HiveCompressionCodec compressionCodec;
    private final Map hiveAcidSchema;
    private final String bucketFilename;
    private Optional deleteFileWriter = Optional.empty();
    private Optional insertFileWriter = Optional.empty();

    private int deleteRowCount;
    private int insertRowCount;

    public MergeFileWriter(
            AcidTransaction transaction,
            int statementId,
            OptionalInt bucketNumber,
            RowIdSortingFileWriterMaker sortingFileWriterMaker,
            String bucketPath,
            OrcFileWriterFactory orcFileWriterFactory,
            HiveCompressionCodec compressionCodec,
            List inputColumns,
            ConnectorSession session,
            TypeManager typeManager,
            HiveType hiveRowType)
    {
        this.transaction = requireNonNull(transaction, "transaction is null");
        this.bucketNumber = requireNonNull(bucketNumber, "bucketNumber is null");
        this.sortingFileWriterMaker = requireNonNull(sortingFileWriterMaker, "sortingFileWriterMaker is null");
        this.bucketValueBlock = nativeValueToBlock(INTEGER, (long) computeBucketValue(bucketNumber.orElse(0), statementId));
        this.orcFileWriterFactory = requireNonNull(orcFileWriterFactory, "orcFileWriterFactory is null");
        this.compressionCodec = requireNonNull(compressionCodec, "compressionCodec is null");
        this.session = requireNonNull(session, "session is null");
        checkArgument(transaction.isTransactional(), "Not in a transaction: %s", transaction);
        this.hiveAcidSchema = createAcidSchema(hiveRowType);
        this.hiveRowTypeNullsBlock = nativeValueToBlock(typeManager.getType(getTypeSignature(hiveRowType)), null);
        Matcher matcher = BASE_PATH_MATCHER.matcher(bucketPath);
        if (!matcher.matches()) {
            matcher = BUCKET_PATH_MATCHER.matcher(bucketPath);
            checkArgument(matcher.matches(), "bucketPath doesn't have the required format: %s", bucketPath);
        }
        this.bucketFilename = matcher.group("filenameBase");
        long writeId = transaction.getWriteId();
        this.deltaDirectory = Location.of(matcher.group("rootDir")).appendPath(deltaSubdir(writeId, statementId));
        this.deleteDeltaDirectory = Location.of(matcher.group("rootDir")).appendPath(deleteDeltaSubdir(writeId, statementId));
        this.inputColumns = requireNonNull(inputColumns, "inputColumns is null");
    }

    @Override
    public void appendRows(Page page)
    {
        if (page.getPositionCount() == 0) {
            return;
        }

        MergePage mergePage = createDeleteAndInsertPages(page, inputColumns.size());
        mergePage.getDeletionsPage().ifPresent(deletePage -> {
            Block acidBlock = deletePage.getBlock(deletePage.getChannelCount() - 1);
            Page orcDeletePage = buildDeletePage(acidBlock, transaction.getWriteId());
            getOrCreateDeleteFileWriter().appendRows(orcDeletePage);
            deleteRowCount += deletePage.getPositionCount();
        });
        mergePage.getInsertionsPage().ifPresent(insertPage -> {
            Page orcInsertPage = buildInsertPage(insertPage, transaction.getWriteId(), inputColumns, bucketValueBlock, insertRowCount);
            getOrCreateInsertFileWriter().appendRows(orcInsertPage);
            insertRowCount += insertPage.getPositionCount();
        });
    }

    @VisibleForTesting
    public static Page buildInsertPage(Page insertPage, long writeId, List columns, Block bucketValueBlock, int insertRowCount)
    {
        int positionCount = insertPage.getPositionCount();
        List dataColumns = columns.stream()
                .filter(column -> !column.isPartitionKey() && !column.isHidden())
                .map(column -> insertPage.getBlock(column.getBaseHiveColumnIndex()))
                .collect(toImmutableList());
        Block mergedColumnsBlock = RowBlock.fromFieldBlocks(positionCount, dataColumns.toArray(new Block[] {}));
        Block currentTransactionBlock = RunLengthEncodedBlock.create(BIGINT, writeId, positionCount);
        Block[] blockArray = {
                RunLengthEncodedBlock.create(INSERT_OPERATION_BLOCK, positionCount),
                currentTransactionBlock,
                RunLengthEncodedBlock.create(bucketValueBlock, positionCount),
                createRowIdBlock(positionCount, insertRowCount),
                currentTransactionBlock,
                mergedColumnsBlock
        };

        return new Page(blockArray);
    }

    @Override
    public long getWrittenBytes()
    {
        return deleteFileWriter.map(FileWriter::getWrittenBytes).orElse(0L) +
                insertFileWriter.map(FileWriter::getWrittenBytes).orElse(0L);
    }

    @Override
    public long getMemoryUsage()
    {
        return deleteFileWriter.map(FileWriter::getMemoryUsage).orElse(0L) +
                insertFileWriter.map(FileWriter::getMemoryUsage).orElse(0L);
    }

    @Override
    public Closeable commit()
    {
        Optional deleteRollbackAction = deleteFileWriter.map(FileWriter::commit);
        Optional insertRollbackAction = insertFileWriter.map(FileWriter::commit);
        return () -> {
            try (Closer closer = Closer.create()) {
                insertRollbackAction.ifPresent(closer::register);
                deleteRollbackAction.ifPresent(closer::register);
            }
        };
    }

    @Override
    public void rollback()
    {
        // Make sure both writers get rolled back
        try (Closer closer = Closer.create()) {
            closer.register(() -> insertFileWriter.ifPresent(FileWriter::rollback));
            closer.register(() -> deleteFileWriter.ifPresent(FileWriter::rollback));
        }
        catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    @Override
    public long getValidationCpuNanos()
    {
        return deleteFileWriter.map(FileWriter::getValidationCpuNanos).orElse(0L) +
                insertFileWriter.map(FileWriter::getValidationCpuNanos).orElse(0L);
    }

    public PartitionUpdateAndMergeResults getPartitionUpdateAndMergeResults(PartitionUpdate partitionUpdate)
    {
        return new PartitionUpdateAndMergeResults(
                partitionUpdate.withRowCount(insertRowCount - deleteRowCount),
                insertRowCount,
                insertFileWriter.isPresent() ? Optional.of(deltaDirectory.toString()) : Optional.empty(),
                deleteRowCount,
                deleteFileWriter.isPresent() ? Optional.of(deleteDeltaDirectory.toString()) : Optional.empty());
    }

    private Page buildDeletePage(Block rowIds, long writeId)
    {
        int positionCount = rowIds.getPositionCount();
        if (rowIds.mayHaveNull()) {
            for (int position = 0; position < positionCount; position++) {
                checkArgument(!rowIds.isNull(position), "The rowIdsRowBlock may not have null rows");
            }
        }
        List fields = getRowFieldsFromBlock(rowIds);
        Block[] blockArray = {
                RunLengthEncodedBlock.create(DELETE_OPERATION_BLOCK, positionCount),
                fields.get(ORIGINAL_TRANSACTION_CHANNEL),
                fields.get(BUCKET_CHANNEL),
                fields.get(ROW_ID_CHANNEL),
                RunLengthEncodedBlock.create(BIGINT, writeId, positionCount),
                RunLengthEncodedBlock.create(hiveRowTypeNullsBlock, positionCount),
        };
        return new Page(blockArray);
    }

    private FileWriter getOrCreateInsertFileWriter()
    {
        if (insertFileWriter.isEmpty()) {
            insertFileWriter = orcFileWriterFactory.createFileWriter(
                    deltaDirectory.appendPath(bucketFilename),
                    ACID_COLUMN_NAMES,
                    ORC.toStorageFormat(),
                    compressionCodec,
                    hiveAcidSchema,
                    session,
                    bucketNumber,
                    transaction,
                    true,
                    WriterKind.INSERT);
        }
        return getWriter(insertFileWriter);
    }

    private FileWriter getOrCreateDeleteFileWriter()
    {
        if (deleteFileWriter.isEmpty()) {
            Location deletePath = deleteDeltaDirectory.appendPath(bucketFilename);
            FileWriter writer = getWriter(orcFileWriterFactory.createFileWriter(
                    deletePath,
                    ACID_COLUMN_NAMES,
                    ORC.toStorageFormat(),
                    compressionCodec,
                    hiveAcidSchema,
                    session,
                    bucketNumber,
                    transaction,
                    true,
                    WriterKind.DELETE));
            deleteFileWriter = Optional.of(sortingFileWriterMaker.makeFileWriter(writer, deletePath));
        }
        return getWriter(deleteFileWriter);
    }

    private static Block createRowIdBlock(int positionCount, int rowCounter)
    {
        long[] rowIds = new long[positionCount];
        for (int index = 0; index < positionCount; index++) {
            rowIds[index] = rowCounter;
            rowCounter++;
        }
        return new LongArrayBlock(positionCount, Optional.empty(), rowIds);
    }

    private static FileWriter getWriter(Optional writer)
    {
        return writer.orElseThrow(() -> new IllegalArgumentException("writer is not present"));
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy