All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.trino.plugin.deltalake.DeltaLakeWriter Maven / Gradle / Ivy

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.trino.plugin.deltalake;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableMultimap;
import com.google.common.collect.Multimap;
import io.trino.filesystem.Location;
import io.trino.parquet.ParquetDataSourceId;
import io.trino.parquet.reader.MetadataReader;
import io.trino.plugin.deltalake.DataFileInfo.DataFileType;
import io.trino.plugin.deltalake.transactionlog.statistics.DeltaLakeJsonFileStatistics;
import io.trino.plugin.hive.FileWriter;
import io.trino.plugin.hive.parquet.ParquetFileWriter;
import io.trino.spi.Page;
import io.trino.spi.block.ArrayBlock;
import io.trino.spi.block.Block;
import io.trino.spi.block.ColumnarArray;
import io.trino.spi.block.ColumnarMap;
import io.trino.spi.block.DictionaryBlock;
import io.trino.spi.block.LazyBlock;
import io.trino.spi.block.LazyBlockLoader;
import io.trino.spi.block.LongArrayBlock;
import io.trino.spi.block.RowBlock;
import io.trino.spi.block.RunLengthEncodedBlock;
import io.trino.spi.type.ArrayType;
import io.trino.spi.type.MapType;
import io.trino.spi.type.RowType;
import io.trino.spi.type.TimestampWithTimeZoneType;
import io.trino.spi.type.Type;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.format.FileMetaData;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;

import java.io.Closeable;
import java.io.IOException;
import java.time.Instant;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.function.Function;

import static com.google.common.base.MoreObjects.toStringHelper;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static com.google.common.collect.ImmutableMap.toImmutableMap;
import static com.google.common.collect.Iterables.getOnlyElement;
import static io.trino.plugin.deltalake.transactionlog.DeltaLakeParquetStatisticsUtils.hasInvalidStatistics;
import static io.trino.plugin.deltalake.transactionlog.DeltaLakeParquetStatisticsUtils.jsonEncodeMax;
import static io.trino.plugin.deltalake.transactionlog.DeltaLakeParquetStatisticsUtils.jsonEncodeMin;
import static io.trino.spi.block.ColumnarArray.toColumnarArray;
import static io.trino.spi.block.ColumnarMap.toColumnarMap;
import static io.trino.spi.type.DateTimeEncoding.unpackMillisUtc;
import static io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MILLIS;
import static java.util.Locale.ENGLISH;
import static java.util.Objects.requireNonNull;
import static java.util.concurrent.TimeUnit.MILLISECONDS;
import static java.util.function.UnaryOperator.identity;

public final class DeltaLakeWriter
        implements FileWriter
{
    private final ParquetFileWriter fileWriter;
    private final Location rootTableLocation;
    private final String relativeFilePath;
    private final List partitionValues;
    private final DeltaLakeWriterStats stats;
    private final long creationTime;
    private final Map> coercers;
    private final List columnHandles;
    private final DataFileType dataFileType;

    private long rowCount;
    private long inputSizeInBytes;

    public DeltaLakeWriter(
            ParquetFileWriter fileWriter,
            Location rootTableLocation,
            String relativeFilePath,
            List partitionValues,
            DeltaLakeWriterStats stats,
            List columnHandles,
            DataFileType dataFileType)
    {
        this.fileWriter = requireNonNull(fileWriter, "fileWriter is null");
        this.rootTableLocation = requireNonNull(rootTableLocation, "rootTableLocation is null");
        this.relativeFilePath = requireNonNull(relativeFilePath, "relativeFilePath is null");
        this.partitionValues = partitionValues;
        this.stats = stats;
        this.creationTime = Instant.now().toEpochMilli();
        this.columnHandles = requireNonNull(columnHandles, "columnHandles is null");

        ImmutableMap.Builder> coercers = ImmutableMap.builder();
        for (int i = 0; i < columnHandles.size(); i++) {
            Optional> coercer = createCoercer(columnHandles.get(i).getBaseType());
            if (coercer.isPresent()) {
                coercers.put(i, coercer.get());
            }
        }
        this.coercers = coercers.buildOrThrow();
        this.dataFileType = requireNonNull(dataFileType, "dataFileType is null");
    }

    @Override
    public long getWrittenBytes()
    {
        return fileWriter.getWrittenBytes();
    }

    @Override
    public long getMemoryUsage()
    {
        return fileWriter.getMemoryUsage();
    }

    @Override
    public void appendRows(Page originalPage)
    {
        Page page = originalPage;
        if (!coercers.isEmpty()) {
            Block[] translatedBlocks = new Block[originalPage.getChannelCount()];
            for (int index = 0; index < translatedBlocks.length; index++) {
                Block originalBlock = originalPage.getBlock(index);
                Function coercer = coercers.get(index);
                if (coercer != null) {
                    translatedBlocks[index] = new LazyBlock(
                            originalBlock.getPositionCount(),
                            new CoercionLazyBlockLoader(originalBlock, coercer));
                }
                else {
                    translatedBlocks[index] = originalBlock;
                }
            }
            page = new Page(originalPage.getPositionCount(), translatedBlocks);
        }

        stats.addInputPageSizesInBytes(page.getRetainedSizeInBytes());
        fileWriter.appendRows(page);
        rowCount += page.getPositionCount();
        inputSizeInBytes += page.getSizeInBytes();
    }

    @Override
    public Closeable commit()
    {
        return fileWriter.commit();
    }

    @Override
    public void rollback()
    {
        fileWriter.rollback();
    }

    @Override
    public long getValidationCpuNanos()
    {
        return 0;
    }

    public long getRowCount()
    {
        return rowCount;
    }

    public DataFileInfo getDataFileInfo()
            throws IOException
    {
        Map dataColumnTypes = columnHandles.stream()
                // Lowercase because the subsequent logic expects lowercase
                .collect(toImmutableMap(column -> column.getBasePhysicalColumnName().toLowerCase(ENGLISH), DeltaLakeColumnHandle::getBasePhysicalType));
        return new DataFileInfo(
                relativeFilePath,
                getWrittenBytes(),
                creationTime,
                dataFileType,
                partitionValues,
                readStatistics(fileWriter.getFileMetadata(), rootTableLocation.appendPath(relativeFilePath), dataColumnTypes, rowCount));
    }

    private static DeltaLakeJsonFileStatistics readStatistics(FileMetaData fileMetaData, Location path, Map typeForColumn, long rowCount)
            throws IOException
    {
        ParquetMetadata parquetMetadata = MetadataReader.createParquetMetadata(fileMetaData, new ParquetDataSourceId(path.toString()));

        ImmutableMultimap.Builder metadataForColumn = ImmutableMultimap.builder();
        for (BlockMetaData blockMetaData : parquetMetadata.getBlocks()) {
            for (ColumnChunkMetaData columnChunkMetaData : blockMetaData.getColumns()) {
                if (columnChunkMetaData.getPath().size() != 1) {
                    continue; // Only base column stats are supported
                }
                String columnName = getOnlyElement(columnChunkMetaData.getPath());
                metadataForColumn.put(columnName, columnChunkMetaData);
            }
        }

        return mergeStats(metadataForColumn.build(), typeForColumn, rowCount);
    }

    @VisibleForTesting
    static DeltaLakeJsonFileStatistics mergeStats(Multimap metadataForColumn, Map typeForColumn, long rowCount)
    {
        Map>> statsForColumn = metadataForColumn.keySet().stream()
                .collect(toImmutableMap(identity(), key -> mergeMetadataList(metadataForColumn.get(key))));

        Map nullCount = statsForColumn.entrySet().stream()
                .filter(entry -> entry.getValue().isPresent())
                .collect(toImmutableMap(Map.Entry::getKey, entry -> entry.getValue().get().getNumNulls()));

        return new DeltaLakeJsonFileStatistics(
                Optional.of(rowCount),
                Optional.of(jsonEncodeMin(statsForColumn, typeForColumn)),
                Optional.of(jsonEncodeMax(statsForColumn, typeForColumn)),
                Optional.of(nullCount));
    }

    private static Optional> mergeMetadataList(Collection metadataList)
    {
        if (hasInvalidStatistics(metadataList)) {
            return Optional.empty();
        }

        return metadataList.stream()
                .>map(ColumnChunkMetaData::getStatistics)
                .reduce((statsA, statsB) -> {
                    statsA.mergeStatistics(statsB);
                    return statsA;
                });
    }

    @Override
    public String toString()
    {
        return toStringHelper(this)
                .add("fileWriter", fileWriter)
                .add("relativeFilePath", relativeFilePath)
                .add("partitionValues", partitionValues)
                .add("creationTime", creationTime)
                .add("rowCount", rowCount)
                .add("inputSizeInBytes", inputSizeInBytes)
                .toString();
    }

    private static Optional> createCoercer(Type type)
    {
        if (type instanceof ArrayType arrayType) {
            return createCoercer(arrayType.getElementType()).map(ArrayCoercer::new);
        }
        if (type instanceof MapType mapType) {
            return Optional.of(new MapCoercer(mapType));
        }
        if (type instanceof RowType rowType) {
            return Optional.of(new RowCoercer(rowType));
        }
        if (type instanceof TimestampWithTimeZoneType) {
            return Optional.of(new TimestampCoercer());
        }
        return Optional.empty();
    }

    private static class ArrayCoercer
            implements Function
    {
        private final Function elementCoercer;

        public ArrayCoercer(Function elementCoercer)
        {
            this.elementCoercer = requireNonNull(elementCoercer, "elementCoercer is null");
        }

        @Override
        public Block apply(Block block)
        {
            ColumnarArray arrayBlock = toColumnarArray(block);
            Block elementsBlock = elementCoercer.apply(arrayBlock.getElementsBlock());
            boolean[] valueIsNull = new boolean[arrayBlock.getPositionCount()];
            int[] offsets = new int[arrayBlock.getPositionCount() + 1];
            for (int i = 0; i < arrayBlock.getPositionCount(); i++) {
                valueIsNull[i] = arrayBlock.isNull(i);
                offsets[i + 1] = offsets[i] + arrayBlock.getLength(i);
            }
            return ArrayBlock.fromElementBlock(arrayBlock.getPositionCount(), Optional.of(valueIsNull), offsets, elementsBlock);
        }
    }

    private static class MapCoercer
            implements Function
    {
        private final MapType mapType;
        private final Optional> keyCoercer;
        private final Optional> valueCoercer;

        public MapCoercer(MapType mapType)
        {
            this.mapType = requireNonNull(mapType, "mapType is null");
            keyCoercer = createCoercer(mapType.getKeyType());
            valueCoercer = createCoercer(mapType.getValueType());
        }

        @Override
        public Block apply(Block block)
        {
            ColumnarMap mapBlock = toColumnarMap(block);
            Block keysBlock = keyCoercer.isEmpty() ? mapBlock.getKeysBlock() : keyCoercer.get().apply(mapBlock.getKeysBlock());
            Block valuesBlock = valueCoercer.isEmpty() ? mapBlock.getValuesBlock() : valueCoercer.get().apply(mapBlock.getValuesBlock());
            boolean[] valueIsNull = new boolean[mapBlock.getPositionCount()];
            int[] offsets = new int[mapBlock.getPositionCount() + 1];
            for (int i = 0; i < mapBlock.getPositionCount(); i++) {
                valueIsNull[i] = mapBlock.isNull(i);
                offsets[i + 1] = offsets[i] + mapBlock.getEntryCount(i);
            }
            return mapType.createBlockFromKeyValue(Optional.of(valueIsNull), offsets, keysBlock, valuesBlock);
        }
    }

    private static class RowCoercer
            implements Function
    {
        private final List>> fieldCoercers;

        public RowCoercer(RowType rowType)
        {
            fieldCoercers = rowType.getTypeParameters().stream()
                    .map(DeltaLakeWriter::createCoercer)
                    .collect(toImmutableList());
        }

        @Override
        public Block apply(Block block)
        {
            block = block.getLoadedBlock();

            if (block instanceof RunLengthEncodedBlock runLengthEncodedBlock) {
                RowBlock rowBlock = (RowBlock) runLengthEncodedBlock.getValue();
                RowBlock newRowBlock = RowBlock.fromNotNullSuppressedFieldBlocks(
                        1,
                        rowBlock.isNull(0) ? Optional.of(new boolean[]{true}) : Optional.empty(),
                        coerceFields(rowBlock.getFieldBlocks()));
                return RunLengthEncodedBlock.create(newRowBlock, runLengthEncodedBlock.getPositionCount());
            }
            if (block instanceof DictionaryBlock dictionaryBlock) {
                RowBlock rowBlock = (RowBlock) dictionaryBlock.getDictionary();
                List fieldBlocks = rowBlock.getFieldBlocks().stream()
                        .map(dictionaryBlock::createProjection)
                        .toList();
                return RowBlock.fromNotNullSuppressedFieldBlocks(
                        dictionaryBlock.getPositionCount(),
                        getNulls(dictionaryBlock),
                        coerceFields(fieldBlocks));
            }
            RowBlock rowBlock = (RowBlock) block;
            return RowBlock.fromNotNullSuppressedFieldBlocks(
                    rowBlock.getPositionCount(),
                    getNulls(rowBlock),
                    coerceFields(rowBlock.getFieldBlocks()));
        }

        private static Optional getNulls(Block rowBlock)
        {
            if (!rowBlock.mayHaveNull()) {
                return Optional.empty();
            }

            boolean[] valueIsNull = new boolean[rowBlock.getPositionCount()];
            for (int i = 0; i < rowBlock.getPositionCount(); i++) {
                valueIsNull[i] = rowBlock.isNull(i);
            }
            return Optional.of(valueIsNull);
        }

        private Block[] coerceFields(List fields)
        {
            checkArgument(fields.size() == fieldCoercers.size());
            Block[] newFields = new Block[fieldCoercers.size()];
            for (int i = 0; i < fieldCoercers.size(); i++) {
                Optional> coercer = fieldCoercers.get(i);
                Block fieldBlock = fields.get(i);
                if (coercer.isPresent()) {
                    newFields[i] = coercer.get().apply(fieldBlock);
                }
                else {
                    newFields[i] = fieldBlock;
                }
            }
            return newFields;
        }
    }

    private static class TimestampCoercer
            implements Function
    {
        @Override
        public Block apply(Block block)
        {
            int positionCount = block.getPositionCount();
            long[] values = new long[positionCount];
            boolean mayHaveNulls = block.mayHaveNull();
            boolean[] valueIsNull = mayHaveNulls ? new boolean[positionCount] : null;

            for (int position = 0; position < positionCount; position++) {
                if (mayHaveNulls && block.isNull(position)) {
                    valueIsNull[position] = true;
                    continue;
                }
                values[position] = MILLISECONDS.toMicros(unpackMillisUtc(TIMESTAMP_TZ_MILLIS.getLong(block, position)));
            }
            return new LongArrayBlock(positionCount, Optional.ofNullable(valueIsNull), values);
        }
    }

    private static final class CoercionLazyBlockLoader
            implements LazyBlockLoader
    {
        private final Function coercer;
        private Block block;

        public CoercionLazyBlockLoader(Block block, Function coercer)
        {
            this.block = requireNonNull(block, "block is null");
            this.coercer = requireNonNull(coercer, "coercer is null");
        }

        @Override
        public Block load()
        {
            checkState(block != null, "Already loaded");

            Block loaded = coercer.apply(block.getLoadedBlock());
            // clear reference to loader to free resources, since load was successful
            block = null;

            return loaded;
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy