All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.trino.parquet.writer.ParquetWriters Maven / Gradle / Ivy

There is a newer version: 464
Show newest version
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.trino.parquet.writer;

import com.google.common.base.Joiner;
import com.google.common.collect.ImmutableList;
import io.trino.parquet.writer.valuewriter.BigintValueWriter;
import io.trino.parquet.writer.valuewriter.BinaryValueWriter;
import io.trino.parquet.writer.valuewriter.BooleanValueWriter;
import io.trino.parquet.writer.valuewriter.DateValueWriter;
import io.trino.parquet.writer.valuewriter.DoubleValueWriter;
import io.trino.parquet.writer.valuewriter.FixedLenByteArrayLongDecimalValueWriter;
import io.trino.parquet.writer.valuewriter.FixedLenByteArrayShortDecimalValueWriter;
import io.trino.parquet.writer.valuewriter.Int32ShortDecimalValueWriter;
import io.trino.parquet.writer.valuewriter.Int64ShortDecimalValueWriter;
import io.trino.parquet.writer.valuewriter.Int96TimestampValueWriter;
import io.trino.parquet.writer.valuewriter.IntegerValueWriter;
import io.trino.parquet.writer.valuewriter.PrimitiveValueWriter;
import io.trino.parquet.writer.valuewriter.RealValueWriter;
import io.trino.parquet.writer.valuewriter.TimeMicrosValueWriter;
import io.trino.parquet.writer.valuewriter.TimestampMillisValueWriter;
import io.trino.parquet.writer.valuewriter.TimestampNanosValueWriter;
import io.trino.parquet.writer.valuewriter.TimestampTzMicrosValueWriter;
import io.trino.parquet.writer.valuewriter.TimestampTzMillisValueWriter;
import io.trino.parquet.writer.valuewriter.TrinoValuesWriterFactory;
import io.trino.parquet.writer.valuewriter.UuidValueWriter;
import io.trino.spi.TrinoException;
import io.trino.spi.type.CharType;
import io.trino.spi.type.DecimalType;
import io.trino.spi.type.TimestampType;
import io.trino.spi.type.Type;
import io.trino.spi.type.UuidType;
import io.trino.spi.type.VarbinaryType;
import io.trino.spi.type.VarcharType;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.values.ValuesWriter;
import org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter;
import org.apache.parquet.column.values.bloomfilter.BloomFilter;
import org.apache.parquet.format.CompressionCodec;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.LogicalTypeAnnotation.TimeLogicalTypeAnnotation;
import org.apache.parquet.schema.LogicalTypeAnnotation.TimestampLogicalTypeAnnotation;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.joda.time.DateTimeZone;

import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.function.Predicate;

import static com.google.common.base.Preconditions.checkArgument;
import static io.trino.parquet.writer.ParquetWriter.SUPPORTED_BLOOM_FILTER_TYPES;
import static io.trino.parquet.writer.valuewriter.ColumnDescriptorValuesWriter.newDefinitionLevelWriter;
import static io.trino.parquet.writer.valuewriter.ColumnDescriptorValuesWriter.newRepetitionLevelWriter;
import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED;
import static io.trino.spi.type.BigintType.BIGINT;
import static io.trino.spi.type.BooleanType.BOOLEAN;
import static io.trino.spi.type.DateType.DATE;
import static io.trino.spi.type.DoubleType.DOUBLE;
import static io.trino.spi.type.IntegerType.INTEGER;
import static io.trino.spi.type.RealType.REAL;
import static io.trino.spi.type.SmallintType.SMALLINT;
import static io.trino.spi.type.TimeType.TIME_MICROS;
import static io.trino.spi.type.TimestampType.TIMESTAMP_MICROS;
import static io.trino.spi.type.TimestampType.TIMESTAMP_MILLIS;
import static io.trino.spi.type.TimestampType.TIMESTAMP_NANOS;
import static io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MICROS;
import static io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MILLIS;
import static io.trino.spi.type.TinyintType.TINYINT;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;

final class ParquetWriters
{
    private static final int DEFAULT_DICTIONARY_PAGE_SIZE = 1024 * 1024;
    static final int BLOOM_FILTER_EXPECTED_ENTRIES = 100_000;

    private ParquetWriters() {}

    static PrimitiveValueWriter getValueWriter(ValuesWriter valuesWriter, Type type, PrimitiveType parquetType, Optional parquetTimeZone)
    {
        if (BOOLEAN.equals(type)) {
            return new BooleanValueWriter(valuesWriter, parquetType);
        }
        if (INTEGER.equals(type) || SMALLINT.equals(type) || TINYINT.equals(type)) {
            return new IntegerValueWriter(valuesWriter, type, parquetType);
        }
        if (BIGINT.equals(type)) {
            return new BigintValueWriter(valuesWriter, type, parquetType);
        }
        if (type instanceof DecimalType) {
            if (parquetType.getPrimitiveTypeName() == INT32) {
                return new Int32ShortDecimalValueWriter(valuesWriter, type, parquetType);
            }
            if (parquetType.getPrimitiveTypeName() == INT64) {
                return new Int64ShortDecimalValueWriter(valuesWriter, type, parquetType);
            }
            if (((DecimalType) type).isShort()) {
                return new FixedLenByteArrayShortDecimalValueWriter(valuesWriter, type, parquetType);
            }
            return new FixedLenByteArrayLongDecimalValueWriter(valuesWriter, type, parquetType);
        }
        if (DATE.equals(type)) {
            return new DateValueWriter(valuesWriter, parquetType);
        }
        if (TIME_MICROS.equals(type)) {
            verifyParquetType(type, parquetType, TimeLogicalTypeAnnotation.class, isTime(LogicalTypeAnnotation.TimeUnit.MICROS));
            return new TimeMicrosValueWriter(valuesWriter, parquetType);
        }
        if (type instanceof TimestampType) {
            if (parquetType.getPrimitiveTypeName().equals(PrimitiveType.PrimitiveTypeName.INT96)) {
                checkArgument(parquetTimeZone.isPresent(), "parquetTimeZone must be provided for INT96 timestamps");
                return new Int96TimestampValueWriter(valuesWriter, type, parquetType, parquetTimeZone.get());
            }
            if (TIMESTAMP_MILLIS.equals(type)) {
                verifyParquetType(type, parquetType, TimestampLogicalTypeAnnotation.class, isTimestamp(LogicalTypeAnnotation.TimeUnit.MILLIS));
                return new TimestampMillisValueWriter(valuesWriter, type, parquetType);
            }
            if (TIMESTAMP_MICROS.equals(type)) {
                verifyParquetType(type, parquetType, TimestampLogicalTypeAnnotation.class, isTimestamp(LogicalTypeAnnotation.TimeUnit.MICROS));
                return new BigintValueWriter(valuesWriter, type, parquetType);
            }
            if (TIMESTAMP_NANOS.equals(type)) {
                verifyParquetType(type, parquetType, TimestampLogicalTypeAnnotation.class, isTimestamp(LogicalTypeAnnotation.TimeUnit.NANOS));
                return new TimestampNanosValueWriter(valuesWriter, type, parquetType);
            }
        }

        if (TIMESTAMP_TZ_MILLIS.equals(type)) {
            return new TimestampTzMillisValueWriter(valuesWriter, parquetType);
        }
        if (TIMESTAMP_TZ_MICROS.equals(type)) {
            return new TimestampTzMicrosValueWriter(valuesWriter, parquetType);
        }
        if (DOUBLE.equals(type)) {
            return new DoubleValueWriter(valuesWriter, parquetType);
        }
        if (REAL.equals(type)) {
            return new RealValueWriter(valuesWriter, parquetType);
        }
        if (type instanceof VarcharType || type instanceof CharType || type instanceof VarbinaryType) {
            // Binary writer is suitable also for char data, as UTF-8 encoding is used on both sides.
            return new BinaryValueWriter(valuesWriter, type, parquetType);
        }
        if (type instanceof UuidType) {
            return new UuidValueWriter(valuesWriter, parquetType);
        }
        throw new TrinoException(NOT_SUPPORTED, format("Unsupported type for Parquet writer: %s", type));
    }

    static List getColumnWriters(
            MessageType messageType,
            Map, Type> trinoTypes,
            CompressionCodec compressionCodec,
            ParquetWriterOptions writerOptions,
            Optional parquetTimeZone)
    {
        TrinoValuesWriterFactory valuesWriterFactory = new TrinoValuesWriterFactory(writerOptions.getMaxPageSize(), DEFAULT_DICTIONARY_PAGE_SIZE);
        WriteBuilder writeBuilder = new WriteBuilder(
                messageType,
                trinoTypes,
                valuesWriterFactory,
                compressionCodec,
                writerOptions,
                parquetTimeZone);
        ParquetTypeVisitor.visit(messageType, writeBuilder);
        return writeBuilder.build();
    }

    private static class WriteBuilder
            extends ParquetTypeVisitor
    {
        private final MessageType type;
        private final Map, Type> trinoTypes;
        private final TrinoValuesWriterFactory valuesWriterFactory;
        private final CompressionCodec compressionCodec;
        private final int maxPageSize;
        private final int pageValueCountLimit;
        private final Set bloomFilterColumns;
        private final Optional parquetTimeZone;
        private final ImmutableList.Builder builder = ImmutableList.builder();
        private final int maxBloomFilterSize;
        private final double bloomFilterFpp;

        WriteBuilder(
                MessageType messageType,
                Map, Type> trinoTypes,
                TrinoValuesWriterFactory valuesWriterFactory,
                CompressionCodec compressionCodec,
                ParquetWriterOptions writerOptions,
                Optional parquetTimeZone)
        {
            this.type = requireNonNull(messageType, "messageType is null");
            this.trinoTypes = requireNonNull(trinoTypes, "trinoTypes is null");
            this.valuesWriterFactory = requireNonNull(valuesWriterFactory, "valuesWriterFactory is null");
            this.compressionCodec = requireNonNull(compressionCodec, "compressionCodec is null");
            this.maxPageSize = writerOptions.getMaxPageSize();
            this.pageValueCountLimit = writerOptions.getMaxPageValueCount();
            this.maxBloomFilterSize = writerOptions.getMaxBloomFilterSize();
            this.bloomFilterColumns = requireNonNull(writerOptions.getBloomFilterColumns(), "bloomFilterColumns is null");
            this.bloomFilterFpp = writerOptions.getBLoomFilterFpp();
            this.parquetTimeZone = requireNonNull(parquetTimeZone, "parquetTimeZone is null");
        }

        List build()
        {
            return builder.build();
        }

        @Override
        public ColumnWriter message(MessageType message, List fields)
        {
            builder.addAll(fields);
            return super.message(message, fields);
        }

        @Override
        public ColumnWriter struct(GroupType struct, List fields)
        {
            String[] path = currentPath();
            int fieldDefinitionLevel = type.getMaxDefinitionLevel(path);
            return new StructColumnWriter(ImmutableList.copyOf(fields), fieldDefinitionLevel);
        }

        @Override
        public ColumnWriter list(GroupType array, ColumnWriter element)
        {
            String[] path = currentPath();
            int fieldDefinitionLevel = type.getMaxDefinitionLevel(path);
            int fieldRepetitionLevel = type.getMaxRepetitionLevel(path);
            return new ArrayColumnWriter(element, fieldDefinitionLevel, fieldRepetitionLevel);
        }

        @Override
        public ColumnWriter map(GroupType map, ColumnWriter key, ColumnWriter value)
        {
            String[] path = currentPath();
            int fieldDefinitionLevel = type.getMaxDefinitionLevel(path);
            int fieldRepetitionLevel = type.getMaxRepetitionLevel(path);
            return new MapColumnWriter(key, value, fieldDefinitionLevel, fieldRepetitionLevel);
        }

        @Override
        public ColumnWriter primitive(PrimitiveType primitive)
        {
            String[] path = currentPath();
            int fieldDefinitionLevel = type.getMaxDefinitionLevel(path);
            int fieldRepetitionLevel = type.getMaxRepetitionLevel(path);
            ColumnDescriptor columnDescriptor = new ColumnDescriptor(path, primitive, fieldRepetitionLevel, fieldDefinitionLevel);
            Type trinoType = requireNonNull(trinoTypes.get(ImmutableList.copyOf(path)), "Trino type is null");
            Optional bloomFilter = createBloomFilter(bloomFilterColumns, maxBloomFilterSize, bloomFilterFpp, columnDescriptor, trinoType);
            return new PrimitiveColumnWriter(
                    columnDescriptor,
                    getValueWriter(valuesWriterFactory.newValuesWriter(columnDescriptor, bloomFilter), trinoType, columnDescriptor.getPrimitiveType(), parquetTimeZone),
                    newDefinitionLevelWriter(columnDescriptor, maxPageSize),
                    newRepetitionLevelWriter(columnDescriptor, maxPageSize),
                    compressionCodec,
                    maxPageSize,
                    pageValueCountLimit,
                    bloomFilter);
        }

        private String[] currentPath()
        {
            String[] path = new String[fieldNames.size()];
            if (!fieldNames.isEmpty()) {
                Iterator iter = fieldNames.descendingIterator();
                for (int i = 0; iter.hasNext(); i += 1) {
                    path[i] = iter.next();
                }
            }
            return path;
        }

        private static Optional createBloomFilter(Set bloomFilterColumns, int maxBloomFilterSize, double bloomFilterFpp, ColumnDescriptor columnDescriptor, Type colummType)
        {
            if (!SUPPORTED_BLOOM_FILTER_TYPES.contains(colummType)) {
                return Optional.empty();
            }
            // TODO: Enable use of AdaptiveBlockSplitBloomFilter once parquet-mr 1.14.0 is released
            String dotPath = Joiner.on('.').join(columnDescriptor.getPath());
            if (bloomFilterColumns.contains(dotPath)) {
                int optimalNumOfBits = BlockSplitBloomFilter.optimalNumOfBits(BLOOM_FILTER_EXPECTED_ENTRIES, bloomFilterFpp);
                return Optional.of(new BlockSplitBloomFilter(optimalNumOfBits / 8, maxBloomFilterSize));
            }
            return Optional.empty();
        }
    }

    private static  void verifyParquetType(Type type, PrimitiveType parquetType, Class annotationType, Predicate predicate)
    {
        checkArgument(
                annotationType.isInstance(parquetType.getLogicalTypeAnnotation()) &&
                        predicate.test(annotationType.cast(parquetType.getLogicalTypeAnnotation())),
                "Wrong Parquet type '%s' for Trino type '%s'", parquetType, type);
    }

    private static Predicate isTime(LogicalTypeAnnotation.TimeUnit precision)
    {
        requireNonNull(precision, "precision is null");
        return annotation -> annotation.getUnit() == precision &&
                // isAdjustedToUTC=false indicates Local semantics (timestamps not normalized to UTC)
                !annotation.isAdjustedToUTC();
    }

    private static Predicate isTimestamp(LogicalTypeAnnotation.TimeUnit precision)
    {
        requireNonNull(precision, "precision is null");
        return annotation -> annotation.getUnit() == precision &&
                // isAdjustedToUTC=false indicates Local semantics (timestamps not normalized to UTC)
                !annotation.isAdjustedToUTC();
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy