io.trino.parquet.writer.ParquetWriters Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of trino-parquet Show documentation
Show all versions of trino-parquet Show documentation
Trino - Parquet file format support
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.parquet.writer;
import com.google.common.base.Joiner;
import com.google.common.collect.ImmutableList;
import io.trino.parquet.writer.valuewriter.BigintValueWriter;
import io.trino.parquet.writer.valuewriter.BinaryValueWriter;
import io.trino.parquet.writer.valuewriter.BooleanValueWriter;
import io.trino.parquet.writer.valuewriter.DateValueWriter;
import io.trino.parquet.writer.valuewriter.DoubleValueWriter;
import io.trino.parquet.writer.valuewriter.FixedLenByteArrayLongDecimalValueWriter;
import io.trino.parquet.writer.valuewriter.FixedLenByteArrayShortDecimalValueWriter;
import io.trino.parquet.writer.valuewriter.Int32ShortDecimalValueWriter;
import io.trino.parquet.writer.valuewriter.Int64ShortDecimalValueWriter;
import io.trino.parquet.writer.valuewriter.Int96TimestampValueWriter;
import io.trino.parquet.writer.valuewriter.IntegerValueWriter;
import io.trino.parquet.writer.valuewriter.PrimitiveValueWriter;
import io.trino.parquet.writer.valuewriter.RealValueWriter;
import io.trino.parquet.writer.valuewriter.TimeMicrosValueWriter;
import io.trino.parquet.writer.valuewriter.TimestampMillisValueWriter;
import io.trino.parquet.writer.valuewriter.TimestampNanosValueWriter;
import io.trino.parquet.writer.valuewriter.TimestampTzMicrosValueWriter;
import io.trino.parquet.writer.valuewriter.TimestampTzMillisValueWriter;
import io.trino.parquet.writer.valuewriter.TrinoValuesWriterFactory;
import io.trino.parquet.writer.valuewriter.UuidValueWriter;
import io.trino.spi.TrinoException;
import io.trino.spi.type.CharType;
import io.trino.spi.type.DecimalType;
import io.trino.spi.type.TimestampType;
import io.trino.spi.type.Type;
import io.trino.spi.type.UuidType;
import io.trino.spi.type.VarbinaryType;
import io.trino.spi.type.VarcharType;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.values.ValuesWriter;
import org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter;
import org.apache.parquet.column.values.bloomfilter.BloomFilter;
import org.apache.parquet.format.CompressionCodec;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.LogicalTypeAnnotation.TimeLogicalTypeAnnotation;
import org.apache.parquet.schema.LogicalTypeAnnotation.TimestampLogicalTypeAnnotation;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.joda.time.DateTimeZone;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.function.Predicate;
import static com.google.common.base.Preconditions.checkArgument;
import static io.trino.parquet.writer.ParquetWriter.SUPPORTED_BLOOM_FILTER_TYPES;
import static io.trino.parquet.writer.valuewriter.ColumnDescriptorValuesWriter.newDefinitionLevelWriter;
import static io.trino.parquet.writer.valuewriter.ColumnDescriptorValuesWriter.newRepetitionLevelWriter;
import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED;
import static io.trino.spi.type.BigintType.BIGINT;
import static io.trino.spi.type.BooleanType.BOOLEAN;
import static io.trino.spi.type.DateType.DATE;
import static io.trino.spi.type.DoubleType.DOUBLE;
import static io.trino.spi.type.IntegerType.INTEGER;
import static io.trino.spi.type.RealType.REAL;
import static io.trino.spi.type.SmallintType.SMALLINT;
import static io.trino.spi.type.TimeType.TIME_MICROS;
import static io.trino.spi.type.TimestampType.TIMESTAMP_MICROS;
import static io.trino.spi.type.TimestampType.TIMESTAMP_MILLIS;
import static io.trino.spi.type.TimestampType.TIMESTAMP_NANOS;
import static io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MICROS;
import static io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MILLIS;
import static io.trino.spi.type.TinyintType.TINYINT;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
final class ParquetWriters
{
private static final int DEFAULT_DICTIONARY_PAGE_SIZE = 1024 * 1024;
static final int BLOOM_FILTER_EXPECTED_ENTRIES = 100_000;
private ParquetWriters() {}
static PrimitiveValueWriter getValueWriter(ValuesWriter valuesWriter, Type type, PrimitiveType parquetType, Optional parquetTimeZone)
{
if (BOOLEAN.equals(type)) {
return new BooleanValueWriter(valuesWriter, parquetType);
}
if (INTEGER.equals(type) || SMALLINT.equals(type) || TINYINT.equals(type)) {
return new IntegerValueWriter(valuesWriter, type, parquetType);
}
if (BIGINT.equals(type)) {
return new BigintValueWriter(valuesWriter, type, parquetType);
}
if (type instanceof DecimalType) {
if (parquetType.getPrimitiveTypeName() == INT32) {
return new Int32ShortDecimalValueWriter(valuesWriter, type, parquetType);
}
if (parquetType.getPrimitiveTypeName() == INT64) {
return new Int64ShortDecimalValueWriter(valuesWriter, type, parquetType);
}
if (((DecimalType) type).isShort()) {
return new FixedLenByteArrayShortDecimalValueWriter(valuesWriter, type, parquetType);
}
return new FixedLenByteArrayLongDecimalValueWriter(valuesWriter, type, parquetType);
}
if (DATE.equals(type)) {
return new DateValueWriter(valuesWriter, parquetType);
}
if (TIME_MICROS.equals(type)) {
verifyParquetType(type, parquetType, TimeLogicalTypeAnnotation.class, isTime(LogicalTypeAnnotation.TimeUnit.MICROS));
return new TimeMicrosValueWriter(valuesWriter, parquetType);
}
if (type instanceof TimestampType) {
if (parquetType.getPrimitiveTypeName().equals(PrimitiveType.PrimitiveTypeName.INT96)) {
checkArgument(parquetTimeZone.isPresent(), "parquetTimeZone must be provided for INT96 timestamps");
return new Int96TimestampValueWriter(valuesWriter, type, parquetType, parquetTimeZone.get());
}
if (TIMESTAMP_MILLIS.equals(type)) {
verifyParquetType(type, parquetType, TimestampLogicalTypeAnnotation.class, isTimestamp(LogicalTypeAnnotation.TimeUnit.MILLIS));
return new TimestampMillisValueWriter(valuesWriter, type, parquetType);
}
if (TIMESTAMP_MICROS.equals(type)) {
verifyParquetType(type, parquetType, TimestampLogicalTypeAnnotation.class, isTimestamp(LogicalTypeAnnotation.TimeUnit.MICROS));
return new BigintValueWriter(valuesWriter, type, parquetType);
}
if (TIMESTAMP_NANOS.equals(type)) {
verifyParquetType(type, parquetType, TimestampLogicalTypeAnnotation.class, isTimestamp(LogicalTypeAnnotation.TimeUnit.NANOS));
return new TimestampNanosValueWriter(valuesWriter, type, parquetType);
}
}
if (TIMESTAMP_TZ_MILLIS.equals(type)) {
return new TimestampTzMillisValueWriter(valuesWriter, parquetType);
}
if (TIMESTAMP_TZ_MICROS.equals(type)) {
return new TimestampTzMicrosValueWriter(valuesWriter, parquetType);
}
if (DOUBLE.equals(type)) {
return new DoubleValueWriter(valuesWriter, parquetType);
}
if (REAL.equals(type)) {
return new RealValueWriter(valuesWriter, parquetType);
}
if (type instanceof VarcharType || type instanceof CharType || type instanceof VarbinaryType) {
// Binary writer is suitable also for char data, as UTF-8 encoding is used on both sides.
return new BinaryValueWriter(valuesWriter, type, parquetType);
}
if (type instanceof UuidType) {
return new UuidValueWriter(valuesWriter, parquetType);
}
throw new TrinoException(NOT_SUPPORTED, format("Unsupported type for Parquet writer: %s", type));
}
static List getColumnWriters(
MessageType messageType,
Map, Type> trinoTypes,
CompressionCodec compressionCodec,
ParquetWriterOptions writerOptions,
Optional parquetTimeZone)
{
TrinoValuesWriterFactory valuesWriterFactory = new TrinoValuesWriterFactory(writerOptions.getMaxPageSize(), DEFAULT_DICTIONARY_PAGE_SIZE);
WriteBuilder writeBuilder = new WriteBuilder(
messageType,
trinoTypes,
valuesWriterFactory,
compressionCodec,
writerOptions,
parquetTimeZone);
ParquetTypeVisitor.visit(messageType, writeBuilder);
return writeBuilder.build();
}
private static class WriteBuilder
extends ParquetTypeVisitor
{
private final MessageType type;
private final Map, Type> trinoTypes;
private final TrinoValuesWriterFactory valuesWriterFactory;
private final CompressionCodec compressionCodec;
private final int maxPageSize;
private final int pageValueCountLimit;
private final Set bloomFilterColumns;
private final Optional parquetTimeZone;
private final ImmutableList.Builder builder = ImmutableList.builder();
private final int maxBloomFilterSize;
private final double bloomFilterFpp;
WriteBuilder(
MessageType messageType,
Map, Type> trinoTypes,
TrinoValuesWriterFactory valuesWriterFactory,
CompressionCodec compressionCodec,
ParquetWriterOptions writerOptions,
Optional parquetTimeZone)
{
this.type = requireNonNull(messageType, "messageType is null");
this.trinoTypes = requireNonNull(trinoTypes, "trinoTypes is null");
this.valuesWriterFactory = requireNonNull(valuesWriterFactory, "valuesWriterFactory is null");
this.compressionCodec = requireNonNull(compressionCodec, "compressionCodec is null");
this.maxPageSize = writerOptions.getMaxPageSize();
this.pageValueCountLimit = writerOptions.getMaxPageValueCount();
this.maxBloomFilterSize = writerOptions.getMaxBloomFilterSize();
this.bloomFilterColumns = requireNonNull(writerOptions.getBloomFilterColumns(), "bloomFilterColumns is null");
this.bloomFilterFpp = writerOptions.getBLoomFilterFpp();
this.parquetTimeZone = requireNonNull(parquetTimeZone, "parquetTimeZone is null");
}
List build()
{
return builder.build();
}
@Override
public ColumnWriter message(MessageType message, List fields)
{
builder.addAll(fields);
return super.message(message, fields);
}
@Override
public ColumnWriter struct(GroupType struct, List fields)
{
String[] path = currentPath();
int fieldDefinitionLevel = type.getMaxDefinitionLevel(path);
return new StructColumnWriter(ImmutableList.copyOf(fields), fieldDefinitionLevel);
}
@Override
public ColumnWriter list(GroupType array, ColumnWriter element)
{
String[] path = currentPath();
int fieldDefinitionLevel = type.getMaxDefinitionLevel(path);
int fieldRepetitionLevel = type.getMaxRepetitionLevel(path);
return new ArrayColumnWriter(element, fieldDefinitionLevel, fieldRepetitionLevel);
}
@Override
public ColumnWriter map(GroupType map, ColumnWriter key, ColumnWriter value)
{
String[] path = currentPath();
int fieldDefinitionLevel = type.getMaxDefinitionLevel(path);
int fieldRepetitionLevel = type.getMaxRepetitionLevel(path);
return new MapColumnWriter(key, value, fieldDefinitionLevel, fieldRepetitionLevel);
}
@Override
public ColumnWriter primitive(PrimitiveType primitive)
{
String[] path = currentPath();
int fieldDefinitionLevel = type.getMaxDefinitionLevel(path);
int fieldRepetitionLevel = type.getMaxRepetitionLevel(path);
ColumnDescriptor columnDescriptor = new ColumnDescriptor(path, primitive, fieldRepetitionLevel, fieldDefinitionLevel);
Type trinoType = requireNonNull(trinoTypes.get(ImmutableList.copyOf(path)), "Trino type is null");
Optional bloomFilter = createBloomFilter(bloomFilterColumns, maxBloomFilterSize, bloomFilterFpp, columnDescriptor, trinoType);
return new PrimitiveColumnWriter(
columnDescriptor,
getValueWriter(valuesWriterFactory.newValuesWriter(columnDescriptor, bloomFilter), trinoType, columnDescriptor.getPrimitiveType(), parquetTimeZone),
newDefinitionLevelWriter(columnDescriptor, maxPageSize),
newRepetitionLevelWriter(columnDescriptor, maxPageSize),
compressionCodec,
maxPageSize,
pageValueCountLimit,
bloomFilter);
}
private String[] currentPath()
{
String[] path = new String[fieldNames.size()];
if (!fieldNames.isEmpty()) {
Iterator iter = fieldNames.descendingIterator();
for (int i = 0; iter.hasNext(); i += 1) {
path[i] = iter.next();
}
}
return path;
}
private static Optional createBloomFilter(Set bloomFilterColumns, int maxBloomFilterSize, double bloomFilterFpp, ColumnDescriptor columnDescriptor, Type colummType)
{
if (!SUPPORTED_BLOOM_FILTER_TYPES.contains(colummType)) {
return Optional.empty();
}
// TODO: Enable use of AdaptiveBlockSplitBloomFilter once parquet-mr 1.14.0 is released
String dotPath = Joiner.on('.').join(columnDescriptor.getPath());
if (bloomFilterColumns.contains(dotPath)) {
int optimalNumOfBits = BlockSplitBloomFilter.optimalNumOfBits(BLOOM_FILTER_EXPECTED_ENTRIES, bloomFilterFpp);
return Optional.of(new BlockSplitBloomFilter(optimalNumOfBits / 8, maxBloomFilterSize));
}
return Optional.empty();
}
}
private static void verifyParquetType(Type type, PrimitiveType parquetType, Class annotationType, Predicate predicate)
{
checkArgument(
annotationType.isInstance(parquetType.getLogicalTypeAnnotation()) &&
predicate.test(annotationType.cast(parquetType.getLogicalTypeAnnotation())),
"Wrong Parquet type '%s' for Trino type '%s'", parquetType, type);
}
private static Predicate isTime(LogicalTypeAnnotation.TimeUnit precision)
{
requireNonNull(precision, "precision is null");
return annotation -> annotation.getUnit() == precision &&
// isAdjustedToUTC=false indicates Local semantics (timestamps not normalized to UTC)
!annotation.isAdjustedToUTC();
}
private static Predicate isTimestamp(LogicalTypeAnnotation.TimeUnit precision)
{
requireNonNull(precision, "precision is null");
return annotation -> annotation.getUnit() == precision &&
// isAdjustedToUTC=false indicates Local semantics (timestamps not normalized to UTC)
!annotation.isAdjustedToUTC();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy