All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.facebook.presto.orc.metadata.OrcMetadataWriter Maven / Gradle / Ivy

There is a newer version: 0.290
Show newest version
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.presto.orc.metadata;

import com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind;
import com.facebook.presto.orc.metadata.OrcType.OrcTypeKind;
import com.facebook.presto.orc.metadata.Stream.StreamKind;
import com.facebook.presto.orc.metadata.statistics.ColumnStatistics;
import com.facebook.presto.orc.metadata.statistics.StripeStatistics;
import com.facebook.presto.orc.proto.OrcProto;
import com.facebook.presto.orc.proto.OrcProto.RowIndexEntry;
import com.facebook.presto.orc.proto.OrcProto.Type;
import com.facebook.presto.orc.proto.OrcProto.Type.Builder;
import com.facebook.presto.orc.proto.OrcProto.UserMetadataItem;
import com.facebook.presto.orc.protobuf.ByteString;
import com.facebook.presto.orc.protobuf.MessageLite;
import com.google.common.collect.ImmutableList;
import com.google.common.io.CountingOutputStream;
import io.airlift.slice.Slice;
import io.airlift.slice.SliceOutput;

import java.io.IOException;
import java.io.OutputStream;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Optional;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static java.lang.Math.toIntExact;
import static java.util.stream.Collectors.toList;

public class OrcMetadataWriter
        implements MetadataWriter
{
    // see https://github.com/prestodb/orc-protobuf/blob/master/src/main/protobuf/orc_proto.proto
    private static final int PRESTO_WRITER_ID = 2;
    // in order to change this value, the master Apache ORC proto file must be updated
    private static final int ORC_WRITER_VERSION = 6;
    private static final List ORC_METADATA_VERSION = ImmutableList.of(0, 12);

    @Override
    public List getOrcMetadataVersion()
    {
        return ORC_METADATA_VERSION;
    }

    @Override
    public int writePostscript(SliceOutput output,
            int footerLength,
            int metadataLength,
            CompressionKind compression,
            int compressionBlockSize,
            Optional dwrfStripeCacheData)
            throws IOException
    {
        OrcProto.PostScript postScriptProtobuf = OrcProto.PostScript.newBuilder()
                .addAllVersion(ORC_METADATA_VERSION)
                .setFooterLength(footerLength)
                .setMetadataLength(metadataLength)
                .setCompression(toCompression(compression))
                .setCompressionBlockSize(compressionBlockSize)
                .setWriterVersion(ORC_WRITER_VERSION)
                .build();

        return writeProtobufObject(output, postScriptProtobuf);
    }

    @Override
    public int writeDwrfStripeCache(SliceOutput output, Optional dwrfStripeCacheData)
    {
        return 0;
    }

    @Override
    public int writeMetadata(SliceOutput output, Metadata metadata)
            throws IOException
    {
        OrcProto.Metadata metadataProtobuf = OrcProto.Metadata.newBuilder()
                .addAllStripeStats(metadata.getStripeStatsList().stream()
                        .map(OrcMetadataWriter::toStripeStatistics)
                        .collect(toList()))
                .build();

        return writeProtobufObject(output, metadataProtobuf);
    }

    private static OrcProto.StripeStatistics toStripeStatistics(StripeStatistics stripeStatistics)
    {
        return OrcProto.StripeStatistics.newBuilder()
                .addAllColStats(stripeStatistics.getColumnStatistics().stream()
                        .map(OrcMetadataWriter::toColumnStatistics)
                        .collect(toList()))
                .build();
    }

    @Override
    public int writeFooter(SliceOutput output, Footer footer)
            throws IOException
    {
        OrcProto.Footer footerProtobuf = OrcProto.Footer.newBuilder()
                .setWriter(PRESTO_WRITER_ID)
                .setNumberOfRows(footer.getNumberOfRows())
                .setRowIndexStride(footer.getRowsInRowGroup())
                .addAllStripes(footer.getStripes().stream()
                        .map(OrcMetadataWriter::toStripeInformation)
                        .collect(toList()))
                .addAllTypes(footer.getTypes().stream()
                        .map(OrcMetadataWriter::toType)
                        .collect(toList()))
                .addAllStatistics(footer.getFileStats().stream()
                        .map(OrcMetadataWriter::toColumnStatistics)
                        .collect(toList()))
                .addAllMetadata(footer.getUserMetadata().entrySet().stream()
                        .map(OrcMetadataWriter::toUserMetadata)
                        .collect(toList()))
                .build();

        return writeProtobufObject(output, footerProtobuf);
    }

    private static OrcProto.StripeInformation toStripeInformation(StripeInformation stripe)
    {
        return OrcProto.StripeInformation.newBuilder()
                .setNumberOfRows(stripe.getNumberOfRows())
                .setOffset(stripe.getOffset())
                .setIndexLength(stripe.getIndexLength())
                .setDataLength(stripe.getDataLength())
                .setFooterLength(stripe.getFooterLength())
                .build();
    }

    private static Type toType(OrcType type)
    {
        Builder builder = Type.newBuilder()
                .setKind(toTypeKind(type.getOrcTypeKind()))
                .addAllSubtypes(type.getFieldTypeIndexes())
                .addAllFieldNames(type.getFieldNames())
                .addAllAttributes(toStringPairList(type.getAttributes()));

        if (type.getLength().isPresent()) {
            builder.setMaximumLength(type.getLength().get());
        }
        if (type.getPrecision().isPresent()) {
            builder.setPrecision(type.getPrecision().get());
        }
        if (type.getScale().isPresent()) {
            builder.setScale(type.getScale().get());
        }
        return builder.build();
    }

    private static OrcProto.Type.Kind toTypeKind(OrcTypeKind orcTypeKind)
    {
        switch (orcTypeKind) {
            case BOOLEAN:
                return OrcProto.Type.Kind.BOOLEAN;
            case BYTE:
                return OrcProto.Type.Kind.BYTE;
            case SHORT:
                return OrcProto.Type.Kind.SHORT;
            case INT:
                return OrcProto.Type.Kind.INT;
            case LONG:
                return OrcProto.Type.Kind.LONG;
            case DECIMAL:
                return OrcProto.Type.Kind.DECIMAL;
            case FLOAT:
                return OrcProto.Type.Kind.FLOAT;
            case DOUBLE:
                return OrcProto.Type.Kind.DOUBLE;
            case STRING:
                return OrcProto.Type.Kind.STRING;
            case VARCHAR:
                return OrcProto.Type.Kind.VARCHAR;
            case CHAR:
                return OrcProto.Type.Kind.CHAR;
            case BINARY:
                return OrcProto.Type.Kind.BINARY;
            case DATE:
                return OrcProto.Type.Kind.DATE;
            case TIMESTAMP:
                return OrcProto.Type.Kind.TIMESTAMP;
            case LIST:
                return OrcProto.Type.Kind.LIST;
            case MAP:
                return OrcProto.Type.Kind.MAP;
            case STRUCT:
                return OrcProto.Type.Kind.STRUCT;
            case UNION:
                return OrcProto.Type.Kind.UNION;
        }
        throw new IllegalArgumentException("Unsupported type: " + orcTypeKind);
    }

    private static List toStringPairList(Map attributes)
    {
        return attributes.entrySet().stream()
                .map(entry -> OrcProto.StringPair.newBuilder()
                        .setKey(entry.getKey())
                        .setValue(entry.getValue())
                        .build())
                .collect(toImmutableList());
    }

    private static OrcProto.ColumnStatistics toColumnStatistics(ColumnStatistics columnStatistics)
    {
        OrcProto.ColumnStatistics.Builder builder = OrcProto.ColumnStatistics.newBuilder();

        if (columnStatistics.hasNumberOfValues()) {
            builder.setNumberOfValues(columnStatistics.getNumberOfValues());
        }

        if (columnStatistics.getBooleanStatistics() != null) {
            builder.setBucketStatistics(OrcProto.BucketStatistics.newBuilder()
                    .addCount(columnStatistics.getBooleanStatistics().getTrueValueCount())
                    .build());
        }

        if (columnStatistics.getIntegerStatistics() != null) {
            OrcProto.IntegerStatistics.Builder integerStatistics = OrcProto.IntegerStatistics.newBuilder()
                    .setMinimum(columnStatistics.getIntegerStatistics().getMin())
                    .setMaximum(columnStatistics.getIntegerStatistics().getMax());
            if (columnStatistics.getIntegerStatistics().getSum() != null) {
                integerStatistics.setSum(columnStatistics.getIntegerStatistics().getSum());
            }
            builder.setIntStatistics(integerStatistics.build());
        }

        if (columnStatistics.getDoubleStatistics() != null) {
            builder.setDoubleStatistics(OrcProto.DoubleStatistics.newBuilder()
                    .setMinimum(columnStatistics.getDoubleStatistics().getMin())
                    .setMaximum(columnStatistics.getDoubleStatistics().getMax())
                    .build());
        }

        if (columnStatistics.getStringStatistics() != null) {
            OrcProto.StringStatistics.Builder statisticsBuilder = OrcProto.StringStatistics.newBuilder();
            if (columnStatistics.getStringStatistics().getMin() != null) {
                statisticsBuilder.setMinimumBytes(ByteString.copyFrom(columnStatistics.getStringStatistics().getMin().getBytes()));
            }
            if (columnStatistics.getStringStatistics().getMax() != null) {
                statisticsBuilder.setMaximumBytes(ByteString.copyFrom(columnStatistics.getStringStatistics().getMax().getBytes()));
            }
            statisticsBuilder.setSum(columnStatistics.getStringStatistics().getSum());
            builder.setStringStatistics(statisticsBuilder.build());
        }

        if (columnStatistics.getDateStatistics() != null) {
            builder.setDateStatistics(OrcProto.DateStatistics.newBuilder()
                    .setMinimum(columnStatistics.getDateStatistics().getMin())
                    .setMaximum(columnStatistics.getDateStatistics().getMax())
                    .build());
        }

        if (columnStatistics.getDecimalStatistics() != null) {
            builder.setDecimalStatistics(OrcProto.DecimalStatistics.newBuilder()
                    .setMinimum(columnStatistics.getDecimalStatistics().getMin().toString())
                    .setMaximum(columnStatistics.getDecimalStatistics().getMax().toString())
                    .build());
        }

        if (columnStatistics.getBinaryStatistics() != null) {
            builder.setBinaryStatistics(OrcProto.BinaryStatistics.newBuilder()
                    .setSum(columnStatistics.getBinaryStatistics().getSum())
                    .build());
        }

        return builder.build();
    }

    private static UserMetadataItem toUserMetadata(Entry entry)
    {
        return OrcProto.UserMetadataItem.newBuilder()
                .setName(entry.getKey())
                .setValue(ByteString.copyFrom(entry.getValue().getBytes()))
                .build();
    }

    @Override
    public int writeStripeFooter(SliceOutput output, StripeFooter footer)
            throws IOException
    {
        OrcProto.StripeFooter footerProtobuf = OrcProto.StripeFooter.newBuilder()
                .addAllStreams(footer.getStreams().stream()
                        .map(OrcMetadataWriter::toStream)
                        .collect(toList()))
                .addAllColumns(footer.getColumnEncodings().entrySet().stream()
                        .sorted(Entry.comparingByKey())
                        .map(entry -> toColumnEncoding(entry.getValue()))
                        .collect(toList()))
                .build();

        return writeProtobufObject(output, footerProtobuf);
    }

    private static OrcProto.Stream toStream(Stream stream)
    {
        return OrcProto.Stream.newBuilder()
                .setColumn(stream.getColumn())
                .setKind(toStreamKind(stream.getStreamKind()))
                .setLength(stream.getLength())
                .build();
    }

    private static OrcProto.Stream.Kind toStreamKind(StreamKind streamKind)
    {
        switch (streamKind) {
            case PRESENT:
                return OrcProto.Stream.Kind.PRESENT;
            case DATA:
                return OrcProto.Stream.Kind.DATA;
            case LENGTH:
                return OrcProto.Stream.Kind.LENGTH;
            case DICTIONARY_DATA:
                return OrcProto.Stream.Kind.DICTIONARY_DATA;
            case DICTIONARY_COUNT:
                return OrcProto.Stream.Kind.DICTIONARY_COUNT;
            case SECONDARY:
                return OrcProto.Stream.Kind.SECONDARY;
            case ROW_INDEX:
                return OrcProto.Stream.Kind.ROW_INDEX;
        }
        throw new IllegalArgumentException("Unsupported stream kind: " + streamKind);
    }

    private static OrcProto.ColumnEncoding toColumnEncoding(ColumnEncoding columnEncodings)
    {
        checkArgument(
                !columnEncodings.getAdditionalSequenceEncodings().isPresent(),
                "Writing columns with non-zero sequence IDs is not supported in ORC: " + columnEncodings);

        return OrcProto.ColumnEncoding.newBuilder()
                .setKind(toColumnEncoding(columnEncodings.getColumnEncodingKind()))
                .setDictionarySize(columnEncodings.getDictionarySize())
                .build();
    }

    private static OrcProto.ColumnEncoding.Kind toColumnEncoding(ColumnEncodingKind columnEncodingKind)
    {
        switch (columnEncodingKind) {
            case DIRECT:
                return OrcProto.ColumnEncoding.Kind.DIRECT;
            case DICTIONARY:
                return OrcProto.ColumnEncoding.Kind.DICTIONARY;
            case DIRECT_V2:
                return OrcProto.ColumnEncoding.Kind.DIRECT_V2;
            case DICTIONARY_V2:
                return OrcProto.ColumnEncoding.Kind.DICTIONARY_V2;
        }
        throw new IllegalArgumentException("Unsupported column encoding kind: " + columnEncodingKind);
    }

    @Override
    public int writeRowIndexes(SliceOutput output, List rowGroupIndexes)
            throws IOException
    {
        OrcProto.RowIndex rowIndexProtobuf = OrcProto.RowIndex.newBuilder()
                .addAllEntry(rowGroupIndexes.stream()
                        .map(OrcMetadataWriter::toRowGroupIndex)
                        .collect(toList()))
                .build();
        return writeProtobufObject(output, rowIndexProtobuf);
    }

    private static RowIndexEntry toRowGroupIndex(RowGroupIndex rowGroupIndex)
    {
        return OrcProto.RowIndexEntry.newBuilder()
                .addAllPositions(rowGroupIndex.getPositions().stream()
                        .map(Integer::longValue)
                        .collect(toList()))
                .setStatistics(toColumnStatistics(rowGroupIndex.getColumnStatistics()))
                .build();
    }

    private static OrcProto.CompressionKind toCompression(CompressionKind compressionKind)
    {
        switch (compressionKind) {
            case NONE:
                return OrcProto.CompressionKind.NONE;
            case ZLIB:
                return OrcProto.CompressionKind.ZLIB;
            case SNAPPY:
                return OrcProto.CompressionKind.SNAPPY;
            case LZ4:
                return OrcProto.CompressionKind.LZ4;
            case ZSTD:
                return OrcProto.CompressionKind.ZSTD;
        }
        throw new IllegalArgumentException("Unsupported compression kind: " + compressionKind);
    }

    private static int writeProtobufObject(OutputStream output, MessageLite object)
            throws IOException
    {
        CountingOutputStream countingOutput = new CountingOutputStream(output);
        object.writeTo(countingOutput);
        return toIntExact(countingOutput.getCount());
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy