
com.facebook.presto.orc.metadata.DwrfMetadataWriter Maven / Gradle / Ivy
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.orc.metadata;
import com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind;
import com.facebook.presto.orc.metadata.OrcType.OrcTypeKind;
import com.facebook.presto.orc.metadata.Stream.StreamKind;
import com.facebook.presto.orc.metadata.statistics.ColumnStatistics;
import com.facebook.presto.orc.metadata.statistics.DoubleStatistics;
import com.facebook.presto.orc.metadata.statistics.IntegerStatistics;
import com.facebook.presto.orc.metadata.statistics.MapStatisticsEntry;
import com.facebook.presto.orc.proto.DwrfProto;
import com.facebook.presto.orc.proto.DwrfProto.RowIndexEntry;
import com.facebook.presto.orc.proto.DwrfProto.Type;
import com.facebook.presto.orc.proto.DwrfProto.Type.Builder;
import com.facebook.presto.orc.proto.DwrfProto.UserMetadataItem;
import com.facebook.presto.orc.protobuf.ByteString;
import com.facebook.presto.orc.protobuf.MessageLite;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.io.CountingOutputStream;
import io.airlift.slice.Slice;
import io.airlift.slice.SliceOutput;
import java.io.IOException;
import java.io.OutputStream;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Optional;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static io.airlift.slice.Slices.utf8Slice;
import static java.lang.Math.toIntExact;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;
import static java.util.stream.Collectors.toList;
public class DwrfMetadataWriter
implements MetadataWriter
{
private static final int DWRF_WRITER_VERSION = 1;
public static final Map STATIC_METADATA = ImmutableMap.builder()
.put("orc.writer.name", utf8Slice("presto"))
.put("orc.writer.version", utf8Slice(String.valueOf(DWRF_WRITER_VERSION)))
.build();
@Override
public List getOrcMetadataVersion()
{
// DWRF does not have a version field
return ImmutableList.of();
}
@Override
public int writePostscript(SliceOutput output,
int footerLength,
int metadataLength,
CompressionKind compression,
int compressionBlockSize,
Optional dwrfStripeCacheData)
throws IOException
{
DwrfProto.PostScript.Builder postScriptBuilder = DwrfProto.PostScript.newBuilder()
.setFooterLength(footerLength)
.setWriterVersion(DWRF_WRITER_VERSION)
.setCompression(toCompression(compression))
.setCompressionBlockSize(compressionBlockSize);
dwrfStripeCacheData.ifPresent(cache -> {
postScriptBuilder.setCacheMode(toStripeCacheMode(cache.getDwrfStripeCacheMode()));
postScriptBuilder.setCacheSize(cache.getDwrfStripeCacheSize());
});
DwrfProto.PostScript postScriptProtobuf = postScriptBuilder.build();
return writeProtobufObject(output, postScriptProtobuf);
}
@Override
public int writeDwrfStripeCache(SliceOutput output, Optional dwrfStripeCacheData)
{
int size = 0;
if (dwrfStripeCacheData.isPresent()) {
DwrfStripeCacheData cache = dwrfStripeCacheData.get();
size = cache.getDwrfStripeCacheSize();
output.writeBytes(cache.getDwrfStripeCacheSlice(), 0, size);
}
return size;
}
@Override
public int writeMetadata(SliceOutput output, Metadata metadata)
{
return 0;
}
@Override
public int writeFooter(SliceOutput output, Footer footer)
throws IOException
{
DwrfProto.Footer.Builder footerProtobuf = DwrfProto.Footer.newBuilder()
.setNumberOfRows(footer.getNumberOfRows())
.setRowIndexStride(footer.getRowsInRowGroup())
.addAllStripes(footer.getStripes().stream()
.map(DwrfMetadataWriter::toStripeInformation)
.collect(toImmutableList()))
.addAllTypes(footer.getTypes().stream()
.map(DwrfMetadataWriter::toType)
.collect(toImmutableList()))
.addAllStatistics(footer.getFileStats().stream()
.map(DwrfMetadataWriter::toColumnStatistics)
.collect(toImmutableList()))
.addAllMetadata(footer.getUserMetadata().entrySet().stream()
.map(DwrfMetadataWriter::toUserMetadata)
.collect(toImmutableList()))
.addAllMetadata(STATIC_METADATA.entrySet().stream()
.map(DwrfMetadataWriter::toUserMetadata)
.collect(toImmutableList()));
if (footer.getEncryption().isPresent()) {
footerProtobuf.setEncryption(toEncryption(footer.getEncryption().get()));
}
if (footer.getRawSize().isPresent()) {
footerProtobuf.setRawDataSize(footer.getRawSize().getAsLong());
}
if (footer.getDwrfStripeCacheOffsets().isPresent()) {
footerProtobuf.addAllStripeCacheOffsets(footer.getDwrfStripeCacheOffsets().get());
}
return writeProtobufObject(output, footerProtobuf.build());
}
@VisibleForTesting
static DwrfProto.StripeInformation toStripeInformation(StripeInformation stripe)
{
DwrfProto.StripeInformation.Builder builder = DwrfProto.StripeInformation.newBuilder()
.setNumberOfRows(stripe.getNumberOfRows())
.setOffset(stripe.getOffset())
.setIndexLength(stripe.getIndexLength())
.setDataLength(stripe.getDataLength())
.setFooterLength(stripe.getFooterLength())
.addAllKeyMetadata(stripe.getKeyMetadata().stream()
.map(ByteString::copyFrom)
.collect(toImmutableList()));
if (stripe.getRawDataSize().isPresent()) {
builder.setRawDataSize(stripe.getRawDataSize().getAsLong());
}
return builder.build();
}
private static Type toType(OrcType type)
{
Builder builder = Type.newBuilder()
.setKind(toTypeKind(type.getOrcTypeKind()))
.addAllSubtypes(type.getFieldTypeIndexes())
.addAllFieldNames(type.getFieldNames());
return builder.build();
}
private static Type.Kind toTypeKind(OrcTypeKind orcTypeKind)
{
switch (orcTypeKind) {
case BOOLEAN:
return Type.Kind.BOOLEAN;
case BYTE:
return Type.Kind.BYTE;
case SHORT:
return Type.Kind.SHORT;
case INT:
return Type.Kind.INT;
case LONG:
return Type.Kind.LONG;
case FLOAT:
return Type.Kind.FLOAT;
case DOUBLE:
return Type.Kind.DOUBLE;
case STRING:
case VARCHAR:
return Type.Kind.STRING;
case BINARY:
return Type.Kind.BINARY;
case TIMESTAMP:
case TIMESTAMP_MICROSECONDS:
return Type.Kind.TIMESTAMP;
case LIST:
return Type.Kind.LIST;
case MAP:
return Type.Kind.MAP;
case STRUCT:
return Type.Kind.STRUCT;
case UNION:
return Type.Kind.UNION;
}
throw new IllegalArgumentException("Unsupported type: " + orcTypeKind);
}
public static DwrfProto.ColumnStatistics toColumnStatistics(ColumnStatistics columnStatistics)
{
DwrfProto.ColumnStatistics.Builder builder = DwrfProto.ColumnStatistics.newBuilder();
if (columnStatistics.hasNumberOfValues()) {
builder.setNumberOfValues(columnStatistics.getNumberOfValues());
}
if (columnStatistics.hasRawSize()) {
builder.setRawSize(columnStatistics.getRawSize());
}
if (columnStatistics.hasStorageSize()) {
builder.setSize(columnStatistics.getStorageSize());
}
if (columnStatistics.getBooleanStatistics() != null) {
builder.setBucketStatistics(DwrfProto.BucketStatistics.newBuilder()
.addCount(columnStatistics.getBooleanStatistics().getTrueValueCount())
.build());
}
IntegerStatistics integerStatistics = columnStatistics.getIntegerStatistics();
if (integerStatistics != null) {
DwrfProto.IntegerStatistics.Builder dwrfIntegerStatistics = DwrfProto.IntegerStatistics.newBuilder()
.setMinimum(integerStatistics.getMinPrimitive())
.setMaximum(integerStatistics.getMaxPrimitive());
if (integerStatistics.hasSum()) {
dwrfIntegerStatistics.setSum(integerStatistics.getSumPrimitive());
}
builder.setIntStatistics(dwrfIntegerStatistics.build());
}
DoubleStatistics doubleStatistics = columnStatistics.getDoubleStatistics();
if (doubleStatistics != null) {
builder.setDoubleStatistics(DwrfProto.DoubleStatistics.newBuilder()
.setMinimum(doubleStatistics.getMinPrimitive())
.setMaximum(doubleStatistics.getMaxPrimitive())
.build());
}
if (columnStatistics.getStringStatistics() != null) {
DwrfProto.StringStatistics.Builder statisticsBuilder = DwrfProto.StringStatistics.newBuilder();
if (columnStatistics.getStringStatistics().getMin() != null) {
statisticsBuilder.setMinimumBytes(ByteString.copyFrom(columnStatistics.getStringStatistics().getMin().getBytes()));
}
if (columnStatistics.getStringStatistics().getMax() != null) {
statisticsBuilder.setMaximumBytes(ByteString.copyFrom(columnStatistics.getStringStatistics().getMax().getBytes()));
}
statisticsBuilder.setSum(columnStatistics.getStringStatistics().getSum());
builder.setStringStatistics(statisticsBuilder.build());
}
if (columnStatistics.getBinaryStatistics() != null) {
builder.setBinaryStatistics(DwrfProto.BinaryStatistics.newBuilder()
.setSum(columnStatistics.getBinaryStatistics().getSum())
.build());
}
if (columnStatistics.getMapStatistics() != null) {
DwrfProto.MapStatistics.Builder statisticsBuilder = DwrfProto.MapStatistics.newBuilder();
for (MapStatisticsEntry entry : columnStatistics.getMapStatistics().getEntries()) {
statisticsBuilder.addStats(DwrfProto.MapEntryStatistics.newBuilder()
.setKey(entry.getKey())
.setStats(toColumnStatistics(entry.getColumnStatistics()))
.build());
}
builder.setMapStatistics(statisticsBuilder.build());
}
return builder.build();
}
private static UserMetadataItem toUserMetadata(Entry entry)
{
return UserMetadataItem.newBuilder()
.setName(entry.getKey())
.setValue(ByteString.copyFrom(entry.getValue().getBytes()))
.build();
}
@Override
public int writeStripeFooter(SliceOutput output, StripeFooter footer)
throws IOException
{
DwrfProto.StripeFooter footerProtobuf = DwrfProto.StripeFooter.newBuilder()
.addAllStreams(footer.getStreams().stream()
.map(DwrfMetadataWriter::toStream)
.collect(toImmutableList()))
.addAllColumns(toColumnEncodings(footer.getColumnEncodings()))
.addAllEncryptedGroups(footer.getStripeEncryptionGroups().stream()
.map(group -> ByteString.copyFrom(group.getBytes()))
.collect(toImmutableList()))
.build();
return writeProtobufObject(output, footerProtobuf);
}
@VisibleForTesting
static DwrfProto.Stream toStream(Stream stream)
{
DwrfProto.Stream.Builder streamBuilder = DwrfProto.Stream.newBuilder()
.setColumn(stream.getColumn())
.setSequence(stream.getSequence())
.setKind(toStreamKind(stream.getStreamKind()))
.setLength(stream.getLength())
.setUseVInts(stream.isUseVInts());
stream.getOffset().ifPresent(streamBuilder::setOffset);
return streamBuilder.build();
}
@VisibleForTesting
static DwrfProto.Stream.Kind toStreamKind(StreamKind streamKind)
{
switch (streamKind) {
case PRESENT:
return DwrfProto.Stream.Kind.PRESENT;
case DATA:
return DwrfProto.Stream.Kind.DATA;
case SECONDARY:
return DwrfProto.Stream.Kind.NANO_DATA;
case LENGTH:
return DwrfProto.Stream.Kind.LENGTH;
case DICTIONARY_DATA:
return DwrfProto.Stream.Kind.DICTIONARY_DATA;
case DICTIONARY_COUNT:
return DwrfProto.Stream.Kind.DICTIONARY_COUNT;
case ROW_INDEX:
return DwrfProto.Stream.Kind.ROW_INDEX;
case IN_MAP:
return DwrfProto.Stream.Kind.IN_MAP;
}
throw new IllegalArgumentException("Unsupported stream kind: " + streamKind);
}
public static List toColumnEncodings(Map columnEncodingsByNodeId)
{
ImmutableList.Builder columnEncodings = ImmutableList.builder();
for (Entry entry : columnEncodingsByNodeId.entrySet()) {
int nodeId = entry.getKey();
ColumnEncoding columnEncoding = entry.getValue();
if (columnEncoding.getAdditionalSequenceEncodings().isPresent()) {
Map sequences = columnEncoding.getAdditionalSequenceEncodings().get();
for (Entry sequenceEntry : sequences.entrySet()) {
int sequence = sequenceEntry.getKey();
DwrfSequenceEncoding sequenceEncoding = sequenceEntry.getValue();
columnEncodings.add(toColumnEncoding(nodeId, sequence, sequenceEncoding));
}
}
else {
columnEncodings.add(toColumnEncoding(nodeId, columnEncoding));
}
}
return columnEncodings.build();
}
public static DwrfProto.ColumnEncoding toColumnEncoding(int nodeId, ColumnEncoding columnEncoding)
{
checkArgument(
!columnEncoding.getAdditionalSequenceEncodings().isPresent(),
"Non-zero sequence IDs for column encoding %s",
columnEncoding);
return DwrfProto.ColumnEncoding.newBuilder()
.setKind(toColumnEncodingKind(columnEncoding.getColumnEncodingKind()))
.setDictionarySize(columnEncoding.getDictionarySize())
.setColumn(nodeId)
.setSequence(0)
.build();
}
public static DwrfProto.ColumnEncoding toColumnEncoding(int nodeId, int sequence, DwrfSequenceEncoding sequenceEncoding)
{
ColumnEncoding columnEncoding = sequenceEncoding.getValueEncoding();
requireNonNull(sequenceEncoding, "sequenceEncoding is null");
checkArgument(
!columnEncoding.getAdditionalSequenceEncodings().isPresent(),
"Non-zero sequence IDs for column encoding %s",
columnEncoding);
return DwrfProto.ColumnEncoding.newBuilder()
.setKind(toColumnEncodingKind(columnEncoding.getColumnEncodingKind()))
.setDictionarySize(columnEncoding.getDictionarySize())
.setColumn(nodeId)
.setSequence(sequence)
.setKey(sequenceEncoding.getKey())
.build();
}
private static DwrfProto.ColumnEncoding.Kind toColumnEncodingKind(ColumnEncodingKind columnEncodingKind)
{
switch (columnEncodingKind) {
case DIRECT:
return DwrfProto.ColumnEncoding.Kind.DIRECT;
case DICTIONARY:
return DwrfProto.ColumnEncoding.Kind.DICTIONARY;
case DWRF_MAP_FLAT:
return DwrfProto.ColumnEncoding.Kind.MAP_FLAT;
}
throw new IllegalArgumentException("Unsupported column encoding kind: " + columnEncodingKind);
}
@Override
public int writeRowIndexes(SliceOutput output, List rowGroupIndexes)
throws IOException
{
DwrfProto.RowIndex rowIndexProtobuf = DwrfProto.RowIndex.newBuilder()
.addAllEntry(rowGroupIndexes.stream()
.map(DwrfMetadataWriter::toRowGroupIndex)
.collect(toImmutableList()))
.build();
return writeProtobufObject(output, rowIndexProtobuf);
}
private static RowIndexEntry toRowGroupIndex(RowGroupIndex rowGroupIndex)
{
RowIndexEntry.Builder builder = RowIndexEntry.newBuilder();
for (int position : rowGroupIndex.getPositions()) {
builder.addPositions(position);
}
return builder.setStatistics(toColumnStatistics(rowGroupIndex.getColumnStatistics())).build();
}
private static DwrfProto.CompressionKind toCompression(CompressionKind compressionKind)
{
switch (compressionKind) {
case NONE:
return DwrfProto.CompressionKind.NONE;
case ZLIB:
return DwrfProto.CompressionKind.ZLIB;
case SNAPPY:
return DwrfProto.CompressionKind.SNAPPY;
case LZ4:
return DwrfProto.CompressionKind.LZ4;
case ZSTD:
return DwrfProto.CompressionKind.ZSTD;
}
throw new IllegalArgumentException("Unsupported compression kind: " + compressionKind);
}
private static DwrfProto.Encryption toEncryption(DwrfEncryption encryption)
{
return DwrfProto.Encryption.newBuilder()
.setKeyProvider(toKeyProvider(encryption.getKeyProvider()))
.addAllEncryptionGroups(encryption.getEncryptionGroups().stream()
.map(group -> toEncryptionGroup(group))
.collect(toImmutableList()))
.build();
}
private static DwrfProto.Encryption.KeyProvider toKeyProvider(KeyProvider keyProvider)
{
switch (keyProvider) {
case CRYPTO_SERVICE:
return DwrfProto.Encryption.KeyProvider.CRYPTO_SERVICE;
case UNKNOWN:
return DwrfProto.Encryption.KeyProvider.UNKNOWN;
default:
throw new UnsupportedOperationException(format("unknown key provider: %s", keyProvider));
}
}
private static DwrfProto.EncryptionGroup toEncryptionGroup(EncryptionGroup encryptionGroup)
{
return DwrfProto.EncryptionGroup.newBuilder()
.addAllNodes(encryptionGroup.getNodes())
.addAllStatistics(encryptionGroup.getStatistics().stream()
.map(statsSlice -> ByteString.copyFrom(statsSlice.getBytes()))
.collect(toImmutableList()))
.build();
}
public static DwrfProto.StripeEncryptionGroup toStripeEncryptionGroup(StripeEncryptionGroup stripeEncryptionGroup)
{
return DwrfProto.StripeEncryptionGroup.newBuilder()
.addAllStreams(stripeEncryptionGroup.getStreams().stream()
.map(DwrfMetadataWriter::toStream)
.collect(toImmutableList()))
.addAllEncoding(toColumnEncodings(stripeEncryptionGroup.getColumnEncodings()))
.build();
}
public static DwrfProto.FileStatistics toFileStatistics(List columnStatistics)
{
List dwrfColumnStatistics = columnStatistics.stream()
.map(DwrfMetadataWriter::toColumnStatistics)
.collect(toList());
return DwrfProto.FileStatistics.newBuilder()
.addAllStatistics(dwrfColumnStatistics)
.build();
}
private static DwrfProto.StripeCacheMode toStripeCacheMode(DwrfStripeCacheMode dwrfStripeCacheMode)
{
switch (dwrfStripeCacheMode) {
case NONE:
return DwrfProto.StripeCacheMode.NA;
case INDEX:
return DwrfProto.StripeCacheMode.INDEX;
case FOOTER:
return DwrfProto.StripeCacheMode.FOOTER;
case INDEX_AND_FOOTER:
return DwrfProto.StripeCacheMode.BOTH;
}
throw new IllegalArgumentException("Unsupported mode: " + dwrfStripeCacheMode);
}
private static int writeProtobufObject(OutputStream output, MessageLite object)
throws IOException
{
CountingOutputStream countingOutput = new CountingOutputStream(output);
object.writeTo(countingOutput);
return toIntExact(countingOutput.getCount());
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy