com.facebook.presto.orc.metadata.DwrfMetadataReader Maven / Gradle / Ivy
The newest version!
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.orc.metadata;
import com.facebook.presto.common.RuntimeStats;
import com.facebook.presto.common.RuntimeUnit;
import com.facebook.presto.orc.DwrfDataEncryptor;
import com.facebook.presto.orc.DwrfEncryptionProvider;
import com.facebook.presto.orc.DwrfKeyProvider;
import com.facebook.presto.orc.EncryptionLibrary;
import com.facebook.presto.orc.OrcCorruptionException;
import com.facebook.presto.orc.OrcDataSource;
import com.facebook.presto.orc.OrcDataSourceId;
import com.facebook.presto.orc.OrcDecompressor;
import com.facebook.presto.orc.OrcReaderOptions;
import com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind;
import com.facebook.presto.orc.metadata.OrcType.OrcTypeKind;
import com.facebook.presto.orc.metadata.PostScript.HiveWriterVersion;
import com.facebook.presto.orc.metadata.Stream.StreamKind;
import com.facebook.presto.orc.metadata.statistics.BinaryStatistics;
import com.facebook.presto.orc.metadata.statistics.BooleanStatistics;
import com.facebook.presto.orc.metadata.statistics.ColumnStatistics;
import com.facebook.presto.orc.metadata.statistics.DoubleStatistics;
import com.facebook.presto.orc.metadata.statistics.HiveBloomFilter;
import com.facebook.presto.orc.metadata.statistics.IntegerStatistics;
import com.facebook.presto.orc.metadata.statistics.MapStatistics;
import com.facebook.presto.orc.metadata.statistics.MapStatisticsEntry;
import com.facebook.presto.orc.metadata.statistics.StringStatistics;
import com.facebook.presto.orc.proto.DwrfProto;
import com.facebook.presto.orc.protobuf.ByteString;
import com.facebook.presto.orc.protobuf.CodedInputStream;
import com.facebook.presto.orc.stream.OrcInputStream;
import com.facebook.presto.orc.stream.SharedBuffer;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSortedMap;
import com.sun.management.ThreadMXBean;
import io.airlift.slice.BasicSliceInput;
import io.airlift.slice.Slice;
import java.io.IOException;
import java.io.InputStream;
import java.lang.management.ManagementFactory;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.OptionalInt;
import java.util.OptionalLong;
import java.util.SortedMap;
import java.util.stream.IntStream;
import static com.facebook.presto.orc.NoopOrcAggregatedMemoryContext.NOOP_ORC_AGGREGATED_MEMORY_CONTEXT;
import static com.facebook.presto.orc.NoopOrcLocalMemoryContext.NOOP_ORC_LOCAL_MEMORY_CONTEXT;
import static com.facebook.presto.orc.metadata.CompressionKind.LZ4;
import static com.facebook.presto.orc.metadata.CompressionKind.NONE;
import static com.facebook.presto.orc.metadata.CompressionKind.SNAPPY;
import static com.facebook.presto.orc.metadata.CompressionKind.ZLIB;
import static com.facebook.presto.orc.metadata.CompressionKind.ZSTD;
import static com.facebook.presto.orc.metadata.DwrfMetadataWriter.STATIC_METADATA;
import static com.facebook.presto.orc.metadata.OrcMetadataReader.byteStringToSlice;
import static com.facebook.presto.orc.metadata.OrcMetadataReader.maxStringTruncateToValidRange;
import static com.facebook.presto.orc.metadata.OrcMetadataReader.minStringTruncateToValidRange;
import static com.facebook.presto.orc.metadata.PostScript.HiveWriterVersion.ORC_HIVE_8732;
import static com.facebook.presto.orc.metadata.PostScript.HiveWriterVersion.ORIGINAL;
import static com.facebook.presto.orc.metadata.statistics.ColumnStatistics.createColumnStatistics;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static java.lang.Math.toIntExact;
import static java.util.Objects.requireNonNull;
public class DwrfMetadataReader
implements MetadataReader
{
private static final ThreadMXBean THREAD_MX_BEAN = (ThreadMXBean) ManagementFactory.getThreadMXBean();
private final RuntimeStats runtimeStats;
private final boolean readMapStatistics;
public DwrfMetadataReader(RuntimeStats runtimeStats, OrcReaderOptions readerOptions)
{
this.runtimeStats = requireNonNull(runtimeStats, "runtimeStats is null");
requireNonNull(readerOptions, "readerOptions is null");
this.readMapStatistics = readerOptions.readMapStatistics();
}
@Override
public PostScript readPostScript(byte[] data, int offset, int length)
throws IOException
{
long cpuStart = THREAD_MX_BEAN.getCurrentThreadCpuTime();
CodedInputStream input = CodedInputStream.newInstance(data, offset, length);
DwrfProto.PostScript postScript = DwrfProto.PostScript.parseFrom(input);
HiveWriterVersion writerVersion = postScript.hasWriterVersion() && postScript.getWriterVersion() > 0 ? ORC_HIVE_8732 : ORIGINAL;
OptionalInt stripeCacheLength = OptionalInt.empty();
Optional stripeCacheMode = Optional.empty();
if (postScript.hasCacheSize() && postScript.hasCacheMode()) {
stripeCacheLength = OptionalInt.of(postScript.getCacheSize());
stripeCacheMode = Optional.of(toStripeCacheMode(postScript.getCacheMode()));
}
runtimeStats.addMetricValue("DwrfReadPostScriptTimeNanos", RuntimeUnit.NANO, THREAD_MX_BEAN.getCurrentThreadCpuTime() - cpuStart);
return new PostScript(
ImmutableList.of(),
postScript.getFooterLength(),
0,
toCompression(postScript.getCompression()),
postScript.getCompressionBlockSize(),
writerVersion,
stripeCacheLength,
stripeCacheMode);
}
@Override
public Metadata readMetadata(HiveWriterVersion hiveWriterVersion, InputStream inputStream)
{
return new Metadata(ImmutableList.of());
}
@Override
public Footer readFooter(HiveWriterVersion hiveWriterVersion,
InputStream inputStream,
DwrfEncryptionProvider dwrfEncryptionProvider,
DwrfKeyProvider dwrfKeyProvider,
OrcDataSource orcDataSource,
Optional decompressor)
throws IOException
{
long cpuStart = THREAD_MX_BEAN.getCurrentThreadCpuTime();
CodedInputStream input = CodedInputStream.newInstance(inputStream);
DwrfProto.Footer footer = DwrfProto.Footer.parseFrom(input);
List fileStats = toColumnStatistics(hiveWriterVersion, footer.getStatisticsList(), false);
List fileStripes = toStripeInformation(footer.getStripesList());
List types = toType(footer.getTypesList());
Optional encryption = footer.hasEncryption() ? Optional.of(toEncryption(footer.getEncryption())) : Optional.empty();
Optional> stripeCacheOffsets = Optional.of(footer.getStripeCacheOffsetsList());
if (encryption.isPresent()) {
Map keys = dwrfKeyProvider.getIntermediateKeys(types);
EncryptionLibrary encryptionLibrary = dwrfEncryptionProvider.getEncryptionLibrary(encryption.get().getKeyProvider());
fileStats = decryptAndCombineFileStatistics(hiveWriterVersion, encryption.get(), encryptionLibrary, fileStats, fileStripes, keys, orcDataSource, decompressor);
}
runtimeStats.addMetricValue("DwrfReadFooterTimeNanos", RuntimeUnit.NANO, THREAD_MX_BEAN.getCurrentThreadCpuTime() - cpuStart);
OptionalLong rawSize = footer.hasRawDataSize() ? OptionalLong.of(footer.getRawDataSize()) : OptionalLong.empty();
return new Footer(
footer.getNumberOfRows(),
footer.getRowIndexStride(),
rawSize,
fileStripes,
types,
fileStats,
toUserMetadata(footer.getMetadataList()),
encryption,
stripeCacheOffsets);
}
private List decryptAndCombineFileStatistics(HiveWriterVersion hiveWriterVersion,
DwrfEncryption dwrfEncryption,
EncryptionLibrary encryptionLibrary,
List fileStats,
List fileStripes,
Map nodeToIntermediateKeys,
OrcDataSource orcDataSource,
Optional decompressor)
{
requireNonNull(dwrfEncryption, "dwrfEncryption is null");
requireNonNull(encryptionLibrary, "encryptionLibrary is null");
if (nodeToIntermediateKeys.isEmpty() || fileStats.isEmpty()) {
return fileStats;
}
ColumnStatistics[] decryptedFileStats = fileStats.toArray(new ColumnStatistics[0]);
List encryptionGroups = dwrfEncryption.getEncryptionGroups();
List stripeKeys = null;
if (!fileStripes.isEmpty() && !fileStripes.get(0).getKeyMetadata().isEmpty()) {
stripeKeys = fileStripes.get(0).getKeyMetadata();
checkState(stripeKeys.size() == encryptionGroups.size(),
"Number of keys in the first stripe must be the same as the number of encryption groups");
}
// if there is a node that has child nodes then its whole subtree will be encrypted and only the parent
// node is added to the encryption group
for (int groupIdx = 0; groupIdx < encryptionGroups.size(); groupIdx++) {
EncryptionGroup encryptionGroup = encryptionGroups.get(groupIdx);
DwrfDataEncryptor decryptor = null;
List nodes = encryptionGroup.getNodes();
for (int i = 0; i < nodes.size(); i++) {
Integer nodeId = nodes.get(i);
// do decryption only for those nodes that are requested (part of the projection)
if (!nodeToIntermediateKeys.containsKey(nodeId)) {
continue;
}
if (decryptor == null) {
// DEK for the FileStats can be stored either in the footer or/and in the first stripe.
// The key in the footer takes priority over the key in the first stripe.
byte[] encryptedDataKeyWithMeta = null;
if (encryptionGroup.getKeyMetadata().isPresent()) {
encryptedDataKeyWithMeta = encryptionGroup.getKeyMetadata().get().byteArray();
}
else if (stripeKeys != null) {
encryptedDataKeyWithMeta = stripeKeys.get(groupIdx);
}
checkState(encryptedDataKeyWithMeta != null, "DEK for %s encryption group is null", groupIdx);
// decrypt the DEK which is encrypted using the IEK passed into a record reader
byte[] intermediateKey = nodeToIntermediateKeys.get(nodeId).byteArray();
byte[] dataKey = encryptionLibrary.decryptKey(intermediateKey, encryptedDataKeyWithMeta, 0, encryptedDataKeyWithMeta.length);
decryptor = new DwrfDataEncryptor(dataKey, encryptionLibrary);
}
// decrypt the FileStats
Slice encryptedFileStats = encryptionGroup.getStatistics().get(i);
try (OrcInputStream inputStream = new OrcInputStream(
orcDataSource.getId(),
// Memory is not accounted as the buffer is expected to be tiny and will be immediately discarded
new SharedBuffer(NOOP_ORC_LOCAL_MEMORY_CONTEXT),
new BasicSliceInput(encryptedFileStats),
decompressor,
Optional.of(decryptor),
NOOP_ORC_AGGREGATED_MEMORY_CONTEXT,
encryptedFileStats.length())) {
CodedInputStream input = CodedInputStream.newInstance(inputStream);
DwrfProto.FileStatistics nodeStats = DwrfProto.FileStatistics.parseFrom(input);
// FileStatistics contains ColumnStatistics for the node and all its child nodes (subtree)
for (int statsIdx = 0; statsIdx < nodeStats.getStatisticsCount(); statsIdx++) {
decryptedFileStats[nodeId + statsIdx] =
toColumnStatistics(hiveWriterVersion, nodeStats.getStatistics(statsIdx), false, null);
}
}
catch (IOException e) {
throw new OrcCorruptionException(e, orcDataSource.getId(), "Failed to read or decrypt FileStatistics for node %s", nodeId);
}
}
}
return ImmutableList.copyOf(decryptedFileStats);
}
private static DwrfEncryption toEncryption(DwrfProto.Encryption encryption)
{
KeyProvider keyProvider = toKeyProvider(encryption.getKeyProvider());
List encryptionGroups = toEncryptionGroups(encryption.getEncryptionGroupsList());
return new DwrfEncryption(keyProvider, encryptionGroups);
}
private static List toEncryptionGroups(List encryptionGroups)
{
ImmutableList.Builder encryptionGroupBuilder = ImmutableList.builderWithExpectedSize(encryptionGroups.size());
for (DwrfProto.EncryptionGroup dwrfEncryptionGroup : encryptionGroups) {
encryptionGroupBuilder.add(new EncryptionGroup(
dwrfEncryptionGroup.getNodesList(),
dwrfEncryptionGroup.hasKeyMetadata() ? Optional.of(byteStringToSlice(dwrfEncryptionGroup.getKeyMetadata())) : Optional.empty(),
dwrfEncryptionGroup.getStatisticsList().stream()
.map(OrcMetadataReader::byteStringToSlice)
.collect(toImmutableList())));
}
return encryptionGroupBuilder.build();
}
private static KeyProvider toKeyProvider(DwrfProto.Encryption.KeyProvider keyProvider)
{
switch (keyProvider) {
case CRYPTO_SERVICE:
return KeyProvider.CRYPTO_SERVICE;
default:
return KeyProvider.UNKNOWN;
}
}
private static List toStripeInformation(List stripeInformationList)
{
ImmutableList.Builder stripeInfoBuilder = ImmutableList.builderWithExpectedSize(stripeInformationList.size());
List previousKeyMetadata = ImmutableList.of();
for (DwrfProto.StripeInformation dwrfStripeInfo : stripeInformationList) {
StripeInformation prestoStripeInfo = toStripeInformation(dwrfStripeInfo, previousKeyMetadata);
stripeInfoBuilder.add(prestoStripeInfo);
previousKeyMetadata = prestoStripeInfo.getKeyMetadata();
}
return stripeInfoBuilder.build();
}
private static StripeInformation toStripeInformation(DwrfProto.StripeInformation stripeInformation, List previousKeyMetadata)
{
List keyMetadata = stripeInformation.getKeyMetadataList().stream()
.map(ByteString::toByteArray)
.collect(toImmutableList());
if (keyMetadata.isEmpty()) {
keyMetadata = previousKeyMetadata;
}
OptionalLong rawDataSize = stripeInformation.hasRawDataSize() ? OptionalLong.of(stripeInformation.getRawDataSize()) : OptionalLong.empty();
return new StripeInformation(
stripeInformation.getNumberOfRows(),
stripeInformation.getOffset(),
stripeInformation.getIndexLength(),
stripeInformation.getDataLength(),
stripeInformation.getFooterLength(),
rawDataSize,
keyMetadata);
}
@Override
public StripeFooter readStripeFooter(OrcDataSourceId orcDataSourceId, List types, InputStream inputStream)
throws IOException
{
long cpuStart = THREAD_MX_BEAN.getCurrentThreadCpuTime();
CodedInputStream input = CodedInputStream.newInstance(inputStream);
DwrfProto.StripeFooter stripeFooter = DwrfProto.StripeFooter.parseFrom(input);
runtimeStats.addMetricValue("DwrfReadStripeFooterTimeNanos", RuntimeUnit.NANO, THREAD_MX_BEAN.getCurrentThreadCpuTime() - cpuStart);
return new StripeFooter(
toStream(orcDataSourceId, stripeFooter.getStreamsList()),
toColumnEncoding(types, stripeFooter.getColumnsList()),
stripeFooter.getEncryptedGroupsList().stream()
.map(OrcMetadataReader::byteStringToSlice)
.collect(toImmutableList()));
}
private static Stream toStream(OrcDataSourceId orcDataSourceId, DwrfProto.Stream stream)
{
// reader doesn't support streams larger than 2GB
if (stream.getLength() > Integer.MAX_VALUE) {
throw new OrcCorruptionException(
orcDataSourceId,
"Stream size %s of one of the streams for column %s is larger than supported size %s",
stream.getLength(),
stream.getColumn(),
Integer.MAX_VALUE);
}
return new Stream(
stream.getColumn(),
toStreamKind(stream.getKind()),
toIntExact(stream.getLength()),
stream.getUseVInts(),
stream.getSequence(),
stream.hasOffset() ? Optional.of(stream.getOffset()) : Optional.empty());
}
private static List toStream(OrcDataSourceId orcDataSourceId, List streams)
{
return streams.stream()
.map((stream -> toStream(orcDataSourceId, stream)))
.collect(toImmutableList());
}
private static DwrfSequenceEncoding toSequenceEncoding(OrcType type, DwrfProto.ColumnEncoding columnEncoding)
{
return new DwrfSequenceEncoding(
columnEncoding.getKey(),
new ColumnEncoding(
toColumnEncodingKind(type.getOrcTypeKind(), columnEncoding.getKind()),
columnEncoding.getDictionarySize()));
}
private static ColumnEncoding toColumnEncoding(OrcType type, List columnEncodings)
{
DwrfProto.ColumnEncoding sequence0 = null;
ImmutableSortedMap.Builder builder = ImmutableSortedMap.naturalOrder();
for (DwrfProto.ColumnEncoding columnEncoding : columnEncodings) {
if (columnEncoding.getSequence() == 0) {
sequence0 = columnEncoding;
}
else {
builder.put(columnEncoding.getSequence(), toSequenceEncoding(type, columnEncoding));
}
}
SortedMap nonZeroSequences = builder.build();
Optional> nonZeroEncodingsOptional =
nonZeroSequences.isEmpty() ? Optional.empty() : Optional.of(nonZeroSequences);
if (sequence0 != null) {
return new ColumnEncoding(toColumnEncodingKind(type.getOrcTypeKind(), sequence0.getKind()), sequence0.getDictionarySize(), nonZeroEncodingsOptional);
}
else {
// This is the case when value node of FLAT_MAP doesn't have encoding for sequence 0
return new ColumnEncoding(ColumnEncodingKind.DWRF_DIRECT, 0, nonZeroEncodingsOptional);
}
}
private static Map toColumnEncoding(List types, List columnEncodings)
{
Map> groupedColumnEncodings = new HashMap<>(columnEncodings.size());
for (int i = 0; i < columnEncodings.size(); i++) {
DwrfProto.ColumnEncoding columnEncoding = columnEncodings.get(i);
int column = columnEncoding.getColumn();
// DWRF prior to version 6.0.8 doesn't set the value of column, infer it from the index
if (!columnEncoding.hasColumn()) {
column = i;
}
groupedColumnEncodings.computeIfAbsent(column, key -> new ArrayList<>()).add(columnEncoding);
}
ImmutableMap.Builder resultBuilder = ImmutableMap.builderWithExpectedSize(groupedColumnEncodings.size());
for (Map.Entry> entry : groupedColumnEncodings.entrySet()) {
OrcType type = types.get(entry.getKey());
resultBuilder.put(
entry.getKey(),
toColumnEncoding(type, entry.getValue()));
}
return resultBuilder.build();
}
@Override
public List readRowIndexes(HiveWriterVersion hiveWriterVersion, InputStream inputStream, List bloomFilters)
throws IOException
{
long cpuStart = THREAD_MX_BEAN.getCurrentThreadCpuTime();
CodedInputStream input = CodedInputStream.newInstance(inputStream);
DwrfProto.RowIndex rowIndex = DwrfProto.RowIndex.parseFrom(input);
runtimeStats.addMetricValue("DwrfReadRowIndexesTimeNanos", RuntimeUnit.NANO, THREAD_MX_BEAN.getCurrentThreadCpuTime() - cpuStart);
return IntStream.range(0, rowIndex.getEntryCount())
.mapToObj(i -> toRowGroupIndex(hiveWriterVersion, rowIndex.getEntry(i), bloomFilters == null || bloomFilters.isEmpty() ? null : bloomFilters.get(i)))
.collect(toImmutableList());
}
@Override
public List readBloomFilterIndexes(InputStream inputStream)
{
// DWRF does not have bloom filters
return ImmutableList.of();
}
private RowGroupIndex toRowGroupIndex(HiveWriterVersion hiveWriterVersion, DwrfProto.RowIndexEntry rowIndexEntry, HiveBloomFilter bloomFilter)
{
List positionsList = rowIndexEntry.getPositionsList();
int[] positions = new int[positionsList.size()];
for (int index = 0; index < positionsList.size(); index++) {
long longPosition = positionsList.get(index);
int intPosition = (int) longPosition;
checkState(intPosition == longPosition, "Expected checkpoint position %s, to be an integer", index);
positions[index] = intPosition;
}
return new RowGroupIndex(positions, toColumnStatistics(hiveWriterVersion, rowIndexEntry.getStatistics(), true, bloomFilter));
}
private List toColumnStatistics(HiveWriterVersion hiveWriterVersion, List columnStatistics, boolean isRowGroup)
{
if (columnStatistics == null) {
return ImmutableList.of();
}
return columnStatistics.stream()
.map(statistics -> toColumnStatistics(hiveWriterVersion, statistics, isRowGroup, null))
.collect(toImmutableList());
}
private Map toUserMetadata(List metadataList)
{
ImmutableMap.Builder mapBuilder = ImmutableMap.builder();
for (DwrfProto.UserMetadataItem item : metadataList) {
// skip static metadata added by the writer framework
if (!STATIC_METADATA.containsKey(item.getName())) {
mapBuilder.put(item.getName(), byteStringToSlice(item.getValue()));
}
}
return mapBuilder.build();
}
@VisibleForTesting
ColumnStatistics toColumnStatistics(HiveWriterVersion hiveWriterVersion, DwrfProto.ColumnStatistics statistics, boolean isRowGroup, HiveBloomFilter bloomFilter)
{
return createColumnStatistics(
statistics.getNumberOfValues(),
statistics.hasRawSize() ? statistics.getRawSize() : null,
statistics.hasSize() ? statistics.getSize() : null,
statistics.hasBucketStatistics() ? toBooleanStatistics(statistics.getBucketStatistics()) : null,
statistics.hasIntStatistics() ? toIntegerStatistics(statistics.getIntStatistics()) : null,
statistics.hasDoubleStatistics() ? toDoubleStatistics(statistics.getDoubleStatistics()) : null,
statistics.hasStringStatistics() ? toStringStatistics(hiveWriterVersion, statistics.getStringStatistics(), isRowGroup) : null,
null,
null,
statistics.hasBinaryStatistics() ? toBinaryStatistics(statistics.getBinaryStatistics()) : null,
readMapStatistics && statistics.hasMapStatistics() ? toMapStatistics(statistics.getMapStatistics(), hiveWriterVersion, isRowGroup, bloomFilter) : null,
bloomFilter);
}
private static BooleanStatistics toBooleanStatistics(DwrfProto.BucketStatistics bucketStatistics)
{
if (bucketStatistics.getCountCount() == 0) {
return null;
}
return new BooleanStatistics(bucketStatistics.getCount(0));
}
private static IntegerStatistics toIntegerStatistics(DwrfProto.IntegerStatistics integerStatistics)
{
return new IntegerStatistics(
integerStatistics.hasMinimum() ? integerStatistics.getMinimum() : null,
integerStatistics.hasMaximum() ? integerStatistics.getMaximum() : null,
integerStatistics.hasSum() ? integerStatistics.getSum() : null);
}
private static DoubleStatistics toDoubleStatistics(DwrfProto.DoubleStatistics doubleStatistics)
{
// if either min, max, or sum is NaN, ignore the stat
if ((doubleStatistics.hasMinimum() && Double.isNaN(doubleStatistics.getMinimum())) ||
(doubleStatistics.hasMaximum() && Double.isNaN(doubleStatistics.getMaximum())) ||
(doubleStatistics.hasSum() && Double.isNaN(doubleStatistics.getSum()))) {
return null;
}
return new DoubleStatistics(
doubleStatistics.hasMinimum() ? doubleStatistics.getMinimum() : null,
doubleStatistics.hasMaximum() ? doubleStatistics.getMaximum() : null);
}
@VisibleForTesting
static StringStatistics toStringStatistics(HiveWriterVersion hiveWriterVersion, DwrfProto.StringStatistics stringStatistics, boolean isRowGroup)
{
if (hiveWriterVersion == ORIGINAL && !isRowGroup) {
return null;
}
Slice maximum = stringStatistics.hasMaximum() ? maxStringTruncateToValidRange(byteStringToSlice(stringStatistics.getMaximumBytes()), hiveWriterVersion) : null;
Slice minimum = stringStatistics.hasMinimum() ? minStringTruncateToValidRange(byteStringToSlice(stringStatistics.getMinimumBytes()), hiveWriterVersion) : null;
long sum = stringStatistics.hasSum() ? stringStatistics.getSum() : 0;
return new StringStatistics(minimum, maximum, sum);
}
private static BinaryStatistics toBinaryStatistics(DwrfProto.BinaryStatistics binaryStatistics)
{
if (!binaryStatistics.hasSum()) {
return null;
}
return new BinaryStatistics(binaryStatistics.getSum());
}
private MapStatistics toMapStatistics(DwrfProto.MapStatistics mapStatistics, HiveWriterVersion hiveWriterVersion, boolean isRowGroup, HiveBloomFilter bloomFilter)
{
ImmutableList.Builder mapStatisticsEntries = ImmutableList.builderWithExpectedSize(mapStatistics.getStatsCount());
for (DwrfProto.MapEntryStatistics mapEntryStatistics : mapStatistics.getStatsList()) {
DwrfProto.ColumnStatistics dwrfStatistics = mapEntryStatistics.getStats();
ColumnStatistics columnStatistics = toColumnStatistics(hiveWriterVersion, dwrfStatistics, isRowGroup, bloomFilter);
DwrfProto.KeyInfo key = mapEntryStatistics.getKey();
mapStatisticsEntries.add(new MapStatisticsEntry(key, columnStatistics));
}
return new MapStatistics(mapStatisticsEntries.build());
}
private static OrcType toType(DwrfProto.Type type)
{
return new OrcType(toTypeKind(type.getKind()), type.getSubtypesList(), type.getFieldNamesList(), Optional.empty(), Optional.empty(), Optional.empty());
}
private static List toType(List types)
{
return types.stream()
.map(DwrfMetadataReader::toType)
.collect(toImmutableList());
}
private static OrcTypeKind toTypeKind(DwrfProto.Type.Kind kind)
{
switch (kind) {
case BOOLEAN:
return OrcTypeKind.BOOLEAN;
case BYTE:
return OrcTypeKind.BYTE;
case SHORT:
return OrcTypeKind.SHORT;
case INT:
return OrcTypeKind.INT;
case LONG:
return OrcTypeKind.LONG;
case FLOAT:
return OrcTypeKind.FLOAT;
case DOUBLE:
return OrcTypeKind.DOUBLE;
case STRING:
return OrcTypeKind.STRING;
case BINARY:
return OrcTypeKind.BINARY;
case TIMESTAMP:
return OrcTypeKind.TIMESTAMP;
case LIST:
return OrcTypeKind.LIST;
case MAP:
return OrcTypeKind.MAP;
case STRUCT:
return OrcTypeKind.STRUCT;
case UNION:
return OrcTypeKind.UNION;
default:
throw new IllegalArgumentException(kind + " data type not implemented yet");
}
}
private static StreamKind toStreamKind(DwrfProto.Stream.Kind kind)
{
switch (kind) {
case PRESENT:
return StreamKind.PRESENT;
case DATA:
return StreamKind.DATA;
case LENGTH:
return StreamKind.LENGTH;
case DICTIONARY_DATA:
return StreamKind.DICTIONARY_DATA;
case DICTIONARY_COUNT:
return StreamKind.DICTIONARY_COUNT;
case NANO_DATA:
return StreamKind.SECONDARY;
case ROW_INDEX:
return StreamKind.ROW_INDEX;
case IN_DICTIONARY:
return StreamKind.IN_DICTIONARY;
case STRIDE_DICTIONARY:
return StreamKind.ROW_GROUP_DICTIONARY;
case STRIDE_DICTIONARY_LENGTH:
return StreamKind.ROW_GROUP_DICTIONARY_LENGTH;
case IN_MAP:
return StreamKind.IN_MAP;
default:
throw new IllegalArgumentException(kind + " stream type not implemented yet");
}
}
private static ColumnEncodingKind toColumnEncodingKind(OrcTypeKind type, DwrfProto.ColumnEncoding.Kind kind)
{
switch (kind) {
case DIRECT:
if (type == OrcTypeKind.SHORT || type == OrcTypeKind.INT || type == OrcTypeKind.LONG) {
return ColumnEncodingKind.DWRF_DIRECT;
}
else {
return ColumnEncodingKind.DIRECT;
}
case DICTIONARY:
return ColumnEncodingKind.DICTIONARY;
case MAP_FLAT:
return ColumnEncodingKind.DWRF_MAP_FLAT;
default:
throw new IllegalArgumentException(kind + " stream encoding not implemented yet");
}
}
private static CompressionKind toCompression(DwrfProto.CompressionKind compression)
{
switch (compression) {
case NONE:
return NONE;
case ZLIB:
return ZLIB;
case SNAPPY:
return SNAPPY;
case LZ4:
return LZ4;
case ZSTD:
return ZSTD;
default:
throw new IllegalArgumentException(compression + " compression not implemented yet");
}
}
static DwrfStripeCacheMode toStripeCacheMode(DwrfProto.StripeCacheMode mode)
{
switch (mode) {
case INDEX:
return DwrfStripeCacheMode.INDEX;
case FOOTER:
return DwrfStripeCacheMode.FOOTER;
case BOTH:
return DwrfStripeCacheMode.INDEX_AND_FOOTER;
default:
return DwrfStripeCacheMode.NONE;
}
}
public static StripeEncryptionGroup toStripeEncryptionGroup(OrcDataSourceId orcDataSourceId, InputStream inputStream, List types)
throws IOException
{
CodedInputStream codedInputStream = CodedInputStream.newInstance(inputStream);
DwrfProto.StripeEncryptionGroup stripeEncryptionGroup = DwrfProto.StripeEncryptionGroup.parseFrom(codedInputStream);
List encryptedStreams = toStream(orcDataSourceId, stripeEncryptionGroup.getStreamsList());
return new StripeEncryptionGroup(
encryptedStreams,
toColumnEncoding(types, stripeEncryptionGroup.getEncodingList()));
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy