org.apache.parquet.format.converter.ParquetMetadataConverter Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet.format.converter;
import static java.util.Optional.empty;
import static java.util.Optional.of;
import static org.apache.parquet.format.Util.readColumnMetaData;
import static org.apache.parquet.format.Util.readFileMetaData;
import static org.apache.parquet.format.Util.writeColumnMetaData;
import static org.apache.parquet.format.Util.writePageHeader;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.CorruptStatistics;
import org.apache.parquet.ParquetReadOptions;
import org.apache.parquet.Preconditions;
import org.apache.parquet.column.EncodingStats;
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.column.statistics.BinaryStatistics;
import org.apache.parquet.column.values.bloomfilter.BloomFilter;
import org.apache.parquet.crypto.AesCipher;
import org.apache.parquet.crypto.AesGcmEncryptor;
import org.apache.parquet.crypto.InternalColumnEncryptionSetup;
import org.apache.parquet.crypto.InternalFileDecryptor;
import org.apache.parquet.crypto.InternalFileEncryptor;
import org.apache.parquet.crypto.ModuleCipherFactory.ModuleType;
import org.apache.parquet.crypto.ParquetCryptoRuntimeException;
import org.apache.parquet.crypto.TagVerificationException;
import org.apache.parquet.format.BlockCipher;
import org.apache.parquet.format.BloomFilterAlgorithm;
import org.apache.parquet.format.BloomFilterCompression;
import org.apache.parquet.format.BloomFilterHash;
import org.apache.parquet.format.BloomFilterHeader;
import org.apache.parquet.format.BoundaryOrder;
import org.apache.parquet.format.BsonType;
import org.apache.parquet.format.ColumnChunk;
import org.apache.parquet.format.ColumnCryptoMetaData;
import org.apache.parquet.format.ColumnIndex;
import org.apache.parquet.format.ColumnMetaData;
import org.apache.parquet.format.ColumnOrder;
import org.apache.parquet.format.CompressionCodec;
import org.apache.parquet.format.ConvertedType;
import org.apache.parquet.format.DataPageHeader;
import org.apache.parquet.format.DataPageHeaderV2;
import org.apache.parquet.format.DateType;
import org.apache.parquet.format.DecimalType;
import org.apache.parquet.format.DictionaryPageHeader;
import org.apache.parquet.format.Encoding;
import org.apache.parquet.format.EncryptionWithColumnKey;
import org.apache.parquet.format.EnumType;
import org.apache.parquet.format.FieldRepetitionType;
import org.apache.parquet.format.FileMetaData;
import org.apache.parquet.format.Float16Type;
import org.apache.parquet.format.IntType;
import org.apache.parquet.format.JsonType;
import org.apache.parquet.format.KeyValue;
import org.apache.parquet.format.ListType;
import org.apache.parquet.format.LogicalType;
import org.apache.parquet.format.MapType;
import org.apache.parquet.format.MicroSeconds;
import org.apache.parquet.format.MilliSeconds;
import org.apache.parquet.format.NanoSeconds;
import org.apache.parquet.format.NullType;
import org.apache.parquet.format.OffsetIndex;
import org.apache.parquet.format.PageEncodingStats;
import org.apache.parquet.format.PageHeader;
import org.apache.parquet.format.PageLocation;
import org.apache.parquet.format.PageType;
import org.apache.parquet.format.RowGroup;
import org.apache.parquet.format.SchemaElement;
import org.apache.parquet.format.SizeStatistics;
import org.apache.parquet.format.SplitBlockAlgorithm;
import org.apache.parquet.format.Statistics;
import org.apache.parquet.format.StringType;
import org.apache.parquet.format.TimeType;
import org.apache.parquet.format.TimeUnit;
import org.apache.parquet.format.TimestampType;
import org.apache.parquet.format.Type;
import org.apache.parquet.format.TypeDefinedOrder;
import org.apache.parquet.format.UUIDType;
import org.apache.parquet.format.Uncompressed;
import org.apache.parquet.format.XxHash;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.ColumnPath;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.metadata.FileMetaData.EncryptionType;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.internal.column.columnindex.BinaryTruncator;
import org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder;
import org.apache.parquet.internal.column.columnindex.OffsetIndexBuilder;
import org.apache.parquet.internal.hadoop.metadata.IndexReference;
import org.apache.parquet.io.InvalidFileOffsetException;
import org.apache.parquet.io.ParquetDecodingException;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.ColumnOrder.ColumnOrderName;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.LogicalTypeAnnotation.UUIDLogicalTypeAnnotation;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.OriginalType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.apache.parquet.schema.Type.Repetition;
import org.apache.parquet.schema.TypeVisitor;
import org.apache.parquet.schema.Types;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
// TODO: This file has become too long!
// TODO: Lets split it up: https://issues.apache.org/jira/browse/PARQUET-310
public class ParquetMetadataConverter {
private static final TypeDefinedOrder TYPE_DEFINED_ORDER = new TypeDefinedOrder();
public static final MetadataFilter NO_FILTER = new NoFilter();
public static final MetadataFilter SKIP_ROW_GROUPS = new SkipMetadataFilter();
public static final long MAX_STATS_SIZE = 4096; // limit stats to 4k
private static final Logger LOG = LoggerFactory.getLogger(ParquetMetadataConverter.class);
private static final LogicalTypeConverterVisitor LOGICAL_TYPE_ANNOTATION_VISITOR =
new LogicalTypeConverterVisitor();
private static final ConvertedTypeConverterVisitor CONVERTED_TYPE_CONVERTER_VISITOR =
new ConvertedTypeConverterVisitor();
private final int statisticsTruncateLength;
private final boolean useSignedStringMinMax;
public ParquetMetadataConverter() {
this(false);
}
public ParquetMetadataConverter(int statisticsTruncateLength) {
this(false, statisticsTruncateLength);
}
/**
* @param conf a configuration
* @deprecated will be removed in 2.0.0; use {@code ParquetMetadataConverter(ParquetReadOptions)}
*/
@Deprecated
public ParquetMetadataConverter(Configuration conf) {
this(conf.getBoolean("parquet.strings.signed-min-max.enabled", false));
}
public ParquetMetadataConverter(ParquetReadOptions options) {
this(options.useSignedStringMinMax());
}
private ParquetMetadataConverter(boolean useSignedStringMinMax) {
this(useSignedStringMinMax, ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH);
}
private ParquetMetadataConverter(boolean useSignedStringMinMax, int statisticsTruncateLength) {
if (statisticsTruncateLength <= 0) {
throw new IllegalArgumentException("Truncate length should be greater than 0");
}
this.useSignedStringMinMax = useSignedStringMinMax;
this.statisticsTruncateLength = statisticsTruncateLength;
}
// NOTE: this cache is for memory savings, not cpu savings, and is used to de-duplicate
// sets of encodings. It is important that all collections inserted to this cache be
// immutable and have thread-safe read-only access. This can be achieved by wrapping
// an unsynchronized collection in Collections.unmodifiable*(), and making sure to not
// keep any references to the original collection.
private static final ConcurrentHashMap<
Set, Set>
cachedEncodingSets = new ConcurrentHashMap<
Set, Set>();
public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) {
return toParquetMetadata(currentVersion, parquetMetadata, null);
}
public FileMetaData toParquetMetadata(
int currentVersion, ParquetMetadata parquetMetadata, InternalFileEncryptor fileEncryptor) {
List blocks = parquetMetadata.getBlocks();
List rowGroups = new ArrayList();
long numRows = 0;
long preBlockStartPos = 0;
long preBlockCompressedSize = 0;
for (BlockMetaData block : blocks) {
numRows += block.getRowCount();
long blockStartPos = block.getStartingPos();
// first block
if (blockStartPos == 4) {
preBlockStartPos = 0;
preBlockCompressedSize = 0;
}
if (preBlockStartPos != 0) {
Preconditions.checkState(
blockStartPos >= preBlockStartPos + preBlockCompressedSize,
"Invalid block starting position: %s",
blockStartPos);
}
preBlockStartPos = blockStartPos;
preBlockCompressedSize = block.getCompressedSize();
addRowGroup(parquetMetadata, rowGroups, block, fileEncryptor);
}
FileMetaData fileMetaData = new FileMetaData(
currentVersion,
toParquetSchema(parquetMetadata.getFileMetaData().getSchema()),
numRows,
rowGroups);
Set> keyValues =
parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet();
for (Entry keyValue : keyValues) {
addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue());
}
fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy());
fileMetaData.setColumn_orders(
getColumnOrders(parquetMetadata.getFileMetaData().getSchema()));
return fileMetaData;
}
private List getColumnOrders(MessageType schema) {
List columnOrders = new ArrayList<>();
// Currently, only TypeDefinedOrder is supported, so we create a column order for each columns with
// TypeDefinedOrder even if some types (e.g. INT96) have undefined column orders.
for (int i = 0, n = schema.getPaths().size(); i < n; ++i) {
ColumnOrder columnOrder = new ColumnOrder();
columnOrder.setTYPE_ORDER(TYPE_DEFINED_ORDER);
columnOrders.add(columnOrder);
}
return columnOrders;
}
// Visible for testing
List toParquetSchema(MessageType schema) {
List result = new ArrayList();
addToList(result, schema);
return result;
}
private void addToList(final List result, org.apache.parquet.schema.Type field) {
field.accept(new TypeVisitor() {
@Override
public void visit(PrimitiveType primitiveType) {
SchemaElement element = new SchemaElement(primitiveType.getName());
element.setRepetition_type(toParquetRepetition(primitiveType.getRepetition()));
element.setType(getType(primitiveType.getPrimitiveTypeName()));
if (primitiveType.getLogicalTypeAnnotation() != null) {
element.setConverted_type(convertToConvertedType(primitiveType.getLogicalTypeAnnotation()));
element.setLogicalType(convertToLogicalType(primitiveType.getLogicalTypeAnnotation()));
}
if (primitiveType.getDecimalMetadata() != null) {
element.setPrecision(primitiveType.getDecimalMetadata().getPrecision());
element.setScale(primitiveType.getDecimalMetadata().getScale());
}
if (primitiveType.getTypeLength() > 0) {
element.setType_length(primitiveType.getTypeLength());
}
if (primitiveType.getId() != null) {
element.setField_id(primitiveType.getId().intValue());
}
result.add(element);
}
@Override
public void visit(MessageType messageType) {
SchemaElement element = new SchemaElement(messageType.getName());
if (messageType.getId() != null) {
element.setField_id(messageType.getId().intValue());
}
visitChildren(result, messageType.asGroupType(), element);
}
@Override
public void visit(GroupType groupType) {
SchemaElement element = new SchemaElement(groupType.getName());
element.setRepetition_type(toParquetRepetition(groupType.getRepetition()));
if (groupType.getLogicalTypeAnnotation() != null) {
element.setConverted_type(convertToConvertedType(groupType.getLogicalTypeAnnotation()));
element.setLogicalType(convertToLogicalType(groupType.getLogicalTypeAnnotation()));
}
if (groupType.getId() != null) {
element.setField_id(groupType.getId().intValue());
}
visitChildren(result, groupType, element);
}
private void visitChildren(final List result, GroupType groupType, SchemaElement element) {
element.setNum_children(groupType.getFieldCount());
result.add(element);
for (org.apache.parquet.schema.Type field : groupType.getFields()) {
addToList(result, field);
}
}
});
}
LogicalType convertToLogicalType(LogicalTypeAnnotation logicalTypeAnnotation) {
return logicalTypeAnnotation.accept(LOGICAL_TYPE_ANNOTATION_VISITOR).orElse(null);
}
ConvertedType convertToConvertedType(LogicalTypeAnnotation logicalTypeAnnotation) {
return logicalTypeAnnotation.accept(CONVERTED_TYPE_CONVERTER_VISITOR).orElse(null);
}
static org.apache.parquet.format.TimeUnit convertUnit(LogicalTypeAnnotation.TimeUnit unit) {
switch (unit) {
case MICROS:
return org.apache.parquet.format.TimeUnit.MICROS(new MicroSeconds());
case MILLIS:
return org.apache.parquet.format.TimeUnit.MILLIS(new MilliSeconds());
case NANOS:
return TimeUnit.NANOS(new NanoSeconds());
default:
throw new RuntimeException("Unknown time unit " + unit);
}
}
private static class ConvertedTypeConverterVisitor
implements LogicalTypeAnnotation.LogicalTypeAnnotationVisitor {
@Override
public Optional visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) {
return of(ConvertedType.UTF8);
}
@Override
public Optional visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) {
return of(ConvertedType.MAP);
}
@Override
public Optional visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) {
return of(ConvertedType.LIST);
}
@Override
public Optional visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) {
return of(ConvertedType.ENUM);
}
@Override
public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) {
return of(ConvertedType.DECIMAL);
}
@Override
public Optional visit(LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) {
return of(ConvertedType.DATE);
}
@Override
public Optional visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) {
switch (timeLogicalType.getUnit()) {
case MILLIS:
return of(ConvertedType.TIME_MILLIS);
case MICROS:
return of(ConvertedType.TIME_MICROS);
case NANOS:
return empty();
default:
throw new RuntimeException("Unknown converted type for " + timeLogicalType.toOriginalType());
}
}
@Override
public Optional visit(
LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) {
switch (timestampLogicalType.getUnit()) {
case MICROS:
return of(ConvertedType.TIMESTAMP_MICROS);
case MILLIS:
return of(ConvertedType.TIMESTAMP_MILLIS);
case NANOS:
return empty();
default:
throw new RuntimeException("Unknown converted type for " + timestampLogicalType.toOriginalType());
}
}
@Override
public Optional visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) {
boolean signed = intLogicalType.isSigned();
switch (intLogicalType.getBitWidth()) {
case 8:
return of(signed ? ConvertedType.INT_8 : ConvertedType.UINT_8);
case 16:
return of(signed ? ConvertedType.INT_16 : ConvertedType.UINT_16);
case 32:
return of(signed ? ConvertedType.INT_32 : ConvertedType.UINT_32);
case 64:
return of(signed ? ConvertedType.INT_64 : ConvertedType.UINT_64);
default:
throw new RuntimeException("Unknown original type " + intLogicalType.toOriginalType());
}
}
@Override
public Optional visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) {
return of(ConvertedType.JSON);
}
@Override
public Optional visit(LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) {
return of(ConvertedType.BSON);
}
@Override
public Optional visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) {
return of(ConvertedType.INTERVAL);
}
@Override
public Optional visit(LogicalTypeAnnotation.MapKeyValueTypeAnnotation mapKeyValueLogicalType) {
return of(ConvertedType.MAP_KEY_VALUE);
}
}
private static class LogicalTypeConverterVisitor
implements LogicalTypeAnnotation.LogicalTypeAnnotationVisitor {
@Override
public Optional visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) {
return of(LogicalType.STRING(new StringType()));
}
@Override
public Optional visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) {
return of(LogicalType.MAP(new MapType()));
}
@Override
public Optional visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) {
return of(LogicalType.LIST(new ListType()));
}
@Override
public Optional visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) {
return of(LogicalType.ENUM(new EnumType()));
}
@Override
public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) {
return of(LogicalType.DECIMAL(
new DecimalType(decimalLogicalType.getScale(), decimalLogicalType.getPrecision())));
}
@Override
public Optional visit(LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) {
return of(LogicalType.DATE(new DateType()));
}
@Override
public Optional visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) {
return of(LogicalType.TIME(
new TimeType(timeLogicalType.isAdjustedToUTC(), convertUnit(timeLogicalType.getUnit()))));
}
@Override
public Optional visit(LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) {
return of(LogicalType.TIMESTAMP(new TimestampType(
timestampLogicalType.isAdjustedToUTC(), convertUnit(timestampLogicalType.getUnit()))));
}
@Override
public Optional visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) {
return of(LogicalType.INTEGER(new IntType((byte) intLogicalType.getBitWidth(), intLogicalType.isSigned())));
}
@Override
public Optional visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) {
return of(LogicalType.JSON(new JsonType()));
}
@Override
public Optional visit(LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) {
return of(LogicalType.BSON(new BsonType()));
}
@Override
public Optional visit(UUIDLogicalTypeAnnotation uuidLogicalType) {
return of(LogicalType.UUID(new UUIDType()));
}
@Override
public Optional visit(LogicalTypeAnnotation.Float16LogicalTypeAnnotation float16LogicalType) {
return of(LogicalType.FLOAT16(new Float16Type()));
}
@Override
public Optional visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) {
return of(LogicalType.UNKNOWN(new NullType()));
}
}
private void addRowGroup(
ParquetMetadata parquetMetadata,
List rowGroups,
BlockMetaData block,
InternalFileEncryptor fileEncryptor) {
// rowGroup.total_byte_size = ;
List columns = block.getColumns();
List parquetColumns = new ArrayList();
int rowGroupOrdinal = rowGroups.size();
int columnOrdinal = -1;
ByteArrayOutputStream tempOutStream = null;
for (ColumnChunkMetaData columnMetaData : columns) {
ColumnChunk columnChunk =
new ColumnChunk(columnMetaData.getFirstDataPageOffset()); // verify this is the right offset
columnChunk.file_path = block.getPath(); // they are in the same file for now
InternalColumnEncryptionSetup columnSetup = null;
boolean writeCryptoMetadata = false;
boolean encryptMetaData = false;
ColumnPath path = columnMetaData.getPath();
if (null != fileEncryptor) {
columnOrdinal++;
columnSetup = fileEncryptor.getColumnSetup(path, false, columnOrdinal);
writeCryptoMetadata = columnSetup.isEncrypted();
encryptMetaData = fileEncryptor.encryptColumnMetaData(columnSetup);
}
ColumnMetaData metaData = new ColumnMetaData(
getType(columnMetaData.getType()),
toFormatEncodings(columnMetaData.getEncodings()),
Arrays.asList(columnMetaData.getPath().toArray()),
toFormatCodec(columnMetaData.getCodec()),
columnMetaData.getValueCount(),
columnMetaData.getTotalUncompressedSize(),
columnMetaData.getTotalSize(),
columnMetaData.getFirstDataPageOffset());
if (columnMetaData.getEncodingStats() != null
&& columnMetaData.getEncodingStats().hasDictionaryPages()) {
metaData.setDictionary_page_offset(columnMetaData.getDictionaryPageOffset());
}
long bloomFilterOffset = columnMetaData.getBloomFilterOffset();
if (bloomFilterOffset >= 0) {
metaData.setBloom_filter_offset(bloomFilterOffset);
}
int bloomFilterLength = columnMetaData.getBloomFilterLength();
if (bloomFilterLength >= 0) {
metaData.setBloom_filter_length(bloomFilterLength);
}
if (columnMetaData.getStatistics() != null
&& !columnMetaData.getStatistics().isEmpty()) {
metaData.setStatistics(
toParquetStatistics(columnMetaData.getStatistics(), this.statisticsTruncateLength));
}
if (columnMetaData.getEncodingStats() != null) {
metaData.setEncoding_stats(convertEncodingStats(columnMetaData.getEncodingStats()));
}
if (columnMetaData.getSizeStatistics() != null
&& columnMetaData.getSizeStatistics().isValid()) {
metaData.setSize_statistics(toParquetSizeStatistics(columnMetaData.getSizeStatistics()));
}
if (!encryptMetaData) {
columnChunk.setMeta_data(metaData);
} else {
// Serialize and encrypt ColumnMetadata separately
byte[] columnMetaDataAAD = AesCipher.createModuleAAD(
fileEncryptor.getFileAAD(),
ModuleType.ColumnMetaData,
rowGroupOrdinal,
columnSetup.getOrdinal(),
-1);
if (null == tempOutStream) {
tempOutStream = new ByteArrayOutputStream();
} else {
tempOutStream.reset();
}
try {
writeColumnMetaData(metaData, tempOutStream, columnSetup.getMetaDataEncryptor(), columnMetaDataAAD);
} catch (IOException e) {
throw new ParquetCryptoRuntimeException(
"Failed to serialize and encrypt ColumnMetadata for " + columnMetaData.getPath(), e);
}
columnChunk.setEncrypted_column_metadata(tempOutStream.toByteArray());
// Keep redacted metadata version
if (!fileEncryptor.isFooterEncrypted()) {
ColumnMetaData metaDataRedacted = metaData.deepCopy();
if (metaDataRedacted.isSetStatistics()) metaDataRedacted.unsetStatistics();
if (metaDataRedacted.isSetEncoding_stats()) metaDataRedacted.unsetEncoding_stats();
columnChunk.setMeta_data(metaDataRedacted);
}
}
if (writeCryptoMetadata) {
columnChunk.setCrypto_metadata(columnSetup.getColumnCryptoMetaData());
}
// columnChunk.meta_data.index_page_offset = ;
// columnChunk.meta_data.key_value_metadata = ; // nothing yet
IndexReference columnIndexRef = columnMetaData.getColumnIndexReference();
if (columnIndexRef != null) {
columnChunk.setColumn_index_offset(columnIndexRef.getOffset());
columnChunk.setColumn_index_length(columnIndexRef.getLength());
}
IndexReference offsetIndexRef = columnMetaData.getOffsetIndexReference();
if (offsetIndexRef != null) {
columnChunk.setOffset_index_offset(offsetIndexRef.getOffset());
columnChunk.setOffset_index_length(offsetIndexRef.getLength());
}
parquetColumns.add(columnChunk);
}
RowGroup rowGroup = new RowGroup(parquetColumns, block.getTotalByteSize(), block.getRowCount());
rowGroup.setFile_offset(block.getStartingPos());
rowGroup.setTotal_compressed_size(block.getCompressedSize());
rowGroup.setOrdinal((short) rowGroupOrdinal);
rowGroups.add(rowGroup);
}
private List toFormatEncodings(Set encodings) {
List converted = new ArrayList(encodings.size());
for (org.apache.parquet.column.Encoding encoding : encodings) {
converted.add(getEncoding(encoding));
}
return converted;
}
// Visible for testing
Set fromFormatEncodings(List encodings) {
Set converted = new HashSet();
for (Encoding encoding : encodings) {
converted.add(getEncoding(encoding));
}
// make converted unmodifiable, drop reference to modifiable copy
converted = Collections.unmodifiableSet(converted);
// atomically update the cache
Set cached = cachedEncodingSets.putIfAbsent(converted, converted);
if (cached == null) {
// cached == null signifies that converted was *not* in the cache previously
// so we can return converted instead of throwing it away, it has now
// been cached
cached = converted;
}
return cached;
}
private CompressionCodecName fromFormatCodec(CompressionCodec codec) {
return CompressionCodecName.valueOf(codec.toString());
}
private CompressionCodec toFormatCodec(CompressionCodecName codec) {
return CompressionCodec.valueOf(codec.toString());
}
public org.apache.parquet.column.Encoding getEncoding(Encoding encoding) {
return org.apache.parquet.column.Encoding.valueOf(encoding.name());
}
public Encoding getEncoding(org.apache.parquet.column.Encoding encoding) {
return Encoding.valueOf(encoding.name());
}
public EncodingStats convertEncodingStats(List stats) {
if (stats == null) {
return null;
}
EncodingStats.Builder builder = new EncodingStats.Builder();
for (PageEncodingStats stat : stats) {
switch (stat.getPage_type()) {
case DATA_PAGE_V2:
builder.withV2Pages();
// falls through
case DATA_PAGE:
builder.addDataEncoding(getEncoding(stat.getEncoding()), stat.getCount());
break;
case DICTIONARY_PAGE:
builder.addDictEncoding(getEncoding(stat.getEncoding()), stat.getCount());
break;
}
}
return builder.build();
}
public List convertEncodingStats(EncodingStats stats) {
if (stats == null) {
return null;
}
List formatStats = new ArrayList();
for (org.apache.parquet.column.Encoding encoding : stats.getDictionaryEncodings()) {
formatStats.add(new PageEncodingStats(
PageType.DICTIONARY_PAGE, getEncoding(encoding), stats.getNumDictionaryPagesEncodedAs(encoding)));
}
PageType dataPageType = (stats.usesV2Pages() ? PageType.DATA_PAGE_V2 : PageType.DATA_PAGE);
for (org.apache.parquet.column.Encoding encoding : stats.getDataEncodings()) {
formatStats.add(new PageEncodingStats(
dataPageType, getEncoding(encoding), stats.getNumDataPagesEncodedAs(encoding)));
}
return formatStats;
}
public static Statistics toParquetStatistics(org.apache.parquet.column.statistics.Statistics stats) {
return toParquetStatistics(stats, ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH);
}
public static Statistics toParquetStatistics(
org.apache.parquet.column.statistics.Statistics stats, int truncateLength) {
Statistics formatStats = new Statistics();
// Don't write stats larger than the max size rather than truncating. The
// rationale is that some engines may use the minimum value in the page as
// the true minimum for aggregations and there is no way to mark that a
// value has been truncated and is a lower bound and not in the page.
if (!stats.isEmpty() && withinLimit(stats, truncateLength)) {
formatStats.setNull_count(stats.getNumNulls());
if (stats.hasNonNullValue()) {
byte[] min;
byte[] max;
if (stats instanceof BinaryStatistics && truncateLength != Integer.MAX_VALUE) {
BinaryTruncator truncator = BinaryTruncator.getTruncator(stats.type());
min = tuncateMin(truncator, truncateLength, stats.getMinBytes());
max = tuncateMax(truncator, truncateLength, stats.getMaxBytes());
} else {
min = stats.getMinBytes();
max = stats.getMaxBytes();
}
// Fill the former min-max statistics only if the comparison logic is
// signed so the logic of V1 and V2 stats are the same (which is
// trivially true for equal min-max values)
if (sortOrder(stats.type()) == SortOrder.SIGNED || Arrays.equals(min, max)) {
formatStats.setMin(min);
formatStats.setMax(max);
}
if (isMinMaxStatsSupported(stats.type()) || Arrays.equals(min, max)) {
formatStats.setMin_value(min);
formatStats.setMax_value(max);
}
}
}
return formatStats;
}
private static boolean withinLimit(org.apache.parquet.column.statistics.Statistics stats, int truncateLength) {
if (stats.isSmallerThan(MAX_STATS_SIZE)) {
return true;
}
if (!(stats instanceof BinaryStatistics)) {
return false;
}
BinaryStatistics binaryStatistics = (BinaryStatistics) stats;
return binaryStatistics.isSmallerThanWithTruncation(MAX_STATS_SIZE, truncateLength);
}
private static byte[] tuncateMin(BinaryTruncator truncator, int truncateLength, byte[] input) {
return truncator
.truncateMin(Binary.fromConstantByteArray(input), truncateLength)
.getBytes();
}
private static byte[] tuncateMax(BinaryTruncator truncator, int truncateLength, byte[] input) {
return truncator
.truncateMax(Binary.fromConstantByteArray(input), truncateLength)
.getBytes();
}
private static boolean isMinMaxStatsSupported(PrimitiveType type) {
return type.columnOrder().getColumnOrderName() == ColumnOrderName.TYPE_DEFINED_ORDER;
}
/**
* @param statistics parquet format statistics
* @param type a primitive type name
* @return the statistics
* @deprecated will be removed in 2.0.0.
*/
@Deprecated
public static org.apache.parquet.column.statistics.Statistics fromParquetStatistics(
Statistics statistics, PrimitiveTypeName type) {
return fromParquetStatistics(null, statistics, type);
}
/**
* @param createdBy the created-by string from the file
* @param statistics parquet format statistics
* @param type a primitive type name
* @return the statistics
* @deprecated will be removed in 2.0.0.
*/
@Deprecated
public static org.apache.parquet.column.statistics.Statistics fromParquetStatistics(
String createdBy, Statistics statistics, PrimitiveTypeName type) {
return fromParquetStatisticsInternal(
createdBy,
statistics,
new PrimitiveType(Repetition.OPTIONAL, type, "fake_type"),
defaultSortOrder(type));
}
// Visible for testing
static org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal(
String createdBy, Statistics formatStats, PrimitiveType type, SortOrder typeSortOrder) {
// create stats object based on the column type
org.apache.parquet.column.statistics.Statistics.Builder statsBuilder =
org.apache.parquet.column.statistics.Statistics.getBuilderForReading(type);
if (formatStats != null) {
// Use the new V2 min-max statistics over the former one if it is filled
if (formatStats.isSetMin_value() && formatStats.isSetMax_value()) {
byte[] min = formatStats.min_value.array();
byte[] max = formatStats.max_value.array();
if (isMinMaxStatsSupported(type) || Arrays.equals(min, max)) {
statsBuilder.withMin(min);
statsBuilder.withMax(max);
}
} else {
boolean isSet = formatStats.isSetMax() && formatStats.isSetMin();
boolean maxEqualsMin = isSet ? Arrays.equals(formatStats.getMin(), formatStats.getMax()) : false;
boolean sortOrdersMatch = SortOrder.SIGNED == typeSortOrder;
// NOTE: See docs in CorruptStatistics for explanation of why this check is needed
// The sort order is checked to avoid returning min/max stats that are not
// valid with the type's sort order. In previous releases, all stats were
// aggregated using a signed byte-wise ordering, which isn't valid for all the
// types (e.g. strings, decimals etc.).
if (!CorruptStatistics.shouldIgnoreStatistics(createdBy, type.getPrimitiveTypeName())
&& (sortOrdersMatch || maxEqualsMin)) {
if (isSet) {
statsBuilder.withMin(formatStats.min.array());
statsBuilder.withMax(formatStats.max.array());
}
}
}
if (formatStats.isSetNull_count()) {
statsBuilder.withNumNulls(formatStats.null_count);
}
}
return statsBuilder.build();
}
public org.apache.parquet.column.statistics.Statistics fromParquetStatistics(
String createdBy, Statistics statistics, PrimitiveType type) {
SortOrder expectedOrder = overrideSortOrderToSigned(type) ? SortOrder.SIGNED : sortOrder(type);
return fromParquetStatisticsInternal(createdBy, statistics, type, expectedOrder);
}
/**
* Sort order for page and column statistics. Types are associated with sort
* orders (e.g., UTF8 columns should use UNSIGNED) and column stats are
* aggregated using a sort order. As of parquet-format version 2.3.1, the
* order used to aggregate stats is always SIGNED and is not stored in the
* Parquet file. These stats are discarded for types that need unsigned.
*
* See PARQUET-686.
*/
enum SortOrder {
SIGNED,
UNSIGNED,
UNKNOWN
}
private static final Set STRING_TYPES = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
LogicalTypeAnnotation.StringLogicalTypeAnnotation.class,
LogicalTypeAnnotation.EnumLogicalTypeAnnotation.class,
LogicalTypeAnnotation.JsonLogicalTypeAnnotation.class,
LogicalTypeAnnotation.Float16LogicalTypeAnnotation.class)));
/**
* Returns whether to use signed order min and max with a type. It is safe to
* use signed min and max when the type is a string type and contains only
* ASCII characters (where the sign bit was 0). This checks whether the type
* is a string type and uses {@code useSignedStringMinMax} to determine if
* only ASCII characters were written.
*
* @param type a primitive type with a logical type annotation
* @return true if signed order min/max can be used with this type
*/
private boolean overrideSortOrderToSigned(PrimitiveType type) {
// even if the override is set, only return stats for string-ish types
// a null type annotation is considered string-ish because some writers
// failed to use the UTF8 annotation.
LogicalTypeAnnotation annotation = type.getLogicalTypeAnnotation();
return useSignedStringMinMax
&& PrimitiveTypeName.BINARY == type.getPrimitiveTypeName()
&& (annotation == null || STRING_TYPES.contains(annotation.getClass()));
}
/**
* @param primitive a primitive physical type
* @return the default sort order used when the logical type is not known
*/
private static SortOrder defaultSortOrder(PrimitiveTypeName primitive) {
switch (primitive) {
case BOOLEAN:
case INT32:
case INT64:
case FLOAT:
case DOUBLE:
return SortOrder.SIGNED;
case BINARY:
case FIXED_LEN_BYTE_ARRAY:
return SortOrder.UNSIGNED;
}
return SortOrder.UNKNOWN;
}
/**
* @param primitive a primitive type with a logical type annotation
* @return the "correct" sort order of the type that applications assume
*/
private static SortOrder sortOrder(PrimitiveType primitive) {
LogicalTypeAnnotation annotation = primitive.getLogicalTypeAnnotation();
if (annotation != null) {
return annotation
.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() {
@Override
public Optional visit(
LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) {
return intLogicalType.isSigned() ? of(SortOrder.SIGNED) : of(SortOrder.UNSIGNED);
}
@Override
public Optional visit(
LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) {
return of(SortOrder.UNKNOWN);
}
@Override
public Optional visit(
LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) {
return of(SortOrder.SIGNED);
}
@Override
public Optional visit(
LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) {
return of(SortOrder.UNSIGNED);
}
@Override
public Optional visit(
LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) {
return of(SortOrder.UNSIGNED);
}
@Override
public Optional visit(UUIDLogicalTypeAnnotation uuidLogicalType) {
return of(SortOrder.UNSIGNED);
}
@Override
public Optional visit(
LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) {
return of(SortOrder.UNSIGNED);
}
@Override
public Optional visit(
LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) {
return of(SortOrder.UNSIGNED);
}
@Override
public Optional visit(
LogicalTypeAnnotation.Float16LogicalTypeAnnotation float16LogicalType) {
return of(SortOrder.SIGNED);
}
@Override
public Optional visit(
LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) {
return of(SortOrder.UNKNOWN);
}
@Override
public Optional visit(
LogicalTypeAnnotation.MapKeyValueTypeAnnotation mapKeyValueLogicalType) {
return of(SortOrder.UNKNOWN);
}
@Override
public Optional visit(
LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) {
return of(SortOrder.UNKNOWN);
}
@Override
public Optional visit(
LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) {
return of(SortOrder.UNKNOWN);
}
@Override
public Optional visit(
LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) {
return of(SortOrder.SIGNED);
}
@Override
public Optional visit(
LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) {
return of(SortOrder.SIGNED);
}
})
.orElse(defaultSortOrder(primitive.getPrimitiveTypeName()));
}
return defaultSortOrder(primitive.getPrimitiveTypeName());
}
public PrimitiveTypeName getPrimitive(Type type) {
switch (type) {
case BYTE_ARRAY: // TODO: rename BINARY and remove this switch
return PrimitiveTypeName.BINARY;
case INT64:
return PrimitiveTypeName.INT64;
case INT32:
return PrimitiveTypeName.INT32;
case BOOLEAN:
return PrimitiveTypeName.BOOLEAN;
case FLOAT:
return PrimitiveTypeName.FLOAT;
case DOUBLE:
return PrimitiveTypeName.DOUBLE;
case INT96:
return PrimitiveTypeName.INT96;
case FIXED_LEN_BYTE_ARRAY:
return PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
default:
throw new RuntimeException("Unknown type " + type);
}
}
// Visible for testing
Type getType(PrimitiveTypeName type) {
switch (type) {
case INT64:
return Type.INT64;
case INT32:
return Type.INT32;
case BOOLEAN:
return Type.BOOLEAN;
case BINARY:
return Type.BYTE_ARRAY;
case FLOAT:
return Type.FLOAT;
case DOUBLE:
return Type.DOUBLE;
case INT96:
return Type.INT96;
case FIXED_LEN_BYTE_ARRAY:
return Type.FIXED_LEN_BYTE_ARRAY;
default:
throw new RuntimeException("Unknown primitive type " + type);
}
}
// Visible for testing
LogicalTypeAnnotation getLogicalTypeAnnotation(ConvertedType type, SchemaElement schemaElement) {
switch (type) {
case UTF8:
return LogicalTypeAnnotation.stringType();
case MAP:
return LogicalTypeAnnotation.mapType();
case MAP_KEY_VALUE:
return LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance();
case LIST:
return LogicalTypeAnnotation.listType();
case ENUM:
return LogicalTypeAnnotation.enumType();
case DECIMAL:
int scale = (schemaElement == null ? 0 : schemaElement.scale);
int precision = (schemaElement == null ? 0 : schemaElement.precision);
return LogicalTypeAnnotation.decimalType(scale, precision);
case DATE:
return LogicalTypeAnnotation.dateType();
case TIME_MILLIS:
return LogicalTypeAnnotation.timeType(true, LogicalTypeAnnotation.TimeUnit.MILLIS);
case TIME_MICROS:
return LogicalTypeAnnotation.timeType(true, LogicalTypeAnnotation.TimeUnit.MICROS);
case TIMESTAMP_MILLIS:
return LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MILLIS);
case TIMESTAMP_MICROS:
return LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MICROS);
case INTERVAL:
return LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance();
case INT_8:
return LogicalTypeAnnotation.intType(8, true);
case INT_16:
return LogicalTypeAnnotation.intType(16, true);
case INT_32:
return LogicalTypeAnnotation.intType(32, true);
case INT_64:
return LogicalTypeAnnotation.intType(64, true);
case UINT_8:
return LogicalTypeAnnotation.intType(8, false);
case UINT_16:
return LogicalTypeAnnotation.intType(16, false);
case UINT_32:
return LogicalTypeAnnotation.intType(32, false);
case UINT_64:
return LogicalTypeAnnotation.intType(64, false);
case JSON:
return LogicalTypeAnnotation.jsonType();
case BSON:
return LogicalTypeAnnotation.bsonType();
default:
throw new RuntimeException(
"Can't convert converted type to logical type, unknown converted type " + type);
}
}
LogicalTypeAnnotation getLogicalTypeAnnotation(LogicalType type) {
switch (type.getSetField()) {
case MAP:
return LogicalTypeAnnotation.mapType();
case BSON:
return LogicalTypeAnnotation.bsonType();
case DATE:
return LogicalTypeAnnotation.dateType();
case ENUM:
return LogicalTypeAnnotation.enumType();
case JSON:
return LogicalTypeAnnotation.jsonType();
case LIST:
return LogicalTypeAnnotation.listType();
case TIME:
TimeType time = type.getTIME();
return LogicalTypeAnnotation.timeType(time.isAdjustedToUTC, convertTimeUnit(time.unit));
case STRING:
return LogicalTypeAnnotation.stringType();
case DECIMAL:
DecimalType decimal = type.getDECIMAL();
return LogicalTypeAnnotation.decimalType(decimal.scale, decimal.precision);
case INTEGER:
IntType integer = type.getINTEGER();
return LogicalTypeAnnotation.intType(integer.bitWidth, integer.isSigned);
case UNKNOWN:
return null;
case TIMESTAMP:
TimestampType timestamp = type.getTIMESTAMP();
return LogicalTypeAnnotation.timestampType(timestamp.isAdjustedToUTC, convertTimeUnit(timestamp.unit));
case UUID:
return LogicalTypeAnnotation.uuidType();
case FLOAT16:
return LogicalTypeAnnotation.float16Type();
default:
throw new RuntimeException("Unknown logical type " + type);
}
}
private LogicalTypeAnnotation.TimeUnit convertTimeUnit(TimeUnit unit) {
switch (unit.getSetField()) {
case MICROS:
return LogicalTypeAnnotation.TimeUnit.MICROS;
case MILLIS:
return LogicalTypeAnnotation.TimeUnit.MILLIS;
case NANOS:
return LogicalTypeAnnotation.TimeUnit.NANOS;
default:
throw new RuntimeException("Unknown time unit " + unit);
}
}
private static void addKeyValue(FileMetaData fileMetaData, String key, String value) {
KeyValue keyValue = new KeyValue(key);
keyValue.value = value;
fileMetaData.addToKey_value_metadata(keyValue);
}
private static interface MetadataFilterVisitor {
T visit(NoFilter filter) throws E;
T visit(SkipMetadataFilter filter) throws E;
T visit(RangeMetadataFilter filter) throws E;
T visit(OffsetMetadataFilter filter) throws E;
}
public abstract static class MetadataFilter {
private MetadataFilter() {}
abstract T accept(MetadataFilterVisitor visitor) throws E;
}
/**
* [ startOffset, endOffset )
*
* @param startOffset a start offset (inclusive)
* @param endOffset an end offset (exclusive)
* @return a range filter from the offsets
*/
public static MetadataFilter range(long startOffset, long endOffset) {
return new RangeMetadataFilter(startOffset, endOffset);
}
public static MetadataFilter offsets(long... offsets) {
Set set = new HashSet();
for (long offset : offsets) {
set.add(offset);
}
return new OffsetMetadataFilter(set);
}
private static final class NoFilter extends MetadataFilter {
private NoFilter() {}
@Override
T accept(MetadataFilterVisitor visitor) throws E {
return visitor.visit(this);
}
@Override
public String toString() {
return "NO_FILTER";
}
}
private static final class SkipMetadataFilter extends MetadataFilter {
private SkipMetadataFilter() {}
@Override
T accept(MetadataFilterVisitor visitor) throws E {
return visitor.visit(this);
}
@Override
public String toString() {
return "SKIP_ROW_GROUPS";
}
}
/**
* [ startOffset, endOffset )
*/
// Visible for testing
static final class RangeMetadataFilter extends MetadataFilter {
final long startOffset;
final long endOffset;
RangeMetadataFilter(long startOffset, long endOffset) {
super();
this.startOffset = startOffset;
this.endOffset = endOffset;
}
@Override
T accept(MetadataFilterVisitor visitor) throws E {
return visitor.visit(this);
}
public boolean contains(long offset) {
return offset >= this.startOffset && offset < this.endOffset;
}
@Override
public String toString() {
return "range(s:" + startOffset + ", e:" + endOffset + ")";
}
}
static final class OffsetMetadataFilter extends MetadataFilter {
private final Set offsets;
public OffsetMetadataFilter(Set offsets) {
this.offsets = offsets;
}
public boolean contains(long offset) {
return offsets.contains(offset);
}
@Override
T accept(MetadataFilterVisitor visitor) throws E {
return visitor.visit(this);
}
}
@Deprecated
public ParquetMetadata readParquetMetadata(InputStream from) throws IOException {
return readParquetMetadata(from, NO_FILTER);
}
// Visible for testing
static FileMetaData filterFileMetaDataByMidpoint(FileMetaData metaData, RangeMetadataFilter filter) {
List rowGroups = metaData.getRow_groups();
List newRowGroups = new ArrayList();
long preStartIndex = 0;
long preCompressedSize = 0;
boolean firstColumnWithMetadata = true;
if (rowGroups != null && !rowGroups.isEmpty()) {
firstColumnWithMetadata = rowGroups.get(0).getColumns().get(0).isSetMeta_data();
}
for (RowGroup rowGroup : rowGroups) {
long totalSize = 0;
long startIndex;
ColumnChunk columnChunk = rowGroup.getColumns().get(0);
if (firstColumnWithMetadata) {
startIndex = getOffset(columnChunk);
} else {
assert rowGroup.isSetFile_offset();
assert rowGroup.isSetTotal_compressed_size();
// the file_offset of first block always holds the truth, while other blocks don't :
// see PARQUET-2078 for details
startIndex = rowGroup.getFile_offset();
if (invalidFileOffset(startIndex, preStartIndex, preCompressedSize)) {
// first row group's offset is always 4
if (preStartIndex == 0) {
startIndex = 4;
} else {
// use minStartIndex(imprecise in case of padding, but good enough for filtering)
startIndex = preStartIndex + preCompressedSize;
}
}
preStartIndex = startIndex;
preCompressedSize = rowGroup.getTotal_compressed_size();
}
if (rowGroup.isSetTotal_compressed_size()) {
totalSize = rowGroup.getTotal_compressed_size();
} else {
for (ColumnChunk col : rowGroup.getColumns()) {
totalSize += col.getMeta_data().getTotal_compressed_size();
}
}
long midPoint = startIndex + totalSize / 2;
if (filter.contains(midPoint)) {
newRowGroups.add(rowGroup);
}
}
metaData.setRow_groups(newRowGroups);
return metaData;
}
private static boolean invalidFileOffset(long startIndex, long preStartIndex, long preCompressedSize) {
boolean invalid = false;
assert preStartIndex <= startIndex;
// checking the first rowGroup
if (preStartIndex == 0 && startIndex != 4) {
invalid = true;
return invalid;
}
// calculate start index for other blocks
long minStartIndex = preStartIndex + preCompressedSize;
if (startIndex < minStartIndex) {
// a bad offset detected, try first column's offset
// can not use minStartIndex in case of padding
invalid = true;
}
return invalid;
}
// Visible for testing
static FileMetaData filterFileMetaDataByStart(FileMetaData metaData, OffsetMetadataFilter filter) {
List rowGroups = metaData.getRow_groups();
List newRowGroups = new ArrayList();
long preStartIndex = 0;
long preCompressedSize = 0;
boolean firstColumnWithMetadata = true;
if (rowGroups != null && !rowGroups.isEmpty()) {
firstColumnWithMetadata = rowGroups.get(0).getColumns().get(0).isSetMeta_data();
}
for (RowGroup rowGroup : rowGroups) {
long startIndex;
ColumnChunk columnChunk = rowGroup.getColumns().get(0);
if (firstColumnWithMetadata) {
startIndex = getOffset(columnChunk);
} else {
assert rowGroup.isSetFile_offset();
assert rowGroup.isSetTotal_compressed_size();
// the file_offset of first block always holds the truth, while other blocks don't :
// see PARQUET-2078 for details
startIndex = rowGroup.getFile_offset();
if (invalidFileOffset(startIndex, preStartIndex, preCompressedSize)) {
// first row group's offset is always 4
if (preStartIndex == 0) {
startIndex = 4;
} else {
throw new InvalidFileOffsetException("corrupted RowGroup.file_offset found, "
+ "please use file range instead of block offset for split.");
}
}
preStartIndex = startIndex;
preCompressedSize = rowGroup.getTotal_compressed_size();
}
if (filter.contains(startIndex)) {
newRowGroups.add(rowGroup);
}
}
metaData.setRow_groups(newRowGroups);
return metaData;
}
static long getOffset(RowGroup rowGroup) {
if (rowGroup.isSetFile_offset()) {
return rowGroup.getFile_offset();
}
return getOffset(rowGroup.getColumns().get(0));
}
// Visible for testing
static long getOffset(ColumnChunk columnChunk) {
ColumnMetaData md = columnChunk.getMeta_data();
long offset = md.getData_page_offset();
if (md.isSetDictionary_page_offset() && offset > md.getDictionary_page_offset()) {
offset = md.getDictionary_page_offset();
}
return offset;
}
private static void verifyFooterIntegrity(
InputStream from, InternalFileDecryptor fileDecryptor, int combinedFooterLength) throws IOException {
byte[] nonce = new byte[AesCipher.NONCE_LENGTH];
from.read(nonce);
byte[] gcmTag = new byte[AesCipher.GCM_TAG_LENGTH];
from.read(gcmTag);
AesGcmEncryptor footerSigner = fileDecryptor.createSignedFooterEncryptor();
int footerSignatureLength = AesCipher.NONCE_LENGTH + AesCipher.GCM_TAG_LENGTH;
byte[] serializedFooter = new byte[combinedFooterLength - footerSignatureLength];
// Resetting to the beginning of the footer
from.reset();
from.read(serializedFooter);
byte[] signedFooterAAD = AesCipher.createFooterAAD(fileDecryptor.getFileAAD());
byte[] encryptedFooterBytes = footerSigner.encrypt(false, serializedFooter, nonce, signedFooterAAD);
byte[] calculatedTag = new byte[AesCipher.GCM_TAG_LENGTH];
System.arraycopy(
encryptedFooterBytes,
encryptedFooterBytes.length - AesCipher.GCM_TAG_LENGTH,
calculatedTag,
0,
AesCipher.GCM_TAG_LENGTH);
if (!Arrays.equals(gcmTag, calculatedTag)) {
throw new TagVerificationException("Signature mismatch in plaintext footer");
}
}
public ParquetMetadata readParquetMetadata(final InputStream from, MetadataFilter filter) throws IOException {
return readParquetMetadata(from, filter, null, false, 0);
}
private Map generateRowGroupOffsets(FileMetaData metaData) {
Map rowGroupOrdinalToRowIdx = new HashMap<>();
List rowGroups = metaData.getRow_groups();
if (rowGroups != null) {
long rowIdxSum = 0;
for (int i = 0; i < rowGroups.size(); i++) {
rowGroupOrdinalToRowIdx.put(rowGroups.get(i), rowIdxSum);
rowIdxSum += rowGroups.get(i).getNum_rows();
}
}
return rowGroupOrdinalToRowIdx;
}
/**
* A container for [[FileMetaData]] and [[RowGroup]] to ROW_INDEX offset map.
*/
private class FileMetaDataAndRowGroupOffsetInfo {
final FileMetaData fileMetadata;
final Map rowGroupToRowIndexOffsetMap;
public FileMetaDataAndRowGroupOffsetInfo(
FileMetaData fileMetadata, Map rowGroupToRowIndexOffsetMap) {
this.fileMetadata = fileMetadata;
this.rowGroupToRowIndexOffsetMap = rowGroupToRowIndexOffsetMap;
}
}
public ParquetMetadata readParquetMetadata(
final InputStream fromInputStream,
MetadataFilter filter,
final InternalFileDecryptor fileDecryptor,
final boolean encryptedFooter,
final int combinedFooterLength)
throws IOException {
final BlockCipher.Decryptor footerDecryptor = (encryptedFooter ? fileDecryptor.fetchFooterDecryptor() : null);
final byte[] encryptedFooterAAD =
(encryptedFooter ? AesCipher.createFooterAAD(fileDecryptor.getFileAAD()) : null);
// Mark the beginning of the footer for verifyFooterIntegrity
final InputStream from;
if (fileDecryptor != null && fileDecryptor.checkFooterIntegrity()) {
// fromInputStream should already support marking but let's be on the safe side
if (!fromInputStream.markSupported()) {
from = new BufferedInputStream(fromInputStream, combinedFooterLength);
} else {
from = fromInputStream;
}
from.mark(combinedFooterLength);
} else {
from = fromInputStream;
}
FileMetaDataAndRowGroupOffsetInfo fileMetaDataAndRowGroupInfo =
filter.accept(new MetadataFilterVisitor() {
@Override
public FileMetaDataAndRowGroupOffsetInfo visit(NoFilter filter) throws IOException {
FileMetaData fileMetadata = readFileMetaData(from, footerDecryptor, encryptedFooterAAD);
return new FileMetaDataAndRowGroupOffsetInfo(
fileMetadata, generateRowGroupOffsets(fileMetadata));
}
@Override
public FileMetaDataAndRowGroupOffsetInfo visit(SkipMetadataFilter filter) throws IOException {
FileMetaData fileMetadata = readFileMetaData(from, true, footerDecryptor, encryptedFooterAAD);
return new FileMetaDataAndRowGroupOffsetInfo(
fileMetadata, generateRowGroupOffsets(fileMetadata));
}
@Override
public FileMetaDataAndRowGroupOffsetInfo visit(OffsetMetadataFilter filter) throws IOException {
FileMetaData fileMetadata = readFileMetaData(from, footerDecryptor, encryptedFooterAAD);
// We must generate the map *before* filtering because it modifies `fileMetadata`.
Map rowGroupToRowIndexOffsetMap = generateRowGroupOffsets(fileMetadata);
FileMetaData filteredFileMetadata = filterFileMetaDataByStart(fileMetadata, filter);
return new FileMetaDataAndRowGroupOffsetInfo(filteredFileMetadata, rowGroupToRowIndexOffsetMap);
}
@Override
public FileMetaDataAndRowGroupOffsetInfo visit(RangeMetadataFilter filter) throws IOException {
FileMetaData fileMetadata = readFileMetaData(from, footerDecryptor, encryptedFooterAAD);
// We must generate the map *before* filtering because it modifies `fileMetadata`.
Map rowGroupToRowIndexOffsetMap = generateRowGroupOffsets(fileMetadata);
FileMetaData filteredFileMetadata = filterFileMetaDataByMidpoint(fileMetadata, filter);
return new FileMetaDataAndRowGroupOffsetInfo(filteredFileMetadata, rowGroupToRowIndexOffsetMap);
}
});
FileMetaData fileMetaData = fileMetaDataAndRowGroupInfo.fileMetadata;
Map rowGroupToRowIndexOffsetMap = fileMetaDataAndRowGroupInfo.rowGroupToRowIndexOffsetMap;
LOG.debug("{}", fileMetaData);
if (!encryptedFooter && null != fileDecryptor) {
if (!fileMetaData.isSetEncryption_algorithm()) { // Plaintext file
fileDecryptor.setPlaintextFile();
// Done to detect files that were not encrypted by mistake
if (!fileDecryptor.plaintextFilesAllowed()) {
throw new ParquetCryptoRuntimeException("Applying decryptor on plaintext file");
}
} else { // Encrypted file with plaintext footer
// if no fileDecryptor, can still read plaintext columns
fileDecryptor.setFileCryptoMetaData(
fileMetaData.getEncryption_algorithm(), false, fileMetaData.getFooter_signing_key_metadata());
if (fileDecryptor.checkFooterIntegrity()) {
verifyFooterIntegrity(from, fileDecryptor, combinedFooterLength);
}
}
}
ParquetMetadata parquetMetadata =
fromParquetMetadata(fileMetaData, fileDecryptor, encryptedFooter, rowGroupToRowIndexOffsetMap);
if (LOG.isDebugEnabled()) LOG.debug(ParquetMetadata.toPrettyJSON(parquetMetadata));
return parquetMetadata;
}
public ColumnChunkMetaData buildColumnChunkMetaData(
ColumnMetaData metaData, ColumnPath columnPath, PrimitiveType type, String createdBy) {
return ColumnChunkMetaData.get(
columnPath,
type,
fromFormatCodec(metaData.codec),
convertEncodingStats(metaData.getEncoding_stats()),
fromFormatEncodings(metaData.encodings),
fromParquetStatistics(createdBy, metaData.statistics, type),
metaData.data_page_offset,
metaData.dictionary_page_offset,
metaData.num_values,
metaData.total_compressed_size,
metaData.total_uncompressed_size,
fromParquetSizeStatistics(metaData.size_statistics, type));
}
public ParquetMetadata fromParquetMetadata(FileMetaData parquetMetadata) throws IOException {
return fromParquetMetadata(parquetMetadata, null, false);
}
public ParquetMetadata fromParquetMetadata(
FileMetaData parquetMetadata, InternalFileDecryptor fileDecryptor, boolean encryptedFooter)
throws IOException {
return fromParquetMetadata(parquetMetadata, fileDecryptor, encryptedFooter, new HashMap());
}
public ParquetMetadata fromParquetMetadata(
FileMetaData parquetMetadata,
InternalFileDecryptor fileDecryptor,
boolean encryptedFooter,
Map rowGroupToRowIndexOffsetMap)
throws IOException {
MessageType messageType = fromParquetSchema(parquetMetadata.getSchema(), parquetMetadata.getColumn_orders());
List blocks = new ArrayList();
List row_groups = parquetMetadata.getRow_groups();
if (row_groups != null) {
for (RowGroup rowGroup : row_groups) {
BlockMetaData blockMetaData = new BlockMetaData();
blockMetaData.setRowCount(rowGroup.getNum_rows());
blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
if (rowGroupToRowIndexOffsetMap.containsKey(rowGroup)) {
blockMetaData.setRowIndexOffset(rowGroupToRowIndexOffsetMap.get(rowGroup));
}
// not set in legacy files
if (rowGroup.isSetOrdinal()) {
blockMetaData.setOrdinal(rowGroup.getOrdinal());
}
List columns = rowGroup.getColumns();
String filePath = columns.get(0).getFile_path();
int columnOrdinal = -1;
for (ColumnChunk columnChunk : columns) {
columnOrdinal++;
if ((filePath == null && columnChunk.getFile_path() != null)
|| (filePath != null && !filePath.equals(columnChunk.getFile_path()))) {
throw new ParquetDecodingException(
"all column chunks of the same row group must be in the same file for now");
}
ColumnMetaData metaData = columnChunk.meta_data;
ColumnCryptoMetaData cryptoMetaData = columnChunk.getCrypto_metadata();
ColumnChunkMetaData column = null;
ColumnPath columnPath = null;
boolean lazyMetadataDecryption = false;
if (null == cryptoMetaData) { // Plaintext column
columnPath = getPath(metaData);
if (null != fileDecryptor && !fileDecryptor.plaintextFile()) {
// mark this column as plaintext in encrypted file decryptor
fileDecryptor.setColumnCryptoMetadata(
columnPath, false, false, (byte[]) null, columnOrdinal);
}
} else { // Encrypted column
boolean encryptedWithFooterKey = cryptoMetaData.isSetENCRYPTION_WITH_FOOTER_KEY();
if (encryptedWithFooterKey) { // Column encrypted with footer key
if (null == fileDecryptor) {
throw new ParquetCryptoRuntimeException(
"Column encrypted with footer key: No keys available");
}
if (null == metaData) {
throw new ParquetCryptoRuntimeException(
"ColumnMetaData not set in Encryption with Footer key");
}
columnPath = getPath(metaData);
if (!encryptedFooter) { // Unencrypted footer. Decrypt full column metadata, using footer
// key
ByteArrayInputStream tempInputStream =
new ByteArrayInputStream(columnChunk.getEncrypted_column_metadata());
byte[] columnMetaDataAAD = AesCipher.createModuleAAD(
fileDecryptor.getFileAAD(),
ModuleType.ColumnMetaData,
rowGroup.getOrdinal(),
columnOrdinal,
-1);
try {
metaData = readColumnMetaData(
tempInputStream, fileDecryptor.fetchFooterDecryptor(), columnMetaDataAAD);
} catch (IOException e) {
throw new ParquetCryptoRuntimeException(
columnPath + ". Failed to decrypt column metadata", e);
}
}
fileDecryptor.setColumnCryptoMetadata(columnPath, true, true, (byte[]) null, columnOrdinal);
} else { // Column encrypted with column key
// setColumnCryptoMetadata triggers KMS interaction, hence delayed until this column is
// projected
lazyMetadataDecryption = true;
}
}
String createdBy = parquetMetadata.getCreated_by();
if (!lazyMetadataDecryption) { // full column metadata (with stats) is available
column = buildColumnChunkMetaData(
metaData,
columnPath,
messageType.getType(columnPath.toArray()).asPrimitiveType(),
createdBy);
column.setRowGroupOrdinal(rowGroup.getOrdinal());
if (metaData.isSetBloom_filter_offset()) {
column.setBloomFilterOffset(metaData.getBloom_filter_offset());
}
if (metaData.isSetBloom_filter_length()) {
column.setBloomFilterLength(metaData.getBloom_filter_length());
}
} else { // column encrypted with column key
// Metadata will be decrypted later, if this column is accessed
EncryptionWithColumnKey columnKeyStruct = cryptoMetaData.getENCRYPTION_WITH_COLUMN_KEY();
List pathList = columnKeyStruct.getPath_in_schema();
byte[] columnKeyMetadata = columnKeyStruct.getKey_metadata();
columnPath = ColumnPath.get(pathList.toArray(new String[pathList.size()]));
byte[] encryptedMetadataBuffer = columnChunk.getEncrypted_column_metadata();
column = ColumnChunkMetaData.getWithEncryptedMetadata(
this,
columnPath,
messageType.getType(columnPath.toArray()).asPrimitiveType(),
encryptedMetadataBuffer,
columnKeyMetadata,
fileDecryptor,
rowGroup.getOrdinal(),
columnOrdinal,
createdBy);
}
column.setColumnIndexReference(toColumnIndexReference(columnChunk));
column.setOffsetIndexReference(toOffsetIndexReference(columnChunk));
// TODO
// index_page_offset
// key_value_metadata
blockMetaData.addColumn(column);
}
blockMetaData.setPath(filePath);
blocks.add(blockMetaData);
}
}
Map keyValueMetaData = new HashMap();
List key_value_metadata = parquetMetadata.getKey_value_metadata();
if (key_value_metadata != null) {
for (KeyValue keyValue : key_value_metadata) {
keyValueMetaData.put(keyValue.key, keyValue.value);
}
}
EncryptionType encryptionType;
if (encryptedFooter) {
encryptionType = EncryptionType.ENCRYPTED_FOOTER;
} else if (parquetMetadata.isSetEncryption_algorithm()) {
encryptionType = EncryptionType.PLAINTEXT_FOOTER;
} else {
encryptionType = EncryptionType.UNENCRYPTED;
}
return new ParquetMetadata(
new org.apache.parquet.hadoop.metadata.FileMetaData(
messageType, keyValueMetaData, parquetMetadata.getCreated_by(), encryptionType, fileDecryptor),
blocks);
}
private static IndexReference toColumnIndexReference(ColumnChunk columnChunk) {
if (columnChunk.isSetColumn_index_offset() && columnChunk.isSetColumn_index_length()) {
return new IndexReference(columnChunk.getColumn_index_offset(), columnChunk.getColumn_index_length());
}
return null;
}
private static IndexReference toOffsetIndexReference(ColumnChunk columnChunk) {
if (columnChunk.isSetOffset_index_offset() && columnChunk.isSetOffset_index_length()) {
return new IndexReference(columnChunk.getOffset_index_offset(), columnChunk.getOffset_index_length());
}
return null;
}
private static ColumnPath getPath(ColumnMetaData metaData) {
String[] path = metaData.path_in_schema.toArray(new String[0]);
return ColumnPath.get(path);
}
// Visible for testing
MessageType fromParquetSchema(List schema, List columnOrders) {
Iterator iterator = schema.iterator();
SchemaElement root = iterator.next();
Types.MessageTypeBuilder builder = Types.buildMessage();
if (root.isSetField_id()) {
builder.id(root.field_id);
}
buildChildren(builder, iterator, root.getNum_children(), columnOrders, 0);
return builder.named(root.name);
}
private void buildChildren(
Types.GroupBuilder builder,
Iterator schema,
int childrenCount,
List columnOrders,
int columnCount) {
for (int i = 0; i < childrenCount; i++) {
SchemaElement schemaElement = schema.next();
// Create Parquet Type.
Types.Builder childBuilder;
if (schemaElement.type != null) {
Types.PrimitiveBuilder primitiveBuilder = builder.primitive(
getPrimitive(schemaElement.type), fromParquetRepetition(schemaElement.repetition_type));
if (schemaElement.isSetType_length()) {
primitiveBuilder.length(schemaElement.type_length);
}
if (schemaElement.isSetPrecision()) {
primitiveBuilder.precision(schemaElement.precision);
}
if (schemaElement.isSetScale()) {
primitiveBuilder.scale(schemaElement.scale);
}
if (columnOrders != null) {
org.apache.parquet.schema.ColumnOrder columnOrder =
fromParquetColumnOrder(columnOrders.get(columnCount));
// As per parquet format 2.4.0 no UNDEFINED order is supported. So, set undefined column order for
// the types
// where ordering is not supported.
if (columnOrder.getColumnOrderName() == ColumnOrderName.TYPE_DEFINED_ORDER
&& (schemaElement.type == Type.INT96
|| schemaElement.converted_type == ConvertedType.INTERVAL)) {
columnOrder = org.apache.parquet.schema.ColumnOrder.undefined();
}
primitiveBuilder.columnOrder(columnOrder);
}
childBuilder = primitiveBuilder;
} else {
childBuilder = builder.group(fromParquetRepetition(schemaElement.repetition_type));
buildChildren(
(Types.GroupBuilder) childBuilder,
schema,
schemaElement.num_children,
columnOrders,
columnCount);
}
if (schemaElement.isSetLogicalType()) {
childBuilder.as(getLogicalTypeAnnotation(schemaElement.logicalType));
}
if (schemaElement.isSetConverted_type()) {
OriginalType originalType = getLogicalTypeAnnotation(schemaElement.converted_type, schemaElement)
.toOriginalType();
OriginalType newOriginalType = (schemaElement.isSetLogicalType()
&& getLogicalTypeAnnotation(schemaElement.logicalType) != null)
? getLogicalTypeAnnotation(schemaElement.logicalType).toOriginalType()
: null;
if (!originalType.equals(newOriginalType)) {
if (newOriginalType != null) {
LOG.warn(
"Converted type and logical type metadata mismatch (convertedType: {}, logical type: {}). Using value in converted type.",
schemaElement.converted_type,
schemaElement.logicalType);
}
childBuilder.as(originalType);
}
}
if (schemaElement.isSetField_id()) {
childBuilder.id(schemaElement.field_id);
}
childBuilder.named(schemaElement.name);
++columnCount;
}
}
// Visible for testing
FieldRepetitionType toParquetRepetition(Repetition repetition) {
return FieldRepetitionType.valueOf(repetition.name());
}
// Visible for testing
Repetition fromParquetRepetition(FieldRepetitionType repetition) {
return Repetition.valueOf(repetition.name());
}
private static org.apache.parquet.schema.ColumnOrder fromParquetColumnOrder(ColumnOrder columnOrder) {
if (columnOrder.isSetTYPE_ORDER()) {
return org.apache.parquet.schema.ColumnOrder.typeDefined();
}
// The column order is not yet supported by this API
return org.apache.parquet.schema.ColumnOrder.undefined();
}
@Deprecated
public void writeDataPageHeader(
int uncompressedSize,
int compressedSize,
int valueCount,
org.apache.parquet.column.Encoding rlEncoding,
org.apache.parquet.column.Encoding dlEncoding,
org.apache.parquet.column.Encoding valuesEncoding,
OutputStream to)
throws IOException {
writePageHeader(
newDataPageHeader(uncompressedSize, compressedSize, valueCount, rlEncoding, dlEncoding, valuesEncoding),
to);
}
// Statistics are no longer saved in page headers
@Deprecated
public void writeDataPageHeader(
int uncompressedSize,
int compressedSize,
int valueCount,
org.apache.parquet.column.statistics.Statistics statistics,
org.apache.parquet.column.Encoding rlEncoding,
org.apache.parquet.column.Encoding dlEncoding,
org.apache.parquet.column.Encoding valuesEncoding,
OutputStream to)
throws IOException {
writePageHeader(
newDataPageHeader(uncompressedSize, compressedSize, valueCount, rlEncoding, dlEncoding, valuesEncoding),
to);
}
private PageHeader newDataPageHeader(
int uncompressedSize,
int compressedSize,
int valueCount,
org.apache.parquet.column.Encoding rlEncoding,
org.apache.parquet.column.Encoding dlEncoding,
org.apache.parquet.column.Encoding valuesEncoding) {
PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE, uncompressedSize, compressedSize);
pageHeader.setData_page_header(new DataPageHeader(
valueCount, getEncoding(valuesEncoding), getEncoding(dlEncoding), getEncoding(rlEncoding)));
return pageHeader;
}
private PageHeader newDataPageHeader(
int uncompressedSize,
int compressedSize,
int valueCount,
org.apache.parquet.column.Encoding rlEncoding,
org.apache.parquet.column.Encoding dlEncoding,
org.apache.parquet.column.Encoding valuesEncoding,
int crc) {
PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE, uncompressedSize, compressedSize);
pageHeader.setCrc(crc);
pageHeader.setData_page_header(new DataPageHeader(
valueCount, getEncoding(valuesEncoding), getEncoding(dlEncoding), getEncoding(rlEncoding)));
return pageHeader;
}
// Statistics are no longer saved in page headers
@Deprecated
public void writeDataPageV2Header(
int uncompressedSize,
int compressedSize,
int valueCount,
int nullCount,
int rowCount,
org.apache.parquet.column.statistics.Statistics statistics,
org.apache.parquet.column.Encoding dataEncoding,
int rlByteLength,
int dlByteLength,
OutputStream to)
throws IOException {
writePageHeader(
newDataPageV2Header(
uncompressedSize,
compressedSize,
valueCount,
nullCount,
rowCount,
dataEncoding,
rlByteLength,
dlByteLength),
to);
}
public void writeDataPageV1Header(
int uncompressedSize,
int compressedSize,
int valueCount,
org.apache.parquet.column.Encoding rlEncoding,
org.apache.parquet.column.Encoding dlEncoding,
org.apache.parquet.column.Encoding valuesEncoding,
OutputStream to)
throws IOException {
writeDataPageV1Header(
uncompressedSize, compressedSize, valueCount, rlEncoding, dlEncoding, valuesEncoding, to, null, null);
}
public void writeDataPageV1Header(
int uncompressedSize,
int compressedSize,
int valueCount,
org.apache.parquet.column.Encoding rlEncoding,
org.apache.parquet.column.Encoding dlEncoding,
org.apache.parquet.column.Encoding valuesEncoding,
OutputStream to,
BlockCipher.Encryptor blockEncryptor,
byte[] pageHeaderAAD)
throws IOException {
writePageHeader(
newDataPageHeader(uncompressedSize, compressedSize, valueCount, rlEncoding, dlEncoding, valuesEncoding),
to,
blockEncryptor,
pageHeaderAAD);
}
public void writeDataPageV1Header(
int uncompressedSize,
int compressedSize,
int valueCount,
org.apache.parquet.column.Encoding rlEncoding,
org.apache.parquet.column.Encoding dlEncoding,
org.apache.parquet.column.Encoding valuesEncoding,
int crc,
OutputStream to)
throws IOException {
writeDataPageV1Header(
uncompressedSize,
compressedSize,
valueCount,
rlEncoding,
dlEncoding,
valuesEncoding,
crc,
to,
null,
null);
}
public void writeDataPageV1Header(
int uncompressedSize,
int compressedSize,
int valueCount,
org.apache.parquet.column.Encoding rlEncoding,
org.apache.parquet.column.Encoding dlEncoding,
org.apache.parquet.column.Encoding valuesEncoding,
int crc,
OutputStream to,
BlockCipher.Encryptor blockEncryptor,
byte[] pageHeaderAAD)
throws IOException {
writePageHeader(
newDataPageHeader(
uncompressedSize, compressedSize, valueCount, rlEncoding, dlEncoding, valuesEncoding, crc),
to,
blockEncryptor,
pageHeaderAAD);
}
public void writeDataPageV2Header(
int uncompressedSize,
int compressedSize,
int valueCount,
int nullCount,
int rowCount,
org.apache.parquet.column.Encoding dataEncoding,
int rlByteLength,
int dlByteLength,
OutputStream to)
throws IOException {
writeDataPageV2Header(
uncompressedSize,
compressedSize,
valueCount,
nullCount,
rowCount,
dataEncoding,
rlByteLength,
dlByteLength,
to,
null,
null);
}
public void writeDataPageV2Header(
int uncompressedSize,
int compressedSize,
int valueCount,
int nullCount,
int rowCount,
org.apache.parquet.column.Encoding dataEncoding,
int rlByteLength,
int dlByteLength,
OutputStream to,
BlockCipher.Encryptor blockEncryptor,
byte[] pageHeaderAAD)
throws IOException {
writePageHeader(
newDataPageV2Header(
uncompressedSize,
compressedSize,
valueCount,
nullCount,
rowCount,
dataEncoding,
rlByteLength,
dlByteLength),
to,
blockEncryptor,
pageHeaderAAD);
}
private PageHeader newDataPageV2Header(
int uncompressedSize,
int compressedSize,
int valueCount,
int nullCount,
int rowCount,
org.apache.parquet.column.Encoding dataEncoding,
int rlByteLength,
int dlByteLength) {
DataPageHeaderV2 dataPageHeaderV2 = new DataPageHeaderV2(
valueCount, nullCount, rowCount, getEncoding(dataEncoding), dlByteLength, rlByteLength);
PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE_V2, uncompressedSize, compressedSize);
pageHeader.setData_page_header_v2(dataPageHeaderV2);
return pageHeader;
}
public void writeDataPageV2Header(
int uncompressedSize,
int compressedSize,
int valueCount,
int nullCount,
int rowCount,
org.apache.parquet.column.Encoding dataEncoding,
int rlByteLength,
int dlByteLength,
int crc,
OutputStream to,
BlockCipher.Encryptor blockEncryptor,
byte[] pageHeaderAAD)
throws IOException {
writePageHeader(
newDataPageV2Header(
uncompressedSize,
compressedSize,
valueCount,
nullCount,
rowCount,
dataEncoding,
rlByteLength,
dlByteLength,
crc),
to,
blockEncryptor,
pageHeaderAAD);
}
private PageHeader newDataPageV2Header(
int uncompressedSize,
int compressedSize,
int valueCount,
int nullCount,
int rowCount,
org.apache.parquet.column.Encoding dataEncoding,
int rlByteLength,
int dlByteLength,
int crc) {
DataPageHeaderV2 dataPageHeaderV2 = new DataPageHeaderV2(
valueCount, nullCount, rowCount, getEncoding(dataEncoding), dlByteLength, rlByteLength);
PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE_V2, uncompressedSize, compressedSize);
pageHeader.setData_page_header_v2(dataPageHeaderV2);
pageHeader.setCrc(crc);
return pageHeader;
}
public void writeDictionaryPageHeader(
int uncompressedSize,
int compressedSize,
int valueCount,
org.apache.parquet.column.Encoding valuesEncoding,
OutputStream to)
throws IOException {
writeDictionaryPageHeader(uncompressedSize, compressedSize, valueCount, valuesEncoding, to, null, null);
}
public void writeDictionaryPageHeader(
int uncompressedSize,
int compressedSize,
int valueCount,
org.apache.parquet.column.Encoding valuesEncoding,
OutputStream to,
BlockCipher.Encryptor blockEncryptor,
byte[] pageHeaderAAD)
throws IOException {
PageHeader pageHeader = new PageHeader(PageType.DICTIONARY_PAGE, uncompressedSize, compressedSize);
pageHeader.setDictionary_page_header(new DictionaryPageHeader(valueCount, getEncoding(valuesEncoding)));
writePageHeader(pageHeader, to, blockEncryptor, pageHeaderAAD);
}
public void writeDictionaryPageHeader(
int uncompressedSize,
int compressedSize,
int valueCount,
org.apache.parquet.column.Encoding valuesEncoding,
int crc,
OutputStream to)
throws IOException {
writeDictionaryPageHeader(uncompressedSize, compressedSize, valueCount, valuesEncoding, crc, to, null, null);
}
public void writeDictionaryPageHeader(
int uncompressedSize,
int compressedSize,
int valueCount,
org.apache.parquet.column.Encoding valuesEncoding,
int crc,
OutputStream to,
BlockCipher.Encryptor blockEncryptor,
byte[] pageHeaderAAD)
throws IOException {
PageHeader pageHeader = new PageHeader(PageType.DICTIONARY_PAGE, uncompressedSize, compressedSize);
pageHeader.setCrc(crc);
pageHeader.setDictionary_page_header(new DictionaryPageHeader(valueCount, getEncoding(valuesEncoding)));
writePageHeader(pageHeader, to, blockEncryptor, pageHeaderAAD);
}
private static BoundaryOrder toParquetBoundaryOrder(
org.apache.parquet.internal.column.columnindex.BoundaryOrder boundaryOrder) {
switch (boundaryOrder) {
case ASCENDING:
return BoundaryOrder.ASCENDING;
case DESCENDING:
return BoundaryOrder.DESCENDING;
case UNORDERED:
return BoundaryOrder.UNORDERED;
default:
throw new IllegalArgumentException("Unsupported boundary order: " + boundaryOrder);
}
}
private static org.apache.parquet.internal.column.columnindex.BoundaryOrder fromParquetBoundaryOrder(
BoundaryOrder boundaryOrder) {
switch (boundaryOrder) {
case ASCENDING:
return org.apache.parquet.internal.column.columnindex.BoundaryOrder.ASCENDING;
case DESCENDING:
return org.apache.parquet.internal.column.columnindex.BoundaryOrder.DESCENDING;
case UNORDERED:
return org.apache.parquet.internal.column.columnindex.BoundaryOrder.UNORDERED;
default:
throw new IllegalArgumentException("Unsupported boundary order: " + boundaryOrder);
}
}
public static ColumnIndex toParquetColumnIndex(
PrimitiveType type, org.apache.parquet.internal.column.columnindex.ColumnIndex columnIndex) {
if (!isMinMaxStatsSupported(type) || columnIndex == null) {
return null;
}
ColumnIndex parquetColumnIndex = new ColumnIndex(
columnIndex.getNullPages(),
columnIndex.getMinValues(),
columnIndex.getMaxValues(),
toParquetBoundaryOrder(columnIndex.getBoundaryOrder()));
parquetColumnIndex.setNull_counts(columnIndex.getNullCounts());
List repLevelHistogram = columnIndex.getRepetitionLevelHistogram();
if (repLevelHistogram != null && !repLevelHistogram.isEmpty()) {
parquetColumnIndex.setRepetition_level_histograms(repLevelHistogram);
}
List defLevelHistogram = columnIndex.getDefinitionLevelHistogram();
if (defLevelHistogram != null && !defLevelHistogram.isEmpty()) {
parquetColumnIndex.setDefinition_level_histograms(defLevelHistogram);
}
return parquetColumnIndex;
}
public static org.apache.parquet.internal.column.columnindex.ColumnIndex fromParquetColumnIndex(
PrimitiveType type, ColumnIndex parquetColumnIndex) {
if (!isMinMaxStatsSupported(type)) {
return null;
}
return ColumnIndexBuilder.build(
type,
fromParquetBoundaryOrder(parquetColumnIndex.getBoundary_order()),
parquetColumnIndex.getNull_pages(),
parquetColumnIndex.getNull_counts(),
parquetColumnIndex.getMin_values(),
parquetColumnIndex.getMax_values(),
parquetColumnIndex.getRepetition_level_histograms(),
parquetColumnIndex.getDefinition_level_histograms());
}
public static OffsetIndex toParquetOffsetIndex(
org.apache.parquet.internal.column.columnindex.OffsetIndex offsetIndex) {
List pageLocations = new ArrayList<>(offsetIndex.getPageCount());
List unencodedByteArrayDataBytes = new ArrayList<>(offsetIndex.getPageCount());
for (int i = 0, n = offsetIndex.getPageCount(); i < n; ++i) {
pageLocations.add(new PageLocation(
offsetIndex.getOffset(i), offsetIndex.getCompressedPageSize(i), offsetIndex.getFirstRowIndex(i)));
Optional unencodedByteArrayDataType = offsetIndex.getUnencodedByteArrayDataBytes(i);
if (unencodedByteArrayDataType.isPresent() && unencodedByteArrayDataBytes.size() == i) {
unencodedByteArrayDataBytes.add(unencodedByteArrayDataType.get());
}
}
OffsetIndex parquetOffsetIndex = new OffsetIndex(pageLocations);
if (unencodedByteArrayDataBytes.size() == pageLocations.size()) {
// Do not add the field if we are missing that from any page.
parquetOffsetIndex.setUnencoded_byte_array_data_bytes(unencodedByteArrayDataBytes);
}
return parquetOffsetIndex;
}
public static org.apache.parquet.internal.column.columnindex.OffsetIndex fromParquetOffsetIndex(
OffsetIndex parquetOffsetIndex) {
boolean hasUnencodedByteArrayDataBytes = parquetOffsetIndex.isSetUnencoded_byte_array_data_bytes()
&& parquetOffsetIndex.unencoded_byte_array_data_bytes.size()
== parquetOffsetIndex.page_locations.size();
OffsetIndexBuilder builder = OffsetIndexBuilder.getBuilder();
for (int i = 0; i < parquetOffsetIndex.page_locations.size(); ++i) {
PageLocation pageLocation = parquetOffsetIndex.page_locations.get(i);
Optional unencodedByteArrayDataBytes = hasUnencodedByteArrayDataBytes
? Optional.of(parquetOffsetIndex.unencoded_byte_array_data_bytes.get(i))
: Optional.empty();
builder.add(
pageLocation.getOffset(),
pageLocation.getCompressed_page_size(),
pageLocation.getFirst_row_index(),
unencodedByteArrayDataBytes);
}
return builder.build();
}
public static BloomFilterHeader toBloomFilterHeader(
org.apache.parquet.column.values.bloomfilter.BloomFilter bloomFilter) {
BloomFilterAlgorithm algorithm = null;
BloomFilterHash hashStrategy = null;
BloomFilterCompression compression = null;
if (bloomFilter.getAlgorithm() == BloomFilter.Algorithm.BLOCK) {
algorithm = BloomFilterAlgorithm.BLOCK(new SplitBlockAlgorithm());
}
if (bloomFilter.getHashStrategy() == BloomFilter.HashStrategy.XXH64) {
hashStrategy = BloomFilterHash.XXHASH(new XxHash());
}
if (bloomFilter.getCompression() == BloomFilter.Compression.UNCOMPRESSED) {
compression = BloomFilterCompression.UNCOMPRESSED(new Uncompressed());
}
if (algorithm != null && hashStrategy != null && compression != null) {
return new BloomFilterHeader(bloomFilter.getBitsetSize(), algorithm, hashStrategy, compression);
} else {
throw new IllegalArgumentException(String.format(
"Failed to build thrift structure for BloomFilterHeader," + "algorithm=%s, hash=%s, compression=%s",
bloomFilter.getAlgorithm(), bloomFilter.getHashStrategy(), bloomFilter.getCompression()));
}
}
public static org.apache.parquet.column.statistics.SizeStatistics fromParquetSizeStatistics(
SizeStatistics statistics, PrimitiveType type) {
if (statistics == null) {
return null;
}
return new org.apache.parquet.column.statistics.SizeStatistics(
type,
statistics.getUnencoded_byte_array_data_bytes(),
statistics.getRepetition_level_histogram(),
statistics.getDefinition_level_histogram());
}
public static SizeStatistics toParquetSizeStatistics(org.apache.parquet.column.statistics.SizeStatistics stats) {
if (stats == null) {
return null;
}
SizeStatistics formatStats = new SizeStatistics();
if (stats.getUnencodedByteArrayDataBytes().isPresent()) {
formatStats.setUnencoded_byte_array_data_bytes(
stats.getUnencodedByteArrayDataBytes().get());
}
formatStats.setRepetition_level_histogram(stats.getRepetitionLevelHistogram());
formatStats.setDefinition_level_histogram(stats.getDefinitionLevelHistogram());
return formatStats;
}
}