io.prestosql.parquet.reader.MetadataReader Maven / Gradle / Ivy
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.prestosql.parquet.reader;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.CorruptStatistics;
import org.apache.parquet.column.statistics.BinaryStatistics;
import org.apache.parquet.format.ColumnChunk;
import org.apache.parquet.format.ColumnMetaData;
import org.apache.parquet.format.ConvertedType;
import org.apache.parquet.format.Encoding;
import org.apache.parquet.format.FileMetaData;
import org.apache.parquet.format.KeyValue;
import org.apache.parquet.format.RowGroup;
import org.apache.parquet.format.SchemaElement;
import org.apache.parquet.format.Statistics;
import org.apache.parquet.format.Type;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.ColumnPath;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.OriginalType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.apache.parquet.schema.Type.Repetition;
import org.apache.parquet.schema.Types;
import java.io.ByteArrayInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import static io.prestosql.parquet.ParquetValidationUtils.validateParquet;
import static java.nio.charset.StandardCharsets.US_ASCII;
import static org.apache.parquet.format.Util.readFileMetaData;
public final class MetadataReader
{
private static final int PARQUET_METADATA_LENGTH = 4;
private static final byte[] MAGIC = "PAR1".getBytes(US_ASCII);
private static final ParquetMetadataConverter PARQUET_METADATA_CONVERTER = new ParquetMetadataConverter();
private MetadataReader() {}
public static ParquetMetadata readFooter(FileSystem fileSystem, Path file, long fileSize)
throws IOException
{
try (FSDataInputStream inputStream = fileSystem.open(file)) {
return readFooter(inputStream, file, fileSize);
}
}
public static ParquetMetadata readFooter(FSDataInputStream inputStream, Path file, long fileSize)
throws IOException
{
// Parquet File Layout:
//
// MAGIC
// variable: Data
// variable: Metadata
// 4 bytes: MetadataLength
// MAGIC
validateParquet(fileSize >= MAGIC.length + PARQUET_METADATA_LENGTH + MAGIC.length, "%s is not a valid Parquet File", file);
long metadataLengthIndex = fileSize - PARQUET_METADATA_LENGTH - MAGIC.length;
InputStream footerStream = readFully(inputStream, metadataLengthIndex, PARQUET_METADATA_LENGTH + MAGIC.length);
int metadataLength = readIntLittleEndian(footerStream);
byte[] magic = new byte[MAGIC.length];
footerStream.read(magic);
validateParquet(Arrays.equals(MAGIC, magic), "Not valid Parquet file: %s expected magic number: %s got: %s", file, Arrays.toString(MAGIC), Arrays.toString(magic));
long metadataIndex = metadataLengthIndex - metadataLength;
validateParquet(
metadataIndex >= MAGIC.length && metadataIndex < metadataLengthIndex,
"Corrupted Parquet file: %s metadata index: %s out of range",
file,
metadataIndex);
InputStream metadataStream = readFully(inputStream, metadataIndex, metadataLength);
FileMetaData fileMetaData = readFileMetaData(metadataStream);
List schema = fileMetaData.getSchema();
validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", file);
MessageType messageType = readParquetSchema(schema);
List blocks = new ArrayList<>();
List rowGroups = fileMetaData.getRow_groups();
if (rowGroups != null) {
for (RowGroup rowGroup : rowGroups) {
BlockMetaData blockMetaData = new BlockMetaData();
blockMetaData.setRowCount(rowGroup.getNum_rows());
blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
List columns = rowGroup.getColumns();
validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup);
String filePath = columns.get(0).getFile_path();
for (ColumnChunk columnChunk : columns) {
validateParquet(
(filePath == null && columnChunk.getFile_path() == null)
|| (filePath != null && filePath.equals(columnChunk.getFile_path())),
"all column chunks of the same row group must be in the same file");
ColumnMetaData metaData = columnChunk.meta_data;
String[] path = metaData.path_in_schema.stream()
.map(value -> value.toLowerCase(Locale.ENGLISH))
.toArray(String[]::new);
ColumnPath columnPath = ColumnPath.get(path);
PrimitiveType primitiveType = messageType.getType(columnPath.toArray()).asPrimitiveType();
ColumnChunkMetaData column = ColumnChunkMetaData.get(
columnPath,
primitiveType,
CompressionCodecName.fromParquet(metaData.codec),
PARQUET_METADATA_CONVERTER.convertEncodingStats(metaData.encoding_stats),
readEncodings(metaData.encodings),
readStats(Optional.ofNullable(fileMetaData.getCreated_by()), Optional.ofNullable(metaData.statistics), primitiveType),
metaData.data_page_offset,
metaData.dictionary_page_offset,
metaData.num_values,
metaData.total_compressed_size,
metaData.total_uncompressed_size);
blockMetaData.addColumn(column);
}
blockMetaData.setPath(filePath);
blocks.add(blockMetaData);
}
}
Map keyValueMetaData = new HashMap<>();
List keyValueList = fileMetaData.getKey_value_metadata();
if (keyValueList != null) {
for (KeyValue keyValue : keyValueList) {
keyValueMetaData.put(keyValue.key, keyValue.value);
}
}
return new ParquetMetadata(new org.apache.parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks);
}
private static MessageType readParquetSchema(List schema)
{
Iterator schemaIterator = schema.iterator();
SchemaElement rootSchema = schemaIterator.next();
Types.MessageTypeBuilder builder = Types.buildMessage();
readTypeSchema(builder, schemaIterator, rootSchema.getNum_children());
return builder.named(rootSchema.name);
}
private static void readTypeSchema(Types.GroupBuilder> builder, Iterator schemaIterator, int typeCount)
{
for (int i = 0; i < typeCount; i++) {
SchemaElement element = schemaIterator.next();
Types.Builder, ?> typeBuilder;
if (element.type == null) {
typeBuilder = builder.group(Repetition.valueOf(element.repetition_type.name()));
readTypeSchema((Types.GroupBuilder>) typeBuilder, schemaIterator, element.num_children);
}
else {
Types.PrimitiveBuilder> primitiveBuilder = builder.primitive(getTypeName(element.type), Repetition.valueOf(element.repetition_type.name()));
if (element.isSetType_length()) {
primitiveBuilder.length(element.type_length);
}
if (element.isSetPrecision()) {
primitiveBuilder.precision(element.precision);
}
if (element.isSetScale()) {
primitiveBuilder.scale(element.scale);
}
typeBuilder = primitiveBuilder;
}
if (element.isSetConverted_type()) {
typeBuilder.as(getOriginalType(element.converted_type));
}
if (element.isSetField_id()) {
typeBuilder.id(element.field_id);
}
typeBuilder.named(element.name.toLowerCase(Locale.ENGLISH));
}
}
public static org.apache.parquet.column.statistics.Statistics> readStats(Optional fileCreatedBy, Optional statisticsFromFile, PrimitiveType type)
{
Statistics statistics = statisticsFromFile.orElse(null);
org.apache.parquet.column.statistics.Statistics> columnStatistics = new ParquetMetadataConverter().fromParquetStatistics(fileCreatedBy.orElse(null), statistics, type);
if (type.getOriginalType() == OriginalType.UTF8
&& statistics != null
&& !statistics.isSetMin_value() && !statistics.isSetMax_value() // the min,max fields used for UTF8 since Parquet PARQUET-1025
&& statistics.isSetMin() && statistics.isSetMax() // the min,max fields used for UTF8 before Parquet PARQUET-1025
&& columnStatistics.genericGetMin() == null && columnStatistics.genericGetMax() == null
&& !CorruptStatistics.shouldIgnoreStatistics(fileCreatedBy.orElse(null), type.getPrimitiveTypeName())) {
tryReadOldUtf8Stats(statistics, (BinaryStatistics) columnStatistics);
}
return columnStatistics;
}
private static void tryReadOldUtf8Stats(Statistics statistics, BinaryStatistics columnStatistics)
{
byte[] min = statistics.getMin();
byte[] max = statistics.getMax();
if (Arrays.equals(min, max)) {
// If min=max, then there is single value only
min = min.clone();
max = min;
}
else {
int commonPrefix = commonPrefix(min, max);
// For min we can retain all-ASCII, because this produces a strictly lower value.
int minGoodLength = commonPrefix;
while (minGoodLength < min.length && isAscii(min[minGoodLength])) {
minGoodLength++;
}
// For max we can be sure only of the part matching the min. When they differ, we can consider only one next, and only if both are ASCII
int maxGoodLength = commonPrefix;
if (maxGoodLength < max.length && maxGoodLength < min.length && isAscii(min[maxGoodLength]) && isAscii(max[maxGoodLength])) {
maxGoodLength++;
}
// Incrementing 127 would overflow. Incrementing within non-ASCII can have side-effects.
while (maxGoodLength > 0 && (max[maxGoodLength - 1] == 127 || !isAscii(max[maxGoodLength - 1]))) {
maxGoodLength--;
}
if (maxGoodLength == 0) {
// We can return just min bound, but code downstream likely expects both are present or both are absent.
return;
}
min = Arrays.copyOf(min, minGoodLength);
max = Arrays.copyOf(max, maxGoodLength);
max[maxGoodLength - 1]++;
}
columnStatistics.setMinMaxFromBytes(min, max);
if (!columnStatistics.isNumNullsSet() && statistics.isSetNull_count()) {
columnStatistics.setNumNulls(statistics.getNull_count());
}
}
private static boolean isAscii(byte b)
{
return 0 <= b;
}
private static int commonPrefix(byte[] a, byte[] b)
{
int commonPrefixLength = 0;
while (commonPrefixLength < a.length && commonPrefixLength < b.length && a[commonPrefixLength] == b[commonPrefixLength]) {
commonPrefixLength++;
}
return commonPrefixLength;
}
private static Set readEncodings(List encodings)
{
Set columnEncodings = new HashSet<>();
for (Encoding encoding : encodings) {
columnEncodings.add(org.apache.parquet.column.Encoding.valueOf(encoding.name()));
}
return Collections.unmodifiableSet(columnEncodings);
}
private static PrimitiveTypeName getTypeName(Type type)
{
switch (type) {
case BYTE_ARRAY:
return PrimitiveTypeName.BINARY;
case INT64:
return PrimitiveTypeName.INT64;
case INT32:
return PrimitiveTypeName.INT32;
case BOOLEAN:
return PrimitiveTypeName.BOOLEAN;
case FLOAT:
return PrimitiveTypeName.FLOAT;
case DOUBLE:
return PrimitiveTypeName.DOUBLE;
case INT96:
return PrimitiveTypeName.INT96;
case FIXED_LEN_BYTE_ARRAY:
return PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
default:
throw new IllegalArgumentException("Unknown type " + type);
}
}
private static OriginalType getOriginalType(ConvertedType type)
{
switch (type) {
case UTF8:
return OriginalType.UTF8;
case MAP:
return OriginalType.MAP;
case MAP_KEY_VALUE:
return OriginalType.MAP_KEY_VALUE;
case LIST:
return OriginalType.LIST;
case ENUM:
return OriginalType.ENUM;
case DECIMAL:
return OriginalType.DECIMAL;
case DATE:
return OriginalType.DATE;
case TIME_MILLIS:
return OriginalType.TIME_MILLIS;
case TIMESTAMP_MILLIS:
return OriginalType.TIMESTAMP_MILLIS;
case INTERVAL:
return OriginalType.INTERVAL;
case INT_8:
return OriginalType.INT_8;
case INT_16:
return OriginalType.INT_16;
case INT_32:
return OriginalType.INT_32;
case INT_64:
return OriginalType.INT_64;
case UINT_8:
return OriginalType.UINT_8;
case UINT_16:
return OriginalType.UINT_16;
case UINT_32:
return OriginalType.UINT_32;
case UINT_64:
return OriginalType.UINT_64;
case JSON:
return OriginalType.JSON;
case BSON:
return OriginalType.BSON;
case TIMESTAMP_MICROS:
return OriginalType.TIMESTAMP_MICROS;
case TIME_MICROS:
return OriginalType.TIME_MICROS;
default:
throw new IllegalArgumentException("Unknown converted type " + type);
}
}
private static int readIntLittleEndian(InputStream in)
throws IOException
{
int ch1 = in.read();
int ch2 = in.read();
int ch3 = in.read();
int ch4 = in.read();
if ((ch1 | ch2 | ch3 | ch4) < 0) {
throw new EOFException();
}
return ((ch4 << 24) + (ch3 << 16) + (ch2 << 8) + (ch1));
}
private static InputStream readFully(FSDataInputStream from, long position, int length)
throws IOException
{
byte[] buffer = new byte[length];
from.readFully(position, buffer);
return new ByteArrayInputStream(buffer);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy