org.apache.iceberg.parquet.ParquetUtil Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of iceberg-parquet Show documentation
Show all versions of iceberg-parquet Show documentation
A table format for huge analytic datasets
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.parquet;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.iceberg.FieldMetrics;
import org.apache.iceberg.Metrics;
import org.apache.iceberg.MetricsConfig;
import org.apache.iceberg.MetricsModes;
import org.apache.iceberg.MetricsModes.MetricsMode;
import org.apache.iceberg.MetricsUtil;
import org.apache.iceberg.Schema;
import org.apache.iceberg.exceptions.RuntimeIOException;
import org.apache.iceberg.expressions.Literal;
import org.apache.iceberg.io.InputFile;
import org.apache.iceberg.mapping.NameMapping;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.types.Conversions;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.BinaryUtil;
import org.apache.iceberg.util.UnicodeUtil;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.Dictionary;
import org.apache.parquet.column.Encoding;
import org.apache.parquet.column.EncodingStats;
import org.apache.parquet.column.page.DictionaryPage;
import org.apache.parquet.column.page.PageReader;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.ColumnPath;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.io.ParquetDecodingException;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
public class ParquetUtil {
// not meant to be instantiated
private ParquetUtil() {
}
public static Metrics fileMetrics(InputFile file, MetricsConfig metricsConfig) {
return fileMetrics(file, metricsConfig, null);
}
public static Metrics fileMetrics(InputFile file, MetricsConfig metricsConfig, NameMapping nameMapping) {
try (ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(file))) {
return footerMetrics(reader.getFooter(), Stream.empty(), metricsConfig, nameMapping);
} catch (IOException e) {
throw new RuntimeIOException(e, "Failed to read footer of file: %s", file);
}
}
public static Metrics footerMetrics(ParquetMetadata metadata, Stream> fieldMetrics,
MetricsConfig metricsConfig) {
return footerMetrics(metadata, fieldMetrics, metricsConfig, null);
}
@SuppressWarnings("checkstyle:CyclomaticComplexity")
public static Metrics footerMetrics(ParquetMetadata metadata, Stream> fieldMetrics,
MetricsConfig metricsConfig, NameMapping nameMapping) {
Preconditions.checkNotNull(fieldMetrics, "fieldMetrics should not be null");
long rowCount = 0;
Map columnSizes = Maps.newHashMap();
Map valueCounts = Maps.newHashMap();
Map nullValueCounts = Maps.newHashMap();
Map> lowerBounds = Maps.newHashMap();
Map> upperBounds = Maps.newHashMap();
Set missingStats = Sets.newHashSet();
// ignore metrics for fields we failed to determine reliable IDs
MessageType parquetTypeWithIds = getParquetTypeWithIds(metadata, nameMapping);
Schema fileSchema = ParquetSchemaUtil.convertAndPrune(parquetTypeWithIds);
Map> fieldMetricsMap = fieldMetrics.collect(
Collectors.toMap(FieldMetrics::id, Function.identity()));
List blocks = metadata.getBlocks();
for (BlockMetaData block : blocks) {
rowCount += block.getRowCount();
for (ColumnChunkMetaData column : block.getColumns()) {
Integer fieldId = fileSchema.aliasToId(column.getPath().toDotString());
if (fieldId == null) {
// fileSchema may contain a subset of columns present in the file
// as we prune columns we could not assign ids
continue;
}
increment(columnSizes, fieldId, column.getTotalSize());
MetricsMode metricsMode = MetricsUtil.metricsMode(fileSchema, metricsConfig, fieldId);
if (metricsMode == MetricsModes.None.get()) {
continue;
}
increment(valueCounts, fieldId, column.getValueCount());
Statistics stats = column.getStatistics();
if (stats == null) {
missingStats.add(fieldId);
} else if (!stats.isEmpty()) {
increment(nullValueCounts, fieldId, stats.getNumNulls());
// when there are metrics gathered by Iceberg for a column, we should use those instead
// of the ones from Parquet
if (metricsMode != MetricsModes.Counts.get() && !fieldMetricsMap.containsKey(fieldId)) {
Types.NestedField field = fileSchema.findField(fieldId);
if (field != null && stats.hasNonNullValue() && shouldStoreBounds(column, fileSchema)) {
Literal> min = ParquetConversions.fromParquetPrimitive(
field.type(), column.getPrimitiveType(), stats.genericGetMin());
updateMin(lowerBounds, fieldId, field.type(), min, metricsMode);
Literal> max = ParquetConversions.fromParquetPrimitive(
field.type(), column.getPrimitiveType(), stats.genericGetMax());
updateMax(upperBounds, fieldId, field.type(), max, metricsMode);
}
}
}
}
}
// discard accumulated values if any stats were missing
for (Integer fieldId : missingStats) {
nullValueCounts.remove(fieldId);
lowerBounds.remove(fieldId);
upperBounds.remove(fieldId);
}
updateFromFieldMetrics(fieldMetricsMap, metricsConfig, fileSchema, lowerBounds, upperBounds);
return new Metrics(rowCount, columnSizes, valueCounts, nullValueCounts,
MetricsUtil.createNanValueCounts(fieldMetricsMap.values().stream(), metricsConfig, fileSchema),
toBufferMap(fileSchema, lowerBounds),
toBufferMap(fileSchema, upperBounds));
}
private static void updateFromFieldMetrics(
Map> idToFieldMetricsMap, MetricsConfig metricsConfig, Schema schema,
Map> lowerBounds, Map> upperBounds) {
idToFieldMetricsMap.entrySet().forEach(entry -> {
int fieldId = entry.getKey();
FieldMetrics> metrics = entry.getValue();
MetricsMode metricsMode = MetricsUtil.metricsMode(schema, metricsConfig, fieldId);
// only check for MetricsModes.None, since we don't truncate float/double values.
if (metricsMode != MetricsModes.None.get()) {
if (!metrics.hasBounds()) {
lowerBounds.remove(fieldId);
upperBounds.remove(fieldId);
} else if (metrics.upperBound() instanceof Float) {
lowerBounds.put(fieldId, Literal.of((Float) metrics.lowerBound()));
upperBounds.put(fieldId, Literal.of((Float) metrics.upperBound()));
} else if (metrics.upperBound() instanceof Double) {
lowerBounds.put(fieldId, Literal.of((Double) metrics.lowerBound()));
upperBounds.put(fieldId, Literal.of((Double) metrics.upperBound()));
} else {
throw new UnsupportedOperationException("Expected only float or double column metrics");
}
}
});
}
private static MessageType getParquetTypeWithIds(ParquetMetadata metadata, NameMapping nameMapping) {
MessageType type = metadata.getFileMetaData().getSchema();
if (ParquetSchemaUtil.hasIds(type)) {
return type;
}
if (nameMapping != null) {
return ParquetSchemaUtil.applyNameMapping(type, nameMapping);
}
return ParquetSchemaUtil.addFallbackIds(type);
}
/**
* Returns a list of offsets in ascending order determined by the starting position of the row groups.
*/
public static List getSplitOffsets(ParquetMetadata md) {
List splitOffsets = Lists.newArrayListWithExpectedSize(md.getBlocks().size());
for (BlockMetaData blockMetaData : md.getBlocks()) {
splitOffsets.add(blockMetaData.getStartingPos());
}
Collections.sort(splitOffsets);
return splitOffsets;
}
// we allow struct nesting, but not maps or arrays
private static boolean shouldStoreBounds(ColumnChunkMetaData column, Schema schema) {
if (column.getPrimitiveType().getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.INT96) {
// stats for INT96 are not reliable
return false;
}
ColumnPath columnPath = column.getPath();
Iterator pathIterator = columnPath.iterator();
Type currentType = schema.asStruct();
while (pathIterator.hasNext()) {
if (currentType == null || !currentType.isStructType()) {
return false;
}
String fieldName = pathIterator.next();
currentType = currentType.asStructType().fieldType(fieldName);
}
return currentType != null && currentType.isPrimitiveType();
}
private static void increment(Map columns, int fieldId, long amount) {
if (columns != null) {
if (columns.containsKey(fieldId)) {
columns.put(fieldId, columns.get(fieldId) + amount);
} else {
columns.put(fieldId, amount);
}
}
}
@SuppressWarnings("unchecked")
private static void updateMin(Map> lowerBounds, int id, Type type,
Literal min, MetricsMode metricsMode) {
Literal currentMin = (Literal) lowerBounds.get(id);
if (currentMin == null || min.comparator().compare(min.value(), currentMin.value()) < 0) {
if (metricsMode == MetricsModes.Full.get()) {
lowerBounds.put(id, min);
} else {
MetricsModes.Truncate truncateMode = (MetricsModes.Truncate) metricsMode;
int truncateLength = truncateMode.length();
switch (type.typeId()) {
case STRING:
lowerBounds.put(id, UnicodeUtil.truncateStringMin((Literal) min, truncateLength));
break;
case FIXED:
case BINARY:
lowerBounds.put(id, BinaryUtil.truncateBinaryMin((Literal) min, truncateLength));
break;
default:
lowerBounds.put(id, min);
}
}
}
}
@SuppressWarnings("unchecked")
private static void updateMax(Map> upperBounds, int id, Type type,
Literal max, MetricsMode metricsMode) {
Literal currentMax = (Literal) upperBounds.get(id);
if (currentMax == null || max.comparator().compare(max.value(), currentMax.value()) > 0) {
if (metricsMode == MetricsModes.Full.get()) {
upperBounds.put(id, max);
} else {
MetricsModes.Truncate truncateMode = (MetricsModes.Truncate) metricsMode;
int truncateLength = truncateMode.length();
switch (type.typeId()) {
case STRING:
Literal truncatedMaxString = UnicodeUtil.truncateStringMax((Literal) max,
truncateLength);
if (truncatedMaxString != null) {
upperBounds.put(id, truncatedMaxString);
}
break;
case FIXED:
case BINARY:
Literal truncatedMaxBinary = BinaryUtil.truncateBinaryMax((Literal) max,
truncateLength);
if (truncatedMaxBinary != null) {
upperBounds.put(id, truncatedMaxBinary);
}
break;
default:
upperBounds.put(id, max);
}
}
}
}
private static Map toBufferMap(Schema schema, Map> map) {
Map bufferMap = Maps.newHashMap();
for (Map.Entry> entry : map.entrySet()) {
bufferMap.put(entry.getKey(),
Conversions.toByteBuffer(schema.findType(entry.getKey()), entry.getValue().value()));
}
return bufferMap;
}
@SuppressWarnings("deprecation")
public static boolean hasNonDictionaryPages(ColumnChunkMetaData meta) {
EncodingStats stats = meta.getEncodingStats();
if (stats != null) {
return stats.hasNonDictionaryEncodedPages();
}
// without EncodingStats, fall back to testing the encoding list
Set encodings = Sets.newHashSet(meta.getEncodings());
if (encodings.remove(Encoding.PLAIN_DICTIONARY)) {
// if remove returned true, PLAIN_DICTIONARY was present, which means at
// least one page was dictionary encoded and 1.0 encodings are used
// RLE and BIT_PACKED are only used for repetition or definition levels
encodings.remove(Encoding.RLE);
encodings.remove(Encoding.BIT_PACKED);
// when empty, no encodings other than dictionary or rep/def levels
return !encodings.isEmpty();
} else {
// if PLAIN_DICTIONARY wasn't present, then either the column is not
// dictionary-encoded, or the 2.0 encoding, RLE_DICTIONARY, was used.
// for 2.0, this cannot determine whether a page fell back without
// page encoding stats
return true;
}
}
public static boolean hasNoBloomFilterPages(ColumnChunkMetaData meta) {
return meta.getBloomFilterOffset() <= 0;
}
public static Dictionary readDictionary(ColumnDescriptor desc, PageReader pageSource) {
DictionaryPage dictionaryPage = pageSource.readDictionaryPage();
if (dictionaryPage != null) {
try {
return dictionaryPage.getEncoding().initDictionary(desc, dictionaryPage);
} catch (IOException e) {
throw new ParquetDecodingException("could not decode the dictionary for " + desc, e);
}
}
return null;
}
public static boolean isIntType(PrimitiveType primitiveType) {
if (primitiveType.getOriginalType() != null) {
switch (primitiveType.getOriginalType()) {
case INT_8:
case INT_16:
case INT_32:
case DATE:
return true;
default:
return false;
}
}
return primitiveType.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.INT32;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy