io.trino.plugin.deltalake.transactionlog.DeltaLakeParquetStatisticsUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of trino-delta-lake Show documentation
Show all versions of trino-delta-lake Show documentation
Trino - Delta Lake connector
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.plugin.deltalake.transactionlog;
import com.google.common.collect.ImmutableMap;
import io.airlift.log.Logger;
import io.airlift.slice.Slice;
import io.trino.plugin.base.type.DecodedTimestamp;
import io.trino.spi.block.Block;
import io.trino.spi.block.SqlRow;
import io.trino.spi.type.ArrayType;
import io.trino.spi.type.DateType;
import io.trino.spi.type.DecimalType;
import io.trino.spi.type.Decimals;
import io.trino.spi.type.Int128;
import io.trino.spi.type.MapType;
import io.trino.spi.type.RowType;
import io.trino.spi.type.TimestampWithTimeZoneType;
import io.trino.spi.type.Type;
import io.trino.spi.type.VarcharType;
import jakarta.annotation.Nullable;
import org.apache.parquet.column.statistics.BinaryStatistics;
import org.apache.parquet.column.statistics.DoubleStatistics;
import org.apache.parquet.column.statistics.FloatStatistics;
import org.apache.parquet.column.statistics.IntStatistics;
import org.apache.parquet.column.statistics.LongStatistics;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.schema.LogicalTypeAnnotation;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.time.Instant;
import java.time.LocalDate;
import java.time.ZonedDateTime;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.function.BiFunction;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.collect.ImmutableMap.toImmutableMap;
import static io.airlift.slice.Slices.utf8Slice;
import static io.trino.parquet.ParquetTimestampUtils.decodeInt96Timestamp;
import static io.trino.spi.block.RowValueBuilder.buildRowValue;
import static io.trino.spi.type.BigintType.BIGINT;
import static io.trino.spi.type.BooleanType.BOOLEAN;
import static io.trino.spi.type.DateTimeEncoding.unpackMillisUtc;
import static io.trino.spi.type.DoubleType.DOUBLE;
import static io.trino.spi.type.IntegerType.INTEGER;
import static io.trino.spi.type.RealType.REAL;
import static io.trino.spi.type.SmallintType.SMALLINT;
import static io.trino.spi.type.TimestampType.TIMESTAMP_MILLIS;
import static io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MILLIS;
import static io.trino.spi.type.Timestamps.MICROSECONDS_PER_MILLISECOND;
import static io.trino.spi.type.TinyintType.TINYINT;
import static io.trino.spi.type.TypeUtils.readNativeValue;
import static io.trino.spi.type.TypeUtils.writeNativeValue;
import static java.lang.Float.floatToRawIntBits;
import static java.lang.Float.intBitsToFloat;
import static java.lang.Math.toIntExact;
import static java.nio.charset.StandardCharsets.UTF_8;
import static java.time.ZoneOffset.UTC;
import static java.time.format.DateTimeFormatter.ISO_INSTANT;
import static java.time.format.DateTimeFormatter.ISO_LOCAL_DATE;
import static java.time.temporal.ChronoUnit.MILLIS;
import static java.util.Objects.requireNonNull;
public class DeltaLakeParquetStatisticsUtils
{
private static final Logger LOG = Logger.get(DeltaLakeParquetStatisticsUtils.class);
private DeltaLakeParquetStatisticsUtils() {}
public static boolean hasInvalidStatistics(Collection metadataList)
{
return metadataList.stream()
.anyMatch(metadata ->
// If any row group does not have stats collected, stats for the file will not be valid
!metadata.getStatistics().isNumNullsSet() || metadata.getStatistics().isEmpty() ||
// Columns with NaN values are marked by `hasNonNullValue` = false by the Parquet reader. See issue: https://issues.apache.org/jira/browse/PARQUET-1246
(!metadata.getStatistics().hasNonNullValue() && metadata.getStatistics().getNumNulls() != metadata.getValueCount()));
}
@Nullable
public static Object jsonValueToTrinoValue(Type type, @Nullable Object jsonValue)
{
if (jsonValue == null) {
return null;
}
if (type == SMALLINT || type == TINYINT || type == INTEGER) {
return (long) (int) jsonValue;
}
if (type == BIGINT) {
if (jsonValue instanceof Long) {
//noinspection RedundantCast
return (long) jsonValue;
}
if (jsonValue instanceof Integer) {
return (long) (int) jsonValue;
}
throw new IllegalArgumentException("Unexpected value for bigint type: " + jsonValue);
}
if (type == REAL) {
return (long) floatToRawIntBits((float) (double) jsonValue);
}
if (type == DOUBLE) {
//noinspection RedundantCast
return (double) jsonValue;
}
if (type instanceof DecimalType decimalType) {
BigDecimal decimal = new BigDecimal((String) jsonValue);
if (decimalType.isShort()) {
return Decimals.encodeShortScaledValue(decimal, decimalType.getScale());
}
return Decimals.encodeScaledValue(decimal, decimalType.getScale());
}
if (type instanceof VarcharType) {
return utf8Slice((String) jsonValue);
}
if (type == DateType.DATE) {
return LocalDate.parse((String) jsonValue).toEpochDay();
}
if (type == TIMESTAMP_MILLIS) {
return Instant.parse((String) jsonValue).toEpochMilli() * MICROSECONDS_PER_MILLISECOND;
}
if (type instanceof RowType rowType) {
Map, ?> values = (Map, ?>) jsonValue;
List fieldTypes = rowType.getTypeParameters();
return buildRowValue(rowType, fields -> {
for (int i = 0; i < values.size(); ++i) {
Type fieldType = fieldTypes.get(i);
String fieldName = rowType.getFields().get(i).getName().orElseThrow(() -> new IllegalArgumentException("Field name must exist"));
Object fieldValue = jsonValueToTrinoValue(fieldType, values.remove(fieldName));
writeNativeValue(fieldType, fields.get(i), fieldValue);
}
checkState(values.isEmpty(), "All fields must be converted into Trino value: %s", values);
});
}
throw new UnsupportedOperationException("Unsupported type: " + type);
}
public static Map toJsonValues(Map columnTypeMapping, Map values)
{
Map jsonValues = new HashMap<>();
for (Map.Entry value : values.entrySet()) {
Type type = columnTypeMapping.get(value.getKey());
if (type instanceof ArrayType || type instanceof MapType) {
continue;
}
jsonValues.put(value.getKey(), toJsonValue(columnTypeMapping.get(value.getKey()), value.getValue()));
}
return jsonValues;
}
@Nullable
public static Object toJsonValue(Type type, @Nullable Object value)
{
if (value == null) {
return null;
}
if (type == SMALLINT || type == TINYINT || type == INTEGER || type == BIGINT) {
return value;
}
if (type == REAL) {
return intBitsToFloat(toIntExact((long) value));
}
if (type == DOUBLE) {
return value;
}
if (type instanceof DecimalType decimalType) {
if (decimalType.isShort()) {
return Decimals.toString((long) value, decimalType.getScale());
}
return Decimals.toString((Int128) value, decimalType.getScale());
}
if (type instanceof VarcharType) {
return ((Slice) value).toStringUtf8();
}
if (type == DateType.DATE) {
return LocalDate.ofEpochDay((long) value).format(ISO_LOCAL_DATE);
}
if (type == TIMESTAMP_TZ_MILLIS) {
Instant ts = Instant.ofEpochMilli(unpackMillisUtc((long) value));
return ISO_INSTANT.format(ZonedDateTime.ofInstant(ts, UTC));
}
if (type instanceof RowType rowType) {
SqlRow row = (SqlRow) value;
int rawIndex = row.getRawIndex();
ImmutableMap.Builder fieldValues = ImmutableMap.builder();
for (int i = 0; i < row.getFieldCount(); i++) {
RowType.Field field = rowType.getFields().get(i);
Object fieldValue = readNativeValue(field.getType(), row.getRawFieldBlock(i), rawIndex);
Object jsonValue = toJsonValue(field.getType(), fieldValue);
if (jsonValue != null) {
fieldValues.put(field.getName().orElseThrow(), jsonValue);
}
}
return fieldValues.buildOrThrow();
}
throw new UnsupportedOperationException("Unsupported type: " + type);
}
public static Map jsonEncodeMin(Map>> stats, Map typeForColumn)
{
return jsonEncode(stats, typeForColumn, DeltaLakeParquetStatisticsUtils::getMin);
}
public static Map jsonEncodeMax(Map>> stats, Map typeForColumn)
{
return jsonEncode(stats, typeForColumn, DeltaLakeParquetStatisticsUtils::getMax);
}
private static Map jsonEncode(Map>> stats, Map typeForColumn, BiFunction, Optional