io.trino.parquet.predicate.TupleDomainParquetPredicate Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of trino-parquet Show documentation
Show all versions of trino-parquet Show documentation
Trino - Parquet file format support
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.parquet.predicate;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.VerifyException;
import com.google.common.collect.ImmutableList;
import io.airlift.slice.Slice;
import io.airlift.slice.Slices;
import io.trino.parquet.BloomFilterStore;
import io.trino.parquet.DictionaryPage;
import io.trino.parquet.ParquetCorruptionException;
import io.trino.parquet.ParquetDataSourceId;
import io.trino.parquet.dictionary.Dictionary;
import io.trino.plugin.base.type.TrinoTimestampEncoder;
import io.trino.spi.predicate.Domain;
import io.trino.spi.predicate.SortedRangeSet;
import io.trino.spi.predicate.TupleDomain;
import io.trino.spi.predicate.ValueSet;
import io.trino.spi.type.DecimalType;
import io.trino.spi.type.Int128;
import io.trino.spi.type.TimestampType;
import io.trino.spi.type.Type;
import io.trino.spi.type.UuidType;
import io.trino.spi.type.VarbinaryType;
import io.trino.spi.type.VarcharType;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.column.values.bloomfilter.BloomFilter;
import org.apache.parquet.filter2.predicate.FilterApi;
import org.apache.parquet.filter2.predicate.FilterPredicate;
import org.apache.parquet.filter2.predicate.Operators;
import org.apache.parquet.filter2.predicate.UserDefinedPredicate;
import org.apache.parquet.hadoop.metadata.ColumnPath;
import org.apache.parquet.internal.column.columnindex.ColumnIndex;
import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore;
import org.apache.parquet.io.ParquetDecodingException;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.LogicalTypeAnnotation.TimestampLogicalTypeAnnotation;
import org.apache.parquet.schema.PrimitiveType;
import org.joda.time.DateTimeZone;
import java.io.Serializable;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.function.Function;
import static com.google.common.base.MoreObjects.toStringHelper;
import static com.google.common.base.Preconditions.checkArgument;
import static io.trino.parquet.ParquetMetadataConverter.isMinMaxStatsSupported;
import static io.trino.parquet.ParquetTimestampUtils.decodeInt64Timestamp;
import static io.trino.parquet.ParquetTimestampUtils.decodeInt96Timestamp;
import static io.trino.parquet.ParquetTypeUtils.getShortDecimalValue;
import static io.trino.parquet.predicate.PredicateUtils.isStatisticsOverflow;
import static io.trino.plugin.base.type.TrinoTimestampEncoderFactory.createTimestampEncoder;
import static io.trino.spi.type.BigintType.BIGINT;
import static io.trino.spi.type.BooleanType.BOOLEAN;
import static io.trino.spi.type.DateType.DATE;
import static io.trino.spi.type.DoubleType.DOUBLE;
import static io.trino.spi.type.IntegerType.INTEGER;
import static io.trino.spi.type.RealType.REAL;
import static io.trino.spi.type.SmallintType.SMALLINT;
import static io.trino.spi.type.TinyintType.TINYINT;
import static java.lang.Float.floatToRawIntBits;
import static java.lang.Float.intBitsToFloat;
import static java.lang.Math.toIntExact;
import static java.lang.String.format;
import static java.nio.ByteOrder.LITTLE_ENDIAN;
import static java.util.Objects.requireNonNull;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT96;
public class TupleDomainParquetPredicate
{
private final TupleDomain effectivePredicate;
private final List columns;
private final DateTimeZone timeZone;
public TupleDomainParquetPredicate(TupleDomain effectivePredicate, List columns, DateTimeZone timeZone)
{
this.effectivePredicate = requireNonNull(effectivePredicate, "effectivePredicate is null");
this.columns = ImmutableList.copyOf(requireNonNull(columns, "columns is null"));
this.timeZone = requireNonNull(timeZone, "timeZone is null");
}
/**
* Should the Parquet Reader process a file section with the specified statistics,
* and if it should, then return the columns are candidates for further inspection of more
* granular statistics from column index and dictionary.
*
* @param valueCounts the number of values for a column in the segment; this can be used with
* Statistics to determine if a column is only null
* @param statistics column statistics
* @param id Parquet file name
*
* @return Optional.empty() if statistics were sufficient to eliminate the file section.
* Otherwise, a list of columns for which page-level indices and dictionary could be consulted
* to potentially eliminate the file section. An optional with empty list is returned if there is
* going to be no benefit in looking at column index or dictionary for any column.
*/
public Optional> getIndexLookupCandidates(
Map valueCounts,
Map> statistics,
ParquetDataSourceId id)
throws ParquetCorruptionException
{
if (effectivePredicate.isNone()) {
return Optional.empty();
}
Map effectivePredicateDomains = effectivePredicate.getDomains()
.orElseThrow(() -> new IllegalStateException("Effective predicate other than none should have domains"));
ImmutableList.Builder candidateColumns = ImmutableList.builder();
for (ColumnDescriptor column : columns) {
Domain effectivePredicateDomain = effectivePredicateDomains.get(column);
if (effectivePredicateDomain == null) {
continue;
}
Statistics> columnStatistics = statistics.get(column);
if (columnStatistics == null || columnStatistics.isEmpty()) {
// no stats for column
candidateColumns.add(column);
continue;
}
Long columnValueCount = valueCounts.get(column);
if (columnValueCount == null) {
throw new IllegalArgumentException(format("Missing columnValueCount for column %s in %s", column, id));
}
Domain domain = getDomain(
column,
effectivePredicateDomain.getType(),
columnValueCount,
columnStatistics,
id,
timeZone);
if (!effectivePredicateDomain.overlaps(domain)) {
return Optional.empty();
}
// If the predicate domain on a column includes the entire domain from column row-group statistics,
// then more granular statistics from page stats or dictionary for this column will not help to eliminate the row-group.
if (!effectivePredicateDomain.contains(domain)) {
candidateColumns.add(column);
}
}
return Optional.of(candidateColumns.build());
}
/**
* Should the Parquet Reader process a file section with the specified dictionary based on that
* single dictionary. This is safe to check repeatedly to avoid loading more parquet dictionaries
* if the section can already be eliminated.
*
* @param dictionary The single column dictionary
*/
public boolean matches(DictionaryDescriptor dictionary)
{
requireNonNull(dictionary, "dictionary is null");
if (effectivePredicate.isNone()) {
return false;
}
Map effectivePredicateDomains = effectivePredicate.getDomains()
.orElseThrow(() -> new IllegalStateException("Effective predicate other than none should have domains"));
Domain effectivePredicateDomain = effectivePredicateDomains.get(dictionary.getColumnDescriptor());
return effectivePredicateDomain == null || effectivePredicateMatches(effectivePredicateDomain, dictionary);
}
/**
* Should the Parquet Reader process a file section with the specified statistics.
*
* @param valueCounts the number of values for a column in the segment; this can be used with
* Statistics to determine if a column is only null
* @param columnIndexStore column index (statistics) store
* @param id Parquet file name
*/
public boolean matches(Map valueCounts, ColumnIndexStore columnIndexStore, ParquetDataSourceId id)
throws ParquetCorruptionException
{
requireNonNull(columnIndexStore, "columnIndexStore is null");
if (effectivePredicate.isNone()) {
return false;
}
Map effectivePredicateDomains = effectivePredicate.getDomains()
.orElseThrow(() -> new IllegalStateException("Effective predicate other than none should have domains"));
for (ColumnDescriptor column : columns) {
Domain effectivePredicateDomain = effectivePredicateDomains.get(column);
if (effectivePredicateDomain == null) {
continue;
}
// ParquetMetadataConverter#fromParquetColumnIndex returns null if the parquet primitive type does not support min/max stats
if (!isMinMaxStatsSupported(column.getPrimitiveType())) {
continue;
}
ColumnIndex columnIndex = columnIndexStore.getColumnIndex(ColumnPath.get(column.getPath()));
if (columnIndex == null) {
continue;
}
Long columnValueCount = valueCounts.get(column);
if (columnValueCount == null) {
throw new IllegalArgumentException(format("Missing columnValueCount for column %s in %s", column, id));
}
Domain domain = getDomain(effectivePredicateDomain.getType(), columnValueCount, columnIndex, id, column, timeZone);
if (!effectivePredicateDomain.overlaps(domain)) {
return false;
}
}
return true;
}
/**
* Should the Parquet Reader process a file section with the specified bloomfilter Store
*
* @param bloomFilterStore bloomfilter Store
*/
public boolean matches(BloomFilterStore bloomFilterStore, int domainCompactionThreshold)
{
requireNonNull(bloomFilterStore, "bloomFilterStore is null");
if (effectivePredicate.isNone()) {
return false;
}
Map effectivePredicateDomains = effectivePredicate.getDomains()
.orElseThrow(() -> new IllegalStateException("Effective predicate other than none should have domains"));
for (ColumnDescriptor column : columns) {
Domain effectivePredicateDomain = effectivePredicateDomains.get(column);
// the bloom filter bitset contains only non-null values so isn't helpful
if (effectivePredicateDomain == null || effectivePredicateDomain.isNullAllowed()) {
continue;
}
Optional> discreteValues = extractDiscreteValues(domainCompactionThreshold, effectivePredicateDomain.getValues());
// values are not discrete, so bloom filter isn't helpful
if (discreteValues.isEmpty()) {
continue;
}
Optional bloomFilterOptional = bloomFilterStore.getBloomFilter(ColumnPath.get(column.getPath()));
if (bloomFilterOptional.isEmpty()) {
continue;
}
BloomFilter bloomFilter = bloomFilterOptional.get();
if (discreteValues.get().stream().noneMatch(value -> checkInBloomFilter(bloomFilter, value, effectivePredicateDomain.getType()))) {
return false;
}
}
return true;
}
/**
* Convert Predicate to Parquet filter if possible.
*
* @param timeZone current Parquet timezone
* @return Converted Parquet filter or null if conversion not possible
*/
public Optional toParquetFilter(DateTimeZone timeZone)
{
return Optional.ofNullable(convertToParquetFilter(timeZone));
}
private boolean effectivePredicateMatches(Domain effectivePredicateDomain, DictionaryDescriptor dictionary)
{
return effectivePredicateDomain.overlaps(getDomain(effectivePredicateDomain.getType(), dictionary, timeZone));
}
@VisibleForTesting
public static Domain getDomain(
ColumnDescriptor column,
Type type,
long columnValuesCount,
Statistics> statistics,
ParquetDataSourceId id,
DateTimeZone timeZone)
throws ParquetCorruptionException
{
if (statistics == null || statistics.isEmpty()) {
return Domain.all(type);
}
if (statistics.isNumNullsSet() && statistics.getNumNulls() == columnValuesCount) {
return Domain.onlyNull(type);
}
boolean hasNullValue = !statistics.isNumNullsSet() || statistics.getNumNulls() != 0L;
if (!statistics.hasNonNullValue() || statistics.genericGetMin() == null || statistics.genericGetMax() == null) {
return Domain.create(ValueSet.all(type), hasNullValue);
}
try {
Object min = statistics.genericGetMin();
Object max = statistics.genericGetMax();
return getDomain(
column,
type,
ImmutableList.of(min instanceof Binary ? Slices.wrappedBuffer(((Binary) min).getBytes()) : min),
ImmutableList.of(max instanceof Binary ? Slices.wrappedBuffer(((Binary) max).getBytes()) : max),
hasNullValue,
timeZone);
}
catch (Exception e) {
throw corruptionException(column.toString(), id, statistics, e);
}
}
/**
* Get a domain for the ranges defined by each pair of elements from {@code minimums} and {@code maximums}.
* Both arrays must have the same length.
*/
private static Domain getDomain(
ColumnDescriptor column,
Type type,
List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy