io.trino.parquet.predicate.TupleDomainParquetPredicate Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of trino-parquet Show documentation
Trino - Parquet file format support
There is a newer version: 464
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.trino.parquet.predicate;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.VerifyException;
import com.google.common.collect.ImmutableList;
import io.airlift.slice.Slice;
import io.airlift.slice.Slices;
import io.trino.parquet.BloomFilterStore;
import io.trino.parquet.DictionaryPage;
import io.trino.parquet.ParquetCorruptionException;
import io.trino.parquet.ParquetDataSourceId;
import io.trino.parquet.dictionary.Dictionary;
import io.trino.plugin.base.type.TrinoTimestampEncoder;
import io.trino.spi.predicate.Domain;
import io.trino.spi.predicate.SortedRangeSet;
import io.trino.spi.predicate.TupleDomain;
import io.trino.spi.predicate.ValueSet;
import io.trino.spi.type.DecimalType;
import io.trino.spi.type.Int128;
import io.trino.spi.type.TimestampType;
import io.trino.spi.type.Type;
import io.trino.spi.type.UuidType;
import io.trino.spi.type.VarbinaryType;
import io.trino.spi.type.VarcharType;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.column.values.bloomfilter.BloomFilter;
import org.apache.parquet.filter2.predicate.FilterApi;
import org.apache.parquet.filter2.predicate.FilterPredicate;
import org.apache.parquet.filter2.predicate.Operators;
import org.apache.parquet.filter2.predicate.UserDefinedPredicate;
import org.apache.parquet.hadoop.metadata.ColumnPath;
import org.apache.parquet.internal.column.columnindex.ColumnIndex;
import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore;
import org.apache.parquet.io.ParquetDecodingException;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.LogicalTypeAnnotation.TimestampLogicalTypeAnnotation;
import org.apache.parquet.schema.PrimitiveType;
import org.joda.time.DateTimeZone;

import java.io.Serializable;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.function.Function;

import static com.google.common.base.MoreObjects.toStringHelper;
import static com.google.common.base.Preconditions.checkArgument;
import static io.trino.parquet.ParquetMetadataConverter.isMinMaxStatsSupported;
import static io.trino.parquet.ParquetTimestampUtils.decodeInt64Timestamp;
import static io.trino.parquet.ParquetTimestampUtils.decodeInt96Timestamp;
import static io.trino.parquet.ParquetTypeUtils.getShortDecimalValue;
import static io.trino.parquet.predicate.PredicateUtils.isStatisticsOverflow;
import static io.trino.plugin.base.type.TrinoTimestampEncoderFactory.createTimestampEncoder;
import static io.trino.spi.type.BigintType.BIGINT;
import static io.trino.spi.type.BooleanType.BOOLEAN;
import static io.trino.spi.type.DateType.DATE;
import static io.trino.spi.type.DoubleType.DOUBLE;
import static io.trino.spi.type.IntegerType.INTEGER;
import static io.trino.spi.type.RealType.REAL;
import static io.trino.spi.type.SmallintType.SMALLINT;
import static io.trino.spi.type.TinyintType.TINYINT;
import static java.lang.Float.floatToRawIntBits;
import static java.lang.Float.intBitsToFloat;
import static java.lang.Math.toIntExact;
import static java.lang.String.format;
import static java.nio.ByteOrder.LITTLE_ENDIAN;
import static java.util.Objects.requireNonNull;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT96;

public class TupleDomainParquetPredicate
{
    private final TupleDomain effectivePredicate;
    private final List columns;
    private final DateTimeZone timeZone;

    public TupleDomainParquetPredicate(TupleDomain effectivePredicate, List columns, DateTimeZone timeZone)
    {
        this.effectivePredicate = requireNonNull(effectivePredicate, "effectivePredicate is null");
        this.columns = ImmutableList.copyOf(requireNonNull(columns, "columns is null"));
        this.timeZone = requireNonNull(timeZone, "timeZone is null");
    }

    /**
     * Should the Parquet Reader process a file section with the specified statistics,
     * and if it should, then return the columns are candidates for further inspection of more
     * granular statistics from column index and dictionary.
     *
     * @param valueCounts the number of values for a column in the segment; this can be used with
     * Statistics to determine if a column is only null
     * @param statistics column statistics
     * @param id Parquet file name
     *
     * @return Optional.empty() if statistics were sufficient to eliminate the file section.
     * Otherwise, a list of columns for which page-level indices and dictionary could be consulted
     * to potentially eliminate the file section. An optional with empty list is returned if there is
     * going to be no benefit in looking at column index or dictionary for any column.
     */
    public Optional> getIndexLookupCandidates(
            Map valueCounts,
            Map> statistics,
            ParquetDataSourceId id)
            throws ParquetCorruptionException
    {
        if (effectivePredicate.isNone()) {
            return Optional.empty();
        }
        Map effectivePredicateDomains = effectivePredicate.getDomains()
                .orElseThrow(() -> new IllegalStateException("Effective predicate other than none should have domains"));

        ImmutableList.Builder candidateColumns = ImmutableList.builder();
        for (ColumnDescriptor column : columns) {
            Domain effectivePredicateDomain = effectivePredicateDomains.get(column);
            if (effectivePredicateDomain == null) {
                continue;
            }

            Statistics columnStatistics = statistics.get(column);
            if (columnStatistics == null || columnStatistics.isEmpty()) {
                // no stats for column
                candidateColumns.add(column);
                continue;
            }

            Long columnValueCount = valueCounts.get(column);
            if (columnValueCount == null) {
                throw new IllegalArgumentException(format("Missing columnValueCount for column %s in %s", column, id));
            }
            Domain domain = getDomain(
                    column,
                    effectivePredicateDomain.getType(),
                    columnValueCount,
                    columnStatistics,
                    id,
                    timeZone);
            if (!effectivePredicateDomain.overlaps(domain)) {
                return Optional.empty();
            }
            // If the predicate domain on a column includes the entire domain from column row-group statistics,
            // then more granular statistics from page stats or dictionary for this column will not help to eliminate the row-group.
            if (!effectivePredicateDomain.contains(domain)) {
                candidateColumns.add(column);
            }
        }
        return Optional.of(candidateColumns.build());
    }

    /**
     * Should the Parquet Reader process a file section with the specified dictionary based on that
     * single dictionary. This is safe to check repeatedly to avoid loading more parquet dictionaries
     * if the section can already be eliminated.
     *
     * @param dictionary The single column dictionary
     */
    public boolean matches(DictionaryDescriptor dictionary)
    {
        requireNonNull(dictionary, "dictionary is null");
        if (effectivePredicate.isNone()) {
            return false;
        }
        Map effectivePredicateDomains = effectivePredicate.getDomains()
                .orElseThrow(() -> new IllegalStateException("Effective predicate other than none should have domains"));

        Domain effectivePredicateDomain = effectivePredicateDomains.get(dictionary.getColumnDescriptor());

        return effectivePredicateDomain == null || effectivePredicateMatches(effectivePredicateDomain, dictionary);
    }

    /**
     * Should the Parquet Reader process a file section with the specified statistics.
     *
     * @param valueCounts the number of values for a column in the segment; this can be used with
     * Statistics to determine if a column is only null
     * @param columnIndexStore column index (statistics) store
     * @param id Parquet file name
     */
    public boolean matches(Map valueCounts, ColumnIndexStore columnIndexStore, ParquetDataSourceId id)
            throws ParquetCorruptionException
    {
        requireNonNull(columnIndexStore, "columnIndexStore is null");
        if (effectivePredicate.isNone()) {
            return false;
        }

        Map effectivePredicateDomains = effectivePredicate.getDomains()
                .orElseThrow(() -> new IllegalStateException("Effective predicate other than none should have domains"));

        for (ColumnDescriptor column : columns) {
            Domain effectivePredicateDomain = effectivePredicateDomains.get(column);
            if (effectivePredicateDomain == null) {
                continue;
            }

            // ParquetMetadataConverter#fromParquetColumnIndex returns null if the parquet primitive type does not support min/max stats
            if (!isMinMaxStatsSupported(column.getPrimitiveType())) {
                continue;
            }
            ColumnIndex columnIndex = columnIndexStore.getColumnIndex(ColumnPath.get(column.getPath()));
            if (columnIndex == null) {
                continue;
            }

            Long columnValueCount = valueCounts.get(column);
            if (columnValueCount == null) {
                throw new IllegalArgumentException(format("Missing columnValueCount for column %s in %s", column, id));
            }
            Domain domain = getDomain(effectivePredicateDomain.getType(), columnValueCount, columnIndex, id, column, timeZone);
            if (!effectivePredicateDomain.overlaps(domain)) {
                return false;
            }
        }

        return true;
    }

    /**
     * Should the Parquet Reader process a file section with the specified bloomfilter Store
     *
     * @param bloomFilterStore bloomfilter Store
     */
    public boolean matches(BloomFilterStore bloomFilterStore, int domainCompactionThreshold)
    {
        requireNonNull(bloomFilterStore, "bloomFilterStore is null");

        if (effectivePredicate.isNone()) {
            return false;
        }
        Map effectivePredicateDomains = effectivePredicate.getDomains()
                .orElseThrow(() -> new IllegalStateException("Effective predicate other than none should have domains"));

        for (ColumnDescriptor column : columns) {
            Domain effectivePredicateDomain = effectivePredicateDomains.get(column);

            // the bloom filter bitset contains only non-null values so isn't helpful
            if (effectivePredicateDomain == null || effectivePredicateDomain.isNullAllowed()) {
                continue;
            }

            Optional> discreteValues = extractDiscreteValues(domainCompactionThreshold, effectivePredicateDomain.getValues());
            // values are not discrete, so bloom filter isn't helpful
            if (discreteValues.isEmpty()) {
                continue;
            }

            Optional bloomFilterOptional = bloomFilterStore.getBloomFilter(ColumnPath.get(column.getPath()));
            if (bloomFilterOptional.isEmpty()) {
                continue;
            }
            BloomFilter bloomFilter = bloomFilterOptional.get();
            if (discreteValues.get().stream().noneMatch(value -> checkInBloomFilter(bloomFilter, value, effectivePredicateDomain.getType()))) {
                return false;
            }
        }
        return true;
    }

    /**
     * Convert Predicate to Parquet filter if possible.
     *
     * @param timeZone current Parquet timezone
     * @return Converted Parquet filter or null if conversion not possible
     */
    public Optional toParquetFilter(DateTimeZone timeZone)
    {
        return Optional.ofNullable(convertToParquetFilter(timeZone));
    }

    private boolean effectivePredicateMatches(Domain effectivePredicateDomain, DictionaryDescriptor dictionary)
    {
        return effectivePredicateDomain.overlaps(getDomain(effectivePredicateDomain.getType(), dictionary, timeZone));
    }

    @VisibleForTesting
    public static Domain getDomain(
            ColumnDescriptor column,
            Type type,
            long columnValuesCount,
            Statistics statistics,
            ParquetDataSourceId id,
            DateTimeZone timeZone)
            throws ParquetCorruptionException
    {
        if (statistics == null || statistics.isEmpty()) {
            return Domain.all(type);
        }

        if (statistics.isNumNullsSet() && statistics.getNumNulls() == columnValuesCount) {
            return Domain.onlyNull(type);
        }

        boolean hasNullValue = !statistics.isNumNullsSet() || statistics.getNumNulls() != 0L;

        if (!statistics.hasNonNullValue() || statistics.genericGetMin() == null || statistics.genericGetMax() == null) {
            return Domain.create(ValueSet.all(type), hasNullValue);
        }

        try {
            Object min = statistics.genericGetMin();
            Object max = statistics.genericGetMax();
            return getDomain(
                    column,
                    type,
                    ImmutableList.of(min instanceof Binary ? Slices.wrappedBuffer(((Binary) min).getBytes()) : min),
                    ImmutableList.of(max instanceof Binary ? Slices.wrappedBuffer(((Binary) max).getBytes()) : max),
                    hasNullValue,
                    timeZone);
        }
        catch (Exception e) {
            throw corruptionException(column.toString(), id, statistics, e);
        }
    }

    /**
     * Get a domain for the ranges defined by each pair of elements from {@code minimums} and {@code maximums}.
     * Both arrays must have the same length.
     */
    private static Domain getDomain(
            ColumnDescriptor column,
            Type type,
            List