All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.trino.orc.metadata.statistics.ColumnStatistics Maven / Gradle / Ivy

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.trino.orc.metadata.statistics;

import io.trino.orc.metadata.statistics.StatisticsHasher.Hashable;

import java.util.List;
import java.util.Objects;

import static com.google.common.base.MoreObjects.toStringHelper;
import static io.airlift.slice.SizeOf.instanceSize;
import static io.trino.orc.metadata.statistics.BinaryStatisticsBuilder.mergeBinaryStatistics;
import static io.trino.orc.metadata.statistics.BooleanStatisticsBuilder.mergeBooleanStatistics;
import static io.trino.orc.metadata.statistics.DateStatisticsBuilder.mergeDateStatistics;
import static io.trino.orc.metadata.statistics.DoubleStatisticsBuilder.mergeDoubleStatistics;
import static io.trino.orc.metadata.statistics.IntegerStatisticsBuilder.mergeIntegerStatistics;
import static io.trino.orc.metadata.statistics.LongDecimalStatisticsBuilder.mergeDecimalStatistics;
import static io.trino.orc.metadata.statistics.StringStatisticsBuilder.mergeStringStatistics;
import static io.trino.orc.metadata.statistics.TimestampStatisticsBuilder.mergeTimestampStatistics;

public class ColumnStatistics
        implements Hashable
{
    private static final int INSTANCE_SIZE = instanceSize(ColumnStatistics.class);

    private final boolean hasNumberOfValues;
    private final long numberOfValues;
    private final long minAverageValueSizeInBytes;
    private final BooleanStatistics booleanStatistics;
    private final IntegerStatistics integerStatistics;
    private final DoubleStatistics doubleStatistics;
    private final long numberOfNanValues;
    private final StringStatistics stringStatistics;
    private final DateStatistics dateStatistics;
    private final TimestampStatistics timestampStatistics;
    private final DecimalStatistics decimalStatistics;
    private final BinaryStatistics binaryStatistics;
    private final BloomFilter bloomFilter;

    public ColumnStatistics(
            Long numberOfValues,
            long minAverageValueSizeInBytes,
            BooleanStatistics booleanStatistics,
            IntegerStatistics integerStatistics,
            DoubleStatistics doubleStatistics,
            Long numberOfNanValues,
            StringStatistics stringStatistics,
            DateStatistics dateStatistics,
            TimestampStatistics timestampStatistics,
            DecimalStatistics decimalStatistics,
            BinaryStatistics binaryStatistics,
            BloomFilter bloomFilter)
    {
        this.hasNumberOfValues = numberOfValues != null;
        this.numberOfValues = hasNumberOfValues ? numberOfValues : 0;
        this.minAverageValueSizeInBytes = minAverageValueSizeInBytes;
        this.booleanStatistics = booleanStatistics;
        this.integerStatistics = integerStatistics;
        this.doubleStatistics = doubleStatistics;
        this.numberOfNanValues = numberOfNanValues != null ? numberOfNanValues : 0;
        this.stringStatistics = stringStatistics;
        this.dateStatistics = dateStatistics;
        this.timestampStatistics = timestampStatistics;
        this.decimalStatistics = decimalStatistics;
        this.binaryStatistics = binaryStatistics;
        this.bloomFilter = bloomFilter;
    }

    public boolean hasNumberOfValues()
    {
        return hasNumberOfValues;
    }

    public long getNumberOfValues()
    {
        return hasNumberOfValues ? numberOfValues : 0;
    }

    public boolean hasMinAverageValueSizeInBytes()
    {
        return hasNumberOfValues() && numberOfValues > 0;
    }

    /**
     * The minimum average value sizes.
     * The actual average value size is no less than the return value.
     * It provides a lower bound of the size of data to be loaded
     */
    public long getMinAverageValueSizeInBytes()
    {
        // it is ok to return 0 if the size does not exist given it is a lower bound
        return minAverageValueSizeInBytes;
    }

    public BooleanStatistics getBooleanStatistics()
    {
        return booleanStatistics;
    }

    public DateStatistics getDateStatistics()
    {
        return dateStatistics;
    }

    public DoubleStatistics getDoubleStatistics()
    {
        return doubleStatistics;
    }

    public long getNumberOfNanValues()
    {
        return numberOfNanValues;
    }

    public IntegerStatistics getIntegerStatistics()
    {
        return integerStatistics;
    }

    public StringStatistics getStringStatistics()
    {
        return stringStatistics;
    }

    public DecimalStatistics getDecimalStatistics()
    {
        return decimalStatistics;
    }

    public BinaryStatistics getBinaryStatistics()
    {
        return binaryStatistics;
    }

    public TimestampStatistics getTimestampStatistics()
    {
        return timestampStatistics;
    }

    public BloomFilter getBloomFilter()
    {
        return bloomFilter;
    }

    public ColumnStatistics withBloomFilter(BloomFilter bloomFilter)
    {
        return new ColumnStatistics(
                getNumberOfValues(),
                minAverageValueSizeInBytes,
                booleanStatistics,
                integerStatistics,
                doubleStatistics,
                numberOfNanValues,
                stringStatistics,
                dateStatistics,
                timestampStatistics,
                decimalStatistics,
                binaryStatistics,
                bloomFilter);
    }

    public long getRetainedSizeInBytes()
    {
        long retainedSizeInBytes = INSTANCE_SIZE;
        if (booleanStatistics != null) {
            retainedSizeInBytes += booleanStatistics.getRetainedSizeInBytes();
        }
        if (integerStatistics != null) {
            retainedSizeInBytes += integerStatistics.getRetainedSizeInBytes();
        }
        if (doubleStatistics != null) {
            retainedSizeInBytes += doubleStatistics.getRetainedSizeInBytes();
        }
        if (stringStatistics != null) {
            retainedSizeInBytes += stringStatistics.getRetainedSizeInBytes();
        }
        if (dateStatistics != null) {
            retainedSizeInBytes += dateStatistics.getRetainedSizeInBytes();
        }
        if (timestampStatistics != null) {
            retainedSizeInBytes += timestampStatistics.getRetainedSizeInBytes();
        }
        if (decimalStatistics != null) {
            retainedSizeInBytes += decimalStatistics.getRetainedSizeInBytes();
        }
        if (binaryStatistics != null) {
            retainedSizeInBytes += binaryStatistics.getRetainedSizeInBytes();
        }
        if (bloomFilter != null) {
            retainedSizeInBytes += bloomFilter.getRetainedSizeInBytes();
        }
        return retainedSizeInBytes;
    }

    @Override
    public boolean equals(Object o)
    {
        if (this == o) {
            return true;
        }
        if (o == null || getClass() != o.getClass()) {
            return false;
        }
        ColumnStatistics that = (ColumnStatistics) o;
        return hasNumberOfValues == that.hasNumberOfValues &&
                getNumberOfValues() == that.getNumberOfValues() &&
                Objects.equals(booleanStatistics, that.booleanStatistics) &&
                Objects.equals(integerStatistics, that.integerStatistics) &&
                Objects.equals(doubleStatistics, that.doubleStatistics) &&
                Objects.equals(stringStatistics, that.stringStatistics) &&
                Objects.equals(dateStatistics, that.dateStatistics) &&
                Objects.equals(timestampStatistics, that.timestampStatistics) &&
                Objects.equals(decimalStatistics, that.decimalStatistics) &&
                Objects.equals(binaryStatistics, that.binaryStatistics) &&
                Objects.equals(bloomFilter, that.bloomFilter);
    }

    @Override
    public int hashCode()
    {
        return Objects.hash(
                hasNumberOfValues,
                getNumberOfValues(),
                booleanStatistics,
                integerStatistics,
                doubleStatistics,
                stringStatistics,
                dateStatistics,
                timestampStatistics,
                decimalStatistics,
                binaryStatistics,
                bloomFilter);
    }

    @Override
    public String toString()
    {
        return toStringHelper(this)
                .omitNullValues()
                .add("numberOfValues", getNumberOfValues())
                .add("booleanStatistics", booleanStatistics)
                .add("integerStatistics", integerStatistics)
                .add("doubleStatistics", doubleStatistics)
                .add("stringStatistics", stringStatistics)
                .add("dateStatistics", dateStatistics)
                .add("timestampStatistics", timestampStatistics)
                .add("decimalStatistics", decimalStatistics)
                .add("binaryStatistics", binaryStatistics)
                .add("bloomFilter", bloomFilter)
                .toString();
    }

    @Override
    public void addHash(StatisticsHasher hasher)
    {
        hasher.putOptionalLong(hasNumberOfValues, numberOfValues)
                .putOptionalHashable(booleanStatistics)
                .putOptionalHashable(integerStatistics)
                .putOptionalHashable(doubleStatistics)
                .putOptionalHashable(stringStatistics)
                .putOptionalHashable(dateStatistics)
                .putOptionalHashable(timestampStatistics)
                .putOptionalHashable(decimalStatistics)
                .putOptionalHashable(binaryStatistics)
                .putOptionalHashable(bloomFilter);
    }

    public static ColumnStatistics mergeColumnStatistics(List stats)
    {
        long numberOfRows = stats.stream()
                .mapToLong(ColumnStatistics::getNumberOfValues)
                .sum();

        long minAverageValueBytes = 0;
        if (numberOfRows > 0) {
            minAverageValueBytes = stats.stream()
                    .mapToLong(s -> s.getMinAverageValueSizeInBytes() * s.getNumberOfValues())
                    .sum() / numberOfRows;
        }

        long numberOfNanValues = stats.stream()
                .mapToLong(ColumnStatistics::getNumberOfNanValues)
                .sum();

        return new ColumnStatistics(
                numberOfRows,
                minAverageValueBytes,
                mergeBooleanStatistics(stats).orElse(null),
                mergeIntegerStatistics(stats).orElse(null),
                mergeDoubleStatistics(stats).orElse(null),
                numberOfNanValues,
                mergeStringStatistics(stats).orElse(null),
                mergeDateStatistics(stats).orElse(null),
                mergeTimestampStatistics(stats).orElse(null),
                mergeDecimalStatistics(stats).orElse(null),
                mergeBinaryStatistics(stats).orElse(null),
                null);
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy