All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.trino.orc.metadata.statistics.StringStatisticsBuilder Maven / Gradle / Ivy

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.trino.orc.metadata.statistics;

import io.airlift.slice.Slice;
import io.airlift.slice.Slices;

import java.util.List;
import java.util.Optional;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.base.Verify.verify;
import static io.airlift.slice.SliceUtf8.codePointToUtf8;
import static io.airlift.slice.SliceUtf8.getCodePointAt;
import static io.airlift.slice.Slices.EMPTY_SLICE;
import static io.trino.orc.metadata.statistics.StringStatistics.STRING_VALUE_BYTES_OVERHEAD;
import static java.lang.Character.MAX_CODE_POINT;
import static java.lang.Math.addExact;
import static java.lang.System.arraycopy;
import static java.util.Objects.requireNonNull;

public class StringStatisticsBuilder
        implements SliceColumnStatisticsBuilder
{
    private final int stringStatisticsLimitInBytes;

    private long nonNullValueCount;
    private Slice minimum;
    private Slice maximum;
    private long sum;
    private final BloomFilterBuilder bloomFilterBuilder;
    private final boolean shouldCompactMinMax;

    public StringStatisticsBuilder(int stringStatisticsLimitInBytes, BloomFilterBuilder bloomFilterBuilder)
    {
        this(stringStatisticsLimitInBytes, 0, null, null, 0, bloomFilterBuilder, true);
    }

    public StringStatisticsBuilder(int stringStatisticsLimitInBytes, BloomFilterBuilder bloomFilterBuilder, boolean shouldCompactMinMax)
    {
        this(stringStatisticsLimitInBytes, 0, null, null, 0, bloomFilterBuilder, shouldCompactMinMax);
    }

    private StringStatisticsBuilder(
            int stringStatisticsLimitInBytes,
            long nonNullValueCount,
            Slice minimum,
            Slice maximum,
            long sum,
            BloomFilterBuilder bloomFilterBuilder,
            boolean shouldCompactMinMax)
    {
        this.stringStatisticsLimitInBytes = stringStatisticsLimitInBytes;
        this.nonNullValueCount = nonNullValueCount;
        this.minimum = minimum;
        this.maximum = maximum;
        this.sum = sum;
        this.bloomFilterBuilder = requireNonNull(bloomFilterBuilder, "bloomFilterBuilder");
        this.shouldCompactMinMax = shouldCompactMinMax;
    }

    public long getNonNullValueCount()
    {
        return nonNullValueCount;
    }

    @Override
    public void addValue(Slice value)
    {
        requireNonNull(value, "value is null");

        if (nonNullValueCount == 0) {
            checkState(minimum == null && maximum == null);
            minimum = value;
            maximum = value;
        }
        else if (minimum != null && value.compareTo(minimum) <= 0) {
            minimum = value;
        }
        else if (maximum != null && value.compareTo(maximum) >= 0) {
            maximum = value;
        }
        bloomFilterBuilder.addString(value);

        nonNullValueCount++;
        sum = addExact(sum, value.length());
    }

    /**
     * This method can only be used in merging stats.
     * It assumes min or max could be nulls.
     */
    private void addStringStatistics(long valueCount, StringStatistics value)
    {
        requireNonNull(value, "value is null");
        checkArgument(valueCount > 0, "valueCount is 0");
        checkArgument(value.getMin() != null || value.getMax() != null, "min and max cannot both be null");

        if (nonNullValueCount == 0) {
            checkState(minimum == null && maximum == null);
            minimum = value.getMin();
            maximum = value.getMax();
        }
        else {
            if (minimum != null && (value.getMin() == null || minimum.compareTo(value.getMin()) > 0)) {
                minimum = value.getMin();
            }
            if (maximum != null && (value.getMax() == null || maximum.compareTo(value.getMax()) < 0)) {
                maximum = value.getMax();
            }
        }

        nonNullValueCount += valueCount;
        sum = addExact(sum, value.getSum());
    }

    private Optional buildStringStatistics()
    {
        if (nonNullValueCount == 0) {
            return Optional.empty();
        }
        minimum = computeStringMinMax(minimum, true);
        maximum = computeStringMinMax(maximum, false);
        if (minimum == null && maximum == null) {
            // Create string stats only when min or max is not null.
            // This corresponds to the behavior of metadata reader.
            return Optional.empty();
        }
        return Optional.of(new StringStatistics(minimum, maximum, sum));
    }

    @Override
    public ColumnStatistics buildColumnStatistics()
    {
        Optional stringStatistics = buildStringStatistics();
        stringStatistics.ifPresent(s -> verify(nonNullValueCount > 0));
        return new ColumnStatistics(
                nonNullValueCount,
                stringStatistics.map(s -> STRING_VALUE_BYTES_OVERHEAD + sum / nonNullValueCount).orElse(0L),
                null,
                null,
                null,
                null,
                stringStatistics.orElse(null),
                null,
                null,
                null,
                null,
                bloomFilterBuilder.buildBloomFilter());
    }

    public static Optional mergeStringStatistics(List stats)
    {
        // no need to set the stats limit for the builder given we assume the given stats are within the same limit
        StringStatisticsBuilder stringStatisticsBuilder = new StringStatisticsBuilder(Integer.MAX_VALUE, new NoOpBloomFilterBuilder());
        for (ColumnStatistics columnStatistics : stats) {
            StringStatistics partialStatistics = columnStatistics.getStringStatistics();
            if (columnStatistics.getNumberOfValues() > 0) {
                if (partialStatistics == null || (partialStatistics.getMin() == null && partialStatistics.getMax() == null)) {
                    // there are non null values but no statistics, so we cannot say anything about the data
                    return Optional.empty();
                }
                stringStatisticsBuilder.addStringStatistics(columnStatistics.getNumberOfValues(), partialStatistics);
            }
        }
        return stringStatisticsBuilder.buildStringStatistics();
    }

    private Slice computeStringMinMax(Slice minOrMax, boolean isMin)
    {
        if (minOrMax == null || (!shouldCompactMinMax && minOrMax.length() > stringStatisticsLimitInBytes)) {
            return null;
        }
        if (minOrMax.length() > stringStatisticsLimitInBytes) {
            if (isMin) {
                return StringCompactor.truncateMin(minOrMax, stringStatisticsLimitInBytes);
            }
            return StringCompactor.truncateMax(minOrMax, stringStatisticsLimitInBytes);
        }

        // Do not hold the entire slice where the actual stats could be small
        if (minOrMax.isCompact()) {
            return minOrMax;
        }
        return minOrMax.copy();
    }

    static final class StringCompactor
    {
        private static final int INDEX_NOT_FOUND = -1;

        private StringCompactor() {}

        public static Slice truncateMin(Slice slice, int maxBytes)
        {
            checkArgument(slice.length() > maxBytes);
            int lastIndex = findLastCharacterInRange(slice, maxBytes);
            if (lastIndex == INDEX_NOT_FOUND) {
                return EMPTY_SLICE;
            }
            return slice.slice(0, lastIndex);
        }

        public static Slice truncateMax(Slice slice, int maxBytes)
        {
            int firstRemovedCharacterIndex = findLastCharacterInRange(slice, maxBytes);
            int lastRetainedCharacterIndex = findLastCharacterInRange(slice, firstRemovedCharacterIndex - 1);
            if (firstRemovedCharacterIndex == INDEX_NOT_FOUND || lastRetainedCharacterIndex == INDEX_NOT_FOUND) {
                return EMPTY_SLICE;
            }
            int lastRetainedCharacter = getCodePointAt(slice, lastRetainedCharacterIndex);
            while (lastRetainedCharacter == MAX_CODE_POINT && lastRetainedCharacterIndex > 0) {
                lastRetainedCharacterIndex = findLastCharacterInRange(slice, lastRetainedCharacterIndex - 1);
                if (lastRetainedCharacterIndex == INDEX_NOT_FOUND) {
                    return EMPTY_SLICE;
                }
                lastRetainedCharacter = getCodePointAt(slice, lastRetainedCharacterIndex);
            }

            if (lastRetainedCharacterIndex == 0 && lastRetainedCharacter == MAX_CODE_POINT) {
                // whole string is made of MAX_CODE_POINT characters, we cannot provide upper bound that is shorter than maxBytes
                return EMPTY_SLICE;
            }

            lastRetainedCharacter++;
            Slice sliceToAppend = codePointToUtf8(lastRetainedCharacter);
            byte[] result = new byte[lastRetainedCharacterIndex + sliceToAppend.length()];
            arraycopy(slice.byteArray(), slice.byteArrayOffset(), result, 0, lastRetainedCharacterIndex);
            arraycopy(sliceToAppend.byteArray(), 0, result, lastRetainedCharacterIndex, sliceToAppend.length());
            return Slices.wrappedBuffer(result);
        }

        private static int findLastCharacterInRange(Slice slice, int toInclusive)
        {
            int pos = toInclusive;
            while (pos >= 0) {
                if (isUtfBlockStartChar(slice.getByte(pos))) {
                    return pos;
                }
                pos--;
            }
            return INDEX_NOT_FOUND;
        }

        private static boolean isUtfBlockStartChar(byte b)
        {
            return (b & 0xC0) != 0x80;
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy