All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.facebook.presto.orc.DictionaryCompressionOptimizer Maven / Gradle / Ivy

There is a newer version: 0.291
Show newest version
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.presto.orc;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.primitives.Longs;
import io.airlift.units.DataSize;
import io.airlift.units.DataSize.Unit;

import java.util.HashSet;
import java.util.OptionalInt;
import java.util.Set;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static java.lang.Math.toIntExact;
import static java.util.Objects.requireNonNull;
import static java.util.stream.Collectors.toSet;

public class DictionaryCompressionOptimizer
{
    private static final double DICTIONARY_MIN_COMPRESSION_RATIO = 1.25;

    // Instead of waiting for the dictionary to fill completely, which would force a column into
    // direct mode, close the stripe early assuming it has hit the minimum row count.
    static final DataSize DICTIONARY_MEMORY_MAX_RANGE = new DataSize(4, Unit.MEGABYTE);

    static final DataSize DIRECT_COLUMN_SIZE_RANGE = new DataSize(4, Unit.MEGABYTE);

    private final Set allWriters;
    private final Set directConversionCandidates = new HashSet<>();

    private final int stripeMinBytes;
    private final int stripeMaxBytes;
    private final int stripeMaxRowCount;
    private final int dictionaryMemoryMaxBytesLow;
    private final int dictionaryMemoryMaxBytesHigh;

    private int dictionaryMemoryBytes;

    public DictionaryCompressionOptimizer(
            Set writers,
            int stripeMinBytes,
            int stripeMaxBytes,
            int stripeMaxRowCount,
            int dictionaryMemoryMaxBytes)
    {
        requireNonNull(writers, "writers is null");
        this.allWriters = ImmutableSet.copyOf(writers.stream()
                .map(DictionaryColumnManager::new)
                .collect(toSet()));

        checkArgument(stripeMinBytes >= 0, "stripeMinBytes is negative");
        this.stripeMinBytes = stripeMinBytes;

        checkArgument(stripeMaxBytes >= stripeMinBytes, "stripeMaxBytes is less than stripeMinBytes");
        this.stripeMaxBytes = stripeMaxBytes;

        checkArgument(stripeMaxRowCount >= 0, "stripeMaxRowCount is negative");
        this.stripeMaxRowCount = stripeMaxRowCount;

        checkArgument(dictionaryMemoryMaxBytes >= 0, "dictionaryMemoryMaxBytes is negative");
        this.dictionaryMemoryMaxBytesHigh = dictionaryMemoryMaxBytes;
        this.dictionaryMemoryMaxBytesLow = (int) Math.max(dictionaryMemoryMaxBytes - DICTIONARY_MEMORY_MAX_RANGE.toBytes(), 0);

        directConversionCandidates.addAll(allWriters);
    }

    public int getDictionaryMemoryBytes()
    {
        return dictionaryMemoryBytes;
    }

    public boolean isFull(long bufferedBytes)
    {
        // if the strip is big enough to flush, stop before we hit the absolute max, so we are
        // not forced to convert a dictionary to direct to fit in memory
        if (bufferedBytes > stripeMinBytes) {
            return dictionaryMemoryBytes > dictionaryMemoryMaxBytesLow;
        }
        // strip is small, grow to the high water mark (so at the very least we have more information)
        return dictionaryMemoryBytes > dictionaryMemoryMaxBytesHigh;
    }

    public void reset()
    {
        directConversionCandidates.clear();
        directConversionCandidates.addAll(allWriters);
        dictionaryMemoryBytes = 0;
        allWriters.forEach(DictionaryColumnManager::reset);
    }

    public void finalOptimize(int bufferedBytes)
    {
        convertLowCompressionStreams(bufferedBytes);
    }

    public void optimize(int bufferedBytes, int stripeRowCount)
    {
        // recompute the dictionary memory usage
        dictionaryMemoryBytes = allWriters.stream()
                .filter(writer -> !writer.isDirectEncoded())
                .mapToInt(DictionaryColumnManager::getDictionaryBytes)
                .sum();

        // update the dictionary growth history
        allWriters.stream()
                .filter(writer -> !writer.isDirectEncoded())
                .forEach(column -> column.updateHistory(stripeRowCount));

        if (dictionaryMemoryBytes <= dictionaryMemoryMaxBytesLow) {
            return;
        }

        // before any further checks, convert all low compression streams
        bufferedBytes = convertLowCompressionStreams(bufferedBytes);

        if (dictionaryMemoryBytes <= dictionaryMemoryMaxBytesLow || bufferedBytes >= stripeMaxBytes) {
            return;
        }

        // calculate size of non-dictionary columns by removing the buffered size of dictionary columns
        int nonDictionaryBufferedBytes = bufferedBytes;
        for (DictionaryColumnManager dictionaryWriter : allWriters) {
            if (!dictionaryWriter.isDirectEncoded()) {
                nonDictionaryBufferedBytes -= dictionaryWriter.getBufferedBytes();
            }
        }

        // convert dictionary columns to direct until we are below the high memory limit
        while (!directConversionCandidates.isEmpty() && dictionaryMemoryBytes > dictionaryMemoryMaxBytesHigh && bufferedBytes < stripeMaxBytes) {
            DictionaryCompressionProjection projection = selectDictionaryColumnToConvert(nonDictionaryBufferedBytes, stripeRowCount);
            int selectDictionaryColumnBufferedBytes = toIntExact(projection.getColumnToConvert().getBufferedBytes());

            OptionalInt directBytes = tryConvertToDirect(projection.getColumnToConvert(), getMaxDirectBytes(bufferedBytes));
            if (directBytes.isPresent()) {
                bufferedBytes = bufferedBytes + directBytes.getAsInt() - selectDictionaryColumnBufferedBytes;
                nonDictionaryBufferedBytes += directBytes.getAsInt();
            }
        }

        if (bufferedBytes >= stripeMaxBytes) {
            return;
        }

        // if the stripe is larger then the minimum stripe size, we are not required to convert any more dictionary columns to direct
        if (bufferedBytes >= stripeMinBytes) {
            // check if we can get better compression by converting a dictionary column to direct.  This can happen when then there are multiple
            // dictionary columns and one does not compress well, so if we convert it to direct we can continue to use the existing dictionaries
            // for the other columns.
            double currentCompressionRatio = currentCompressionRatio(nonDictionaryBufferedBytes);
            while (!directConversionCandidates.isEmpty() && bufferedBytes < stripeMaxBytes) {
                DictionaryCompressionProjection projection = selectDictionaryColumnToConvert(nonDictionaryBufferedBytes, stripeRowCount);
                if (projection.getPredictedFileCompressionRatio() < currentCompressionRatio) {
                    return;
                }

                int selectDictionaryColumnBufferedBytes = toIntExact(projection.getColumnToConvert().getBufferedBytes());
                OptionalInt directBytes = tryConvertToDirect(projection.getColumnToConvert(), getMaxDirectBytes(bufferedBytes));
                if (directBytes.isPresent()) {
                    bufferedBytes = bufferedBytes + directBytes.getAsInt() - selectDictionaryColumnBufferedBytes;
                    nonDictionaryBufferedBytes += directBytes.getAsInt();
                }
            }
        }
    }

    private int convertLowCompressionStreams(int bufferedBytes)
    {
        // convert all low compression column to direct
        for (DictionaryColumnManager dictionaryWriter : ImmutableList.copyOf(directConversionCandidates)) {
            if (dictionaryWriter.getCompressionRatio() < DICTIONARY_MIN_COMPRESSION_RATIO) {
                int columnBufferedBytes = toIntExact(dictionaryWriter.getBufferedBytes());
                OptionalInt directBytes = tryConvertToDirect(dictionaryWriter, getMaxDirectBytes(bufferedBytes));
                if (directBytes.isPresent()) {
                    bufferedBytes = bufferedBytes + directBytes.getAsInt() - columnBufferedBytes;
                    if (bufferedBytes >= stripeMaxBytes) {
                        return bufferedBytes;
                    }
                }
            }
        }
        return bufferedBytes;
    }

    private OptionalInt tryConvertToDirect(DictionaryColumnManager dictionaryWriter, int maxDirectBytes)
    {
        int dictionaryBytes = dictionaryWriter.getDictionaryBytes();
        OptionalInt directBytes = dictionaryWriter.tryConvertToDirect(maxDirectBytes);
        if (directBytes.isPresent()) {
            dictionaryMemoryBytes -= dictionaryBytes;
        }
        directConversionCandidates.remove(dictionaryWriter);
        return directBytes;
    }

    private double currentCompressionRatio(int totalNonDictionaryBytes)
    {
        long uncompressedBytes = totalNonDictionaryBytes;
        long compressedBytes = totalNonDictionaryBytes;

        for (DictionaryColumnManager column : allWriters) {
            if (!column.isDirectEncoded()) {
                uncompressedBytes += column.getRawBytes();
                compressedBytes += column.getDictionaryBytes();
            }
        }
        return 1.0 * uncompressedBytes / compressedBytes;
    }

    /**
     * Choose a dictionary column to convert to direct encoding.  We do this by predicting the compression ration
     * of the stripe if a singe column is flipped to direct.  So for each column, we try to predict the row count
     * when we will hit a stripe flush limit if that column were converted to direct.  Once we know the row count, we
     * calculate the predicted compression ratio.
     *
     * @param totalNonDictionaryBytes current size of the stripe without non-dictionary columns
     * @param stripeRowCount current number of rows in the stripe
     * @return the column that would produce the best stripe compression ration if converted to direct
     */
    private DictionaryCompressionProjection selectDictionaryColumnToConvert(int totalNonDictionaryBytes, int stripeRowCount)
    {
        checkState(!directConversionCandidates.isEmpty());

        int totalNonDictionaryBytesPerRow = totalNonDictionaryBytes / stripeRowCount;

        // rawBytes = sum of the length of every row value (without dictionary encoding)
        // dictionaryBytes = sum of the length of every entry in the dictionary
        // indexBytes = bytes used encode the dictionary index (e.g., 2 byte for dictionary less than 65536 entries)

        long totalDictionaryRawBytes = 0;
        long totalDictionaryBytes = 0;
        long totalDictionaryIndexBytes = 0;

        long totalDictionaryRawBytesPerRow = 0;
        long totalDictionaryBytesPerNewRow = 0;
        long totalDictionaryIndexBytesPerRow = 0;

        for (DictionaryColumnManager column : allWriters) {
            if (!column.isDirectEncoded()) {
                totalDictionaryRawBytes += column.getRawBytes();
                totalDictionaryBytes += column.getDictionaryBytes();
                totalDictionaryIndexBytes += column.getIndexBytes();

                totalDictionaryRawBytesPerRow += column.getRawBytesPerRow();
                totalDictionaryBytesPerNewRow += column.getDictionaryBytesPerFutureRow();
                totalDictionaryIndexBytesPerRow += column.getIndexBytesPerRow();
            }
        }

        long totalUncompressedBytesPerRow = totalNonDictionaryBytesPerRow + totalDictionaryRawBytesPerRow;

        DictionaryCompressionProjection maxProjectedCompression = null;
        for (DictionaryColumnManager column : directConversionCandidates) {
            // determine the size of the currently written stripe if we were convert this column to direct
            long currentRawBytes = totalNonDictionaryBytes + column.getRawBytes();
            long currentDictionaryBytes = totalDictionaryBytes - column.getDictionaryBytes();
            long currentIndexBytes = totalDictionaryIndexBytes - column.getIndexBytes();
            long currentTotalBytes = currentRawBytes + currentDictionaryBytes + currentIndexBytes;

            // estimate the size of each new row if we were convert this column to direct
            double rawBytesPerFutureRow = totalNonDictionaryBytesPerRow + column.getRawBytesPerRow();
            double dictionaryBytesPerFutureRow = totalDictionaryBytesPerNewRow - column.getDictionaryBytesPerFutureRow();
            double indexBytesPerFutureRow = totalDictionaryIndexBytesPerRow - column.getIndexBytesPerRow();
            double totalBytesPerFutureRow = rawBytesPerFutureRow + dictionaryBytesPerFutureRow + indexBytesPerFutureRow;

            // estimate how many rows until we hit a limit and flush the stripe if we convert this column to direct
            long rowsToDictionaryMemoryLimit = (long) ((dictionaryMemoryMaxBytesLow - currentDictionaryBytes) / dictionaryBytesPerFutureRow);
            long rowsToStripeMemoryLimit = (long) ((stripeMaxBytes - currentTotalBytes) / totalBytesPerFutureRow);
            long rowsToStripeRowLimit = stripeMaxRowCount - stripeRowCount;
            long rowsToLimit = Longs.min(rowsToDictionaryMemoryLimit, rowsToStripeMemoryLimit, rowsToStripeRowLimit);

            // predict the compression ratio at that limit if we were convert this column to direct
            long predictedUncompressedSizeAtLimit = totalNonDictionaryBytes + totalDictionaryRawBytes + (totalUncompressedBytesPerRow * rowsToLimit);
            long predictedCompressedSizeAtLimit = (long) (currentTotalBytes + (totalBytesPerFutureRow * rowsToLimit));
            double predictedCompressionRatioAtLimit = 1.0 * predictedUncompressedSizeAtLimit / predictedCompressedSizeAtLimit;

            // convert the column that creates the best compression ratio
            if (maxProjectedCompression == null || maxProjectedCompression.getPredictedFileCompressionRatio() < predictedCompressionRatioAtLimit) {
                maxProjectedCompression = new DictionaryCompressionProjection(column, predictedCompressionRatioAtLimit);
            }
        }
        return maxProjectedCompression;
    }

    private int getMaxDirectBytes(int bufferedBytes)
    {
        return toIntExact(Math.min(stripeMaxBytes, stripeMaxBytes - bufferedBytes + DIRECT_COLUMN_SIZE_RANGE.toBytes()));
    }

    public static int estimateIndexBytesPerValue(int dictionaryEntries)
    {
        // assume basic byte packing
        if (dictionaryEntries <= 256) {
            return 1;
        }
        if (dictionaryEntries <= 65_536) {
            return 2;
        }
        if (dictionaryEntries <= 16_777_216) {
            return 3;
        }
        return 4;
    }

    public interface DictionaryColumn
    {
        long getValueCount();

        long getNonNullValueCount();

        long getRawBytes();

        int getDictionaryEntries();

        int getDictionaryBytes();

        int getIndexBytes();

        OptionalInt tryConvertToDirect(int maxDirectBytes);

        long getBufferedBytes();
    }

    private static class DictionaryColumnManager
    {
        private final DictionaryColumn dictionaryColumn;
        private boolean directEncoded;

        private int rowCount;

        private long pastValueCount;
        private int pastDictionaryEntries;

        private long pendingPastValueCount;
        private int pendingPastDictionaryEntries;

        public DictionaryColumnManager(DictionaryColumn dictionaryColumn)
        {
            this.dictionaryColumn = dictionaryColumn;
        }

        OptionalInt tryConvertToDirect(int maxDirectBytes)
        {
            OptionalInt directBytes = dictionaryColumn.tryConvertToDirect(maxDirectBytes);
            if (directBytes.isPresent()) {
                directEncoded = true;
            }
            return directBytes;
        }

        void reset()
        {
            directEncoded = false;

            pastValueCount = 0;
            pastDictionaryEntries = 0;

            pendingPastValueCount = 0;
            pendingPastDictionaryEntries = 0;
        }

        public void updateHistory(int rowCount)
        {
            this.rowCount = rowCount;
            long currentValueCount = dictionaryColumn.getValueCount();
            if (currentValueCount - pendingPastValueCount >= 1024) {
                pastValueCount = pendingPastValueCount;
                pastDictionaryEntries = pendingPastDictionaryEntries;

                pendingPastValueCount = currentValueCount;
                pendingPastDictionaryEntries = dictionaryColumn.getDictionaryEntries();
            }
        }

        public long getRawBytes()
        {
            checkState(!directEncoded);
            return dictionaryColumn.getRawBytes();
        }

        public double getRawBytesPerRow()
        {
            checkState(!directEncoded);
            return 1.0 * getRawBytes() / rowCount;
        }

        public int getDictionaryBytes()
        {
            checkState(!directEncoded);
            return dictionaryColumn.getDictionaryBytes();
        }

        public double getDictionaryBytesPerFutureRow()
        {
            checkState(!directEncoded);

            int currentDictionaryEntries = dictionaryColumn.getDictionaryEntries();
            long currentValueCount = dictionaryColumn.getValueCount();

            // average size of a dictionary entry
            double dictionaryBytesPerEntry = 1.0 * dictionaryColumn.getDictionaryBytes() / currentDictionaryEntries;

            // average number of entries added per non-value over the "past" period
            double dictionaryEntriesPerFutureValue = 1.0 * (currentDictionaryEntries - pastDictionaryEntries) / (currentValueCount - pastValueCount);

            // Expected size comes down to: chance for a non-null row * chance for a unique dictionary value * average bytes per dictionary entry
            return dictionaryBytesPerEntry * dictionaryEntriesPerFutureValue;
        }

        public int getIndexBytes()
        {
            checkState(!directEncoded);
            return dictionaryColumn.getIndexBytes();
        }

        public double getIndexBytesPerRow()
        {
            checkState(!directEncoded);
            return 1.0 * getIndexBytes() / rowCount;
        }

        public double getCompressionRatio()
        {
            checkState(!directEncoded);
            return 1.0 * getRawBytes() / getBufferedBytes();
        }

        public long getBufferedBytes()
        {
            return dictionaryColumn.getBufferedBytes();
        }

        public boolean isDirectEncoded()
        {
            return directEncoded;
        }
    }

    private static class DictionaryCompressionProjection
    {
        private final DictionaryColumnManager columnToConvert;
        private final double predictedFileCompressionRatio;

        public DictionaryCompressionProjection(DictionaryColumnManager columnToConvert, double predictedFileCompressionRatio)
        {
            this.columnToConvert = requireNonNull(columnToConvert, "columnToConvert is null");
            this.predictedFileCompressionRatio = predictedFileCompressionRatio;
        }

        public DictionaryColumnManager getColumnToConvert()
        {
            return columnToConvert;
        }

        public double getPredictedFileCompressionRatio()
        {
            return predictedFileCompressionRatio;
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy