All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.trino.orc.writer.DictionaryBuilder Maven / Gradle / Ivy

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.trino.orc.writer;

import io.airlift.slice.DynamicSliceOutput;
import io.airlift.slice.Slice;
import io.airlift.slice.SliceOutput;
import io.airlift.slice.XxHash64;
import io.trino.array.IntBigArray;
import io.trino.spi.block.VariableWidthBlock;

import java.util.Arrays;
import java.util.Optional;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Verify.verify;
import static io.airlift.slice.SizeOf.instanceSize;
import static io.airlift.slice.SizeOf.sizeOf;
import static io.trino.spi.block.PageBuilderStatus.DEFAULT_MAX_PAGE_SIZE_IN_BYTES;
import static it.unimi.dsi.fastutil.HashCommon.arraySize;
import static java.lang.Math.min;
import static java.util.Objects.requireNonNull;

public class DictionaryBuilder
{
    private static final int INSTANCE_SIZE = instanceSize(DictionaryBuilder.class);

    // See jdk.internal.util.ArraysSupport.SOFT_MAX_ARRAY_LENGTH for an explanation
    private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8;

    private static final float FILL_RATIO = 0.75f;
    private static final int EMPTY_SLOT = -1;
    private static final int NULL_POSITION = 0;
    private static final int EXPECTED_BYTES_PER_ENTRY = 32;

    private final IntBigArray blockPositionByHash = new IntBigArray();

    private int entryCount = 1;
    private SliceOutput sliceOutput;
    private int[] offsets;

    private int maxFill;
    private int hashMask;

    public DictionaryBuilder(int expectedSize)
    {
        checkArgument(expectedSize >= 0, "expectedSize must not be negative");

        // todo we can do better
        int expectedEntries = min(expectedSize, DEFAULT_MAX_PAGE_SIZE_IN_BYTES / EXPECTED_BYTES_PER_ENTRY);
        // it is guaranteed expectedEntries * EXPECTED_BYTES_PER_ENTRY will not overflow
        int expectedBytes = expectedEntries * EXPECTED_BYTES_PER_ENTRY;
        sliceOutput = new DynamicSliceOutput(min(expectedBytes, MAX_ARRAY_SIZE));

        int hashSize = arraySize(expectedSize, FILL_RATIO);
        this.maxFill = calculateMaxFill(hashSize);
        this.hashMask = hashSize - 1;

        this.offsets = new int[maxFill + 1];

        blockPositionByHash.ensureCapacity(hashSize);
        blockPositionByHash.fill(EMPTY_SLOT);
    }

    public long getSizeInBytes()
    {
        return sliceOutput.size() + sizeOf(offsets);
    }

    public long getRetainedSizeInBytes()
    {
        return INSTANCE_SIZE +
                sliceOutput.getRetainedSize() +
                sizeOf(offsets) +
                blockPositionByHash.sizeOf();
    }

    public VariableWidthBlock getElementBlock()
    {
        boolean[] isNull = new boolean[entryCount];
        isNull[NULL_POSITION] = true;
        return new VariableWidthBlock(entryCount, sliceOutput.slice(), offsets, Optional.of(isNull));
    }

    public void clear()
    {
        blockPositionByHash.fill(EMPTY_SLOT);

        int initialSize = min((int) (sliceOutput.size() * 1.25), MAX_ARRAY_SIZE);
        sliceOutput = new DynamicSliceOutput(initialSize);
        entryCount = 1;
        Arrays.fill(offsets, 0);
    }

    public int putIfAbsent(VariableWidthBlock block, int position)
    {
        requireNonNull(block, "block must not be null");

        if (block.isNull(position)) {
            return NULL_POSITION;
        }

        int blockPosition;
        long hashPosition = getHashPositionOfElement(block, position);
        if (blockPositionByHash.get(hashPosition) != EMPTY_SLOT) {
            blockPosition = blockPositionByHash.get(hashPosition);
        }
        else {
            blockPosition = addNewElement(hashPosition, block, position);
        }
        verify(blockPosition != NULL_POSITION);
        return blockPosition;
    }

    public int getEntryCount()
    {
        return entryCount;
    }

    /**
     * Get slot position of the element at {@code position} of {@code block}
     */
    private long getHashPositionOfElement(VariableWidthBlock block, int position)
    {
        checkArgument(!block.isNull(position), "position is null");
        Slice rawSlice = block.getRawSlice();
        int rawSliceOffset = block.getRawSliceOffset(position);
        int length = block.getSliceLength(position);

        long hashPosition = getMaskedHash(XxHash64.hash(rawSlice, rawSliceOffset, length));
        while (true) {
            int entryPosition = blockPositionByHash.get(hashPosition);
            if (entryPosition == EMPTY_SLOT) {
                // Doesn't have this element
                return hashPosition;
            }
            int entryOffset = offsets[entryPosition];
            int entryLength = offsets[entryPosition + 1] - entryOffset;
            if (rawSlice.equals(rawSliceOffset, length, sliceOutput.getUnderlyingSlice(), entryOffset, entryLength)) {
                // Already has this element
                return hashPosition;
            }

            hashPosition = getMaskedHash(hashPosition + 1);
        }
    }

    private int addNewElement(long hashPosition, VariableWidthBlock block, int position)
    {
        checkArgument(!block.isNull(position), "position is null");

        int newElementPositionInBlock = entryCount;

        sliceOutput.writeBytes(block.getRawSlice(), block.getRawSliceOffset(position), block.getSliceLength(position));
        entryCount++;
        offsets[entryCount] = sliceOutput.size();

        blockPositionByHash.set(hashPosition, newElementPositionInBlock);

        // increase capacity, if necessary
        if (entryCount >= maxFill) {
            rehash(maxFill * 2);
        }

        return newElementPositionInBlock;
    }

    private void rehash(int size)
    {
        int newHashSize = arraySize(size + 1, FILL_RATIO);
        hashMask = newHashSize - 1;
        maxFill = calculateMaxFill(newHashSize);

        // offsets are not changed during rehashing, but we grow them hold the maxFill
        offsets = Arrays.copyOf(offsets, maxFill + 1);

        blockPositionByHash.ensureCapacity(newHashSize);
        blockPositionByHash.fill(EMPTY_SLOT);

        // the first element of elementBlock is always null
        for (int entryPosition = 1; entryPosition < entryCount; entryPosition++) {
            int entryOffset = offsets[entryPosition];
            int entryLength = offsets[entryPosition + 1] - entryOffset;
            long entryHashCode = XxHash64.hash(sliceOutput.getUnderlyingSlice(), entryOffset, entryLength);

            // values are already distinct, so just find the first empty slot
            long hashPosition = getMaskedHash(entryHashCode);
            while (true) {
                int hashEntryIndex = blockPositionByHash.get(hashPosition);
                if (hashEntryIndex == EMPTY_SLOT) {
                    blockPositionByHash.set(hashPosition, entryPosition);
                    break;
                }

                hashPosition = getMaskedHash(hashPosition + 1);
            }
        }
    }

    private static int calculateMaxFill(int hashSize)
    {
        int maxFill = (int) Math.ceil(hashSize * FILL_RATIO);
        if (maxFill == hashSize) {
            maxFill--;
        }
        return maxFill;
    }

    private long getMaskedHash(long rawHash)
    {
        return rawHash & hashMask;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy