org.dinky.shaded.paimon.utils.ZOrderByteUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of dinky-shaded-paimon Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/* This file is based on source code from the Iceberg Project (http://iceberg.apache.org/), licensed by the Apache
 * Software Foundation (ASF) under the Apache License, Version 2.0. See the NOTICE file distributed with this work for
 * additional information regarding copyright ownership. */

package org.dinky.shaded.paimon.utils;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;

/**
 * Within Z-Ordering the byte representations of objects being compared must be ordered, this
 * requires several types to be transformed when converted to bytes. The goal is to map object's
 * whose byte representation are not lexicographically ordered into representations that are
 * lexicographically ordered. Bytes produced should be compared lexicographically as unsigned bytes,
 * big-endian.
 *
 * All types except for String are stored within an 8 Byte Buffer
 *
 * 
Most of these techniques are derived from
 * https://aws.amazon.com/blogs/database/z-order-indexing-for-multifaceted-queries-in-amazon-dynamodb-part-2/
 */
public class ZOrderByteUtils {

    public static final int PRIMITIVE_BUFFER_SIZE = 8;
    public static final byte[] NULL_BYTES = new byte[PRIMITIVE_BUFFER_SIZE];

    private static final ThreadLocal ENCODER = new ThreadLocal<>();

    static {
        Arrays.fill(NULL_BYTES, (byte) 0x00);
    }

    private ZOrderByteUtils() {}

    static ByteBuffer allocatePrimitiveBuffer() {
        return ByteBuffer.allocate(PRIMITIVE_BUFFER_SIZE);
    }

    /**
     * Signed ints do not have their bytes in magnitude order because of the sign bit. To fix this,
     * flip the sign bit so that all negatives are ordered before positives. This essentially shifts
     * the 0 value so that we don't break our ordering when we cross the new 0 value.
     */
    public static ByteBuffer intToOrderedBytes(int val, ByteBuffer reuse) {
        ByteBuffer bytes = reuse(reuse, PRIMITIVE_BUFFER_SIZE);
        bytes.putLong(((long) val) ^ 0x8000000000000000L);
        return bytes;
    }

    /**
     * Signed longs are treated the same as the signed ints in {@link #intToOrderedBytes(int,
     * ByteBuffer)}.
     */
    public static ByteBuffer longToOrderedBytes(long val, ByteBuffer reuse) {
        ByteBuffer bytes = reuse(reuse, PRIMITIVE_BUFFER_SIZE);
        bytes.putLong(val ^ 0x8000000000000000L);
        return bytes;
    }

    /**
     * Signed shorts are treated the same as the signed ints in {@link #intToOrderedBytes(int,
     * ByteBuffer)}.
     */
    public static ByteBuffer shortToOrderedBytes(short val, ByteBuffer reuse) {
        ByteBuffer bytes = reuse(reuse, PRIMITIVE_BUFFER_SIZE);
        bytes.putLong(((long) val) ^ 0x8000000000000000L);
        return bytes;
    }

    /**
     * Signed tiny ints are treated the same as the signed ints in {@link #intToOrderedBytes(int,
     * ByteBuffer)}.
     */
    public static ByteBuffer tinyintToOrderedBytes(byte val, ByteBuffer reuse) {
        ByteBuffer bytes = reuse(reuse, PRIMITIVE_BUFFER_SIZE);
        bytes.putLong(((long) val) ^ 0x8000000000000000L);
        return bytes;
    }

    /**
     * IEEE 754 : “If two floating-point numbers in the same format are ordered (say, x {@literal <}
     * y), they are ordered the same way when their bits are reinterpreted as sign-magnitude
     * integers.”
     *
     * Which means floats can be treated as sign magnitude integers which can then be converted
     * into lexicographically comparable bytes.
     */
    public static ByteBuffer floatToOrderedBytes(float val, ByteBuffer reuse) {
        ByteBuffer bytes = reuse(reuse, PRIMITIVE_BUFFER_SIZE);
        long lval = Double.doubleToLongBits(val);
        lval ^= ((lval >> (Integer.SIZE - 1)) | Long.MIN_VALUE);
        bytes.putLong(lval);
        return bytes;
    }

    /**
     * Doubles are treated the same as floats in {@link #floatToOrderedBytes(float, ByteBuffer)}.
     */
    public static ByteBuffer doubleToOrderedBytes(double val, ByteBuffer reuse) {
        ByteBuffer bytes = reuse(reuse, PRIMITIVE_BUFFER_SIZE);
        long lval = Double.doubleToLongBits(val);
        lval ^= ((lval >> (Integer.SIZE - 1)) | Long.MIN_VALUE);
        bytes.putLong(lval);
        return bytes;
    }

    /**
     * Strings are lexicographically sortable BUT if different byte array lengths will ruin the
     * Z-Ordering. (ZOrder requires that a given column contribute the same number of bytes every
     * time). This implementation just uses a set size to for all output byte representations.
     * Truncating longer strings and right padding 0 for shorter strings.
     */
    @SuppressWarnings("ByteBufferBackingArray")
    public static ByteBuffer stringToOrderedBytes(String val, int length, ByteBuffer reuse) {
        CharsetEncoder encoder = ENCODER.get();
        if (encoder == null) {
            encoder = StandardCharsets.UTF_8.newEncoder();
            ENCODER.set(encoder);
        }

        ByteBuffer bytes = reuse(reuse, length);
        Arrays.fill(bytes.array(), 0, length, (byte) 0x00);
        if (val != null) {
            CharBuffer inputBuffer = CharBuffer.wrap(val);
            encoder.encode(inputBuffer, bytes, true);
        }
        return bytes;
    }

    /**
     * Return a bytebuffer with the given bytes truncated to length, or filled with 0's to length
     * depending on whether the given bytes are larger or smaller than the given length.
     */
    @SuppressWarnings("ByteBufferBackingArray")
    public static ByteBuffer byteTruncateOrFill(byte[] val, int length, ByteBuffer reuse) {
        ByteBuffer bytes = reuse(reuse, length);
        if (val.length < length) {
            bytes.put(val, 0, val.length);
            Arrays.fill(bytes.array(), val.length, length, (byte) 0x00);
        } else {
            bytes.put(val, 0, length);
        }
        return bytes;
    }

    public static byte[] interleaveBits(byte[][] columnsBinary, int interleavedSize) {
        return interleaveBits(columnsBinary, interleavedSize, ByteBuffer.allocate(interleavedSize));
    }

    /**
     * Interleave bits using a naive loop. Variable length inputs are allowed but to get a
     * consistent ordering it is required that every column contribute the same number of bytes in
     * each invocation. Bits are interleaved from all columns that have a bit available at that
     * position. Once a Column has no more bits to produce it is skipped in the interleaving.
     *
     * @param columnsBinary an array of ordered byte representations of the columns being ZOrdered
     * @param interleavedSize the number of bytes to use in the output
     * @return the columnbytes interleaved
     */
    // NarrowingCompoundAssignment is intended here. See
    // https://github.com/apache/iceberg/pull/5200#issuecomment-1176226163
    @SuppressWarnings({"ByteBufferBackingArray", "NarrowingCompoundAssignment"})
    public static byte[] interleaveBits(
            byte[][] columnsBinary, int interleavedSize, ByteBuffer reuse) {
        byte[] interleavedBytes = reuse.array();
        Arrays.fill(interleavedBytes, 0, interleavedSize, (byte) 0x00);

        int sourceColumn = 0;
        int sourceByte = 0;
        int sourceBit = 7;
        int interleaveByte = 0;
        int interleaveBit = 7;

        while (interleaveByte < interleavedSize) {
            // Take the source bit from source byte and move it to the output bit position
            interleavedBytes[interleaveByte] |=
                    (columnsBinary[sourceColumn][sourceByte] & 1 << sourceBit)
                            >>> sourceBit
                            << interleaveBit;
            --interleaveBit;

            // Check if an output byte has been completed
            if (interleaveBit == -1) {
                // Move to the next output byte
                interleaveByte++;
                // Move to the highest order bit of the new output byte
                interleaveBit = 7;
            }

            // Check if the last output byte has been completed
            if (interleaveByte == interleavedSize) {
                break;
            }

            // Find the next source bit to interleave
            do {
                // Move to next column
                ++sourceColumn;
                if (sourceColumn == columnsBinary.length) {
                    // If the last source column was used, reset to next bit of first column
                    sourceColumn = 0;
                    --sourceBit;
                    if (sourceBit == -1) {
                        // If the last bit of the source byte was used, reset to the highest bit of
                        // the next
                        // byte
                        sourceByte++;
                        sourceBit = 7;
                    }
                }
            } while (columnsBinary[sourceColumn].length <= sourceByte);
        }
        return interleavedBytes;
    }

    public static ByteBuffer reuse(ByteBuffer reuse, int length) {
        reuse.position(0);
        reuse.limit(length);
        return reuse;
    }
}