All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.dinky.shaded.paimon.utils.ZOrderByteUtils Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/* This file is based on source code from the Iceberg Project (http://iceberg.apache.org/), licensed by the Apache
 * Software Foundation (ASF) under the Apache License, Version 2.0. See the NOTICE file distributed with this work for
 * additional information regarding copyright ownership. */

package org.dinky.shaded.paimon.utils;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;

/**
 * Within Z-Ordering the byte representations of objects being compared must be ordered, this
 * requires several types to be transformed when converted to bytes. The goal is to map object's
 * whose byte representation are not lexicographically ordered into representations that are
 * lexicographically ordered. Bytes produced should be compared lexicographically as unsigned bytes,
 * big-endian.
 *
 * 

All types except for String are stored within an 8 Byte Buffer * *

Most of these techniques are derived from * https://aws.amazon.com/blogs/database/z-order-indexing-for-multifaceted-queries-in-amazon-dynamodb-part-2/ */ public class ZOrderByteUtils { public static final int PRIMITIVE_BUFFER_SIZE = 8; public static final byte[] NULL_BYTES = new byte[PRIMITIVE_BUFFER_SIZE]; private static final ThreadLocal ENCODER = new ThreadLocal<>(); static { Arrays.fill(NULL_BYTES, (byte) 0x00); } private ZOrderByteUtils() {} static ByteBuffer allocatePrimitiveBuffer() { return ByteBuffer.allocate(PRIMITIVE_BUFFER_SIZE); } /** * Signed ints do not have their bytes in magnitude order because of the sign bit. To fix this, * flip the sign bit so that all negatives are ordered before positives. This essentially shifts * the 0 value so that we don't break our ordering when we cross the new 0 value. */ public static ByteBuffer intToOrderedBytes(int val, ByteBuffer reuse) { ByteBuffer bytes = reuse(reuse, PRIMITIVE_BUFFER_SIZE); bytes.putLong(((long) val) ^ 0x8000000000000000L); return bytes; } /** * Signed longs are treated the same as the signed ints in {@link #intToOrderedBytes(int, * ByteBuffer)}. */ public static ByteBuffer longToOrderedBytes(long val, ByteBuffer reuse) { ByteBuffer bytes = reuse(reuse, PRIMITIVE_BUFFER_SIZE); bytes.putLong(val ^ 0x8000000000000000L); return bytes; } /** * Signed shorts are treated the same as the signed ints in {@link #intToOrderedBytes(int, * ByteBuffer)}. */ public static ByteBuffer shortToOrderedBytes(short val, ByteBuffer reuse) { ByteBuffer bytes = reuse(reuse, PRIMITIVE_BUFFER_SIZE); bytes.putLong(((long) val) ^ 0x8000000000000000L); return bytes; } /** * Signed tiny ints are treated the same as the signed ints in {@link #intToOrderedBytes(int, * ByteBuffer)}. */ public static ByteBuffer tinyintToOrderedBytes(byte val, ByteBuffer reuse) { ByteBuffer bytes = reuse(reuse, PRIMITIVE_BUFFER_SIZE); bytes.putLong(((long) val) ^ 0x8000000000000000L); return bytes; } /** * IEEE 754 : “If two floating-point numbers in the same format are ordered (say, x {@literal <} * y), they are ordered the same way when their bits are reinterpreted as sign-magnitude * integers.” * *

Which means floats can be treated as sign magnitude integers which can then be converted * into lexicographically comparable bytes. */ public static ByteBuffer floatToOrderedBytes(float val, ByteBuffer reuse) { ByteBuffer bytes = reuse(reuse, PRIMITIVE_BUFFER_SIZE); long lval = Double.doubleToLongBits(val); lval ^= ((lval >> (Integer.SIZE - 1)) | Long.MIN_VALUE); bytes.putLong(lval); return bytes; } /** * Doubles are treated the same as floats in {@link #floatToOrderedBytes(float, ByteBuffer)}. */ public static ByteBuffer doubleToOrderedBytes(double val, ByteBuffer reuse) { ByteBuffer bytes = reuse(reuse, PRIMITIVE_BUFFER_SIZE); long lval = Double.doubleToLongBits(val); lval ^= ((lval >> (Integer.SIZE - 1)) | Long.MIN_VALUE); bytes.putLong(lval); return bytes; } /** * Strings are lexicographically sortable BUT if different byte array lengths will ruin the * Z-Ordering. (ZOrder requires that a given column contribute the same number of bytes every * time). This implementation just uses a set size to for all output byte representations. * Truncating longer strings and right padding 0 for shorter strings. */ @SuppressWarnings("ByteBufferBackingArray") public static ByteBuffer stringToOrderedBytes(String val, int length, ByteBuffer reuse) { CharsetEncoder encoder = ENCODER.get(); if (encoder == null) { encoder = StandardCharsets.UTF_8.newEncoder(); ENCODER.set(encoder); } ByteBuffer bytes = reuse(reuse, length); Arrays.fill(bytes.array(), 0, length, (byte) 0x00); if (val != null) { CharBuffer inputBuffer = CharBuffer.wrap(val); encoder.encode(inputBuffer, bytes, true); } return bytes; } /** * Return a bytebuffer with the given bytes truncated to length, or filled with 0's to length * depending on whether the given bytes are larger or smaller than the given length. */ @SuppressWarnings("ByteBufferBackingArray") public static ByteBuffer byteTruncateOrFill(byte[] val, int length, ByteBuffer reuse) { ByteBuffer bytes = reuse(reuse, length); if (val.length < length) { bytes.put(val, 0, val.length); Arrays.fill(bytes.array(), val.length, length, (byte) 0x00); } else { bytes.put(val, 0, length); } return bytes; } public static byte[] interleaveBits(byte[][] columnsBinary, int interleavedSize) { return interleaveBits(columnsBinary, interleavedSize, ByteBuffer.allocate(interleavedSize)); } /** * Interleave bits using a naive loop. Variable length inputs are allowed but to get a * consistent ordering it is required that every column contribute the same number of bytes in * each invocation. Bits are interleaved from all columns that have a bit available at that * position. Once a Column has no more bits to produce it is skipped in the interleaving. * * @param columnsBinary an array of ordered byte representations of the columns being ZOrdered * @param interleavedSize the number of bytes to use in the output * @return the columnbytes interleaved */ // NarrowingCompoundAssignment is intended here. See // https://github.com/apache/iceberg/pull/5200#issuecomment-1176226163 @SuppressWarnings({"ByteBufferBackingArray", "NarrowingCompoundAssignment"}) public static byte[] interleaveBits( byte[][] columnsBinary, int interleavedSize, ByteBuffer reuse) { byte[] interleavedBytes = reuse.array(); Arrays.fill(interleavedBytes, 0, interleavedSize, (byte) 0x00); int sourceColumn = 0; int sourceByte = 0; int sourceBit = 7; int interleaveByte = 0; int interleaveBit = 7; while (interleaveByte < interleavedSize) { // Take the source bit from source byte and move it to the output bit position interleavedBytes[interleaveByte] |= (columnsBinary[sourceColumn][sourceByte] & 1 << sourceBit) >>> sourceBit << interleaveBit; --interleaveBit; // Check if an output byte has been completed if (interleaveBit == -1) { // Move to the next output byte interleaveByte++; // Move to the highest order bit of the new output byte interleaveBit = 7; } // Check if the last output byte has been completed if (interleaveByte == interleavedSize) { break; } // Find the next source bit to interleave do { // Move to next column ++sourceColumn; if (sourceColumn == columnsBinary.length) { // If the last source column was used, reset to next bit of first column sourceColumn = 0; --sourceBit; if (sourceBit == -1) { // If the last bit of the source byte was used, reset to the highest bit of // the next // byte sourceByte++; sourceBit = 7; } } } while (columnsBinary[sourceColumn].length <= sourceByte); } return interleavedBytes; } public static ByteBuffer reuse(ByteBuffer reuse, int length) { reuse.position(0); reuse.limit(length); return reuse; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy