io.trino.parquet.ParquetReaderUtils Maven / Gradle / Ivy
Show all versions of trino-parquet Show documentation
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.parquet;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;
import io.airlift.slice.Slice;
import io.trino.parquet.metadata.ColumnChunkMetadata;
import io.trino.parquet.reader.SimpleSliceInputStream;
import org.apache.parquet.bytes.ByteBufferInputStream;
import org.apache.parquet.column.Encoding;
import org.apache.parquet.column.EncodingStats;
import java.util.Set;
import static com.google.common.base.Verify.verify;
import static java.lang.String.format;
import static org.apache.parquet.column.Encoding.BIT_PACKED;
import static org.apache.parquet.column.Encoding.PLAIN_DICTIONARY;
import static org.apache.parquet.column.Encoding.RLE;
public final class ParquetReaderUtils
{
private ParquetReaderUtils() {}
public static ByteBufferInputStream toInputStream(Slice slice)
{
return ByteBufferInputStream.wrap(slice.toByteBuffer());
}
public static ByteBufferInputStream toInputStream(DictionaryPage page)
{
return toInputStream(page.getSlice());
}
/**
* Reads an integer formatted in ULEB128 variable-width format described in
* ...
*/
public static int readUleb128Int(SimpleSliceInputStream input)
{
byte[] inputBytes = input.getByteArray();
int offset = input.getByteArrayOffset();
// Manual loop unrolling shows improvements in BenchmarkReadUleb128Int
int inputByte = inputBytes[offset];
int value = inputByte & 0x7F;
if ((inputByte & 0x80) == 0) {
input.skip(1);
return value;
}
inputByte = inputBytes[offset + 1];
value |= (inputByte & 0x7F) << 7;
if ((inputByte & 0x80) == 0) {
input.skip(2);
return value;
}
inputByte = inputBytes[offset + 2];
value |= (inputByte & 0x7F) << 14;
if ((inputByte & 0x80) == 0) {
input.skip(3);
return value;
}
inputByte = inputBytes[offset + 3];
value |= (inputByte & 0x7F) << 21;
if ((inputByte & 0x80) == 0) {
input.skip(4);
return value;
}
inputByte = inputBytes[offset + 4];
verify((inputByte & 0x80) == 0, "ULEB128 variable-width integer should not be longer than 5 bytes");
input.skip(5);
return value | inputByte << 28;
}
public static long readUleb128Long(SimpleSliceInputStream input)
{
byte[] inputBytes = input.getByteArray();
int offset = input.getByteArrayOffset();
// Manual loop unrolling shows improvements in BenchmarkReadUleb128Long
long inputByte = inputBytes[offset];
long value = inputByte & 0x7F;
if ((inputByte & 0x80) == 0) {
input.skip(1);
return value;
}
inputByte = inputBytes[offset + 1];
value |= (inputByte & 0x7F) << 7;
if ((inputByte & 0x80) == 0) {
input.skip(2);
return value;
}
inputByte = inputBytes[offset + 2];
value |= (inputByte & 0x7F) << 14;
if ((inputByte & 0x80) == 0) {
input.skip(3);
return value;
}
inputByte = inputBytes[offset + 3];
value |= (inputByte & 0x7F) << 21;
if ((inputByte & 0x80) == 0) {
input.skip(4);
return value;
}
inputByte = inputBytes[offset + 4];
value |= (inputByte & 0x7F) << 28;
if ((inputByte & 0x80) == 0) {
input.skip(5);
return value;
}
inputByte = inputBytes[offset + 5];
value |= (inputByte & 0x7F) << 35;
if ((inputByte & 0x80) == 0) {
input.skip(6);
return value;
}
inputByte = inputBytes[offset + 6];
value |= (inputByte & 0x7F) << 42;
if ((inputByte & 0x80) == 0) {
input.skip(7);
return value;
}
inputByte = inputBytes[offset + 7];
value |= (inputByte & 0x7F) << 49;
if ((inputByte & 0x80) == 0) {
input.skip(8);
return value;
}
inputByte = inputBytes[offset + 8];
value |= (inputByte & 0x7F) << 56;
if ((inputByte & 0x80) == 0) {
input.skip(9);
return value;
}
inputByte = inputBytes[offset + 9];
verify((inputByte & 0x80) == 0, "ULEB128 variable-width long should not be longer than 10 bytes");
input.skip(10);
return value | inputByte << 63;
}
public static int readFixedWidthInt(SimpleSliceInputStream input, int bytesWidth)
{
return switch (bytesWidth) {
case 0 -> 0;
case 1 -> input.readByte() & 0xFF;
case 2 -> input.readShort() & 0xFFFF;
case 3 -> {
int value = input.readShort() & 0xFFFF;
yield ((input.readByte() & 0xFF) << 16) | value;
}
case 4 -> input.readInt();
default -> throw new IllegalArgumentException(format("Encountered bytesWidth (%d) that requires more than 4 bytes", bytesWidth));
};
}
/**
* For storing signed values (not the deltas themselves) in DELTA_BINARY_PACKED encoding, zigzag encoding
* (...)
* is used to map negative values to positive ones and then apply ULEB128 on the result.
*/
public static long zigzagDecode(long value)
{
return (value >>> 1) ^ -(value & 1);
}
/**
* Returns the result of arguments division rounded up.
*
* Works only for positive numbers.
* The sum of dividend and divisor cannot exceed Integer.MAX_VALUE
*/
public static int ceilDiv(int dividend, int divisor)
{
return (dividend + divisor - 1) / divisor;
}
/**
* Propagate the sign bit in values that are shorter than 8 bytes.
*
* When the value of less than 8 bytes in put into a long variable, the padding bytes on the
* left side of the number should be all zeros for a positive number or all ones for negatives.
* This method does this padding using signed bit shift operator without branches.
*
* @param value Value to trim
* @param bitsToPad Number of bits to pad
* @return Value with correct padding
*/
public static long propagateSignBit(long value, int bitsToPad)
{
return value << bitsToPad >> bitsToPad;
}
/**
* Method simulates a cast from boolean to byte value. Despite using
* a ternary (?) operator, the just-in-time compiler usually figures out
* that this is a cast and turns that into a no-op.
*
* Method may be used to avoid branches that may be CPU costly due to
* branch misprediction.
* The following code:
*
* boolean[] flags = ...
* int sum = 0;
* for (int i = 0; i < length; i++){
* if (flags[i])
* sum++;
* }
*
* will perform better when rewritten to
*
* boolean[] flags = ...
* int sum = 0;
* for (int i = 0; i < length; i++){
* sum += castToByte(flags[i]);
* }
*
*/
public static byte castToByte(boolean value)
{
return (byte) (value ? 1 : 0);
}
/**
* Works the same as {@link io.trino.parquet.ParquetReaderUtils#castToByte(boolean)} and negates the boolean value
*/
public static byte castToByteNegate(boolean value)
{
return (byte) (value ? 0 : 1);
}
public static short toShortExact(long value)
{
if ((short) value != value) {
throw new ArithmeticException("short overflow");
}
return (short) value;
}
public static short toShortExact(int value)
{
if ((short) value != value) {
throw new ArithmeticException(format("Value %d exceeds short range", value));
}
return (short) value;
}
public static byte toByteExact(long value)
{
if ((byte) value != value) {
throw new ArithmeticException("byte overflow");
}
return (byte) value;
}
public static byte toByteExact(int value)
{
if ((byte) value != value) {
throw new ArithmeticException(format("Value %d exceeds byte range", value));
}
return (byte) value;
}
@SuppressWarnings("deprecation")
public static boolean isOnlyDictionaryEncodingPages(ColumnChunkMetadata columnMetaData)
{
// Files written with newer versions of Parquet libraries (e.g. parquet-mr 1.9.0) will have EncodingStats available
// Otherwise, fallback to v1 logic
EncodingStats stats = columnMetaData.getEncodingStats();
if (stats != null) {
return stats.hasDictionaryPages() && !stats.hasNonDictionaryEncodedPages();
}
Set encodings = columnMetaData.getEncodings();
if (encodings.contains(PLAIN_DICTIONARY)) {
// PLAIN_DICTIONARY was present, which means at least one page was
// dictionary-encoded and 1.0 encodings are used
// The only other allowed encodings are RLE and BIT_PACKED which are used for repetition or definition levels
return Sets.difference(encodings, ImmutableSet.of(PLAIN_DICTIONARY, RLE, BIT_PACKED)).isEmpty();
}
return false;
}
}