All Downloads are FREE. Search and download functionalities are using the official Maven repository.

parquet.column.values.delta.DeltaBinaryPackingValuesWriter Maven / Gradle / Ivy

/**
 * Copyright 2012 Twitter, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package parquet.column.values.delta;

import parquet.bytes.BytesInput;
import parquet.bytes.BytesUtils;
import parquet.bytes.CapacityByteArrayOutputStream;
import parquet.column.Encoding;
import parquet.column.values.ValuesWriter;
import parquet.column.values.bitpacking.BytePacker;
import parquet.column.values.bitpacking.Packer;
import parquet.io.ParquetEncodingException;

import java.io.IOException;

/**
 * Write integers with delta encoding and binary packing
 * The format is as follows:
 * 

*

 *   {@code
 *     delta-binary-packing:  *
 *     page-header :=    
 *     block :=   
 *
 *     min delta : zig-zag var int encoded
 *     bitWidthsOfMiniBlock : 1 byte little endian
 *     blockSizeInValues,blockSizeInValues,totalValueCount,firstValue : unsigned varint
 *   }
 * 
* * The algorithm and format is inspired by D. Lemire's paper: http://lemire.me/blog/archives/2012/09/12/fast-integer-compression-decoding-billions-of-integers-per-second/ * * @author Tianshuo Deng */ public class DeltaBinaryPackingValuesWriter extends ValuesWriter { /** * max bitwidth for a mini block, it is used to allocate miniBlockByteBuffer which is * reused between flushes. */ public static final int MAX_BITWIDTH = 32; public static final int DEFAULT_NUM_BLOCK_VALUES = 128; public static final int DEFAULT_NUM_MINIBLOCKS = 4; private final CapacityByteArrayOutputStream baos; /** * stores blockSizeInValues, miniBlockNumInABlock and miniBlockSizeInValues */ private final DeltaBinaryPackingConfig config; /** * bit width for each mini block, reused between flushes */ private final int[] bitWidths; private int totalValueCount = 0; /** * a pointer to deltaBlockBuffer indicating the end of deltaBlockBuffer * the number of values in the deltaBlockBuffer that haven't flushed to baos * it will be reset after each flush */ private int deltaValuesToFlush = 0; /** * stores delta values starting from the 2nd value written(1st value is stored in header). * It's reused between flushes */ private int[] deltaBlockBuffer; /** * bytes buffer for a mini block, it is reused for each mini block. * Therefore the size of biggest miniblock with bitwith of MAX_BITWITH is allocated */ private byte[] miniBlockByteBuffer; /** * firstValue is written to the header of the page */ private int firstValue = 0; /** * cache previous written value for calculating delta */ private int previousValue = 0; /** * min delta is written to the beginning of each block. * it's zig-zag encoded. The deltas stored in each block is actually the difference to min delta, * therefore are all positive * it will be reset after each flush */ private int minDeltaInCurrentBlock = Integer.MAX_VALUE; public DeltaBinaryPackingValuesWriter(int slabSize) { this(DEFAULT_NUM_BLOCK_VALUES, DEFAULT_NUM_MINIBLOCKS, slabSize); } public DeltaBinaryPackingValuesWriter(int blockSizeInValues, int miniBlockNum, int slabSize) { this.config = new DeltaBinaryPackingConfig(blockSizeInValues, miniBlockNum); bitWidths = new int[config.miniBlockNumInABlock]; deltaBlockBuffer = new int[blockSizeInValues]; miniBlockByteBuffer = new byte[config.miniBlockSizeInValues * MAX_BITWIDTH]; baos = new CapacityByteArrayOutputStream(slabSize); } @Override public long getBufferedSize() { return baos.size(); } @Override public void writeInteger(int v) { totalValueCount++; if (totalValueCount == 1) { firstValue = v; previousValue = firstValue; return; } int delta = v - previousValue;//calculate delta previousValue = v; deltaBlockBuffer[deltaValuesToFlush++] = delta; if (delta < minDeltaInCurrentBlock) { minDeltaInCurrentBlock = delta; } if (config.blockSizeInValues == deltaValuesToFlush) { flushBlockBuffer(); } } private void flushBlockBuffer() { //since we store the min delta, the deltas will be converted to be the difference to min delta and all positive for (int i = 0; i < deltaValuesToFlush; i++) { deltaBlockBuffer[i] = deltaBlockBuffer[i] - minDeltaInCurrentBlock; } writeMinDelta(); int miniBlocksToFlush = getMiniBlockCountToFlush(deltaValuesToFlush); calculateBitWidthsForDeltaBlockBuffer(miniBlocksToFlush); for (int i = 0; i < config.miniBlockNumInABlock; i++) { writeBitWidthForMiniBlock(i); } for (int i = 0; i < miniBlocksToFlush; i++) { //writing i th miniblock int currentBitWidth = bitWidths[i]; BytePacker packer = Packer.LITTLE_ENDIAN.newBytePacker(currentBitWidth); int miniBlockStart = i * config.miniBlockSizeInValues; for (int j = miniBlockStart; j < (i + 1) * config.miniBlockSizeInValues; j += 8) {//8 values per pack // mini block is atomic in terms of flushing // This may write more values when reach to the end of data writing to last mini block, // since it may not be aligend to miniblock, // but doesnt matter. The reader uses total count to see if reached the end. packer.pack8Values(deltaBlockBuffer, j, miniBlockByteBuffer, 0); baos.write(miniBlockByteBuffer, 0, currentBitWidth); } } minDeltaInCurrentBlock = Integer.MAX_VALUE; deltaValuesToFlush = 0; } private void writeBitWidthForMiniBlock(int i) { try { BytesUtils.writeIntLittleEndianOnOneByte(baos, bitWidths[i]); } catch (IOException e) { throw new ParquetEncodingException("can not write bitwith for miniblock", e); } } private void writeMinDelta() { try { BytesUtils.writeZigZagVarInt(minDeltaInCurrentBlock, baos); } catch (IOException e) { throw new ParquetEncodingException("can not write min delta for block", e); } } /** * iterate through values in each mini block and calculate the bitWidths of max values. * * @param miniBlocksToFlush */ private void calculateBitWidthsForDeltaBlockBuffer(int miniBlocksToFlush) { for (int miniBlockIndex = 0; miniBlockIndex < miniBlocksToFlush; miniBlockIndex++) { int mask = 0; int miniStart = miniBlockIndex * config.miniBlockSizeInValues; //The end of current mini block could be the end of current block(deltaValuesToFlush) buffer when data is not aligned to mini block int miniEnd = Math.min((miniBlockIndex + 1) * config.miniBlockSizeInValues, deltaValuesToFlush); for (int i = miniStart; i < miniEnd; i++) { mask |= deltaBlockBuffer[i]; } bitWidths[miniBlockIndex] = 32 - Integer.numberOfLeadingZeros(mask); } } private int getMiniBlockCountToFlush(double numberCount) { return (int) Math.ceil(numberCount / config.miniBlockSizeInValues); } /** * getBytes will trigger flushing block buffer, DO NOT write after getBytes() is called without calling reset() * * @return */ @Override public BytesInput getBytes() { //The Page Header should include: blockSizeInValues, numberOfMiniBlocks, totalValueCount if (deltaValuesToFlush != 0) { flushBlockBuffer(); } return BytesInput.concat( config.toBytesInput(), BytesInput.fromUnsignedVarInt(totalValueCount), BytesInput.fromZigZagVarInt(firstValue), BytesInput.from(baos)); } @Override public Encoding getEncoding() { return Encoding.DELTA_BINARY_PACKED; } @Override public void reset() { this.totalValueCount = 0; this.baos.reset(); this.deltaValuesToFlush = 0; this.minDeltaInCurrentBlock = Integer.MAX_VALUE; } @Override public long getAllocatedSize() { return baos.getCapacity(); } @Override public String memUsageString(String prefix) { return String.format("%s DeltaBinaryPacking %d bytes", prefix, getAllocatedSize()); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy