parquet.column.values.delta.DeltaBinaryPackingValuesReader Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package parquet.column.values.delta;
import parquet.bytes.BytesUtils;
import parquet.column.values.ValuesReader;
import parquet.column.values.bitpacking.BytePacker;
import parquet.column.values.bitpacking.Packer;
import parquet.io.ParquetDecodingException;
import java.io.ByteArrayInputStream;
import java.io.IOException;
/**
* Read values written by {@link DeltaBinaryPackingValuesWriter}
*
* @author Tianshuo Deng
*/
public class DeltaBinaryPackingValuesReader extends ValuesReader {
private int totalValueCount;
/**
* values read by the caller
*/
private int valuesRead;
private int minDeltaInCurrentBlock;
private byte[] page;
/**
* stores the decoded values including the first value which is written to the header
*/
private int[] valuesBuffer;
/**
* values loaded to the buffer, it could be bigger than the totalValueCount
* when data is not aligned to mini block, which means padding 0s are in the buffer
*/
private int valuesBuffered;
private ByteArrayInputStream in;
private int nextOffset;
private DeltaBinaryPackingConfig config;
private int[] bitWidths;
/**
* eagerly load all the data into memory
*
* @param valueCount count of values in this page
* @param page the array to read from containing the page data (repetition levels, definition levels, data)
* @param offset where to start reading from in the page
* @throws IOException
*/
@Override
public void initFromPage(int valueCount, byte[] page, int offset) throws IOException {
in = new ByteArrayInputStream(page, offset, page.length - offset);
this.config = DeltaBinaryPackingConfig.readConfig(in);
this.page = page;
this.totalValueCount = BytesUtils.readUnsignedVarInt(in);
allocateValuesBuffer();
bitWidths = new int[config.miniBlockNumInABlock];
//read first value from header
valuesBuffer[valuesBuffered++] = BytesUtils.readZigZagVarInt(in);
while (valuesBuffered < totalValueCount) { //values Buffered could be more than totalValueCount, since we flush on a mini block basis
loadNewBlockToBuffer();
}
this.nextOffset = page.length - in.available();
}
@Override
public int getNextOffset() {
return nextOffset;
}
/**
* the value buffer is allocated so that the size of it is multiple of mini block
* because when writing, data is flushed on a mini block basis
*/
private void allocateValuesBuffer() {
int totalMiniBlockCount = (int) Math.ceil((double) totalValueCount / config.miniBlockSizeInValues);
//+ 1 because first value written to header is also stored in values buffer
valuesBuffer = new int[totalMiniBlockCount * config.miniBlockSizeInValues + 1];
}
@Override
public void skip() {
checkRead();
valuesRead++;
}
@Override
public int readInteger() {
checkRead();
return valuesBuffer[valuesRead++];
}
private void checkRead() {
if (valuesRead >= totalValueCount) {
throw new ParquetDecodingException("no more value to read, total value count is " + totalValueCount);
}
}
private void loadNewBlockToBuffer() {
try {
minDeltaInCurrentBlock = BytesUtils.readZigZagVarInt(in);
} catch (IOException e) {
throw new ParquetDecodingException("can not read min delta in current block", e);
}
readBitWidthsForMiniBlocks();
// mini block is atomic for reading, we read a mini block when there are more values left
int i;
for (i = 0; i < config.miniBlockNumInABlock && valuesBuffered < totalValueCount; i++) {
BytePacker packer = Packer.LITTLE_ENDIAN.newBytePacker(bitWidths[i]);
unpackMiniBlock(packer);
}
//calculate values from deltas unpacked for current block
int valueUnpacked=i*config.miniBlockSizeInValues;
for (int j = valuesBuffered-valueUnpacked; j < valuesBuffered; j++) {
int index = j;
valuesBuffer[index] += minDeltaInCurrentBlock + valuesBuffer[index - 1];
}
}
/**
* mini block has a size of 8*n, unpack 8 value each time
*
* @param packer the packer created from bitwidth of current mini block
*/
private void unpackMiniBlock(BytePacker packer) {
for (int j = 0; j < config.miniBlockSizeInValues; j += 8) {
unpack8Values(packer);
}
}
private void unpack8Values(BytePacker packer) {
//calculate the pos because the packer api uses array not stream
int pos = page.length - in.available();
packer.unpack8Values(page, pos, valuesBuffer, valuesBuffered);
this.valuesBuffered += 8;
//sync the pos in stream
in.skip(packer.getBitWidth());
}
private void readBitWidthsForMiniBlocks() {
for (int i = 0; i < config.miniBlockNumInABlock; i++) {
try {
bitWidths[i] = BytesUtils.readIntLittleEndianOnOneByte(in);
} catch (IOException e) {
throw new ParquetDecodingException("Can not decode bitwidth in block header", e);
}
}
}
}