org.apache.orc.impl.RunLengthIntegerWriterV2 Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.orc.impl;
import java.io.IOException;
/**
* A writer that performs light weight compression over sequence of integers.
*
* There are four types of lightweight integer compression
*
* - SHORT_REPEAT
* - DIRECT
* - PATCHED_BASE
* - DELTA
*
* The description and format for these types are as below:
* SHORT_REPEAT: Used for short repeated integer sequences.
*
* - 1 byte header
*
* - 2 bits for encoding type
* - 3 bits for bytes required for repeating value
* - 3 bits for repeat count (MIN_REPEAT + run length)
*
*
* - Blob - repeat value (fixed bytes)
*
*
* DIRECT: Used for random integer sequences whose number of bit
* requirement doesn't vary a lot.
*
* - 2 byte header (1st byte)
*
* - 2 bits for encoding type
* - 5 bits for fixed bit width of values in blob
* - 1 bit for storing MSB of run length
*
* - 2nd byte
*
* - 8 bits for lower run length bits
*
*
* - Blob - stores the direct values using fixed bit width. The length of the
* data blob is (fixed width * run length) bits long
*
*
* PATCHED_BASE: Used for random integer sequences whose number of bit
* requirement varies beyond a threshold.
*
* - 4 bytes header (1st byte)
*
* - 2 bits for encoding type
* - 5 bits for fixed bit width of values in blob
* - 1 bit for storing MSB of run length
*
* - 2nd byte
*
* - 8 bits for lower run length bits
*
* - 3rd byte
*
* - 3 bits for bytes required to encode base value
* - 5 bits for patch width
*
* - 4th byte
*
* - 3 bits for patch gap width
* - 5 bits for patch length
*
*
* - Base value - Stored using fixed number of bytes. If MSB is set, base
* value is negative else positive. Length of base value is (base width * 8)
* bits.
* - Data blob - Base reduced values as stored using fixed bit width. Length
* of data blob is (fixed width * run length) bits.
* - Patch blob - Patch blob is a list of gap and patch value. Each entry in
* the patch list is (patch width + patch gap width) bits long. Gap between the
* subsequent elements to be patched are stored in upper part of entry whereas
* patch values are stored in lower part of entry. Length of patch blob is
* ((patch width + patch gap width) * patch length) bits.
*
*
* DELTA Used for monotonically increasing or decreasing sequences,
* sequences with fixed delta values or long repeated sequences.
*
* - 2 bytes header (1st byte)
*
* - 2 bits for encoding type
* - 5 bits for fixed bit width of values in blob
* - 1 bit for storing MSB of run length
*
* - 2nd byte
*
* - 8 bits for lower run length bits
*
* - Base value - zigzag encoded value written as varint
* - Delta base - zigzag encoded value written as varint
* - Delta blob - only positive values. monotonicity and orderness are decided
* based on the sign of the base value and delta base
*
*/
public class RunLengthIntegerWriterV2 implements IntegerWriter {
public enum EncodingType {
SHORT_REPEAT, DIRECT, PATCHED_BASE, DELTA
}
static final int MAX_SCOPE = 512;
static final int MIN_REPEAT = 3;
private static final int MAX_SHORT_REPEAT_LENGTH = 10;
private long prevDelta = 0;
private int fixedRunLength = 0;
private int variableRunLength = 0;
private final long[] literals = new long[MAX_SCOPE];
private final PositionedOutputStream output;
private final boolean signed;
private EncodingType encoding;
private int numLiterals;
private final long[] zigzagLiterals = new long[MAX_SCOPE];
private final long[] baseRedLiterals = new long[MAX_SCOPE];
private final long[] adjDeltas = new long[MAX_SCOPE];
private long fixedDelta;
private int zzBits90p;
private int zzBits100p;
private int brBits95p;
private int brBits100p;
private int bitsDeltaMax;
private int patchWidth;
private int patchGapWidth;
private int patchLength;
private long[] gapVsPatchList;
private long min;
private boolean isFixedDelta;
private SerializationUtils utils;
private boolean alignedBitpacking;
RunLengthIntegerWriterV2(PositionedOutputStream output, boolean signed) {
this(output, signed, true);
}
public RunLengthIntegerWriterV2(PositionedOutputStream output, boolean signed,
boolean alignedBitpacking) {
this.output = output;
this.signed = signed;
this.alignedBitpacking = alignedBitpacking;
this.utils = new SerializationUtils();
clear();
}
private void writeValues() throws IOException {
if (numLiterals != 0) {
if (encoding.equals(EncodingType.SHORT_REPEAT)) {
writeShortRepeatValues();
} else if (encoding.equals(EncodingType.DIRECT)) {
writeDirectValues();
} else if (encoding.equals(EncodingType.PATCHED_BASE)) {
writePatchedBaseValues();
} else {
writeDeltaValues();
}
// clear all the variables
clear();
}
}
private void writeDeltaValues() throws IOException {
int len = 0;
int fb = bitsDeltaMax;
int efb = 0;
if (alignedBitpacking) {
fb = utils.getClosestAlignedFixedBits(fb);
}
if (isFixedDelta) {
// if fixed run length is greater than threshold then it will be fixed
// delta sequence with delta value 0 else fixed delta sequence with
// non-zero delta value
if (fixedRunLength > MIN_REPEAT) {
// ex. sequence: 2 2 2 2 2 2 2 2
len = fixedRunLength - 1;
fixedRunLength = 0;
} else {
// ex. sequence: 4 6 8 10 12 14 16
len = variableRunLength - 1;
variableRunLength = 0;
}
} else {
// fixed width 0 is used for long repeating values.
// sequences that require only 1 bit to encode will have an additional bit
if (fb == 1) {
fb = 2;
}
efb = utils.encodeBitWidth(fb);
efb = efb << 1;
len = variableRunLength - 1;
variableRunLength = 0;
}
// extract the 9th bit of run length
final int tailBits = (len & 0x100) >>> 8;
// create first byte of the header
final int headerFirstByte = getOpcode() | efb | tailBits;
// second byte of the header stores the remaining 8 bits of runlength
final int headerSecondByte = len & 0xff;
// write header
output.write(headerFirstByte);
output.write(headerSecondByte);
// store the first value from zigzag literal array
if (signed) {
utils.writeVslong(output, literals[0]);
} else {
utils.writeVulong(output, literals[0]);
}
if (isFixedDelta) {
// if delta is fixed then we don't need to store delta blob
utils.writeVslong(output, fixedDelta);
} else {
// store the first value as delta value using zigzag encoding
utils.writeVslong(output, adjDeltas[0]);
// adjacent delta values are bit packed. The length of adjDeltas array is
// always one less than the number of literals (delta difference for n
// elements is n-1). We have already written one element, write the
// remaining numLiterals - 2 elements here
utils.writeInts(adjDeltas, 1, numLiterals - 2, fb, output);
}
}
private void writePatchedBaseValues() throws IOException {
// NOTE: Aligned bit packing cannot be applied for PATCHED_BASE encoding
// because patch is applied to MSB bits. For example: If fixed bit width of
// base value is 7 bits and if patch is 3 bits, the actual value is
// constructed by shifting the patch to left by 7 positions.
// actual_value = patch << 7 | base_value
// So, if we align base_value then actual_value can not be reconstructed.
// write the number of fixed bits required in next 5 bits
final int fb = brBits95p;
final int efb = utils.encodeBitWidth(fb) << 1;
// adjust variable run length, they are one off
variableRunLength -= 1;
// extract the 9th bit of run length
final int tailBits = (variableRunLength & 0x100) >>> 8;
// create first byte of the header
final int headerFirstByte = getOpcode() | efb | tailBits;
// second byte of the header stores the remaining 8 bits of runlength
final int headerSecondByte = variableRunLength & 0xff;
// if the min value is negative toggle the sign
final boolean isNegative = min < 0 ? true : false;
if (isNegative) {
min = -min;
}
// find the number of bytes required for base and shift it by 5 bits
// to accommodate patch width. The additional bit is used to store the sign
// of the base value.
final int baseWidth = utils.findClosestNumBits(min) + 1;
final int baseBytes = baseWidth % 8 == 0 ? baseWidth / 8 : (baseWidth / 8) + 1;
final int bb = (baseBytes - 1) << 5;
// if the base value is negative then set MSB to 1
if (isNegative) {
min |= (1L << ((baseBytes * 8) - 1));
}
// third byte contains 3 bits for number of bytes occupied by base
// and 5 bits for patchWidth
final int headerThirdByte = bb | utils.encodeBitWidth(patchWidth);
// fourth byte contains 3 bits for page gap width and 5 bits for
// patch length
final int headerFourthByte = (patchGapWidth - 1) << 5 | patchLength;
// write header
output.write(headerFirstByte);
output.write(headerSecondByte);
output.write(headerThirdByte);
output.write(headerFourthByte);
// write the base value using fixed bytes in big endian order
for(int i = baseBytes - 1; i >= 0; i--) {
byte b = (byte) ((min >>> (i * 8)) & 0xff);
output.write(b);
}
// base reduced literals are bit packed
int closestFixedBits = utils.getClosestFixedBits(fb);
utils.writeInts(baseRedLiterals, 0, numLiterals, closestFixedBits,
output);
// write patch list
closestFixedBits = utils.getClosestFixedBits(patchGapWidth + patchWidth);
utils.writeInts(gapVsPatchList, 0, gapVsPatchList.length, closestFixedBits,
output);
// reset run length
variableRunLength = 0;
}
/**
* Store the opcode in 2 MSB bits
* @return opcode
*/
private int getOpcode() {
return encoding.ordinal() << 6;
}
private void writeDirectValues() throws IOException {
// write the number of fixed bits required in next 5 bits
int fb = zzBits100p;
if (alignedBitpacking) {
fb = utils.getClosestAlignedFixedBits(fb);
}
final int efb = utils.encodeBitWidth(fb) << 1;
// adjust variable run length
variableRunLength -= 1;
// extract the 9th bit of run length
final int tailBits = (variableRunLength & 0x100) >>> 8;
// create first byte of the header
final int headerFirstByte = getOpcode() | efb | tailBits;
// second byte of the header stores the remaining 8 bits of runlength
final int headerSecondByte = variableRunLength & 0xff;
// write header
output.write(headerFirstByte);
output.write(headerSecondByte);
// bit packing the zigzag encoded literals
utils.writeInts(zigzagLiterals, 0, numLiterals, fb, output);
// reset run length
variableRunLength = 0;
}
private void writeShortRepeatValues() throws IOException {
// get the value that is repeating, compute the bits and bytes required
long repeatVal = 0;
if (signed) {
repeatVal = utils.zigzagEncode(literals[0]);
} else {
repeatVal = literals[0];
}
final int numBitsRepeatVal = utils.findClosestNumBits(repeatVal);
final int numBytesRepeatVal = numBitsRepeatVal % 8 == 0 ? numBitsRepeatVal >>> 3
: (numBitsRepeatVal >>> 3) + 1;
// write encoding type in top 2 bits
int header = getOpcode();
// write the number of bytes required for the value
header |= ((numBytesRepeatVal - 1) << 3);
// write the run length
fixedRunLength -= MIN_REPEAT;
header |= fixedRunLength;
// write the header
output.write(header);
// write the repeating value in big endian byte order
for(int i = numBytesRepeatVal - 1; i >= 0; i--) {
int b = (int) ((repeatVal >>> (i * 8)) & 0xff);
output.write(b);
}
fixedRunLength = 0;
}
private void determineEncoding() {
// we need to compute zigzag values for DIRECT encoding if we decide to
// break early for delta overflows or for shorter runs
computeZigZagLiterals();
zzBits100p = utils.percentileBits(zigzagLiterals, 0, numLiterals, 1.0);
// not a big win for shorter runs to determine encoding
if (numLiterals <= MIN_REPEAT) {
encoding = EncodingType.DIRECT;
return;
}
// DELTA encoding check
// for identifying monotonic sequences
boolean isIncreasing = true;
boolean isDecreasing = true;
this.isFixedDelta = true;
this.min = literals[0];
long max = literals[0];
final long initialDelta = literals[1] - literals[0];
long currDelta = 0;
long deltaMax = 0;
this.adjDeltas[0] = initialDelta;
for (int i = 1; i < numLiterals; i++) {
final long l1 = literals[i];
final long l0 = literals[i - 1];
currDelta = l1 - l0;
min = Math.min(min, l1);
max = Math.max(max, l1);
isIncreasing &= (l0 <= l1);
isDecreasing &= (l0 >= l1);
isFixedDelta &= (currDelta == initialDelta);
if (i > 1) {
adjDeltas[i - 1] = Math.abs(currDelta);
deltaMax = Math.max(deltaMax, adjDeltas[i - 1]);
}
}
// its faster to exit under delta overflow condition without checking for
// PATCHED_BASE condition as encoding using DIRECT is faster and has less
// overhead than PATCHED_BASE
if (!utils.isSafeSubtract(max, min)) {
encoding = EncodingType.DIRECT;
return;
}
// invariant - subtracting any number from any other in the literals after
// this point won't overflow
// if min is equal to max then the delta is 0, this condition happens for
// fixed values run >10 which cannot be encoded with SHORT_REPEAT
if (min == max) {
assert isFixedDelta : min + "==" + max +
", isFixedDelta cannot be false";
assert currDelta == 0 : min + "==" + max + ", currDelta should be zero";
fixedDelta = 0;
encoding = EncodingType.DELTA;
return;
}
if (isFixedDelta) {
assert currDelta == initialDelta
: "currDelta should be equal to initialDelta for fixed delta encoding";
encoding = EncodingType.DELTA;
fixedDelta = currDelta;
return;
}
// if initialDelta is 0 then we cannot delta encode as we cannot identify
// the sign of deltas (increasing or decreasing)
if (initialDelta != 0) {
// stores the number of bits required for packing delta blob in
// delta encoding
bitsDeltaMax = utils.findClosestNumBits(deltaMax);
// monotonic condition
if (isIncreasing || isDecreasing) {
encoding = EncodingType.DELTA;
return;
}
}
// PATCHED_BASE encoding check
// percentile values are computed for the zigzag encoded values. if the
// number of bit requirement between 90th and 100th percentile varies
// beyond a threshold then we need to patch the values. if the variation
// is not significant then we can use direct encoding
zzBits90p = utils.percentileBits(zigzagLiterals, 0, numLiterals, 0.9);
int diffBitsLH = zzBits100p - zzBits90p;
// if the difference between 90th percentile and 100th percentile fixed
// bits is > 1 then we need patch the values
if (diffBitsLH > 1) {
// patching is done only on base reduced values.
// remove base from literals
for (int i = 0; i < numLiterals; i++) {
baseRedLiterals[i] = literals[i] - min;
}
// 95th percentile width is used to determine max allowed value
// after which patching will be done
brBits95p = utils.percentileBits(baseRedLiterals, 0, numLiterals, 0.95);
// 100th percentile is used to compute the max patch width
brBits100p = utils.percentileBits(baseRedLiterals, 0, numLiterals, 1.0);
// after base reducing the values, if the difference in bits between
// 95th percentile and 100th percentile value is zero then there
// is no point in patching the values, in which case we will
// fallback to DIRECT encoding.
// The decision to use patched base was based on zigzag values, but the
// actual patching is done on base reduced literals.
if ((brBits100p - brBits95p) != 0) {
encoding = EncodingType.PATCHED_BASE;
preparePatchedBlob();
return;
} else {
encoding = EncodingType.DIRECT;
return;
}
} else {
// if difference in bits between 95th percentile and 100th percentile is
// 0, then patch length will become 0. Hence we will fallback to direct
encoding = EncodingType.DIRECT;
return;
}
}
private void computeZigZagLiterals() {
// populate zigzag encoded literals
long zzEncVal = 0;
for (int i = 0; i < numLiterals; i++) {
if (signed) {
zzEncVal = utils.zigzagEncode(literals[i]);
} else {
zzEncVal = literals[i];
}
zigzagLiterals[i] = zzEncVal;
}
}
private void preparePatchedBlob() {
// mask will be max value beyond which patch will be generated
long mask = (1L << brBits95p) - 1;
// since we are considering only 95 percentile, the size of gap and
// patch array can contain only be 5% values
patchLength = (int) Math.ceil((numLiterals * 0.05));
int[] gapList = new int[patchLength];
long[] patchList = new long[patchLength];
// #bit for patch
patchWidth = brBits100p - brBits95p;
patchWidth = utils.getClosestFixedBits(patchWidth);
// if patch bit requirement is 64 then it will not possible to pack
// gap and patch together in a long. To make sure gap and patch can be
// packed together adjust the patch width
if (patchWidth == 64) {
patchWidth = 56;
brBits95p = 8;
mask = (1L << brBits95p) - 1;
}
int gapIdx = 0;
int patchIdx = 0;
int prev = 0;
int gap = 0;
int maxGap = 0;
for(int i = 0; i < numLiterals; i++) {
// if value is above mask then create the patch and record the gap
if (baseRedLiterals[i] > mask) {
gap = i - prev;
if (gap > maxGap) {
maxGap = gap;
}
// gaps are relative, so store the previous patched value index
prev = i;
gapList[gapIdx++] = gap;
// extract the most significant bits that are over mask bits
long patch = baseRedLiterals[i] >>> brBits95p;
patchList[patchIdx++] = patch;
// strip off the MSB to enable safe bit packing
baseRedLiterals[i] &= mask;
}
}
// adjust the patch length to number of entries in gap list
patchLength = gapIdx;
// if the element to be patched is the first and only element then
// max gap will be 0, but to store the gap as 0 we need atleast 1 bit
if (maxGap == 0 && patchLength != 0) {
patchGapWidth = 1;
} else {
patchGapWidth = utils.findClosestNumBits(maxGap);
}
// special case: if the patch gap width is greater than 256, then
// we need 9 bits to encode the gap width. But we only have 3 bits in
// header to record the gap width. To deal with this case, we will save
// two entries in patch list in the following way
// 256 gap width => 0 for patch value
// actual gap - 256 => actual patch value
// We will do the same for gap width = 511. If the element to be patched is
// the last element in the scope then gap width will be 511. In this case we
// will have 3 entries in the patch list in the following way
// 255 gap width => 0 for patch value
// 255 gap width => 0 for patch value
// 1 gap width => actual patch value
if (patchGapWidth > 8) {
patchGapWidth = 8;
// for gap = 511, we need two additional entries in patch list
if (maxGap == 511) {
patchLength += 2;
} else {
patchLength += 1;
}
}
// create gap vs patch list
gapIdx = 0;
patchIdx = 0;
gapVsPatchList = new long[patchLength];
for(int i = 0; i < patchLength; i++) {
long g = gapList[gapIdx++];
long p = patchList[patchIdx++];
while (g > 255) {
gapVsPatchList[i++] = (255L << patchWidth);
g -= 255;
}
// store patch value in LSBs and gap in MSBs
gapVsPatchList[i] = (g << patchWidth) | p;
}
}
/**
* clears all the variables
*/
private void clear() {
numLiterals = 0;
encoding = null;
prevDelta = 0;
fixedDelta = 0;
zzBits90p = 0;
zzBits100p = 0;
brBits95p = 0;
brBits100p = 0;
bitsDeltaMax = 0;
patchGapWidth = 0;
patchLength = 0;
patchWidth = 0;
gapVsPatchList = null;
min = 0;
isFixedDelta = true;
}
@Override
public void flush() throws IOException {
if (numLiterals != 0) {
if (variableRunLength != 0) {
determineEncoding();
writeValues();
} else if (fixedRunLength != 0) {
if (fixedRunLength < MIN_REPEAT) {
variableRunLength = fixedRunLength;
fixedRunLength = 0;
determineEncoding();
writeValues();
} else if (fixedRunLength >= MIN_REPEAT
&& fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
encoding = EncodingType.SHORT_REPEAT;
writeValues();
} else {
encoding = EncodingType.DELTA;
isFixedDelta = true;
writeValues();
}
}
}
output.flush();
}
@Override
public void write(long val) throws IOException {
if (numLiterals == 0) {
initializeLiterals(val);
} else {
if (numLiterals == 1) {
prevDelta = val - literals[0];
literals[numLiterals++] = val;
// if both values are same count as fixed run else variable run
if (val == literals[0]) {
fixedRunLength = 2;
variableRunLength = 0;
} else {
fixedRunLength = 0;
variableRunLength = 2;
}
} else {
long currentDelta = val - literals[numLiterals - 1];
if (prevDelta == 0 && currentDelta == 0) {
// fixed delta run
literals[numLiterals++] = val;
// if variable run is non-zero then we are seeing repeating
// values at the end of variable run in which case keep
// updating variable and fixed runs
if (variableRunLength > 0) {
fixedRunLength = 2;
}
fixedRunLength += 1;
// if fixed run met the minimum condition and if variable
// run is non-zero then flush the variable run and shift the
// tail fixed runs to start of the buffer
if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) {
numLiterals -= MIN_REPEAT;
variableRunLength -= MIN_REPEAT - 1;
// copy the tail fixed runs
long[] tailVals = new long[MIN_REPEAT];
System.arraycopy(literals, numLiterals, tailVals, 0, MIN_REPEAT);
// determine variable encoding and flush values
determineEncoding();
writeValues();
// shift tail fixed runs to beginning of the buffer
for(long l : tailVals) {
literals[numLiterals++] = l;
}
}
// if fixed runs reached max repeat length then write values
if (fixedRunLength == MAX_SCOPE) {
determineEncoding();
writeValues();
}
} else {
// variable delta run
// if fixed run length is non-zero and if it satisfies the
// short repeat conditions then write the values as short repeats
// else use delta encoding
if (fixedRunLength >= MIN_REPEAT) {
if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
encoding = EncodingType.SHORT_REPEAT;
writeValues();
} else {
encoding = EncodingType.DELTA;
isFixedDelta = true;
writeValues();
}
}
// if fixed run length is 0 && fixedRunLength < MIN_REPEAT) {
if (val != literals[numLiterals - 1]) {
variableRunLength = fixedRunLength;
fixedRunLength = 0;
}
}
// after writing values re-initialize the variables
if (numLiterals == 0) {
initializeLiterals(val);
} else {
// keep updating variable run lengths
prevDelta = val - literals[numLiterals - 1];
literals[numLiterals++] = val;
variableRunLength += 1;
// if variable run length reach the max scope, write it
if (variableRunLength == MAX_SCOPE) {
determineEncoding();
writeValues();
}
}
}
}
}
}
private void initializeLiterals(long val) {
literals[numLiterals++] = val;
fixedRunLength = 1;
variableRunLength = 1;
}
@Override
public void getPosition(PositionRecorder recorder) throws IOException {
output.getPosition(recorder);
recorder.addPosition(numLiterals);
}
@Override
public long estimateMemory() {
return output.getBufferSize();
}
}